Ignore language list

This commit is contained in:
Jan Bader
2026-04-04 20:41:00 +02:00
parent 75a4ab20fd
commit 99ba4f6ac8

View File

@@ -24,15 +24,15 @@ except ImportError:
class InstagramExtractor: class InstagramExtractor:
"""Extract content from Instagram reels.""" """Extract content from Instagram reels."""
def __init__(self, url: str, headless: bool = True): def __init__(self, url: str, headless: bool = True):
self.url = url self.url = url
self.headless = headless self.headless = headless
self.data = {} self.data = {}
if sync_playwright is None: if sync_playwright is None:
raise ImportError("playwright not installed. Run: pip install playwright && playwright install") raise ImportError("playwright not installed. Run: pip install playwright && playwright install")
def extract(self) -> Dict[str, Any]: def extract(self) -> Dict[str, Any]:
"""Extract content from Instagram reel.""" """Extract content from Instagram reel."""
try: try:
@@ -41,28 +41,28 @@ class InstagramExtractor:
page = browser.new_page( page = browser.new_page(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
) )
# Navigate to the reel # Navigate to the reel
print(f"📱 Loading Instagram reel...") print(f"📱 Loading Instagram reel...")
page.goto(self.url, timeout=30000) page.goto(self.url, timeout=30000)
# Wait for content to load # Wait for content to load
time.sleep(3) time.sleep(3)
# Try to close any cookies/login prompts # Try to close any cookies/login prompts
try: try:
page.click('button:has-text("Not Now")', timeout=3000) page.click('button:has-text("Not Now")', timeout=3000)
except: except:
pass pass
try: try:
page.click('button:has-text("Allow")', timeout=3000) page.click('button:has-text("Allow")', timeout=3000)
except: except:
pass pass
# Extract data # Extract data
self.data = self._extract_data(page) self.data = self._extract_data(page)
browser.close() browser.close()
except PlaywrightTimeout: except PlaywrightTimeout:
print("⚠️ Timeout loading Instagram page") print("⚠️ Timeout loading Instagram page")
@@ -70,9 +70,9 @@ class InstagramExtractor:
except Exception as e: except Exception as e:
print(f"⚠️ Error: {str(e)}") print(f"⚠️ Error: {str(e)}")
self.data = self._fallback_extract() self.data = self._fallback_extract()
return self.data return self.data
def _extract_data(self, page) -> Dict[str, Any]: def _extract_data(self, page) -> Dict[str, Any]:
"""Extract data from loaded page.""" """Extract data from loaded page."""
data = { data = {
@@ -83,20 +83,30 @@ class InstagramExtractor:
"key_points": [], "key_points": [],
"tags": ["instagram", "reel"], "tags": ["instagram", "reel"],
} }
def _looks_like_language_list(text: str) -> bool:
lines = [line.strip() for line in text.splitlines() if line.strip()]
if len(lines) < 8:
return False
short_lines = [line for line in lines if len(line) <= 20]
if len(short_lines) / len(lines) < 0.8:
return False
single_tokenish = [line for line in short_lines if len(line.split()) <= 2]
return len(single_tokenish) / len(lines) > 0.7
# Try to get caption/description # Try to get caption/description
try: try:
# Look for caption text # Look for caption text
captions = page.query_selector_all('h1, h2, span') captions = page.query_selector_all('h1, h2, span')
for caption in captions: for caption in captions:
text = caption.inner_text() text = caption.inner_text().strip()
if len(text) > 20 and len(text) < 500: if len(text) > 20 and len(text) < 500 and not _looks_like_language_list(text):
if not data["description"]: if not data["description"]:
data["description"] = text data["description"] = text
break break
except Exception as e: except Exception as e:
print(f"⚠️ Could not extract caption: {e}") print(f"⚠️ Could not extract caption: {e}")
# Try to get author # Try to get author
try: try:
author_elem = page.query_selector('a[href*="/"] h1, a[href*="/"] h2, header span') author_elem = page.query_selector('a[href*="/"] h1, a[href*="/"] h2, header span')
@@ -104,7 +114,7 @@ class InstagramExtractor:
data["author"] = author_elem.inner_text().strip() data["author"] = author_elem.inner_text().strip()
except: except:
pass pass
# Try to get engagement metrics # Try to get engagement metrics
try: try:
likes_elem = page.query_selector('span:has-text("likes"), span:has-text("views")') likes_elem = page.query_selector('span:has-text("likes"), span:has-text("views")')
@@ -112,41 +122,64 @@ class InstagramExtractor:
data["views"] = likes_elem.inner_text().strip() data["views"] = likes_elem.inner_text().strip()
except: except:
pass pass
# Extract any visible text as content # Extract any visible text as content
try: try:
# Get all text content # Get all text content
body_text = page.inner_text('body') body_text = page.inner_text('body')
# Filter for meaningful content # Filter for meaningful content
lines = body_text.split('\n') lines = body_text.split('\n')
cleaned_lines = []
buffer = []
def flush_buffer():
if buffer:
block = "\n".join(buffer)
if not _looks_like_language_list(block):
cleaned_lines.extend(buffer)
buffer.clear()
for line in lines:
stripped = line.strip()
if not stripped:
flush_buffer()
continue
if len(stripped) <= 24:
buffer.append(stripped)
else:
flush_buffer()
cleaned_lines.append(stripped)
flush_buffer()
meaningful_lines = [ meaningful_lines = [
line.strip() for line in lines line for line in cleaned_lines
if len(line.strip()) > 30 and len(line.strip()) < 300 if len(line) > 30 and len(line) < 300
] ]
data["content"] = "\n\n".join(meaningful_lines[:10])[:5000] data["content"] = "\n\n".join(meaningful_lines[:10])[:5000]
except Exception as e: except Exception as e:
print(f"⚠️ Could not extract page text: {e}") print(f"⚠️ Could not extract page text: {e}")
# Generate key points from description # Generate key points from description
if data["description"]: if data["description"]:
sentences = data["description"].split('.')[:3] sentences = data["description"].split('.')[:3]
data["key_points"] = [s.strip() + '.' for s in sentences if len(s.strip()) > 20] data["key_points"] = [s.strip() + '.' for s in sentences if len(s.strip()) > 20]
# Add URL-based tags # Add URL-based tags
parsed = urlparse(self.url) parsed = urlparse(self.url)
if '/reel/' in parsed.path: if '/reel/' in parsed.path:
data["tags"].append("reel") data["tags"].append("reel")
if '/video/' in parsed.path: if '/video/' in parsed.path:
data["tags"].append("video") data["tags"].append("video")
return data return data
def _fallback_extract(self) -> Dict[str, Any]: def _fallback_extract(self) -> Dict[str, Any]:
"""Fallback extraction when browser automation fails.""" """Fallback extraction when browser automation fails."""
print("⚠️ Using fallback extraction method...") print("⚠️ Using fallback extraction method...")
# Try to extract what we can from the URL itself # Try to extract what we can from the URL itself
data = { data = {
"title": "Instagram Content", "title": "Instagram Content",
@@ -159,7 +192,7 @@ class InstagramExtractor:
], ],
"tags": ["instagram", "fallback"], "tags": ["instagram", "fallback"],
} }
# Extract reel ID from URL # Extract reel ID from URL
try: try:
parsed = urlparse(self.url) parsed = urlparse(self.url)
@@ -171,5 +204,5 @@ class InstagramExtractor:
break break
except: except:
pass pass
return data return data