From 99ba4f6ac847f27f62cb8a419384f0c16b59a29d Mon Sep 17 00:00:00 2001 From: Jan Bader Date: Sat, 4 Apr 2026 20:41:00 +0200 Subject: [PATCH] Ignore language list --- extractors/instagram_extractor.py | 89 +++++++++++++++++++++---------- 1 file changed, 61 insertions(+), 28 deletions(-) diff --git a/extractors/instagram_extractor.py b/extractors/instagram_extractor.py index c0a110a..0ab7b55 100644 --- a/extractors/instagram_extractor.py +++ b/extractors/instagram_extractor.py @@ -24,15 +24,15 @@ except ImportError: class InstagramExtractor: """Extract content from Instagram reels.""" - + def __init__(self, url: str, headless: bool = True): self.url = url self.headless = headless self.data = {} - + if sync_playwright is None: raise ImportError("playwright not installed. Run: pip install playwright && playwright install") - + def extract(self) -> Dict[str, Any]: """Extract content from Instagram reel.""" try: @@ -41,28 +41,28 @@ class InstagramExtractor: page = browser.new_page( user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ) - + # Navigate to the reel print(f"📱 Loading Instagram reel...") page.goto(self.url, timeout=30000) - + # Wait for content to load time.sleep(3) - + # Try to close any cookies/login prompts try: page.click('button:has-text("Not Now")', timeout=3000) except: pass - + try: page.click('button:has-text("Allow")', timeout=3000) except: pass - + # Extract data self.data = self._extract_data(page) - + browser.close() except PlaywrightTimeout: print("⚠️ Timeout loading Instagram page") @@ -70,9 +70,9 @@ class InstagramExtractor: except Exception as e: print(f"⚠️ Error: {str(e)}") self.data = self._fallback_extract() - + return self.data - + def _extract_data(self, page) -> Dict[str, Any]: """Extract data from loaded page.""" data = { @@ -83,20 +83,30 @@ class InstagramExtractor: "key_points": [], "tags": ["instagram", "reel"], } - + + def _looks_like_language_list(text: str) -> bool: + lines = [line.strip() for line in text.splitlines() if line.strip()] + if len(lines) < 8: + return False + short_lines = [line for line in lines if len(line) <= 20] + if len(short_lines) / len(lines) < 0.8: + return False + single_tokenish = [line for line in short_lines if len(line.split()) <= 2] + return len(single_tokenish) / len(lines) > 0.7 + # Try to get caption/description try: # Look for caption text captions = page.query_selector_all('h1, h2, span') for caption in captions: - text = caption.inner_text() - if len(text) > 20 and len(text) < 500: + text = caption.inner_text().strip() + if len(text) > 20 and len(text) < 500 and not _looks_like_language_list(text): if not data["description"]: data["description"] = text break except Exception as e: print(f"⚠️ Could not extract caption: {e}") - + # Try to get author try: author_elem = page.query_selector('a[href*="/"] h1, a[href*="/"] h2, header span') @@ -104,7 +114,7 @@ class InstagramExtractor: data["author"] = author_elem.inner_text().strip() except: pass - + # Try to get engagement metrics try: likes_elem = page.query_selector('span:has-text("likes"), span:has-text("views")') @@ -112,41 +122,64 @@ class InstagramExtractor: data["views"] = likes_elem.inner_text().strip() except: pass - + # Extract any visible text as content try: # Get all text content body_text = page.inner_text('body') - + # Filter for meaningful content lines = body_text.split('\n') + cleaned_lines = [] + buffer = [] + + def flush_buffer(): + if buffer: + block = "\n".join(buffer) + if not _looks_like_language_list(block): + cleaned_lines.extend(buffer) + buffer.clear() + + for line in lines: + stripped = line.strip() + if not stripped: + flush_buffer() + continue + if len(stripped) <= 24: + buffer.append(stripped) + else: + flush_buffer() + cleaned_lines.append(stripped) + + flush_buffer() + meaningful_lines = [ - line.strip() for line in lines - if len(line.strip()) > 30 and len(line.strip()) < 300 + line for line in cleaned_lines + if len(line) > 30 and len(line) < 300 ] - + data["content"] = "\n\n".join(meaningful_lines[:10])[:5000] except Exception as e: print(f"⚠️ Could not extract page text: {e}") - + # Generate key points from description if data["description"]: sentences = data["description"].split('.')[:3] data["key_points"] = [s.strip() + '.' for s in sentences if len(s.strip()) > 20] - + # Add URL-based tags parsed = urlparse(self.url) if '/reel/' in parsed.path: data["tags"].append("reel") if '/video/' in parsed.path: data["tags"].append("video") - + return data - + def _fallback_extract(self) -> Dict[str, Any]: """Fallback extraction when browser automation fails.""" print("⚠️ Using fallback extraction method...") - + # Try to extract what we can from the URL itself data = { "title": "Instagram Content", @@ -159,7 +192,7 @@ class InstagramExtractor: ], "tags": ["instagram", "fallback"], } - + # Extract reel ID from URL try: parsed = urlparse(self.url) @@ -171,5 +204,5 @@ class InstagramExtractor: break except: pass - + return data