From 99ba4f6ac847f27f62cb8a419384f0c16b59a29d Mon Sep 17 00:00:00 2001
From: Jan Bader <c.github@jan.javil.eu>
Date: Sat, 4 Apr 2026 20:41:00 +0200
Subject: [PATCH] Ignore language list

---
 extractors/instagram_extractor.py | 89 +++++++++++++++++++++----------
 1 file changed, 61 insertions(+), 28 deletions(-)

diff --git a/extractors/instagram_extractor.py b/extractors/instagram_extractor.py
index c0a110a..0ab7b55 100644
--- a/extractors/instagram_extractor.py
+++ b/extractors/instagram_extractor.py
@@ -24,15 +24,15 @@ except ImportError:
 
 class InstagramExtractor:
     """Extract content from Instagram reels."""
-    
+
     def __init__(self, url: str, headless: bool = True):
         self.url = url
         self.headless = headless
         self.data = {}
-        
+
         if sync_playwright is None:
             raise ImportError("playwright not installed. Run: pip install playwright && playwright install")
-    
+
     def extract(self) -> Dict[str, Any]:
         """Extract content from Instagram reel."""
         try:
@@ -41,28 +41,28 @@ class InstagramExtractor:
                 page = browser.new_page(
                     user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
                 )
-                
+
                 # Navigate to the reel
                 print(f"📱 Loading Instagram reel...")
                 page.goto(self.url, timeout=30000)
-                
+
                 # Wait for content to load
                 time.sleep(3)
-                
+
                 # Try to close any cookies/login prompts
                 try:
                     page.click('button:has-text("Not Now")', timeout=3000)
                 except:
                     pass
-                
+
                 try:
                     page.click('button:has-text("Allow")', timeout=3000)
                 except:
                     pass
-                
+
                 # Extract data
                 self.data = self._extract_data(page)
-                
+
                 browser.close()
         except PlaywrightTimeout:
             print("⚠️  Timeout loading Instagram page")
@@ -70,9 +70,9 @@ class InstagramExtractor:
         except Exception as e:
             print(f"⚠️  Error: {str(e)}")
             self.data = self._fallback_extract()
-        
+
         return self.data
-    
+
     def _extract_data(self, page) -> Dict[str, Any]:
         """Extract data from loaded page."""
         data = {
@@ -83,20 +83,30 @@ class InstagramExtractor:
             "key_points": [],
             "tags": ["instagram", "reel"],
         }
-        
+
+        def _looks_like_language_list(text: str) -> bool:
+            lines = [line.strip() for line in text.splitlines() if line.strip()]
+            if len(lines) < 8:
+                return False
+            short_lines = [line for line in lines if len(line) <= 20]
+            if len(short_lines) / len(lines) < 0.8:
+                return False
+            single_tokenish = [line for line in short_lines if len(line.split()) <= 2]
+            return len(single_tokenish) / len(lines) > 0.7
+
         # Try to get caption/description
         try:
             # Look for caption text
             captions = page.query_selector_all('h1, h2, span')
             for caption in captions:
-                text = caption.inner_text()
-                if len(text) > 20 and len(text) < 500:
+                text = caption.inner_text().strip()
+                if len(text) > 20 and len(text) < 500 and not _looks_like_language_list(text):
                     if not data["description"]:
                         data["description"] = text
                     break
         except Exception as e:
             print(f"⚠️  Could not extract caption: {e}")
-        
+
         # Try to get author
         try:
             author_elem = page.query_selector('a[href*="/"] h1, a[href*="/"] h2, header span')
@@ -104,7 +114,7 @@ class InstagramExtractor:
                 data["author"] = author_elem.inner_text().strip()
         except:
             pass
-        
+
         # Try to get engagement metrics
         try:
             likes_elem = page.query_selector('span:has-text("likes"), span:has-text("views")')
@@ -112,41 +122,64 @@ class InstagramExtractor:
                 data["views"] = likes_elem.inner_text().strip()
         except:
             pass
-        
+
         # Extract any visible text as content
         try:
             # Get all text content
             body_text = page.inner_text('body')
-            
+
             # Filter for meaningful content
             lines = body_text.split('\n')
+            cleaned_lines = []
+            buffer = []
+
+            def flush_buffer():
+                if buffer:
+                    block = "\n".join(buffer)
+                    if not _looks_like_language_list(block):
+                        cleaned_lines.extend(buffer)
+                    buffer.clear()
+
+            for line in lines:
+                stripped = line.strip()
+                if not stripped:
+                    flush_buffer()
+                    continue
+                if len(stripped) <= 24:
+                    buffer.append(stripped)
+                else:
+                    flush_buffer()
+                    cleaned_lines.append(stripped)
+
+            flush_buffer()
+
             meaningful_lines = [
-                line.strip() for line in lines 
-                if len(line.strip()) > 30 and len(line.strip()) < 300
+                line for line in cleaned_lines
+                if len(line) > 30 and len(line) < 300
             ]
-            
+
             data["content"] = "\n\n".join(meaningful_lines[:10])[:5000]
         except Exception as e:
             print(f"⚠️  Could not extract page text: {e}")
-        
+
         # Generate key points from description
         if data["description"]:
             sentences = data["description"].split('.')[:3]
             data["key_points"] = [s.strip() + '.' for s in sentences if len(s.strip()) > 20]
-        
+
         # Add URL-based tags
         parsed = urlparse(self.url)
         if '/reel/' in parsed.path:
             data["tags"].append("reel")
         if '/video/' in parsed.path:
             data["tags"].append("video")
-        
+
         return data
-    
+
     def _fallback_extract(self) -> Dict[str, Any]:
         """Fallback extraction when browser automation fails."""
         print("⚠️  Using fallback extraction method...")
-        
+
         # Try to extract what we can from the URL itself
         data = {
             "title": "Instagram Content",
@@ -159,7 +192,7 @@ class InstagramExtractor:
             ],
             "tags": ["instagram", "fallback"],
         }
-        
+
         # Extract reel ID from URL
         try:
             parsed = urlparse(self.url)
@@ -171,5 +204,5 @@ class InstagramExtractor:
                     break
         except:
             pass
-        
+
         return data