Ignore language list

2026-04-04 20:41:00 +02:00
parent 75a4ab20fd
commit 99ba4f6ac8
1 changed files with 61 additions and 28 deletions
@@ -84,13 +84,23 @@ class InstagramExtractor:
            "tags": ["instagram", "reel"],
        }

+        def _looks_like_language_list(text: str) -> bool:
+            lines = [line.strip() for line in text.splitlines() if line.strip()]
+            if len(lines) < 8:
+                return False
+            short_lines = [line for line in lines if len(line) <= 20]
+            if len(short_lines) / len(lines) < 0.8:
+                return False
+            single_tokenish = [line for line in short_lines if len(line.split()) <= 2]
+            return len(single_tokenish) / len(lines) > 0.7
+
        # Try to get caption/description
        try:
            # Look for caption text
            captions = page.query_selector_all('h1, h2, span')
            for caption in captions:
-                text = caption.inner_text()
-                if len(text) > 20 and len(text) < 500:
+                text = caption.inner_text().strip()
+                if len(text) > 20 and len(text) < 500 and not _looks_like_language_list(text):
                    if not data["description"]:
                        data["description"] = text
                    break
@@ -120,9 +130,32 @@ class InstagramExtractor:

            # Filter for meaningful content
            lines = body_text.split('\n')
+            cleaned_lines = []
+            buffer = []
+
+            def flush_buffer():
+                if buffer:
+                    block = "\n".join(buffer)
+                    if not _looks_like_language_list(block):
+                        cleaned_lines.extend(buffer)
+                    buffer.clear()
+
+            for line in lines:
+                stripped = line.strip()
+                if not stripped:
+                    flush_buffer()
+                    continue
+                if len(stripped) <= 24:
+                    buffer.append(stripped)
+                else:
+                    flush_buffer()
+                    cleaned_lines.append(stripped)
+
+            flush_buffer()
+
            meaningful_lines = [
-                line.strip() for line in lines 
-                if len(line.strip()) > 30 and len(line.strip()) < 300
+                line for line in cleaned_lines
+                if len(line) > 30 and len(line) < 300
            ]

            data["content"] = "\n\n".join(meaningful_lines[:10])[:5000]