From db44427c1ff3d941d8f13caa0b5c597433963f80 Mon Sep 17 00:00:00 2001
From: Jan Bader <c.github@jan.javil.eu>
Date: Sat, 4 Apr 2026 20:46:48 +0200
Subject: [PATCH] Ignore some ui prompts

---
 extractors/instagram_extractor.py | 45 +++++++++++++++++++++++++++----
 1 file changed, 40 insertions(+), 5 deletions(-)

diff --git a/extractors/instagram_extractor.py b/extractors/instagram_extractor.py
index 0ab7b55..776b4db 100644
--- a/extractors/instagram_extractor.py
+++ b/extractors/instagram_extractor.py
@@ -94,13 +94,39 @@ class InstagramExtractor:
             single_tokenish = [line for line in short_lines if len(line.split()) <= 2]
             return len(single_tokenish) / len(lines) > 0.7
 
+        def _looks_like_ui_prompt(text: str) -> bool:
+            lowered = text.lower()
+            blockers = [
+                "allow the use of cookies",
+                "use of cookies",
+                "cookies and similar technologies",
+                "meta products",
+                "safer experience",
+                "information we receive from cookies",
+                "accept all",
+                "only allow essential",
+                "log in",
+                "login",
+                "sign up",
+                "sign in",
+                "save your login info",
+                "turn on notifications",
+                "not now",
+            ]
+            return any(blocker in lowered for blocker in blockers)
+
         # Try to get caption/description
         try:
             # Look for caption text
             captions = page.query_selector_all('h1, h2, span')
             for caption in captions:
                 text = caption.inner_text().strip()
-                if len(text) > 20 and len(text) < 500 and not _looks_like_language_list(text):
+                if (
+                    len(text) > 20
+                    and len(text) < 500
+                    and not _looks_like_language_list(text)
+                    and not _looks_like_ui_prompt(text)
+                ):
                     if not data["description"]:
                         data["description"] = text
                     break
@@ -162,10 +188,19 @@ class InstagramExtractor:
         except Exception as e:
             print(f"⚠️  Could not extract page text: {e}")
 
-        # Generate key points from description
-        if data["description"]:
-            sentences = data["description"].split('.')[:3]
-            data["key_points"] = [s.strip() + '.' for s in sentences if len(s.strip()) > 20]
+        # Generate key points from description or content
+        base_text = ""
+        if data["description"] and not _looks_like_ui_prompt(data["description"]):
+            base_text = data["description"]
+        elif data["content"]:
+            base_text = data["content"]
+
+        if base_text:
+            sentences = re.split(r'(?<=[.!?])\s+', base_text.strip())
+            data["key_points"] = [
+                s.strip() for s in sentences
+                if 20 < len(s.strip()) < 200
+            ][:3]
 
         # Add URL-based tags
         parsed = urlparse(self.url)