Ignore some ui prompts

2026-04-04 20:46:48 +02:00
parent 99ba4f6ac8
commit db44427c1f
1 changed files with 40 additions and 5 deletions
@@ -94,13 +94,39 @@ class InstagramExtractor:
            single_tokenish = [line for line in short_lines if len(line.split()) <= 2]
            return len(single_tokenish) / len(lines) > 0.7
        def _looks_like_ui_prompt(text: str) -> bool:
            lowered = text.lower()
            blockers = [
                "allow the use of cookies",
                "use of cookies",
                "cookies and similar technologies",
                "meta products",
                "safer experience",
                "information we receive from cookies",
                "accept all",
                "only allow essential",
                "log in",
                "login",
                "sign up",
                "sign in",
                "save your login info",
                "turn on notifications",
                "not now",
            ]
            return any(blocker in lowered for blocker in blockers)
        # Try to get caption/description
        try:
            # Look for caption text
            captions = page.query_selector_all('h1, h2, span')
            for caption in captions:
                text = caption.inner_text().strip()
-                if len(text) > 20 and len(text) < 500 and not _looks_like_language_list(text):
+                if (
                    len(text) > 20
                    and len(text) < 500
                    and not _looks_like_language_list(text)
                    and not _looks_like_ui_prompt(text)
                ):
                    if not data["description"]:
                        data["description"] = text
                    break
@@ -162,10 +188,19 @@ class InstagramExtractor:
        except Exception as e:
            print(f"⚠️  Could not extract page text: {e}")
-        # Generate key points from description
+        # Generate key points from description or content
-        if data["description"]:
+        base_text = ""
-            sentences = data["description"].split('.')[:3]
+        if data["description"] and not _looks_like_ui_prompt(data["description"]):
-            data["key_points"] = [s.strip() + '.' for s in sentences if len(s.strip()) > 20]
+            base_text = data["description"]
        elif data["content"]:
            base_text = data["content"]
        if base_text:
            sentences = re.split(r'(?<=[.!?])\s+', base_text.strip())
            data["key_points"] = [
                s.strip() for s in sentences
                if 20 < len(s.strip()) < 200
            ][:3]
        # Add URL-based tags
        parsed = urlparse(self.url)