From db44427c1ff3d941d8f13caa0b5c597433963f80 Mon Sep 17 00:00:00 2001 From: Jan Bader Date: Sat, 4 Apr 2026 20:46:48 +0200 Subject: [PATCH] Ignore some ui prompts --- extractors/instagram_extractor.py | 45 +++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/extractors/instagram_extractor.py b/extractors/instagram_extractor.py index 0ab7b55..776b4db 100644 --- a/extractors/instagram_extractor.py +++ b/extractors/instagram_extractor.py @@ -94,13 +94,39 @@ class InstagramExtractor: single_tokenish = [line for line in short_lines if len(line.split()) <= 2] return len(single_tokenish) / len(lines) > 0.7 + def _looks_like_ui_prompt(text: str) -> bool: + lowered = text.lower() + blockers = [ + "allow the use of cookies", + "use of cookies", + "cookies and similar technologies", + "meta products", + "safer experience", + "information we receive from cookies", + "accept all", + "only allow essential", + "log in", + "login", + "sign up", + "sign in", + "save your login info", + "turn on notifications", + "not now", + ] + return any(blocker in lowered for blocker in blockers) + # Try to get caption/description try: # Look for caption text captions = page.query_selector_all('h1, h2, span') for caption in captions: text = caption.inner_text().strip() - if len(text) > 20 and len(text) < 500 and not _looks_like_language_list(text): + if ( + len(text) > 20 + and len(text) < 500 + and not _looks_like_language_list(text) + and not _looks_like_ui_prompt(text) + ): if not data["description"]: data["description"] = text break @@ -162,10 +188,19 @@ class InstagramExtractor: except Exception as e: print(f"⚠️ Could not extract page text: {e}") - # Generate key points from description - if data["description"]: - sentences = data["description"].split('.')[:3] - data["key_points"] = [s.strip() + '.' for s in sentences if len(s.strip()) > 20] + # Generate key points from description or content + base_text = "" + if data["description"] and not _looks_like_ui_prompt(data["description"]): + base_text = data["description"] + elif data["content"]: + base_text = data["content"] + + if base_text: + sentences = re.split(r'(?<=[.!?])\s+', base_text.strip()) + data["key_points"] = [ + s.strip() for s in sentences + if 20 < len(s.strip()) < 200 + ][:3] # Add URL-based tags parsed = urlparse(self.url)