Ignore some ui prompts

This commit is contained in:
Jan Bader
2026-04-04 20:46:48 +02:00
parent 99ba4f6ac8
commit db44427c1f

View File

@@ -94,13 +94,39 @@ class InstagramExtractor:
single_tokenish = [line for line in short_lines if len(line.split()) <= 2]
return len(single_tokenish) / len(lines) > 0.7
def _looks_like_ui_prompt(text: str) -> bool:
lowered = text.lower()
blockers = [
"allow the use of cookies",
"use of cookies",
"cookies and similar technologies",
"meta products",
"safer experience",
"information we receive from cookies",
"accept all",
"only allow essential",
"log in",
"login",
"sign up",
"sign in",
"save your login info",
"turn on notifications",
"not now",
]
return any(blocker in lowered for blocker in blockers)
# Try to get caption/description
try:
# Look for caption text
captions = page.query_selector_all('h1, h2, span')
for caption in captions:
text = caption.inner_text().strip()
if len(text) > 20 and len(text) < 500 and not _looks_like_language_list(text):
if (
len(text) > 20
and len(text) < 500
and not _looks_like_language_list(text)
and not _looks_like_ui_prompt(text)
):
if not data["description"]:
data["description"] = text
break
@@ -162,10 +188,19 @@ class InstagramExtractor:
except Exception as e:
print(f"⚠️ Could not extract page text: {e}")
# Generate key points from description
if data["description"]:
sentences = data["description"].split('.')[:3]
data["key_points"] = [s.strip() + '.' for s in sentences if len(s.strip()) > 20]
# Generate key points from description or content
base_text = ""
if data["description"] and not _looks_like_ui_prompt(data["description"]):
base_text = data["description"]
elif data["content"]:
base_text = data["content"]
if base_text:
sentences = re.split(r'(?<=[.!?])\s+', base_text.strip())
data["key_points"] = [
s.strip() for s in sentences
if 20 < len(s.strip()) < 200
][:3]
# Add URL-based tags
parsed = urlparse(self.url)