Ignore some ui prompts

This commit is contained in:
Jan Bader
2026-04-04 20:46:48 +02:00
parent 99ba4f6ac8
commit db44427c1f

View File

@@ -94,13 +94,39 @@ class InstagramExtractor:
single_tokenish = [line for line in short_lines if len(line.split()) <= 2] single_tokenish = [line for line in short_lines if len(line.split()) <= 2]
return len(single_tokenish) / len(lines) > 0.7 return len(single_tokenish) / len(lines) > 0.7
def _looks_like_ui_prompt(text: str) -> bool:
lowered = text.lower()
blockers = [
"allow the use of cookies",
"use of cookies",
"cookies and similar technologies",
"meta products",
"safer experience",
"information we receive from cookies",
"accept all",
"only allow essential",
"log in",
"login",
"sign up",
"sign in",
"save your login info",
"turn on notifications",
"not now",
]
return any(blocker in lowered for blocker in blockers)
# Try to get caption/description # Try to get caption/description
try: try:
# Look for caption text # Look for caption text
captions = page.query_selector_all('h1, h2, span') captions = page.query_selector_all('h1, h2, span')
for caption in captions: for caption in captions:
text = caption.inner_text().strip() text = caption.inner_text().strip()
if len(text) > 20 and len(text) < 500 and not _looks_like_language_list(text): if (
len(text) > 20
and len(text) < 500
and not _looks_like_language_list(text)
and not _looks_like_ui_prompt(text)
):
if not data["description"]: if not data["description"]:
data["description"] = text data["description"] = text
break break
@@ -162,10 +188,19 @@ class InstagramExtractor:
except Exception as e: except Exception as e:
print(f"⚠️ Could not extract page text: {e}") print(f"⚠️ Could not extract page text: {e}")
# Generate key points from description # Generate key points from description or content
if data["description"]: base_text = ""
sentences = data["description"].split('.')[:3] if data["description"] and not _looks_like_ui_prompt(data["description"]):
data["key_points"] = [s.strip() + '.' for s in sentences if len(s.strip()) > 20] base_text = data["description"]
elif data["content"]:
base_text = data["content"]
if base_text:
sentences = re.split(r'(?<=[.!?])\s+', base_text.strip())
data["key_points"] = [
s.strip() for s in sentences
if 20 < len(s.strip()) < 200
][:3]
# Add URL-based tags # Add URL-based tags
parsed = urlparse(self.url) parsed = urlparse(self.url)