Ignore some ui prompts
This commit is contained in:
@@ -94,13 +94,39 @@ class InstagramExtractor:
|
||||
single_tokenish = [line for line in short_lines if len(line.split()) <= 2]
|
||||
return len(single_tokenish) / len(lines) > 0.7
|
||||
|
||||
def _looks_like_ui_prompt(text: str) -> bool:
|
||||
lowered = text.lower()
|
||||
blockers = [
|
||||
"allow the use of cookies",
|
||||
"use of cookies",
|
||||
"cookies and similar technologies",
|
||||
"meta products",
|
||||
"safer experience",
|
||||
"information we receive from cookies",
|
||||
"accept all",
|
||||
"only allow essential",
|
||||
"log in",
|
||||
"login",
|
||||
"sign up",
|
||||
"sign in",
|
||||
"save your login info",
|
||||
"turn on notifications",
|
||||
"not now",
|
||||
]
|
||||
return any(blocker in lowered for blocker in blockers)
|
||||
|
||||
# Try to get caption/description
|
||||
try:
|
||||
# Look for caption text
|
||||
captions = page.query_selector_all('h1, h2, span')
|
||||
for caption in captions:
|
||||
text = caption.inner_text().strip()
|
||||
if len(text) > 20 and len(text) < 500 and not _looks_like_language_list(text):
|
||||
if (
|
||||
len(text) > 20
|
||||
and len(text) < 500
|
||||
and not _looks_like_language_list(text)
|
||||
and not _looks_like_ui_prompt(text)
|
||||
):
|
||||
if not data["description"]:
|
||||
data["description"] = text
|
||||
break
|
||||
@@ -162,10 +188,19 @@ class InstagramExtractor:
|
||||
except Exception as e:
|
||||
print(f"⚠️ Could not extract page text: {e}")
|
||||
|
||||
# Generate key points from description
|
||||
if data["description"]:
|
||||
sentences = data["description"].split('.')[:3]
|
||||
data["key_points"] = [s.strip() + '.' for s in sentences if len(s.strip()) > 20]
|
||||
# Generate key points from description or content
|
||||
base_text = ""
|
||||
if data["description"] and not _looks_like_ui_prompt(data["description"]):
|
||||
base_text = data["description"]
|
||||
elif data["content"]:
|
||||
base_text = data["content"]
|
||||
|
||||
if base_text:
|
||||
sentences = re.split(r'(?<=[.!?])\s+', base_text.strip())
|
||||
data["key_points"] = [
|
||||
s.strip() for s in sentences
|
||||
if 20 < len(s.strip()) < 200
|
||||
][:3]
|
||||
|
||||
# Add URL-based tags
|
||||
parsed = urlparse(self.url)
|
||||
|
||||
Reference in New Issue
Block a user