Ignore some ui prompts
This commit is contained in:
@@ -94,13 +94,39 @@ class InstagramExtractor:
|
|||||||
single_tokenish = [line for line in short_lines if len(line.split()) <= 2]
|
single_tokenish = [line for line in short_lines if len(line.split()) <= 2]
|
||||||
return len(single_tokenish) / len(lines) > 0.7
|
return len(single_tokenish) / len(lines) > 0.7
|
||||||
|
|
||||||
|
def _looks_like_ui_prompt(text: str) -> bool:
|
||||||
|
lowered = text.lower()
|
||||||
|
blockers = [
|
||||||
|
"allow the use of cookies",
|
||||||
|
"use of cookies",
|
||||||
|
"cookies and similar technologies",
|
||||||
|
"meta products",
|
||||||
|
"safer experience",
|
||||||
|
"information we receive from cookies",
|
||||||
|
"accept all",
|
||||||
|
"only allow essential",
|
||||||
|
"log in",
|
||||||
|
"login",
|
||||||
|
"sign up",
|
||||||
|
"sign in",
|
||||||
|
"save your login info",
|
||||||
|
"turn on notifications",
|
||||||
|
"not now",
|
||||||
|
]
|
||||||
|
return any(blocker in lowered for blocker in blockers)
|
||||||
|
|
||||||
# Try to get caption/description
|
# Try to get caption/description
|
||||||
try:
|
try:
|
||||||
# Look for caption text
|
# Look for caption text
|
||||||
captions = page.query_selector_all('h1, h2, span')
|
captions = page.query_selector_all('h1, h2, span')
|
||||||
for caption in captions:
|
for caption in captions:
|
||||||
text = caption.inner_text().strip()
|
text = caption.inner_text().strip()
|
||||||
if len(text) > 20 and len(text) < 500 and not _looks_like_language_list(text):
|
if (
|
||||||
|
len(text) > 20
|
||||||
|
and len(text) < 500
|
||||||
|
and not _looks_like_language_list(text)
|
||||||
|
and not _looks_like_ui_prompt(text)
|
||||||
|
):
|
||||||
if not data["description"]:
|
if not data["description"]:
|
||||||
data["description"] = text
|
data["description"] = text
|
||||||
break
|
break
|
||||||
@@ -162,10 +188,19 @@ class InstagramExtractor:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"⚠️ Could not extract page text: {e}")
|
print(f"⚠️ Could not extract page text: {e}")
|
||||||
|
|
||||||
# Generate key points from description
|
# Generate key points from description or content
|
||||||
if data["description"]:
|
base_text = ""
|
||||||
sentences = data["description"].split('.')[:3]
|
if data["description"] and not _looks_like_ui_prompt(data["description"]):
|
||||||
data["key_points"] = [s.strip() + '.' for s in sentences if len(s.strip()) > 20]
|
base_text = data["description"]
|
||||||
|
elif data["content"]:
|
||||||
|
base_text = data["content"]
|
||||||
|
|
||||||
|
if base_text:
|
||||||
|
sentences = re.split(r'(?<=[.!?])\s+', base_text.strip())
|
||||||
|
data["key_points"] = [
|
||||||
|
s.strip() for s in sentences
|
||||||
|
if 20 < len(s.strip()) < 200
|
||||||
|
][:3]
|
||||||
|
|
||||||
# Add URL-based tags
|
# Add URL-based tags
|
||||||
parsed = urlparse(self.url)
|
parsed = urlparse(self.url)
|
||||||
|
|||||||
Reference in New Issue
Block a user