Ignore language list

This commit is contained in:
Jan Bader
2026-04-04 20:41:00 +02:00
parent 75a4ab20fd
commit 99ba4f6ac8

View File

@@ -84,13 +84,23 @@ class InstagramExtractor:
"tags": ["instagram", "reel"],
}
def _looks_like_language_list(text: str) -> bool:
lines = [line.strip() for line in text.splitlines() if line.strip()]
if len(lines) < 8:
return False
short_lines = [line for line in lines if len(line) <= 20]
if len(short_lines) / len(lines) < 0.8:
return False
single_tokenish = [line for line in short_lines if len(line.split()) <= 2]
return len(single_tokenish) / len(lines) > 0.7
# Try to get caption/description
try:
# Look for caption text
captions = page.query_selector_all('h1, h2, span')
for caption in captions:
text = caption.inner_text()
if len(text) > 20 and len(text) < 500:
text = caption.inner_text().strip()
if len(text) > 20 and len(text) < 500 and not _looks_like_language_list(text):
if not data["description"]:
data["description"] = text
break
@@ -120,9 +130,32 @@ class InstagramExtractor:
# Filter for meaningful content
lines = body_text.split('\n')
cleaned_lines = []
buffer = []
def flush_buffer():
if buffer:
block = "\n".join(buffer)
if not _looks_like_language_list(block):
cleaned_lines.extend(buffer)
buffer.clear()
for line in lines:
stripped = line.strip()
if not stripped:
flush_buffer()
continue
if len(stripped) <= 24:
buffer.append(stripped)
else:
flush_buffer()
cleaned_lines.append(stripped)
flush_buffer()
meaningful_lines = [
line.strip() for line in lines
if len(line.strip()) > 30 and len(line.strip()) < 300
line for line in cleaned_lines
if len(line) > 30 and len(line) < 300
]
data["content"] = "\n\n".join(meaningful_lines[:10])[:5000]