Ignore language list
This commit is contained in:
@@ -84,13 +84,23 @@ class InstagramExtractor:
|
|||||||
"tags": ["instagram", "reel"],
|
"tags": ["instagram", "reel"],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def _looks_like_language_list(text: str) -> bool:
|
||||||
|
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
||||||
|
if len(lines) < 8:
|
||||||
|
return False
|
||||||
|
short_lines = [line for line in lines if len(line) <= 20]
|
||||||
|
if len(short_lines) / len(lines) < 0.8:
|
||||||
|
return False
|
||||||
|
single_tokenish = [line for line in short_lines if len(line.split()) <= 2]
|
||||||
|
return len(single_tokenish) / len(lines) > 0.7
|
||||||
|
|
||||||
# Try to get caption/description
|
# Try to get caption/description
|
||||||
try:
|
try:
|
||||||
# Look for caption text
|
# Look for caption text
|
||||||
captions = page.query_selector_all('h1, h2, span')
|
captions = page.query_selector_all('h1, h2, span')
|
||||||
for caption in captions:
|
for caption in captions:
|
||||||
text = caption.inner_text()
|
text = caption.inner_text().strip()
|
||||||
if len(text) > 20 and len(text) < 500:
|
if len(text) > 20 and len(text) < 500 and not _looks_like_language_list(text):
|
||||||
if not data["description"]:
|
if not data["description"]:
|
||||||
data["description"] = text
|
data["description"] = text
|
||||||
break
|
break
|
||||||
@@ -120,9 +130,32 @@ class InstagramExtractor:
|
|||||||
|
|
||||||
# Filter for meaningful content
|
# Filter for meaningful content
|
||||||
lines = body_text.split('\n')
|
lines = body_text.split('\n')
|
||||||
|
cleaned_lines = []
|
||||||
|
buffer = []
|
||||||
|
|
||||||
|
def flush_buffer():
|
||||||
|
if buffer:
|
||||||
|
block = "\n".join(buffer)
|
||||||
|
if not _looks_like_language_list(block):
|
||||||
|
cleaned_lines.extend(buffer)
|
||||||
|
buffer.clear()
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
stripped = line.strip()
|
||||||
|
if not stripped:
|
||||||
|
flush_buffer()
|
||||||
|
continue
|
||||||
|
if len(stripped) <= 24:
|
||||||
|
buffer.append(stripped)
|
||||||
|
else:
|
||||||
|
flush_buffer()
|
||||||
|
cleaned_lines.append(stripped)
|
||||||
|
|
||||||
|
flush_buffer()
|
||||||
|
|
||||||
meaningful_lines = [
|
meaningful_lines = [
|
||||||
line.strip() for line in lines
|
line for line in cleaned_lines
|
||||||
if len(line.strip()) > 30 and len(line.strip()) < 300
|
if len(line) > 30 and len(line) < 300
|
||||||
]
|
]
|
||||||
|
|
||||||
data["content"] = "\n\n".join(meaningful_lines[:10])[:5000]
|
data["content"] = "\n\n".join(meaningful_lines[:10])[:5000]
|
||||||
|
|||||||
Reference in New Issue
Block a user