Improve extraction

This commit is contained in:
Jan Bader
2026-04-04 21:16:28 +02:00
parent 40104dc0f9
commit 66e1c9e0e0

View File

@@ -11,6 +11,8 @@ Extracts:
Note: Instagram requires browser automation. Uses Playwright. Note: Instagram requires browser automation. Uses Playwright.
""" """
import html
import json
import re import re
import time import time
from typing import Dict, Any from typing import Dict, Any
@@ -122,7 +124,138 @@ class InstagramExtractor:
] ]
return any(blocker in lowered for blocker in blockers) return any(blocker in lowered for blocker in blockers)
# Try to get caption/description # Try to get caption/description from meta and embedded JSON first
try:
meta_desc = page.query_selector('meta[property="og:description"], meta[name="description"]')
if meta_desc:
text = (meta_desc.get_attribute("content") or "").strip()
if text and not _looks_like_ui_prompt(text) and not _looks_like_language_list(text):
data["description"] = text
meta_title = page.query_selector('meta[property="og:title"], meta[name="twitter:title"]')
if meta_title and data["title"] == "Instagram Reel":
title_text = (meta_title.get_attribute("content") or "").strip()
if title_text:
data["title"] = title_text
if not data["description"]:
html_source = page.content()
patterns = [
r'<meta[^>]+property="og:description"[^>]+content="([^"]+)"',
r'<meta[^>]+name="description"[^>]+content="([^"]+)"',
r'<meta[^>]+name="twitter:description"[^>]+content="([^"]+)"',
]
for pattern in patterns:
match = re.search(pattern, html_source, re.IGNORECASE)
if match:
text = html.unescape(match.group(1)).strip()
if text and not _looks_like_ui_prompt(text) and not _looks_like_language_list(text):
data["description"] = text
break
scripts = page.query_selector_all('script[type="application/ld+json"]')
for script in scripts:
raw = script.inner_text().strip()
if not raw:
continue
try:
payload = json.loads(raw)
except Exception:
continue
def extract_from_obj(obj: Dict[str, Any]):
if not isinstance(obj, dict):
return
desc = obj.get("description")
if desc and not data["description"]:
if not _looks_like_ui_prompt(desc) and not _looks_like_language_list(desc):
data["description"] = desc.strip()
author = obj.get("author")
if author and data["author"] == "Unknown":
if isinstance(author, dict):
name = author.get("name")
if name:
data["author"] = name.strip()
elif isinstance(author, list):
for item in author:
if isinstance(item, dict) and item.get("name"):
data["author"] = item["name"].strip()
break
elif isinstance(author, str):
data["author"] = author.strip()
if isinstance(payload, list):
for obj in payload:
extract_from_obj(obj)
else:
extract_from_obj(payload)
if data["description"] and data["author"] != "Unknown":
break
except Exception as e:
print(f"⚠️ Could not extract meta/ld+json: {e}")
# Try to get caption/description from embedded shared data
try:
html = page.content()
payloads = []
shared_match = re.search(r'window\._sharedData\s*=\s*({.*?});</script>', html, re.DOTALL)
if shared_match:
payloads.append(shared_match.group(1))
for match in re.finditer(r'__additionalDataLoaded\([^,]+,\s*({.*?})\);', html, re.DOTALL):
payloads.append(match.group(1))
def extract_from_media(media: Dict[str, Any]):
if not isinstance(media, dict):
return
if data["author"] == "Unknown":
owner = media.get("owner") or {}
if isinstance(owner, dict):
name = owner.get("username") or owner.get("full_name")
if name:
data["author"] = name.strip()
caption_text = None
edge = media.get("edge_media_to_caption")
if isinstance(edge, dict):
edges = edge.get("edges") or []
if edges:
node = edges[0].get("node", {})
if isinstance(node, dict):
caption_text = node.get("text")
if not caption_text and isinstance(media.get("caption"), dict):
caption_text = media["caption"].get("text")
if caption_text and not data["description"]:
if not _looks_like_ui_prompt(caption_text) and not _looks_like_language_list(caption_text):
data["description"] = caption_text.strip()
def walk(obj: Any):
if isinstance(obj, dict):
graphql = obj.get("graphql")
if isinstance(graphql, dict):
extract_from_media(graphql.get("shortcode_media") or graphql.get("media"))
if isinstance(obj.get("shortcode_media"), dict):
extract_from_media(obj.get("shortcode_media"))
for v in obj.values():
walk(v)
elif isinstance(obj, list):
for item in obj:
walk(item)
for raw in payloads:
try:
parsed = json.loads(raw)
except Exception:
continue
walk(parsed)
if data["description"] and data["author"] != "Unknown":
break
except Exception as e:
print(f"⚠️ Could not extract shared data: {e}")
# Try to get caption/description from visible text
try: try:
# Look for caption text # Look for caption text
captions = page.query_selector_all('h1, h2, span') captions = page.query_selector_all('h1, h2, span')
@@ -158,44 +291,47 @@ class InstagramExtractor:
# Extract any visible text as content # Extract any visible text as content
try: try:
# Get all text content if data["description"] and not _looks_like_ui_prompt(data["description"]):
body_text = page.inner_text('body') data["content"] = data["description"].strip()
else:
# Get all text content
body_text = page.inner_text('body')
# Filter for meaningful content # Filter for meaningful content
lines = body_text.split('\n') lines = body_text.split('\n')
cleaned_lines = [] cleaned_lines = []
buffer = [] buffer = []
def flush_buffer(): def flush_buffer():
if buffer: if buffer:
block = "\n".join(buffer) block = "\n".join(buffer)
if not _looks_like_language_list(block): if not _looks_like_language_list(block):
cleaned_lines.extend( cleaned_lines.extend(
[line for line in buffer if not _looks_like_ui_prompt(line)] [line for line in buffer if not _looks_like_ui_prompt(line)]
) )
buffer.clear() buffer.clear()
for line in lines: for line in lines:
stripped = line.strip() stripped = line.strip()
if not stripped: if not stripped:
flush_buffer() flush_buffer()
continue continue
if _looks_like_ui_prompt(stripped): if _looks_like_ui_prompt(stripped):
continue continue
if len(stripped) <= 24: if len(stripped) <= 24:
buffer.append(stripped) buffer.append(stripped)
else: else:
flush_buffer() flush_buffer()
cleaned_lines.append(stripped) cleaned_lines.append(stripped)
flush_buffer() flush_buffer()
meaningful_lines = [ meaningful_lines = [
line for line in cleaned_lines line for line in cleaned_lines
if len(line) > 30 and len(line) < 300 if len(line) > 30 and len(line) < 300
] ]
data["content"] = "\n\n".join(meaningful_lines[:10])[:5000] data["content"] = "\n\n".join(meaningful_lines[:10])[:5000]
except Exception as e: except Exception as e:
print(f"⚠️ Could not extract page text: {e}") print(f"⚠️ Could not extract page text: {e}")