Improve extraction

2026-04-04 21:16:28 +02:00
parent 40104dc0f9
commit 66e1c9e0e0
1 changed files with 169 additions and 33 deletions
@@ -11,6 +11,8 @@ Extracts:
 Note: Instagram requires browser automation. Uses Playwright.
 """
 import html
 import json
 import re
 import time
 from typing import Dict, Any
@@ -122,7 +124,138 @@ class InstagramExtractor:
            ]
            return any(blocker in lowered for blocker in blockers)
-        # Try to get caption/description
+        # Try to get caption/description from meta and embedded JSON first
        try:
            meta_desc = page.query_selector('meta[property="og:description"], meta[name="description"]')
            if meta_desc:
                text = (meta_desc.get_attribute("content") or "").strip()
                if text and not _looks_like_ui_prompt(text) and not _looks_like_language_list(text):
                    data["description"] = text
            meta_title = page.query_selector('meta[property="og:title"], meta[name="twitter:title"]')
            if meta_title and data["title"] == "Instagram Reel":
                title_text = (meta_title.get_attribute("content") or "").strip()
                if title_text:
                    data["title"] = title_text
            if not data["description"]:
                html_source = page.content()
                patterns = [
                    r'<meta[^>]+property="og:description"[^>]+content="([^"]+)"',
                    r'<meta[^>]+name="description"[^>]+content="([^"]+)"',
                    r'<meta[^>]+name="twitter:description"[^>]+content="([^"]+)"',
                ]
                for pattern in patterns:
                    match = re.search(pattern, html_source, re.IGNORECASE)
                    if match:
                        text = html.unescape(match.group(1)).strip()
                        if text and not _looks_like_ui_prompt(text) and not _looks_like_language_list(text):
                            data["description"] = text
                            break
            scripts = page.query_selector_all('script[type="application/ld+json"]')
            for script in scripts:
                raw = script.inner_text().strip()
                if not raw:
                    continue
                try:
                    payload = json.loads(raw)
                except Exception:
                    continue
                def extract_from_obj(obj: Dict[str, Any]):
                    if not isinstance(obj, dict):
                        return
                    desc = obj.get("description")
                    if desc and not data["description"]:
                        if not _looks_like_ui_prompt(desc) and not _looks_like_language_list(desc):
                            data["description"] = desc.strip()
                    author = obj.get("author")
                    if author and data["author"] == "Unknown":
                        if isinstance(author, dict):
                            name = author.get("name")
                            if name:
                                data["author"] = name.strip()
                        elif isinstance(author, list):
                            for item in author:
                                if isinstance(item, dict) and item.get("name"):
                                    data["author"] = item["name"].strip()
                                    break
                        elif isinstance(author, str):
                            data["author"] = author.strip()
                if isinstance(payload, list):
                    for obj in payload:
                        extract_from_obj(obj)
                else:
                    extract_from_obj(payload)
                if data["description"] and data["author"] != "Unknown":
                    break
        except Exception as e:
            print(f"⚠️  Could not extract meta/ld+json: {e}")
        # Try to get caption/description from embedded shared data
        try:
            html = page.content()
            payloads = []
            shared_match = re.search(r'window\._sharedData\s*=\s*({.*?});</script>', html, re.DOTALL)
            if shared_match:
                payloads.append(shared_match.group(1))
            for match in re.finditer(r'__additionalDataLoaded\([^,]+,\s*({.*?})\);', html, re.DOTALL):
                payloads.append(match.group(1))
            def extract_from_media(media: Dict[str, Any]):
                if not isinstance(media, dict):
                    return
                if data["author"] == "Unknown":
                    owner = media.get("owner") or {}
                    if isinstance(owner, dict):
                        name = owner.get("username") or owner.get("full_name")
                        if name:
                            data["author"] = name.strip()
                caption_text = None
                edge = media.get("edge_media_to_caption")
                if isinstance(edge, dict):
                    edges = edge.get("edges") or []
                    if edges:
                        node = edges[0].get("node", {})
                        if isinstance(node, dict):
                            caption_text = node.get("text")
                if not caption_text and isinstance(media.get("caption"), dict):
                    caption_text = media["caption"].get("text")
                if caption_text and not data["description"]:
                    if not _looks_like_ui_prompt(caption_text) and not _looks_like_language_list(caption_text):
                        data["description"] = caption_text.strip()
            def walk(obj: Any):
                if isinstance(obj, dict):
                    graphql = obj.get("graphql")
                    if isinstance(graphql, dict):
                        extract_from_media(graphql.get("shortcode_media") or graphql.get("media"))
                    if isinstance(obj.get("shortcode_media"), dict):
                        extract_from_media(obj.get("shortcode_media"))
                    for v in obj.values():
                        walk(v)
                elif isinstance(obj, list):
                    for item in obj:
                        walk(item)
            for raw in payloads:
                try:
                    parsed = json.loads(raw)
                except Exception:
                    continue
                walk(parsed)
                if data["description"] and data["author"] != "Unknown":
                    break
        except Exception as e:
            print(f"⚠️  Could not extract shared data: {e}")
        # Try to get caption/description from visible text
        try:
            # Look for caption text
            captions = page.query_selector_all('h1, h2, span')
@@ -158,44 +291,47 @@ class InstagramExtractor:
        # Extract any visible text as content
        try:
-            # Get all text content
+            if data["description"] and not _looks_like_ui_prompt(data["description"]):
-            body_text = page.inner_text('body')
+                data["content"] = data["description"].strip()
            else:
                # Get all text content
                body_text = page.inner_text('body')
-            # Filter for meaningful content
+                # Filter for meaningful content
-            lines = body_text.split('\n')
+                lines = body_text.split('\n')
-            cleaned_lines = []
+                cleaned_lines = []
-            buffer = []
+                buffer = []
-            def flush_buffer():
+                def flush_buffer():
-                if buffer:
+                    if buffer:
-                    block = "\n".join(buffer)
+                        block = "\n".join(buffer)
-                    if not _looks_like_language_list(block):
+                        if not _looks_like_language_list(block):
-                        cleaned_lines.extend(
+                            cleaned_lines.extend(
-                            [line for line in buffer if not _looks_like_ui_prompt(line)]
+                                [line for line in buffer if not _looks_like_ui_prompt(line)]
-                        )
+                            )
-                    buffer.clear()
+                        buffer.clear()
-            for line in lines:
+                for line in lines:
-                stripped = line.strip()
+                    stripped = line.strip()
-                if not stripped:
+                    if not stripped:
-                    flush_buffer()
+                        flush_buffer()
-                    continue
+                        continue
-                if _looks_like_ui_prompt(stripped):
+                    if _looks_like_ui_prompt(stripped):
-                    continue
+                        continue
-                if len(stripped) <= 24:
+                    if len(stripped) <= 24:
-                    buffer.append(stripped)
+                        buffer.append(stripped)
-                else:
+                    else:
-                    flush_buffer()
+                        flush_buffer()
-                    cleaned_lines.append(stripped)
+                        cleaned_lines.append(stripped)
-            flush_buffer()
+                flush_buffer()
-            meaningful_lines = [
+                meaningful_lines = [
-                line for line in cleaned_lines
+                    line for line in cleaned_lines
-                if len(line) > 30 and len(line) < 300
+                    if len(line) > 30 and len(line) < 300
-            ]
+                ]
-            data["content"] = "\n\n".join(meaningful_lines[:10])[:5000]
+                data["content"] = "\n\n".join(meaningful_lines[:10])[:5000]
        except Exception as e:
            print(f"⚠️  Could not extract page text: {e}")