From 0163767dd1e8f6b1d687904210d1decb13259d6c Mon Sep 17 00:00:00 2001
From: Jan Bader <c.github@jan.javil.eu>
Date: Sat, 4 Apr 2026 20:50:59 +0200
Subject: [PATCH] Add AI summarization

---
 README.md     |  6 ++++
 config.py     | 20 +++++++-----
 main.py       | 57 ++++++++++++++++++++++------------
 summarizer.py | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 140 insertions(+), 27 deletions(-)
 create mode 100644 summarizer.py

diff --git a/README.md b/README.md
index ddcb9e2..46d82ee 100644
--- a/README.md
+++ b/README.md
@@ -92,6 +92,12 @@ BROWSER_TIMEOUT=30000
 MAX_CONTENT_LENGTH=10000
 GENERATE_SUMMARY=true
 
+# OpenAI/OpenRouter
+OPENAI_API_KEY=your_key_here
+OPENAI_URL=https://openrouter.ai/api/v1/chat/completions
+OPENAI_MODEL=gpt-4o-mini
+OPENAI_TIMEOUT=30
+
 # YouTube
 YOUTUBE_LANGUAGE=en
 
diff --git a/config.py b/config.py
index 614672c..727c756 100644
--- a/config.py
+++ b/config.py
@@ -12,31 +12,37 @@ load_dotenv()
 
 class Config:
     """Configuration settings for content extractor."""
-    
+
     # Obsidian vault path (default to common locations)
     OBSIDIAN_VAULT_PATH = os.getenv(
         "OBSIDIAN_VAULT_PATH",
         os.path.expanduser("~/Obsidian Vault")  # Default location
     )
-    
+
     # Browser settings (for Instagram and dynamic content)
     BROWSER_HEADLESS = os.getenv("BROWSER_HEADLESS", "true").lower() == "true"
     BROWSER_TIMEOUT = int(os.getenv("BROWSER_TIMEOUT", "30000"))  # 30 seconds
-    
+
     # Content extraction settings
     MAX_CONTENT_LENGTH = int(os.getenv("MAX_CONTENT_LENGTH", "10000"))  # Max chars
     GENERATE_SUMMARY = os.getenv("GENERATE_SUMMARY", "true").lower() == "true"
-    
+
+    # OpenAI/OpenRouter settings
+    OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
+    OPENAI_URL = os.getenv("OPENAI_URL", "https://api.openai.com/v1/chat/completions")
+    OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
+    OPENAI_TIMEOUT = int(os.getenv("OPENAI_TIMEOUT", "30"))
+
     # YouTube settings
     YOUTUBE_LANGUAGE = os.getenv("YOUTUBE_LANGUAGE", "en")
-    
+
     # Instagram settings (requires browser automation)
     INSTAGRAM_WAIT_TIME = int(os.getenv("INSTAGRAM_WAIT_TIME", "5"))  # seconds
-    
+
     # Logging
     LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
     LOG_FILE = os.getenv("LOG_FILE", "content_extractor.log")
-    
+
     @classmethod
     def validate(cls):
         """Validate configuration."""
diff --git a/main.py b/main.py
index 986fe17..2bad7e9 100644
--- a/main.py
+++ b/main.py
@@ -23,6 +23,7 @@ from extractors.blog_extractor import BlogExtractor
 from extractors.instagram_extractor import InstagramExtractor
 from obsidian_writer import ObsidianWriter
 from config import Config
+from summarizer import summarize_text, SummarizationError
 
 
 def detect_source_type(url: str) -> str:
@@ -40,14 +41,14 @@ def detect_source_type(url: str) -> str:
 def extract_content(url: str, source_type: str) -> dict:
     """Extract content from URL based on source type."""
     print(f"🔍 Extracting content from {source_type}...")
-    
+
     if source_type == "youtube":
         extractor = YouTubeExtractor(url)
     elif source_type == "instagram":
         extractor = InstagramExtractor(url)
     else:
         extractor = BlogExtractor(url)
-    
+
     return extractor.extract()
 
 
@@ -84,24 +85,40 @@ def main():
         action="store_true",
         help="Generate a summary of the content"
     )
-    
+
     args = parser.parse_args()
-    
+
     # Detect source type
     source_type = detect_source_type(args.url)
     print(f"📌 Detected source type: {source_type}")
-    
+
     # Extract content
     try:
         content = extract_content(args.url, source_type)
     except Exception as e:
         print(f"❌ Extraction failed: {e}")
         sys.exit(1)
-    
+
     if not content:
         print("❌ No content could be extracted")
         sys.exit(1)
-    
+
+    # Generate AI summary + key points
+    if args.summarize or Config.GENERATE_SUMMARY:
+        source_text = "\n\n".join(
+            part for part in [content.get("description", ""), content.get("content", "")]
+            if part
+        ).strip()
+        if source_text:
+            try:
+                summary_result = summarize_text(source_text, max_points=3)
+                if summary_result.get("summary"):
+                    content["description"] = summary_result["summary"]
+                if summary_result.get("key_points"):
+                    content["key_points"] = summary_result["key_points"]
+            except SummarizationError as e:
+                print(f"⚠️  Summarization failed: {e}")
+
     # Generate output filename
     if args.output:
         filename = args.output
@@ -111,17 +128,17 @@ def main():
         filename = f"{title[:50]}_{datetime.now().strftime('%Y%m%d')}"
         # Sanitize filename
         filename = "".join(c for c in filename if c.isalnum() or c in " -_").strip()
-    
+
     # Create markdown content
     markdown = generate_markdown(content, source_type, args.url)
-    
+
     # Print preview
     print("\n" + "="*80)
     print("📝 EXTRACTED CONTENT PREVIEW")
     print("="*80)
     print(markdown[:2000] + "..." if len(markdown) > 2000 else markdown)
     print("="*80)
-    
+
     # Save to Obsidian
     if not args.no_save:
         writer = ObsidianWriter(args.obsidian_path)
@@ -129,25 +146,25 @@ def main():
         print(f"\n✅ Note saved to: {output_path}")
     else:
         print("\n⚠️  Note not saved (--no-save flag)")
-    
+
     return content
 
 
 def generate_markdown(content: dict, source_type: str, url: str) -> str:
     """Generate markdown content for Obsidian note."""
     lines = []
-    
+
     # Header
     lines.append(f"# {content.get('title', 'Untitled')}")
     lines.append("")
-    
+
     # Metadata
     lines.append("## Metadata")
     lines.append("")
     lines.append(f"- **Source**: {source_type.capitalize()}")
     lines.append(f"- **URL**: {url}")
     lines.append(f"- **Extracted**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
-    
+
     if content.get("author"):
         lines.append(f"- **Author**: {content.get('author')}")
     if content.get("duration"):
@@ -156,23 +173,23 @@ def generate_markdown(content: dict, source_type: str, url: str) -> str:
         lines.append(f"- **Published**: {content.get('publish_date')}")
     if content.get("views"):
         lines.append(f"- **Views**: {content.get('views')}")
-    
+
     lines.append("")
-    
+
     # Description/Summary
     if content.get("description"):
         lines.append("## Description")
         lines.append("")
         lines.append(content.get("description", ""))
         lines.append("")
-    
+
     # Main Content (transcript, article text, etc.)
     if content.get("content"):
         lines.append("## Content")
         lines.append("")
         lines.append(content.get("content", ""))
         lines.append("")
-    
+
     # Key Points/Summary
     if content.get("key_points"):
         lines.append("## Key Points")
@@ -180,7 +197,7 @@ def generate_markdown(content: dict, source_type: str, url: str) -> str:
         for point in content.get("key_points", []):
             lines.append(f"- {point}")
         lines.append("")
-    
+
     # Tags
     lines.append("---")
     lines.append("")
@@ -191,7 +208,7 @@ def generate_markdown(content: dict, source_type: str, url: str) -> str:
         tags = ["content-extractor", source_type, "notes"]
     lines.append(" ".join(f"#{tag}" for tag in tags))
     lines.append("")
-    
+
     return "\n".join(lines)
 
 
diff --git a/summarizer.py b/summarizer.py
new file mode 100644
index 0000000..2550f97
--- /dev/null
+++ b/summarizer.py
@@ -0,0 +1,84 @@
+"""
+OpenAI/OpenRouter summarizer utility.
+
+Uses OPENAI_API_KEY and OPENAI_URL from environment (via Config).
+"""
+
+from __future__ import annotations
+
+import json
+from typing import Dict, List
+
+import requests
+
+from config import Config
+
+
+class SummarizationError(RuntimeError):
+    """Raised when summarization fails."""
+
+
+def summarize_text(text: str, max_points: int = 3) -> Dict[str, List[str] | str]:
+    """
+    Summarize text into a short summary and key points.
+
+    Returns:
+        {
+            "summary": "string",
+            "key_points": ["point 1", "point 2", ...]
+        }
+    """
+    if not text or not text.strip():
+        return {"summary": "", "key_points": []}
+
+    if not Config.OPENAI_API_KEY:
+        raise SummarizationError("OPENAI_API_KEY is not set")
+
+    payload = {
+        "model": Config.OPENAI_MODEL,
+        "messages": [
+            {
+                "role": "system",
+                "content": (
+                    "You are a precise summarizer. Return JSON only with keys "
+                    "`summary` and `key_points` (array of strings). Do not add extra keys."
+                ),
+            },
+            {
+                "role": "user",
+                "content": (
+                    "Summarize the following content in 2-4 sentences and provide "
+                    f"{max_points} key points.\n\n"
+                    f"CONTENT:\n{text}"
+                ),
+            },
+        ],
+        "temperature": 0.2,
+        "max_tokens": 400,
+    }
+
+    headers = {
+        "Authorization": f"Bearer {Config.OPENAI_API_KEY}",
+        "Content-Type": "application/json",
+    }
+
+    try:
+        response = requests.post(
+            Config.OPENAI_URL,
+            headers=headers,
+            json=payload,
+            timeout=Config.OPENAI_TIMEOUT,
+        )
+        response.raise_for_status()
+        data = response.json()
+    except Exception as exc:
+        raise SummarizationError(f"Request failed: {exc}") from exc
+
+    try:
+        content = data["choices"][0]["message"]["content"].strip()
+        result = json.loads(content)
+        summary = result.get("summary", "").strip()
+        key_points = [p.strip() for p in result.get("key_points", []) if p.strip()]
+        return {"summary": summary, "key_points": key_points}
+    except Exception as exc:
+        raise SummarizationError(f"Invalid response format: {exc}") from exc