From 0163767dd1e8f6b1d687904210d1decb13259d6c Mon Sep 17 00:00:00 2001 From: Jan Bader Date: Sat, 4 Apr 2026 20:50:59 +0200 Subject: [PATCH] Add AI summarization --- README.md | 6 ++++ config.py | 20 +++++++----- main.py | 57 ++++++++++++++++++++++------------ summarizer.py | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 140 insertions(+), 27 deletions(-) create mode 100644 summarizer.py diff --git a/README.md b/README.md index ddcb9e2..46d82ee 100644 --- a/README.md +++ b/README.md @@ -92,6 +92,12 @@ BROWSER_TIMEOUT=30000 MAX_CONTENT_LENGTH=10000 GENERATE_SUMMARY=true +# OpenAI/OpenRouter +OPENAI_API_KEY=your_key_here +OPENAI_URL=https://openrouter.ai/api/v1/chat/completions +OPENAI_MODEL=gpt-4o-mini +OPENAI_TIMEOUT=30 + # YouTube YOUTUBE_LANGUAGE=en diff --git a/config.py b/config.py index 614672c..727c756 100644 --- a/config.py +++ b/config.py @@ -12,31 +12,37 @@ load_dotenv() class Config: """Configuration settings for content extractor.""" - + # Obsidian vault path (default to common locations) OBSIDIAN_VAULT_PATH = os.getenv( "OBSIDIAN_VAULT_PATH", os.path.expanduser("~/Obsidian Vault") # Default location ) - + # Browser settings (for Instagram and dynamic content) BROWSER_HEADLESS = os.getenv("BROWSER_HEADLESS", "true").lower() == "true" BROWSER_TIMEOUT = int(os.getenv("BROWSER_TIMEOUT", "30000")) # 30 seconds - + # Content extraction settings MAX_CONTENT_LENGTH = int(os.getenv("MAX_CONTENT_LENGTH", "10000")) # Max chars GENERATE_SUMMARY = os.getenv("GENERATE_SUMMARY", "true").lower() == "true" - + + # OpenAI/OpenRouter settings + OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "") + OPENAI_URL = os.getenv("OPENAI_URL", "https://api.openai.com/v1/chat/completions") + OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini") + OPENAI_TIMEOUT = int(os.getenv("OPENAI_TIMEOUT", "30")) + # YouTube settings YOUTUBE_LANGUAGE = os.getenv("YOUTUBE_LANGUAGE", "en") - + # Instagram settings (requires browser automation) INSTAGRAM_WAIT_TIME = int(os.getenv("INSTAGRAM_WAIT_TIME", "5")) # seconds - + # Logging LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO") LOG_FILE = os.getenv("LOG_FILE", "content_extractor.log") - + @classmethod def validate(cls): """Validate configuration.""" diff --git a/main.py b/main.py index 986fe17..2bad7e9 100644 --- a/main.py +++ b/main.py @@ -23,6 +23,7 @@ from extractors.blog_extractor import BlogExtractor from extractors.instagram_extractor import InstagramExtractor from obsidian_writer import ObsidianWriter from config import Config +from summarizer import summarize_text, SummarizationError def detect_source_type(url: str) -> str: @@ -40,14 +41,14 @@ def detect_source_type(url: str) -> str: def extract_content(url: str, source_type: str) -> dict: """Extract content from URL based on source type.""" print(f"šŸ” Extracting content from {source_type}...") - + if source_type == "youtube": extractor = YouTubeExtractor(url) elif source_type == "instagram": extractor = InstagramExtractor(url) else: extractor = BlogExtractor(url) - + return extractor.extract() @@ -84,24 +85,40 @@ def main(): action="store_true", help="Generate a summary of the content" ) - + args = parser.parse_args() - + # Detect source type source_type = detect_source_type(args.url) print(f"šŸ“Œ Detected source type: {source_type}") - + # Extract content try: content = extract_content(args.url, source_type) except Exception as e: print(f"āŒ Extraction failed: {e}") sys.exit(1) - + if not content: print("āŒ No content could be extracted") sys.exit(1) - + + # Generate AI summary + key points + if args.summarize or Config.GENERATE_SUMMARY: + source_text = "\n\n".join( + part for part in [content.get("description", ""), content.get("content", "")] + if part + ).strip() + if source_text: + try: + summary_result = summarize_text(source_text, max_points=3) + if summary_result.get("summary"): + content["description"] = summary_result["summary"] + if summary_result.get("key_points"): + content["key_points"] = summary_result["key_points"] + except SummarizationError as e: + print(f"āš ļø Summarization failed: {e}") + # Generate output filename if args.output: filename = args.output @@ -111,17 +128,17 @@ def main(): filename = f"{title[:50]}_{datetime.now().strftime('%Y%m%d')}" # Sanitize filename filename = "".join(c for c in filename if c.isalnum() or c in " -_").strip() - + # Create markdown content markdown = generate_markdown(content, source_type, args.url) - + # Print preview print("\n" + "="*80) print("šŸ“ EXTRACTED CONTENT PREVIEW") print("="*80) print(markdown[:2000] + "..." if len(markdown) > 2000 else markdown) print("="*80) - + # Save to Obsidian if not args.no_save: writer = ObsidianWriter(args.obsidian_path) @@ -129,25 +146,25 @@ def main(): print(f"\nāœ… Note saved to: {output_path}") else: print("\nāš ļø Note not saved (--no-save flag)") - + return content def generate_markdown(content: dict, source_type: str, url: str) -> str: """Generate markdown content for Obsidian note.""" lines = [] - + # Header lines.append(f"# {content.get('title', 'Untitled')}") lines.append("") - + # Metadata lines.append("## Metadata") lines.append("") lines.append(f"- **Source**: {source_type.capitalize()}") lines.append(f"- **URL**: {url}") lines.append(f"- **Extracted**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") - + if content.get("author"): lines.append(f"- **Author**: {content.get('author')}") if content.get("duration"): @@ -156,23 +173,23 @@ def generate_markdown(content: dict, source_type: str, url: str) -> str: lines.append(f"- **Published**: {content.get('publish_date')}") if content.get("views"): lines.append(f"- **Views**: {content.get('views')}") - + lines.append("") - + # Description/Summary if content.get("description"): lines.append("## Description") lines.append("") lines.append(content.get("description", "")) lines.append("") - + # Main Content (transcript, article text, etc.) if content.get("content"): lines.append("## Content") lines.append("") lines.append(content.get("content", "")) lines.append("") - + # Key Points/Summary if content.get("key_points"): lines.append("## Key Points") @@ -180,7 +197,7 @@ def generate_markdown(content: dict, source_type: str, url: str) -> str: for point in content.get("key_points", []): lines.append(f"- {point}") lines.append("") - + # Tags lines.append("---") lines.append("") @@ -191,7 +208,7 @@ def generate_markdown(content: dict, source_type: str, url: str) -> str: tags = ["content-extractor", source_type, "notes"] lines.append(" ".join(f"#{tag}" for tag in tags)) lines.append("") - + return "\n".join(lines) diff --git a/summarizer.py b/summarizer.py new file mode 100644 index 0000000..2550f97 --- /dev/null +++ b/summarizer.py @@ -0,0 +1,84 @@ +""" +OpenAI/OpenRouter summarizer utility. + +Uses OPENAI_API_KEY and OPENAI_URL from environment (via Config). +""" + +from __future__ import annotations + +import json +from typing import Dict, List + +import requests + +from config import Config + + +class SummarizationError(RuntimeError): + """Raised when summarization fails.""" + + +def summarize_text(text: str, max_points: int = 3) -> Dict[str, List[str] | str]: + """ + Summarize text into a short summary and key points. + + Returns: + { + "summary": "string", + "key_points": ["point 1", "point 2", ...] + } + """ + if not text or not text.strip(): + return {"summary": "", "key_points": []} + + if not Config.OPENAI_API_KEY: + raise SummarizationError("OPENAI_API_KEY is not set") + + payload = { + "model": Config.OPENAI_MODEL, + "messages": [ + { + "role": "system", + "content": ( + "You are a precise summarizer. Return JSON only with keys " + "`summary` and `key_points` (array of strings). Do not add extra keys." + ), + }, + { + "role": "user", + "content": ( + "Summarize the following content in 2-4 sentences and provide " + f"{max_points} key points.\n\n" + f"CONTENT:\n{text}" + ), + }, + ], + "temperature": 0.2, + "max_tokens": 400, + } + + headers = { + "Authorization": f"Bearer {Config.OPENAI_API_KEY}", + "Content-Type": "application/json", + } + + try: + response = requests.post( + Config.OPENAI_URL, + headers=headers, + json=payload, + timeout=Config.OPENAI_TIMEOUT, + ) + response.raise_for_status() + data = response.json() + except Exception as exc: + raise SummarizationError(f"Request failed: {exc}") from exc + + try: + content = data["choices"][0]["message"]["content"].strip() + result = json.loads(content) + summary = result.get("summary", "").strip() + key_points = [p.strip() for p in result.get("key_points", []) if p.strip()] + return {"summary": summary, "key_points": key_points} + except Exception as exc: + raise SummarizationError(f"Invalid response format: {exc}") from exc