Add AI summarization
This commit is contained in:
@@ -92,6 +92,12 @@ BROWSER_TIMEOUT=30000
|
|||||||
MAX_CONTENT_LENGTH=10000
|
MAX_CONTENT_LENGTH=10000
|
||||||
GENERATE_SUMMARY=true
|
GENERATE_SUMMARY=true
|
||||||
|
|
||||||
|
# OpenAI/OpenRouter
|
||||||
|
OPENAI_API_KEY=your_key_here
|
||||||
|
OPENAI_URL=https://openrouter.ai/api/v1/chat/completions
|
||||||
|
OPENAI_MODEL=gpt-4o-mini
|
||||||
|
OPENAI_TIMEOUT=30
|
||||||
|
|
||||||
# YouTube
|
# YouTube
|
||||||
YOUTUBE_LANGUAGE=en
|
YOUTUBE_LANGUAGE=en
|
||||||
|
|
||||||
|
|||||||
20
config.py
20
config.py
@@ -12,31 +12,37 @@ load_dotenv()
|
|||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
"""Configuration settings for content extractor."""
|
"""Configuration settings for content extractor."""
|
||||||
|
|
||||||
# Obsidian vault path (default to common locations)
|
# Obsidian vault path (default to common locations)
|
||||||
OBSIDIAN_VAULT_PATH = os.getenv(
|
OBSIDIAN_VAULT_PATH = os.getenv(
|
||||||
"OBSIDIAN_VAULT_PATH",
|
"OBSIDIAN_VAULT_PATH",
|
||||||
os.path.expanduser("~/Obsidian Vault") # Default location
|
os.path.expanduser("~/Obsidian Vault") # Default location
|
||||||
)
|
)
|
||||||
|
|
||||||
# Browser settings (for Instagram and dynamic content)
|
# Browser settings (for Instagram and dynamic content)
|
||||||
BROWSER_HEADLESS = os.getenv("BROWSER_HEADLESS", "true").lower() == "true"
|
BROWSER_HEADLESS = os.getenv("BROWSER_HEADLESS", "true").lower() == "true"
|
||||||
BROWSER_TIMEOUT = int(os.getenv("BROWSER_TIMEOUT", "30000")) # 30 seconds
|
BROWSER_TIMEOUT = int(os.getenv("BROWSER_TIMEOUT", "30000")) # 30 seconds
|
||||||
|
|
||||||
# Content extraction settings
|
# Content extraction settings
|
||||||
MAX_CONTENT_LENGTH = int(os.getenv("MAX_CONTENT_LENGTH", "10000")) # Max chars
|
MAX_CONTENT_LENGTH = int(os.getenv("MAX_CONTENT_LENGTH", "10000")) # Max chars
|
||||||
GENERATE_SUMMARY = os.getenv("GENERATE_SUMMARY", "true").lower() == "true"
|
GENERATE_SUMMARY = os.getenv("GENERATE_SUMMARY", "true").lower() == "true"
|
||||||
|
|
||||||
|
# OpenAI/OpenRouter settings
|
||||||
|
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
||||||
|
OPENAI_URL = os.getenv("OPENAI_URL", "https://api.openai.com/v1/chat/completions")
|
||||||
|
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
|
||||||
|
OPENAI_TIMEOUT = int(os.getenv("OPENAI_TIMEOUT", "30"))
|
||||||
|
|
||||||
# YouTube settings
|
# YouTube settings
|
||||||
YOUTUBE_LANGUAGE = os.getenv("YOUTUBE_LANGUAGE", "en")
|
YOUTUBE_LANGUAGE = os.getenv("YOUTUBE_LANGUAGE", "en")
|
||||||
|
|
||||||
# Instagram settings (requires browser automation)
|
# Instagram settings (requires browser automation)
|
||||||
INSTAGRAM_WAIT_TIME = int(os.getenv("INSTAGRAM_WAIT_TIME", "5")) # seconds
|
INSTAGRAM_WAIT_TIME = int(os.getenv("INSTAGRAM_WAIT_TIME", "5")) # seconds
|
||||||
|
|
||||||
# Logging
|
# Logging
|
||||||
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
|
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
|
||||||
LOG_FILE = os.getenv("LOG_FILE", "content_extractor.log")
|
LOG_FILE = os.getenv("LOG_FILE", "content_extractor.log")
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def validate(cls):
|
def validate(cls):
|
||||||
"""Validate configuration."""
|
"""Validate configuration."""
|
||||||
|
|||||||
57
main.py
57
main.py
@@ -23,6 +23,7 @@ from extractors.blog_extractor import BlogExtractor
|
|||||||
from extractors.instagram_extractor import InstagramExtractor
|
from extractors.instagram_extractor import InstagramExtractor
|
||||||
from obsidian_writer import ObsidianWriter
|
from obsidian_writer import ObsidianWriter
|
||||||
from config import Config
|
from config import Config
|
||||||
|
from summarizer import summarize_text, SummarizationError
|
||||||
|
|
||||||
|
|
||||||
def detect_source_type(url: str) -> str:
|
def detect_source_type(url: str) -> str:
|
||||||
@@ -40,14 +41,14 @@ def detect_source_type(url: str) -> str:
|
|||||||
def extract_content(url: str, source_type: str) -> dict:
|
def extract_content(url: str, source_type: str) -> dict:
|
||||||
"""Extract content from URL based on source type."""
|
"""Extract content from URL based on source type."""
|
||||||
print(f"🔍 Extracting content from {source_type}...")
|
print(f"🔍 Extracting content from {source_type}...")
|
||||||
|
|
||||||
if source_type == "youtube":
|
if source_type == "youtube":
|
||||||
extractor = YouTubeExtractor(url)
|
extractor = YouTubeExtractor(url)
|
||||||
elif source_type == "instagram":
|
elif source_type == "instagram":
|
||||||
extractor = InstagramExtractor(url)
|
extractor = InstagramExtractor(url)
|
||||||
else:
|
else:
|
||||||
extractor = BlogExtractor(url)
|
extractor = BlogExtractor(url)
|
||||||
|
|
||||||
return extractor.extract()
|
return extractor.extract()
|
||||||
|
|
||||||
|
|
||||||
@@ -84,24 +85,40 @@ def main():
|
|||||||
action="store_true",
|
action="store_true",
|
||||||
help="Generate a summary of the content"
|
help="Generate a summary of the content"
|
||||||
)
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Detect source type
|
# Detect source type
|
||||||
source_type = detect_source_type(args.url)
|
source_type = detect_source_type(args.url)
|
||||||
print(f"📌 Detected source type: {source_type}")
|
print(f"📌 Detected source type: {source_type}")
|
||||||
|
|
||||||
# Extract content
|
# Extract content
|
||||||
try:
|
try:
|
||||||
content = extract_content(args.url, source_type)
|
content = extract_content(args.url, source_type)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"❌ Extraction failed: {e}")
|
print(f"❌ Extraction failed: {e}")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
if not content:
|
if not content:
|
||||||
print("❌ No content could be extracted")
|
print("❌ No content could be extracted")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Generate AI summary + key points
|
||||||
|
if args.summarize or Config.GENERATE_SUMMARY:
|
||||||
|
source_text = "\n\n".join(
|
||||||
|
part for part in [content.get("description", ""), content.get("content", "")]
|
||||||
|
if part
|
||||||
|
).strip()
|
||||||
|
if source_text:
|
||||||
|
try:
|
||||||
|
summary_result = summarize_text(source_text, max_points=3)
|
||||||
|
if summary_result.get("summary"):
|
||||||
|
content["description"] = summary_result["summary"]
|
||||||
|
if summary_result.get("key_points"):
|
||||||
|
content["key_points"] = summary_result["key_points"]
|
||||||
|
except SummarizationError as e:
|
||||||
|
print(f"⚠️ Summarization failed: {e}")
|
||||||
|
|
||||||
# Generate output filename
|
# Generate output filename
|
||||||
if args.output:
|
if args.output:
|
||||||
filename = args.output
|
filename = args.output
|
||||||
@@ -111,17 +128,17 @@ def main():
|
|||||||
filename = f"{title[:50]}_{datetime.now().strftime('%Y%m%d')}"
|
filename = f"{title[:50]}_{datetime.now().strftime('%Y%m%d')}"
|
||||||
# Sanitize filename
|
# Sanitize filename
|
||||||
filename = "".join(c for c in filename if c.isalnum() or c in " -_").strip()
|
filename = "".join(c for c in filename if c.isalnum() or c in " -_").strip()
|
||||||
|
|
||||||
# Create markdown content
|
# Create markdown content
|
||||||
markdown = generate_markdown(content, source_type, args.url)
|
markdown = generate_markdown(content, source_type, args.url)
|
||||||
|
|
||||||
# Print preview
|
# Print preview
|
||||||
print("\n" + "="*80)
|
print("\n" + "="*80)
|
||||||
print("📝 EXTRACTED CONTENT PREVIEW")
|
print("📝 EXTRACTED CONTENT PREVIEW")
|
||||||
print("="*80)
|
print("="*80)
|
||||||
print(markdown[:2000] + "..." if len(markdown) > 2000 else markdown)
|
print(markdown[:2000] + "..." if len(markdown) > 2000 else markdown)
|
||||||
print("="*80)
|
print("="*80)
|
||||||
|
|
||||||
# Save to Obsidian
|
# Save to Obsidian
|
||||||
if not args.no_save:
|
if not args.no_save:
|
||||||
writer = ObsidianWriter(args.obsidian_path)
|
writer = ObsidianWriter(args.obsidian_path)
|
||||||
@@ -129,25 +146,25 @@ def main():
|
|||||||
print(f"\n✅ Note saved to: {output_path}")
|
print(f"\n✅ Note saved to: {output_path}")
|
||||||
else:
|
else:
|
||||||
print("\n⚠️ Note not saved (--no-save flag)")
|
print("\n⚠️ Note not saved (--no-save flag)")
|
||||||
|
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
|
||||||
def generate_markdown(content: dict, source_type: str, url: str) -> str:
|
def generate_markdown(content: dict, source_type: str, url: str) -> str:
|
||||||
"""Generate markdown content for Obsidian note."""
|
"""Generate markdown content for Obsidian note."""
|
||||||
lines = []
|
lines = []
|
||||||
|
|
||||||
# Header
|
# Header
|
||||||
lines.append(f"# {content.get('title', 'Untitled')}")
|
lines.append(f"# {content.get('title', 'Untitled')}")
|
||||||
lines.append("")
|
lines.append("")
|
||||||
|
|
||||||
# Metadata
|
# Metadata
|
||||||
lines.append("## Metadata")
|
lines.append("## Metadata")
|
||||||
lines.append("")
|
lines.append("")
|
||||||
lines.append(f"- **Source**: {source_type.capitalize()}")
|
lines.append(f"- **Source**: {source_type.capitalize()}")
|
||||||
lines.append(f"- **URL**: {url}")
|
lines.append(f"- **URL**: {url}")
|
||||||
lines.append(f"- **Extracted**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
lines.append(f"- **Extracted**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||||
|
|
||||||
if content.get("author"):
|
if content.get("author"):
|
||||||
lines.append(f"- **Author**: {content.get('author')}")
|
lines.append(f"- **Author**: {content.get('author')}")
|
||||||
if content.get("duration"):
|
if content.get("duration"):
|
||||||
@@ -156,23 +173,23 @@ def generate_markdown(content: dict, source_type: str, url: str) -> str:
|
|||||||
lines.append(f"- **Published**: {content.get('publish_date')}")
|
lines.append(f"- **Published**: {content.get('publish_date')}")
|
||||||
if content.get("views"):
|
if content.get("views"):
|
||||||
lines.append(f"- **Views**: {content.get('views')}")
|
lines.append(f"- **Views**: {content.get('views')}")
|
||||||
|
|
||||||
lines.append("")
|
lines.append("")
|
||||||
|
|
||||||
# Description/Summary
|
# Description/Summary
|
||||||
if content.get("description"):
|
if content.get("description"):
|
||||||
lines.append("## Description")
|
lines.append("## Description")
|
||||||
lines.append("")
|
lines.append("")
|
||||||
lines.append(content.get("description", ""))
|
lines.append(content.get("description", ""))
|
||||||
lines.append("")
|
lines.append("")
|
||||||
|
|
||||||
# Main Content (transcript, article text, etc.)
|
# Main Content (transcript, article text, etc.)
|
||||||
if content.get("content"):
|
if content.get("content"):
|
||||||
lines.append("## Content")
|
lines.append("## Content")
|
||||||
lines.append("")
|
lines.append("")
|
||||||
lines.append(content.get("content", ""))
|
lines.append(content.get("content", ""))
|
||||||
lines.append("")
|
lines.append("")
|
||||||
|
|
||||||
# Key Points/Summary
|
# Key Points/Summary
|
||||||
if content.get("key_points"):
|
if content.get("key_points"):
|
||||||
lines.append("## Key Points")
|
lines.append("## Key Points")
|
||||||
@@ -180,7 +197,7 @@ def generate_markdown(content: dict, source_type: str, url: str) -> str:
|
|||||||
for point in content.get("key_points", []):
|
for point in content.get("key_points", []):
|
||||||
lines.append(f"- {point}")
|
lines.append(f"- {point}")
|
||||||
lines.append("")
|
lines.append("")
|
||||||
|
|
||||||
# Tags
|
# Tags
|
||||||
lines.append("---")
|
lines.append("---")
|
||||||
lines.append("")
|
lines.append("")
|
||||||
@@ -191,7 +208,7 @@ def generate_markdown(content: dict, source_type: str, url: str) -> str:
|
|||||||
tags = ["content-extractor", source_type, "notes"]
|
tags = ["content-extractor", source_type, "notes"]
|
||||||
lines.append(" ".join(f"#{tag}" for tag in tags))
|
lines.append(" ".join(f"#{tag}" for tag in tags))
|
||||||
lines.append("")
|
lines.append("")
|
||||||
|
|
||||||
return "\n".join(lines)
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
84
summarizer.py
Normal file
84
summarizer.py
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
"""
|
||||||
|
OpenAI/OpenRouter summarizer utility.
|
||||||
|
|
||||||
|
Uses OPENAI_API_KEY and OPENAI_URL from environment (via Config).
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
from typing import Dict, List
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from config import Config
|
||||||
|
|
||||||
|
|
||||||
|
class SummarizationError(RuntimeError):
|
||||||
|
"""Raised when summarization fails."""
|
||||||
|
|
||||||
|
|
||||||
|
def summarize_text(text: str, max_points: int = 3) -> Dict[str, List[str] | str]:
|
||||||
|
"""
|
||||||
|
Summarize text into a short summary and key points.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
{
|
||||||
|
"summary": "string",
|
||||||
|
"key_points": ["point 1", "point 2", ...]
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
if not text or not text.strip():
|
||||||
|
return {"summary": "", "key_points": []}
|
||||||
|
|
||||||
|
if not Config.OPENAI_API_KEY:
|
||||||
|
raise SummarizationError("OPENAI_API_KEY is not set")
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"model": Config.OPENAI_MODEL,
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": (
|
||||||
|
"You are a precise summarizer. Return JSON only with keys "
|
||||||
|
"`summary` and `key_points` (array of strings). Do not add extra keys."
|
||||||
|
),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": (
|
||||||
|
"Summarize the following content in 2-4 sentences and provide "
|
||||||
|
f"{max_points} key points.\n\n"
|
||||||
|
f"CONTENT:\n{text}"
|
||||||
|
),
|
||||||
|
},
|
||||||
|
],
|
||||||
|
"temperature": 0.2,
|
||||||
|
"max_tokens": 400,
|
||||||
|
}
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"Authorization": f"Bearer {Config.OPENAI_API_KEY}",
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.post(
|
||||||
|
Config.OPENAI_URL,
|
||||||
|
headers=headers,
|
||||||
|
json=payload,
|
||||||
|
timeout=Config.OPENAI_TIMEOUT,
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
except Exception as exc:
|
||||||
|
raise SummarizationError(f"Request failed: {exc}") from exc
|
||||||
|
|
||||||
|
try:
|
||||||
|
content = data["choices"][0]["message"]["content"].strip()
|
||||||
|
result = json.loads(content)
|
||||||
|
summary = result.get("summary", "").strip()
|
||||||
|
key_points = [p.strip() for p in result.get("key_points", []) if p.strip()]
|
||||||
|
return {"summary": summary, "key_points": key_points}
|
||||||
|
except Exception as exc:
|
||||||
|
raise SummarizationError(f"Invalid response format: {exc}") from exc
|
||||||
Reference in New Issue
Block a user