feat: Initial commit - Content Extractor for YouTube, Instagram, and blogs

- YouTube extraction with transcript support - Instagram reel extraction via browser automation - Blog/article web scraping - Auto-save to Obsidian vaults - Smart key point generation - Configurable via .env file - Quick extract shell script Tech stack: Python, requests, beautifulsoup4, playwright, youtube-transcript-api
2026-03-05 13:02:58 +05:30
commit c997e764b5
12 changed files with 1302 additions and 0 deletions
--- a/main.py
+++ b/main.py
@@ -0,0 +1,199 @@
+#!/usr/bin/env python3
+"""
+Content Extractor - Extract key information from URLs and save to Obsidian
+
+Supports:
+- YouTube videos (transcripts, descriptions, metadata)
+- Blog posts & articles (web scraping)
+- Instagram reels (via browser automation)
+- Generic URLs (text extraction)
+
+Usage:
+    python main.py <url> [--obsidian-path <path>] [--output <filename>]
+"""
+
+import argparse
+import sys
+from pathlib import Path
+from datetime import datetime
+from typing import Optional
+
+from extractors.youtube_extractor import YouTubeExtractor
+from extractors.blog_extractor import BlogExtractor
+from extractors.instagram_extractor import InstagramExtractor
+from obsidian_writer import ObsidianWriter
+from config import Config
+
+
+def detect_source_type(url: str) -> str:
+    """Detect the type of content based on URL."""
+    if "youtube.com" in url or "youtu.be" in url:
+        return "youtube"
+    elif "instagram.com" in url and "/reel" in url:
+        return "instagram"
+    elif "instagram.com" in url:
+        return "instagram"
+    else:
+        return "blog"
+
+
+def extract_content(url: str, source_type: str) -> dict:
+    """Extract content from URL based on source type."""
+    print(f"🔍 Extracting content from {source_type}...")
+    
+    if source_type == "youtube":
+        extractor = YouTubeExtractor(url)
+    elif source_type == "instagram":
+        extractor = InstagramExtractor(url)
+    else:
+        extractor = BlogExtractor(url)
+    
+    return extractor.extract()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Extract content from URLs and save to Obsidian notes"
+    )
+    parser.add_argument("url", help="URL to extract content from")
+    parser.add_argument(
+        "--obsidian-path",
+        type=str,
+        default=Config.OBSIDIAN_VAULT_PATH,
+        help="Path to Obsidian vault"
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default=None,
+        help="Output filename (without .md extension)"
+    )
+    parser.add_argument(
+        "--folder",
+        type=str,
+        default="Content Extractor",
+        help="Folder in Obsidian vault to save notes"
+    )
+    parser.add_argument(
+        "--no-save",
+        action="store_true",
+        help="Only print extracted content, don't save to Obsidian"
+    )
+    parser.add_argument(
+        "--summarize",
+        action="store_true",
+        help="Generate a summary of the content"
+    )
+    
+    args = parser.parse_args()
+    
+    # Detect source type
+    source_type = detect_source_type(args.url)
+    print(f"📌 Detected source type: {source_type}")
+    
+    # Extract content
+    try:
+        content = extract_content(args.url, source_type)
+    except Exception as e:
+        print(f"❌ Extraction failed: {e}")
+        sys.exit(1)
+    
+    if not content:
+        print("❌ No content could be extracted")
+        sys.exit(1)
+    
+    # Generate output filename
+    if args.output:
+        filename = args.output
+    else:
+        # Generate from title or URL
+        title = content.get("title", "Untitled")
+        filename = f"{title[:50]}_{datetime.now().strftime('%Y%m%d')}"
+        # Sanitize filename
+        filename = "".join(c for c in filename if c.isalnum() or c in " -_").strip()
+    
+    # Create markdown content
+    markdown = generate_markdown(content, source_type, args.url)
+    
+    # Print preview
+    print("\n" + "="*80)
+    print("📝 EXTRACTED CONTENT PREVIEW")
+    print("="*80)
+    print(markdown[:2000] + "..." if len(markdown) > 2000 else markdown)
+    print("="*80)
+    
+    # Save to Obsidian
+    if not args.no_save:
+        writer = ObsidianWriter(args.obsidian_path)
+        output_path = writer.save_note(markdown, filename, args.folder)
+        print(f"\n✅ Note saved to: {output_path}")
+    else:
+        print("\n⚠️  Note not saved (--no-save flag)")
+    
+    return content
+
+
+def generate_markdown(content: dict, source_type: str, url: str) -> str:
+    """Generate markdown content for Obsidian note."""
+    lines = []
+    
+    # Header
+    lines.append(f"# {content.get('title', 'Untitled')}")
+    lines.append("")
+    
+    # Metadata
+    lines.append("## Metadata")
+    lines.append("")
+    lines.append(f"- **Source**: {source_type.capitalize()}")
+    lines.append(f"- **URL**: {url}")
+    lines.append(f"- **Extracted**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    
+    if content.get("author"):
+        lines.append(f"- **Author**: {content.get('author')}")
+    if content.get("duration"):
+        lines.append(f"- **Duration**: {content.get('duration')}")
+    if content.get("publish_date"):
+        lines.append(f"- **Published**: {content.get('publish_date')}")
+    if content.get("views"):
+        lines.append(f"- **Views**: {content.get('views')}")
+    
+    lines.append("")
+    
+    # Description/Summary
+    if content.get("description"):
+        lines.append("## Description")
+        lines.append("")
+        lines.append(content.get("description", ""))
+        lines.append("")
+    
+    # Main Content (transcript, article text, etc.)
+    if content.get("content"):
+        lines.append("## Content")
+        lines.append("")
+        lines.append(content.get("content", ""))
+        lines.append("")
+    
+    # Key Points/Summary
+    if content.get("key_points"):
+        lines.append("## Key Points")
+        lines.append("")
+        for point in content.get("key_points", []):
+            lines.append(f"- {point}")
+        lines.append("")
+    
+    # Tags
+    lines.append("---")
+    lines.append("")
+    lines.append("## Tags")
+    lines.append("")
+    tags = content.get("tags", [])
+    if not tags:
+        tags = ["content-extractor", source_type, "notes"]
+    lines.append(" ".join(f"#{tag}" for tag in tags))
+    lines.append("")
+    
+    return "\n".join(lines)
+
+
+if __name__ == "__main__":
+    main()