feat: Initial commit - Content Extractor for YouTube, Instagram, and blogs

- YouTube extraction with transcript support
- Instagram reel extraction via browser automation
- Blog/article web scraping
- Auto-save to Obsidian vaults
- Smart key point generation
- Configurable via .env file
- Quick extract shell script

Tech stack: Python, requests, beautifulsoup4, playwright, youtube-transcript-api
This commit is contained in:
naki
2026-03-05 13:02:58 +05:30
commit c997e764b5
12 changed files with 1302 additions and 0 deletions

199
main.py Normal file
View File

@@ -0,0 +1,199 @@
#!/usr/bin/env python3
"""
Content Extractor - Extract key information from URLs and save to Obsidian
Supports:
- YouTube videos (transcripts, descriptions, metadata)
- Blog posts & articles (web scraping)
- Instagram reels (via browser automation)
- Generic URLs (text extraction)
Usage:
python main.py <url> [--obsidian-path <path>] [--output <filename>]
"""
import argparse
import sys
from pathlib import Path
from datetime import datetime
from typing import Optional
from extractors.youtube_extractor import YouTubeExtractor
from extractors.blog_extractor import BlogExtractor
from extractors.instagram_extractor import InstagramExtractor
from obsidian_writer import ObsidianWriter
from config import Config
def detect_source_type(url: str) -> str:
"""Detect the type of content based on URL."""
if "youtube.com" in url or "youtu.be" in url:
return "youtube"
elif "instagram.com" in url and "/reel" in url:
return "instagram"
elif "instagram.com" in url:
return "instagram"
else:
return "blog"
def extract_content(url: str, source_type: str) -> dict:
"""Extract content from URL based on source type."""
print(f"🔍 Extracting content from {source_type}...")
if source_type == "youtube":
extractor = YouTubeExtractor(url)
elif source_type == "instagram":
extractor = InstagramExtractor(url)
else:
extractor = BlogExtractor(url)
return extractor.extract()
def main():
parser = argparse.ArgumentParser(
description="Extract content from URLs and save to Obsidian notes"
)
parser.add_argument("url", help="URL to extract content from")
parser.add_argument(
"--obsidian-path",
type=str,
default=Config.OBSIDIAN_VAULT_PATH,
help="Path to Obsidian vault"
)
parser.add_argument(
"--output",
type=str,
default=None,
help="Output filename (without .md extension)"
)
parser.add_argument(
"--folder",
type=str,
default="Content Extractor",
help="Folder in Obsidian vault to save notes"
)
parser.add_argument(
"--no-save",
action="store_true",
help="Only print extracted content, don't save to Obsidian"
)
parser.add_argument(
"--summarize",
action="store_true",
help="Generate a summary of the content"
)
args = parser.parse_args()
# Detect source type
source_type = detect_source_type(args.url)
print(f"📌 Detected source type: {source_type}")
# Extract content
try:
content = extract_content(args.url, source_type)
except Exception as e:
print(f"❌ Extraction failed: {e}")
sys.exit(1)
if not content:
print("❌ No content could be extracted")
sys.exit(1)
# Generate output filename
if args.output:
filename = args.output
else:
# Generate from title or URL
title = content.get("title", "Untitled")
filename = f"{title[:50]}_{datetime.now().strftime('%Y%m%d')}"
# Sanitize filename
filename = "".join(c for c in filename if c.isalnum() or c in " -_").strip()
# Create markdown content
markdown = generate_markdown(content, source_type, args.url)
# Print preview
print("\n" + "="*80)
print("📝 EXTRACTED CONTENT PREVIEW")
print("="*80)
print(markdown[:2000] + "..." if len(markdown) > 2000 else markdown)
print("="*80)
# Save to Obsidian
if not args.no_save:
writer = ObsidianWriter(args.obsidian_path)
output_path = writer.save_note(markdown, filename, args.folder)
print(f"\n✅ Note saved to: {output_path}")
else:
print("\n⚠️ Note not saved (--no-save flag)")
return content
def generate_markdown(content: dict, source_type: str, url: str) -> str:
"""Generate markdown content for Obsidian note."""
lines = []
# Header
lines.append(f"# {content.get('title', 'Untitled')}")
lines.append("")
# Metadata
lines.append("## Metadata")
lines.append("")
lines.append(f"- **Source**: {source_type.capitalize()}")
lines.append(f"- **URL**: {url}")
lines.append(f"- **Extracted**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
if content.get("author"):
lines.append(f"- **Author**: {content.get('author')}")
if content.get("duration"):
lines.append(f"- **Duration**: {content.get('duration')}")
if content.get("publish_date"):
lines.append(f"- **Published**: {content.get('publish_date')}")
if content.get("views"):
lines.append(f"- **Views**: {content.get('views')}")
lines.append("")
# Description/Summary
if content.get("description"):
lines.append("## Description")
lines.append("")
lines.append(content.get("description", ""))
lines.append("")
# Main Content (transcript, article text, etc.)
if content.get("content"):
lines.append("## Content")
lines.append("")
lines.append(content.get("content", ""))
lines.append("")
# Key Points/Summary
if content.get("key_points"):
lines.append("## Key Points")
lines.append("")
for point in content.get("key_points", []):
lines.append(f"- {point}")
lines.append("")
# Tags
lines.append("---")
lines.append("")
lines.append("## Tags")
lines.append("")
tags = content.get("tags", [])
if not tags:
tags = ["content-extractor", source_type, "notes"]
lines.append(" ".join(f"#{tag}" for tag in tags))
lines.append("")
return "\n".join(lines)
if __name__ == "__main__":
main()