feat: Initial commit - Content Extractor for YouTube, Instagram, and blogs

- YouTube extraction with transcript support
- Instagram reel extraction via browser automation
- Blog/article web scraping
- Auto-save to Obsidian vaults
- Smart key point generation
- Configurable via .env file
- Quick extract shell script

Tech stack: Python, requests, beautifulsoup4, playwright, youtube-transcript-api
This commit is contained in:
naki
2026-03-05 13:02:58 +05:30
commit c997e764b5
12 changed files with 1302 additions and 0 deletions

View File

@@ -0,0 +1,224 @@
"""
Blog/Article Extractor
Extracts:
- Title, author, publish date
- Main article content
- Tags/categories
- Summary
"""
import re
from typing import Dict, Any, Optional
from urllib.parse import urlparse
try:
import requests
from bs4 import BeautifulSoup
except ImportError:
requests = None
BeautifulSoup = None
class BlogExtractor:
"""Extract content from blog posts and articles."""
def __init__(self, url: str):
self.url = url
self.html = None
self.soup = None
self._fetch_page()
def _fetch_page(self):
"""Fetch the webpage."""
if requests is None:
raise ImportError("requests not installed. Run: pip install requests")
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
try:
response = requests.get(self.url, headers=headers, timeout=30)
response.raise_for_status()
self.html = response.text
except Exception as e:
raise Exception(f"Failed to fetch page: {str(e)}")
def _parse_html(self):
"""Parse HTML with BeautifulSoup."""
if BeautifulSoup is None:
raise ImportError("beautifulsoup4 not installed. Run: pip install beautifulsoup4")
if self.soup is None:
self.soup = BeautifulSoup(self.html, 'lxml')
def extract(self) -> Dict[str, Any]:
"""Extract all content from the page."""
self._parse_html()
content = {
"title": self._get_title(),
"description": self._get_description(),
"author": self._get_author(),
"publish_date": self._get_publish_date(),
"content": self._get_content(),
"key_points": self._generate_key_points(),
"tags": self._get_tags(),
}
return content
def _get_title(self) -> str:
"""Get page title."""
# Try Open Graph title first
og_title = self.soup.find('meta', property='og:title')
if og_title and og_title.get('content'):
return og_title['content'].strip()
# Try Twitter card title
twitter_title = self.soup.find('meta', attrs={'name': 'twitter:title'})
if twitter_title and twitter_title.get('content'):
return twitter_title['content'].strip()
# Try h1 tag
h1 = self.soup.find('h1')
if h1:
return h1.get_text().strip()
# Fallback to <title> tag
title_tag = self.soup.find('title')
if title_tag:
return title_tag.get_text().strip()
return "Untitled Article"
def _get_description(self) -> str:
"""Get page description."""
# Try Open Graph description
og_desc = self.soup.find('meta', property='og:description')
if og_desc and og_desc.get('content'):
return og_desc['content'].strip()
# Try meta description
meta_desc = self.soup.find('meta', attrs={'name': 'description'})
if meta_desc and meta_desc.get('content'):
return meta_desc['content'].strip()
return ""
def _get_author(self) -> str:
"""Get article author."""
# Try Open Graph author
og_author = self.soup.find('meta', property='article:author')
if og_author and og_author.get('content'):
return og_author['content'].strip()
# Try meta author
meta_author = self.soup.find('meta', attrs={'name': 'author'})
if meta_author and meta_author.get('content'):
return meta_author['content'].strip()
# Try to find author in byline
byline = self.soup.find(class_=re.compile(r'byline|author', re.I))
if byline:
return byline.get_text().strip()
return "Unknown"
def _get_publish_date(self) -> str:
"""Get publish date."""
# Try Open Graph publish time
og_time = self.soup.find('meta', property='article:published_time')
if og_time and og_time.get('content'):
return og_time['content'][:10] # YYYY-MM-DD
# Try meta publish date
meta_time = self.soup.find('meta', attrs={'name': 'date'})
if meta_time and meta_time.get('content'):
return meta_time['content'][:10]
# Try time tag
time_tag = self.soup.find('time')
if time_tag and time_tag.get('datetime'):
return time_tag['datetime'][:10]
return "Unknown"
def _get_content(self) -> str:
"""Extract main article content."""
# Remove unwanted elements
for element in self.soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
element.decompose()
# Try to find main content area
content_areas = [
self.soup.find('article'),
self.soup.find(class_=re.compile(r'article|content|post|entry', re.I)),
self.soup.find(id=re.compile(r'article|content|post', re.I)),
self.soup.find('main'),
]
content_elem = next((elem for elem in content_areas if elem), None)
if content_elem:
# Get paragraphs from content area
paragraphs = content_elem.find_all('p')
else:
# Fallback to all paragraphs
paragraphs = self.soup.find_all('p')
# Extract text from paragraphs
text_parts = []
for p in paragraphs:
text = p.get_text().strip()
if len(text) > 50: # Filter out short paragraphs
text_parts.append(text)
# Join and clean
content = "\n\n".join(text_parts)
content = re.sub(r'\n{3,}', '\n\n', content) # Remove excessive newlines
return content[:10000] # Limit length
def _generate_key_points(self) -> list:
"""Generate key points from content."""
content = self._get_content()
if not content:
return []
# Extract first few sentences as key points
sentences = re.split(r'[.!?]+', content)
key_points = []
for sentence in sentences[:5]:
sentence = sentence.strip()
if len(sentence) > 30 and len(sentence) < 200:
key_points.append(sentence + '.')
return key_points
def _get_tags(self) -> list:
"""Get article tags/categories."""
tags = []
# Try Open Graph article tags
og_tags = self.soup.find_all('meta', property='article:tag')
for tag in og_tags:
if tag.get('content'):
tags.append(tag['content'].lower().replace(' ', '-'))
# Try to find tag elements
tag_elements = self.soup.find_all(class_=re.compile(r'tag|category|label', re.I))
for elem in tag_elements[:5]: # Limit to 5
text = elem.get_text().strip().lower()
if len(text) < 30:
tags.append(text.replace(' ', '-'))
# Add domain-based tag
domain = urlparse(self.url).netloc
if domain:
tags.append(domain.replace('www.', '').split('.')[0])
return list(set(tags))[:10] # Remove duplicates and limit