- YouTube extraction with transcript support - Instagram reel extraction via browser automation - Blog/article web scraping - Auto-save to Obsidian vaults - Smart key point generation - Configurable via .env file - Quick extract shell script Tech stack: Python, requests, beautifulsoup4, playwright, youtube-transcript-api
225 lines
7.4 KiB
Python
225 lines
7.4 KiB
Python
"""
|
|
Blog/Article Extractor
|
|
|
|
Extracts:
|
|
- Title, author, publish date
|
|
- Main article content
|
|
- Tags/categories
|
|
- Summary
|
|
"""
|
|
|
|
import re
|
|
from typing import Dict, Any, Optional
|
|
from urllib.parse import urlparse
|
|
|
|
try:
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
except ImportError:
|
|
requests = None
|
|
BeautifulSoup = None
|
|
|
|
|
|
class BlogExtractor:
|
|
"""Extract content from blog posts and articles."""
|
|
|
|
def __init__(self, url: str):
|
|
self.url = url
|
|
self.html = None
|
|
self.soup = None
|
|
self._fetch_page()
|
|
|
|
def _fetch_page(self):
|
|
"""Fetch the webpage."""
|
|
if requests is None:
|
|
raise ImportError("requests not installed. Run: pip install requests")
|
|
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
}
|
|
|
|
try:
|
|
response = requests.get(self.url, headers=headers, timeout=30)
|
|
response.raise_for_status()
|
|
self.html = response.text
|
|
except Exception as e:
|
|
raise Exception(f"Failed to fetch page: {str(e)}")
|
|
|
|
def _parse_html(self):
|
|
"""Parse HTML with BeautifulSoup."""
|
|
if BeautifulSoup is None:
|
|
raise ImportError("beautifulsoup4 not installed. Run: pip install beautifulsoup4")
|
|
|
|
if self.soup is None:
|
|
self.soup = BeautifulSoup(self.html, 'lxml')
|
|
|
|
def extract(self) -> Dict[str, Any]:
|
|
"""Extract all content from the page."""
|
|
self._parse_html()
|
|
|
|
content = {
|
|
"title": self._get_title(),
|
|
"description": self._get_description(),
|
|
"author": self._get_author(),
|
|
"publish_date": self._get_publish_date(),
|
|
"content": self._get_content(),
|
|
"key_points": self._generate_key_points(),
|
|
"tags": self._get_tags(),
|
|
}
|
|
|
|
return content
|
|
|
|
def _get_title(self) -> str:
|
|
"""Get page title."""
|
|
# Try Open Graph title first
|
|
og_title = self.soup.find('meta', property='og:title')
|
|
if og_title and og_title.get('content'):
|
|
return og_title['content'].strip()
|
|
|
|
# Try Twitter card title
|
|
twitter_title = self.soup.find('meta', attrs={'name': 'twitter:title'})
|
|
if twitter_title and twitter_title.get('content'):
|
|
return twitter_title['content'].strip()
|
|
|
|
# Try h1 tag
|
|
h1 = self.soup.find('h1')
|
|
if h1:
|
|
return h1.get_text().strip()
|
|
|
|
# Fallback to <title> tag
|
|
title_tag = self.soup.find('title')
|
|
if title_tag:
|
|
return title_tag.get_text().strip()
|
|
|
|
return "Untitled Article"
|
|
|
|
def _get_description(self) -> str:
|
|
"""Get page description."""
|
|
# Try Open Graph description
|
|
og_desc = self.soup.find('meta', property='og:description')
|
|
if og_desc and og_desc.get('content'):
|
|
return og_desc['content'].strip()
|
|
|
|
# Try meta description
|
|
meta_desc = self.soup.find('meta', attrs={'name': 'description'})
|
|
if meta_desc and meta_desc.get('content'):
|
|
return meta_desc['content'].strip()
|
|
|
|
return ""
|
|
|
|
def _get_author(self) -> str:
|
|
"""Get article author."""
|
|
# Try Open Graph author
|
|
og_author = self.soup.find('meta', property='article:author')
|
|
if og_author and og_author.get('content'):
|
|
return og_author['content'].strip()
|
|
|
|
# Try meta author
|
|
meta_author = self.soup.find('meta', attrs={'name': 'author'})
|
|
if meta_author and meta_author.get('content'):
|
|
return meta_author['content'].strip()
|
|
|
|
# Try to find author in byline
|
|
byline = self.soup.find(class_=re.compile(r'byline|author', re.I))
|
|
if byline:
|
|
return byline.get_text().strip()
|
|
|
|
return "Unknown"
|
|
|
|
def _get_publish_date(self) -> str:
|
|
"""Get publish date."""
|
|
# Try Open Graph publish time
|
|
og_time = self.soup.find('meta', property='article:published_time')
|
|
if og_time and og_time.get('content'):
|
|
return og_time['content'][:10] # YYYY-MM-DD
|
|
|
|
# Try meta publish date
|
|
meta_time = self.soup.find('meta', attrs={'name': 'date'})
|
|
if meta_time and meta_time.get('content'):
|
|
return meta_time['content'][:10]
|
|
|
|
# Try time tag
|
|
time_tag = self.soup.find('time')
|
|
if time_tag and time_tag.get('datetime'):
|
|
return time_tag['datetime'][:10]
|
|
|
|
return "Unknown"
|
|
|
|
def _get_content(self) -> str:
|
|
"""Extract main article content."""
|
|
# Remove unwanted elements
|
|
for element in self.soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
|
|
element.decompose()
|
|
|
|
# Try to find main content area
|
|
content_areas = [
|
|
self.soup.find('article'),
|
|
self.soup.find(class_=re.compile(r'article|content|post|entry', re.I)),
|
|
self.soup.find(id=re.compile(r'article|content|post', re.I)),
|
|
self.soup.find('main'),
|
|
]
|
|
|
|
content_elem = next((elem for elem in content_areas if elem), None)
|
|
|
|
if content_elem:
|
|
# Get paragraphs from content area
|
|
paragraphs = content_elem.find_all('p')
|
|
else:
|
|
# Fallback to all paragraphs
|
|
paragraphs = self.soup.find_all('p')
|
|
|
|
# Extract text from paragraphs
|
|
text_parts = []
|
|
for p in paragraphs:
|
|
text = p.get_text().strip()
|
|
if len(text) > 50: # Filter out short paragraphs
|
|
text_parts.append(text)
|
|
|
|
# Join and clean
|
|
content = "\n\n".join(text_parts)
|
|
content = re.sub(r'\n{3,}', '\n\n', content) # Remove excessive newlines
|
|
|
|
return content[:10000] # Limit length
|
|
|
|
def _generate_key_points(self) -> list:
|
|
"""Generate key points from content."""
|
|
content = self._get_content()
|
|
|
|
if not content:
|
|
return []
|
|
|
|
# Extract first few sentences as key points
|
|
sentences = re.split(r'[.!?]+', content)
|
|
key_points = []
|
|
|
|
for sentence in sentences[:5]:
|
|
sentence = sentence.strip()
|
|
if len(sentence) > 30 and len(sentence) < 200:
|
|
key_points.append(sentence + '.')
|
|
|
|
return key_points
|
|
|
|
def _get_tags(self) -> list:
|
|
"""Get article tags/categories."""
|
|
tags = []
|
|
|
|
# Try Open Graph article tags
|
|
og_tags = self.soup.find_all('meta', property='article:tag')
|
|
for tag in og_tags:
|
|
if tag.get('content'):
|
|
tags.append(tag['content'].lower().replace(' ', '-'))
|
|
|
|
# Try to find tag elements
|
|
tag_elements = self.soup.find_all(class_=re.compile(r'tag|category|label', re.I))
|
|
for elem in tag_elements[:5]: # Limit to 5
|
|
text = elem.get_text().strip().lower()
|
|
if len(text) < 30:
|
|
tags.append(text.replace(' ', '-'))
|
|
|
|
# Add domain-based tag
|
|
domain = urlparse(self.url).netloc
|
|
if domain:
|
|
tags.append(domain.replace('www.', '').split('.')[0])
|
|
|
|
return list(set(tags))[:10] # Remove duplicates and limit
|