feat: Initial commit - Content Extractor for YouTube, Instagram, and blogs

- YouTube extraction with transcript support - Instagram reel extraction via browser automation - Blog/article web scraping - Auto-save to Obsidian vaults - Smart key point generation - Configurable via .env file - Quick extract shell script Tech stack: Python, requests, beautifulsoup4, playwright, youtube-transcript-api
2026-03-05 13:02:58 +05:30
commit c997e764b5
12 changed files with 1302 additions and 0 deletions
--- a/extractors/blog_extractor.py
+++ b/extractors/blog_extractor.py
@@ -0,0 +1,224 @@
+"""
+Blog/Article Extractor
+
+Extracts:
+- Title, author, publish date
+- Main article content
+- Tags/categories
+- Summary
+"""
+
+import re
+from typing import Dict, Any, Optional
+from urllib.parse import urlparse
+
+try:
+    import requests
+    from bs4 import BeautifulSoup
+except ImportError:
+    requests = None
+    BeautifulSoup = None
+
+
+class BlogExtractor:
+    """Extract content from blog posts and articles."""
+    
+    def __init__(self, url: str):
+        self.url = url
+        self.html = None
+        self.soup = None
+        self._fetch_page()
+    
+    def _fetch_page(self):
+        """Fetch the webpage."""
+        if requests is None:
+            raise ImportError("requests not installed. Run: pip install requests")
+        
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+        }
+        
+        try:
+            response = requests.get(self.url, headers=headers, timeout=30)
+            response.raise_for_status()
+            self.html = response.text
+        except Exception as e:
+            raise Exception(f"Failed to fetch page: {str(e)}")
+    
+    def _parse_html(self):
+        """Parse HTML with BeautifulSoup."""
+        if BeautifulSoup is None:
+            raise ImportError("beautifulsoup4 not installed. Run: pip install beautifulsoup4")
+        
+        if self.soup is None:
+            self.soup = BeautifulSoup(self.html, 'lxml')
+    
+    def extract(self) -> Dict[str, Any]:
+        """Extract all content from the page."""
+        self._parse_html()
+        
+        content = {
+            "title": self._get_title(),
+            "description": self._get_description(),
+            "author": self._get_author(),
+            "publish_date": self._get_publish_date(),
+            "content": self._get_content(),
+            "key_points": self._generate_key_points(),
+            "tags": self._get_tags(),
+        }
+        
+        return content
+    
+    def _get_title(self) -> str:
+        """Get page title."""
+        # Try Open Graph title first
+        og_title = self.soup.find('meta', property='og:title')
+        if og_title and og_title.get('content'):
+            return og_title['content'].strip()
+        
+        # Try Twitter card title
+        twitter_title = self.soup.find('meta', attrs={'name': 'twitter:title'})
+        if twitter_title and twitter_title.get('content'):
+            return twitter_title['content'].strip()
+        
+        # Try h1 tag
+        h1 = self.soup.find('h1')
+        if h1:
+            return h1.get_text().strip()
+        
+        # Fallback to <title> tag
+        title_tag = self.soup.find('title')
+        if title_tag:
+            return title_tag.get_text().strip()
+        
+        return "Untitled Article"
+    
+    def _get_description(self) -> str:
+        """Get page description."""
+        # Try Open Graph description
+        og_desc = self.soup.find('meta', property='og:description')
+        if og_desc and og_desc.get('content'):
+            return og_desc['content'].strip()
+        
+        # Try meta description
+        meta_desc = self.soup.find('meta', attrs={'name': 'description'})
+        if meta_desc and meta_desc.get('content'):
+            return meta_desc['content'].strip()
+        
+        return ""
+    
+    def _get_author(self) -> str:
+        """Get article author."""
+        # Try Open Graph author
+        og_author = self.soup.find('meta', property='article:author')
+        if og_author and og_author.get('content'):
+            return og_author['content'].strip()
+        
+        # Try meta author
+        meta_author = self.soup.find('meta', attrs={'name': 'author'})
+        if meta_author and meta_author.get('content'):
+            return meta_author['content'].strip()
+        
+        # Try to find author in byline
+        byline = self.soup.find(class_=re.compile(r'byline|author', re.I))
+        if byline:
+            return byline.get_text().strip()
+        
+        return "Unknown"
+    
+    def _get_publish_date(self) -> str:
+        """Get publish date."""
+        # Try Open Graph publish time
+        og_time = self.soup.find('meta', property='article:published_time')
+        if og_time and og_time.get('content'):
+            return og_time['content'][:10]  # YYYY-MM-DD
+        
+        # Try meta publish date
+        meta_time = self.soup.find('meta', attrs={'name': 'date'})
+        if meta_time and meta_time.get('content'):
+            return meta_time['content'][:10]
+        
+        # Try time tag
+        time_tag = self.soup.find('time')
+        if time_tag and time_tag.get('datetime'):
+            return time_tag['datetime'][:10]
+        
+        return "Unknown"
+    
+    def _get_content(self) -> str:
+        """Extract main article content."""
+        # Remove unwanted elements
+        for element in self.soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
+            element.decompose()
+        
+        # Try to find main content area
+        content_areas = [
+            self.soup.find('article'),
+            self.soup.find(class_=re.compile(r'article|content|post|entry', re.I)),
+            self.soup.find(id=re.compile(r'article|content|post', re.I)),
+            self.soup.find('main'),
+        ]
+        
+        content_elem = next((elem for elem in content_areas if elem), None)
+        
+        if content_elem:
+            # Get paragraphs from content area
+            paragraphs = content_elem.find_all('p')
+        else:
+            # Fallback to all paragraphs
+            paragraphs = self.soup.find_all('p')
+        
+        # Extract text from paragraphs
+        text_parts = []
+        for p in paragraphs:
+            text = p.get_text().strip()
+            if len(text) > 50:  # Filter out short paragraphs
+                text_parts.append(text)
+        
+        # Join and clean
+        content = "\n\n".join(text_parts)
+        content = re.sub(r'\n{3,}', '\n\n', content)  # Remove excessive newlines
+        
+        return content[:10000]  # Limit length
+    
+    def _generate_key_points(self) -> list:
+        """Generate key points from content."""
+        content = self._get_content()
+        
+        if not content:
+            return []
+        
+        # Extract first few sentences as key points
+        sentences = re.split(r'[.!?]+', content)
+        key_points = []
+        
+        for sentence in sentences[:5]:
+            sentence = sentence.strip()
+            if len(sentence) > 30 and len(sentence) < 200:
+                key_points.append(sentence + '.')
+        
+        return key_points
+    
+    def _get_tags(self) -> list:
+        """Get article tags/categories."""
+        tags = []
+        
+        # Try Open Graph article tags
+        og_tags = self.soup.find_all('meta', property='article:tag')
+        for tag in og_tags:
+            if tag.get('content'):
+                tags.append(tag['content'].lower().replace(' ', '-'))
+        
+        # Try to find tag elements
+        tag_elements = self.soup.find_all(class_=re.compile(r'tag|category|label', re.I))
+        for elem in tag_elements[:5]:  # Limit to 5
+            text = elem.get_text().strip().lower()
+            if len(text) < 30:
+                tags.append(text.replace(' ', '-'))
+        
+        # Add domain-based tag
+        domain = urlparse(self.url).netloc
+        if domain:
+            tags.append(domain.replace('www.', '').split('.')[0])
+        
+        return list(set(tags))[:10]  # Remove duplicates and limit