backblaze-invoices-downloader/extractors/blog_extractor.py

"""
Blog/Article Extractor

Extracts:
- Title, author, publish date
- Main article content
- Tags/categories
- Summary
"""

import re
from typing import Dict, Any, Optional
from urllib.parse import urlparse

try:
    import requests
    from bs4 import BeautifulSoup
except ImportError:
    requests = None
    BeautifulSoup = None


class BlogExtractor:
    """Extract content from blog posts and articles."""

    def __init__(self, url: str):
        self.url = url
        self.html = None
        self.soup = None
        self._fetch_page()

    def _fetch_page(self):
        """Fetch the webpage."""
        if requests is None:
            raise ImportError("requests not installed. Run: pip install requests")

        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        }

        try:
            response = requests.get(self.url, headers=headers, timeout=30)
            response.raise_for_status()
            self.html = response.text
        except Exception as e:
            raise Exception(f"Failed to fetch page: {str(e)}")

    def _parse_html(self):
        """Parse HTML with BeautifulSoup."""
        if BeautifulSoup is None:
            raise ImportError("beautifulsoup4 not installed. Run: pip install beautifulsoup4")

        if self.soup is None:
            self.soup = BeautifulSoup(self.html, 'lxml')

    def extract(self) -> Dict[str, Any]:
        """Extract all content from the page."""
        self._parse_html()

        content = {
            "title": self._get_title(),
            "description": self._get_description(),
            "author": self._get_author(),
            "publish_date": self._get_publish_date(),
            "content": self._get_content(),
            "key_points": self._generate_key_points(),
            "tags": self._get_tags(),
        }

        return content

    def _get_title(self) -> str:
        """Get page title."""
        # Try Open Graph title first
        og_title = self.soup.find('meta', property='og:title')
        if og_title and og_title.get('content'):
            return og_title['content'].strip()

        # Try Twitter card title
        twitter_title = self.soup.find('meta', attrs={'name': 'twitter:title'})
        if twitter_title and twitter_title.get('content'):
            return twitter_title['content'].strip()

        # Try h1 tag
        h1 = self.soup.find('h1')
        if h1:
            return h1.get_text().strip()

        # Fallback to <title> tag
        title_tag = self.soup.find('title')
        if title_tag:
            return title_tag.get_text().strip()

        return "Untitled Article"

    def _get_description(self) -> str:
        """Get page description."""
        # Try Open Graph description
        og_desc = self.soup.find('meta', property='og:description')
        if og_desc and og_desc.get('content'):
            return og_desc['content'].strip()

        # Try meta description
        meta_desc = self.soup.find('meta', attrs={'name': 'description'})
        if meta_desc and meta_desc.get('content'):
            return meta_desc['content'].strip()

        return ""

    def _get_author(self) -> str:
        """Get article author."""
        # Try Open Graph author
        og_author = self.soup.find('meta', property='article:author')
        if og_author and og_author.get('content'):
            return og_author['content'].strip()

        # Try meta author
        meta_author = self.soup.find('meta', attrs={'name': 'author'})
        if meta_author and meta_author.get('content'):
            return meta_author['content'].strip()

        # Try to find author in byline
        byline = self.soup.find(class_=re.compile(r'byline|author', re.I))
        if byline:
            return byline.get_text().strip()

        return "Unknown"

    def _get_publish_date(self) -> str:
        """Get publish date."""
        # Try Open Graph publish time
        og_time = self.soup.find('meta', property='article:published_time')
        if og_time and og_time.get('content'):
            return og_time['content'][:10]  # YYYY-MM-DD

        # Try meta publish date
        meta_time = self.soup.find('meta', attrs={'name': 'date'})
        if meta_time and meta_time.get('content'):
            return meta_time['content'][:10]

        # Try time tag
        time_tag = self.soup.find('time')
        if time_tag and time_tag.get('datetime'):
            return time_tag['datetime'][:10]

        return "Unknown"

    def _get_content(self) -> str:
        """Extract main article content."""
        # Remove unwanted elements
        for element in self.soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
            element.decompose()

        # Try to find main content area
        content_areas = [
            self.soup.find('article'),
            self.soup.find(class_=re.compile(r'article|content|post|entry', re.I)),
            self.soup.find(id=re.compile(r'article|content|post', re.I)),
            self.soup.find('main'),
        ]

        content_elem = next((elem for elem in content_areas if elem), None)

        if content_elem:
            # Get paragraphs from content area
            paragraphs = content_elem.find_all('p')
        else:
            # Fallback to all paragraphs
            paragraphs = self.soup.find_all('p')

        # Extract text from paragraphs
        text_parts = []
        for p in paragraphs:
            text = p.get_text().strip()
            if len(text) > 50:  # Filter out short paragraphs
                text_parts.append(text)

        # Join and clean
        content = "\n\n".join(text_parts)
        content = re.sub(r'\n{3,}', '\n\n', content)  # Remove excessive newlines

        return content[:10000]  # Limit length

    def _generate_key_points(self) -> list:
        """Generate key points from content."""
        content = self._get_content()

        if not content:
            return []

        # Extract first few sentences as key points
        sentences = re.split(r'[.!?]+', content)
        key_points = []

        for sentence in sentences[:5]:
            sentence = sentence.strip()
            if len(sentence) > 30 and len(sentence) < 200:
                key_points.append(sentence + '.')

        return key_points

    def _get_tags(self) -> list:
        """Get article tags/categories."""
        tags = []

        # Try Open Graph article tags
        og_tags = self.soup.find_all('meta', property='article:tag')
        for tag in og_tags:
            if tag.get('content'):
                tags.append(tag['content'].lower().replace(' ', '-'))

        # Try to find tag elements
        tag_elements = self.soup.find_all(class_=re.compile(r'tag|category|label', re.I))
        for elem in tag_elements[:5]:  # Limit to 5
            text = elem.get_text().strip().lower()
            if len(text) < 30:
                tags.append(text.replace(' ', '-'))

        # Add domain-based tag
        domain = urlparse(self.url).netloc
        if domain:
            tags.append(domain.replace('www.', '').split('.')[0])

        return list(set(tags))[:10]  # Remove duplicates and limit