""" Blog/Article Extractor Extracts: - Title, author, publish date - Main article content - Tags/categories - Summary """ import re from typing import Dict, Any, Optional from urllib.parse import urlparse try: import requests from bs4 import BeautifulSoup except ImportError: requests = None BeautifulSoup = None class BlogExtractor: """Extract content from blog posts and articles.""" def __init__(self, url: str): self.url = url self.html = None self.soup = None self._fetch_page() def _fetch_page(self): """Fetch the webpage.""" if requests is None: raise ImportError("requests not installed. Run: pip install requests") headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' } try: response = requests.get(self.url, headers=headers, timeout=30) response.raise_for_status() self.html = response.text except Exception as e: raise Exception(f"Failed to fetch page: {str(e)}") def _parse_html(self): """Parse HTML with BeautifulSoup.""" if BeautifulSoup is None: raise ImportError("beautifulsoup4 not installed. Run: pip install beautifulsoup4") if self.soup is None: self.soup = BeautifulSoup(self.html, 'lxml') def extract(self) -> Dict[str, Any]: """Extract all content from the page.""" self._parse_html() content = { "title": self._get_title(), "description": self._get_description(), "author": self._get_author(), "publish_date": self._get_publish_date(), "content": self._get_content(), "key_points": self._generate_key_points(), "tags": self._get_tags(), } return content def _get_title(self) -> str: """Get page title.""" # Try Open Graph title first og_title = self.soup.find('meta', property='og:title') if og_title and og_title.get('content'): return og_title['content'].strip() # Try Twitter card title twitter_title = self.soup.find('meta', attrs={'name': 'twitter:title'}) if twitter_title and twitter_title.get('content'): return twitter_title['content'].strip() # Try h1 tag h1 = self.soup.find('h1') if h1: return h1.get_text().strip() # Fallback to tag title_tag = self.soup.find('title') if title_tag: return title_tag.get_text().strip() return "Untitled Article" def _get_description(self) -> str: """Get page description.""" # Try Open Graph description og_desc = self.soup.find('meta', property='og:description') if og_desc and og_desc.get('content'): return og_desc['content'].strip() # Try meta description meta_desc = self.soup.find('meta', attrs={'name': 'description'}) if meta_desc and meta_desc.get('content'): return meta_desc['content'].strip() return "" def _get_author(self) -> str: """Get article author.""" # Try Open Graph author og_author = self.soup.find('meta', property='article:author') if og_author and og_author.get('content'): return og_author['content'].strip() # Try meta author meta_author = self.soup.find('meta', attrs={'name': 'author'}) if meta_author and meta_author.get('content'): return meta_author['content'].strip() # Try to find author in byline byline = self.soup.find(class_=re.compile(r'byline|author', re.I)) if byline: return byline.get_text().strip() return "Unknown" def _get_publish_date(self) -> str: """Get publish date.""" # Try Open Graph publish time og_time = self.soup.find('meta', property='article:published_time') if og_time and og_time.get('content'): return og_time['content'][:10] # YYYY-MM-DD # Try meta publish date meta_time = self.soup.find('meta', attrs={'name': 'date'}) if meta_time and meta_time.get('content'): return meta_time['content'][:10] # Try time tag time_tag = self.soup.find('time') if time_tag and time_tag.get('datetime'): return time_tag['datetime'][:10] return "Unknown" def _get_content(self) -> str: """Extract main article content.""" # Remove unwanted elements for element in self.soup(['script', 'style', 'nav', 'header', 'footer', 'aside']): element.decompose() # Try to find main content area content_areas = [ self.soup.find('article'), self.soup.find(class_=re.compile(r'article|content|post|entry', re.I)), self.soup.find(id=re.compile(r'article|content|post', re.I)), self.soup.find('main'), ] content_elem = next((elem for elem in content_areas if elem), None) if content_elem: # Get paragraphs from content area paragraphs = content_elem.find_all('p') else: # Fallback to all paragraphs paragraphs = self.soup.find_all('p') # Extract text from paragraphs text_parts = [] for p in paragraphs: text = p.get_text().strip() if len(text) > 50: # Filter out short paragraphs text_parts.append(text) # Join and clean content = "\n\n".join(text_parts) content = re.sub(r'\n{3,}', '\n\n', content) # Remove excessive newlines return content[:10000] # Limit length def _generate_key_points(self) -> list: """Generate key points from content.""" content = self._get_content() if not content: return [] # Extract first few sentences as key points sentences = re.split(r'[.!?]+', content) key_points = [] for sentence in sentences[:5]: sentence = sentence.strip() if len(sentence) > 30 and len(sentence) < 200: key_points.append(sentence + '.') return key_points def _get_tags(self) -> list: """Get article tags/categories.""" tags = [] # Try Open Graph article tags og_tags = self.soup.find_all('meta', property='article:tag') for tag in og_tags: if tag.get('content'): tags.append(tag['content'].lower().replace(' ', '-')) # Try to find tag elements tag_elements = self.soup.find_all(class_=re.compile(r'tag|category|label', re.I)) for elem in tag_elements[:5]: # Limit to 5 text = elem.get_text().strip().lower() if len(text) < 30: tags.append(text.replace(' ', '-')) # Add domain-based tag domain = urlparse(self.url).netloc if domain: tags.append(domain.replace('www.', '').split('.')[0]) return list(set(tags))[:10] # Remove duplicates and limit