backblaze-invoices-downloader/extractors/youtube_extractor.py

"""
YouTube Video Extractor

Extracts:
- Title, description, author
- Transcript/captions
- Duration, views, publish date
- Tags/categories
"""

import re
from typing import Optional, Dict, Any
from urllib.parse import urlparse, parse_qs

try:
    from youtube_transcript_api import YouTubeTranscriptApi
    from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound
except ImportError:
    YouTubeTranscriptApi = None

try:
    from pytubefix import YouTube  # More reliable than pytube
except ImportError:
    try:
        from pytube import YouTube
    except ImportError:
        YouTube = None


class YouTubeExtractor:
    """Extract content from YouTube videos."""

    def __init__(self, url: str):
        self.url = url
        self.video_id = self._extract_video_id(url)
        self.youtube = None

    def _extract_video_id(self, url: str) -> str:
        """Extract video ID from YouTube URL."""
        patterns = [
            r'(?:youtube\.com\/watch\?v=|youtu\.be\/)([a-zA-Z0-9_-]{11})',
            r'youtube\.com\/embed\/([a-zA-Z0-9_-]{11})',
            r'youtube\.com\/v\/([a-zA-Z0-9_-]{11})',
        ]

        for pattern in patterns:
            match = re.search(pattern, url)
            if match:
                return match.group(1)

        raise ValueError(f"Could not extract YouTube video ID from: {url}")

    def _init_youtube(self):
        """Initialize YouTube object."""
        if YouTube is None:
            raise ImportError("pytube or pytubefix not installed. Run: pip install pytubefix")

        if self.youtube is None:
            self.youtube = YouTube(self.url)

    def extract(self) -> Dict[str, Any]:
        """Extract all content from YouTube video."""
        self._init_youtube()

        content = {
            "title": self._get_title(),
            "description": self._get_description(),
            "author": self._get_author(),
            "duration": self._get_duration(),
            "publish_date": self._get_publish_date(),
            "views": self._get_views(),
            "content": self._get_transcript(),
            "key_points": self._generate_key_points(),
            "tags": self._get_tags(),
        }

        return content

    def _get_title(self) -> str:
        """Get video title."""
        try:
            self._init_youtube()
            return self.youtube.title
        except Exception as e:
            return f"Video {self.video_id}"

    def _get_description(self) -> str:
        """Get video description."""
        try:
            self._init_youtube()
            return self.youtube.description or ""
        except Exception:
            return ""

    def _get_author(self) -> str:
        """Get video author/channel name."""
        try:
            self._init_youtube()
            return self.youtube.author
        except Exception:
            return "Unknown"

    def _get_duration(self) -> str:
        """Get video duration in readable format."""
        try:
            self._init_youtube()
            seconds = self.youtube.length
            minutes, secs = divmod(seconds, 60)
            hours, minutes = divmod(minutes, 60)

            if hours > 0:
                return f"{hours}:{minutes:02d}:{secs:02d}"
            else:
                return f"{minutes}:{secs:02d}"
        except Exception:
            return "Unknown"

    def _get_publish_date(self) -> str:
        """Get video publish date."""
        try:
            self._init_youtube()
            if hasattr(self.youtube, 'publish_date') and self.youtube.publish_date:
                return self.youtube.publish_date.strftime("%Y-%m-%d")
        except Exception:
            pass
        return "Unknown"

    def _get_views(self) -> str:
        """Get view count."""
        try:
            self._init_youtube()
            views = self.youtube.views
            if views > 1_000_000:
                return f"{views / 1_000_000:.1f}M"
            elif views > 1_000:
                return f"{views / 1_000:.1f}K"
            else:
                return str(views)
        except Exception:
            return "Unknown"

    def _get_transcript(self) -> str:
        """Get video transcript/captions."""
        if YouTubeTranscriptApi is None:
            return "[Transcript not available - youtube-transcript-api not installed]"

        try:
            # New API requires creating an instance
            api = YouTubeTranscriptApi()
            transcript_list = api.list(self.video_id)

            # Try to find English transcript
            transcript = None
            for t in transcript_list:
                if t.language_code == 'en':
                    transcript = t
                    break

            # Fallback to first available
            if transcript is None:
                transcript = next(iter(transcript_list), None)

            if transcript is None:
                return "[No transcript available]"

            transcript_data = transcript.fetch()

            # New API returns FetchedTranscript with snippets
            if hasattr(transcript_data, 'snippets'):
                full_text = " ".join([snippet.text for snippet in transcript_data.snippets])
            else:
                # Fallback for older API format
                full_text = " ".join([entry['text'] for entry in transcript_data])

            # Clean up the text
            full_text = full_text.replace("\n", " ").strip()

            return full_text[:10000]  # Limit length
        except Exception as e:
            return f"[Transcript not available: {str(e)}]"

    def _generate_key_points(self) -> list:
        """Generate key points from transcript (simple extraction)."""
        transcript = self._get_transcript()

        if not transcript or transcript.startswith("["):
            return []

        # Simple sentence extraction (first few sentences as key points)
        sentences = transcript.split('.')[:5]
        key_points = [s.strip() + '.' for s in sentences if len(s.strip()) > 20]

        return key_points[:5]

    def _get_tags(self) -> list:
        """Get video tags."""
        try:
            self._init_youtube()
            if hasattr(self.youtube, 'keywords'):
                return self.youtube.keywords[:10] if self.youtube.keywords else []
        except Exception:
            pass
        return ["youtube", "video"]