HH/apps/social/scrapers/youtube.py

"""
YouTube comment scraper using YouTube Data API v3.
"""
import logging
from typing import List, Dict, Any
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

from .base import BaseScraper


class YouTubeScraper(BaseScraper):
    """
    Scraper for YouTube comments using YouTube Data API v3.
    Extracts top-level comments only (no replies).
    """

    def __init__(self, config: Dict[str, Any]):
        """
        Initialize YouTube scraper.

        Args:
            config: Dictionary with 'api_key' and optionally 'channel_id'
        """
        super().__init__(config)
        self.api_key = config.get('api_key')
        if not self.api_key:
            raise ValueError(
                "YouTube API key is required. "
                "Set YOUTUBE_API_KEY in your .env file."
            )

        self.channel_id = config.get('channel_id')
        if not self.channel_id:
            self.logger.warning(
                "YouTube channel_id not provided. "
                "Set YOUTUBE_CHANNEL_ID in your .env file to specify which channel to scrape."
            )

        self.youtube = build('youtube', 'v3', developerKey=self.api_key)
        self.logger = logging.getLogger(self.__class__.__name__)

    def scrape_comments(self, channel_id: str = None, **kwargs) -> List[Dict[str, Any]]:
        """
        Scrape top-level comments from a YouTube channel.

        Args:
            channel_id: YouTube channel ID to scrape comments from

        Returns:
            List of standardized comment dictionaries
        """
        channel_id = channel_id or self.config.get('channel_id')
        if not channel_id:
            raise ValueError("Channel ID is required")

        all_comments = []
        next_page_token = None

        self.logger.info(f"Starting YouTube comment extraction for channel: {channel_id}")

        while True:
            try:
                # Get comment threads (top-level comments only)
                request = self.youtube.commentThreads().list(
                    part="snippet",
                    allThreadsRelatedToChannelId=channel_id,
                    maxResults=100,
                    pageToken=next_page_token,
                    textFormat="plainText"
                )
                response = request.execute()

                # Process each comment thread
                for item in response.get('items', []):
                    comment = self._extract_top_level_comment(item)
                    if comment:
                        all_comments.append(comment)

                # Check for more pages
                next_page_token = response.get('nextPageToken')
                if not next_page_token:
                    break

                self.logger.info(f"Fetched {len(all_comments)} comments so far...")

            except HttpError as e:
                if e.resp.status in [403, 429]:
                    self.logger.error("YouTube API quota exceeded or access forbidden")
                    break
                else:
                    self.logger.error(f"YouTube API error: {e}")
                    break
            except Exception as e:
                self.logger.error(f"Unexpected error scraping YouTube: {e}")
                break

        self.logger.info(f"Completed YouTube scraping. Total comments: {len(all_comments)}")
        return all_comments

    def _extract_top_level_comment(self, item: Dict[str, Any]) -> Dict[str, Any]:
        """
        Extract and standardize a top-level comment from YouTube API response.

        Args:
            item: YouTube API comment thread item

        Returns:
            Standardized comment dictionary
        """
        try:
            top_level_comment = item['snippet']['topLevelComment']['snippet']
            comment_id = item['snippet']['topLevelComment']['id']

            # Get video ID (post_id)
            video_id = item['snippet'].get('videoId')

            comment_data = {
                'comment_id': comment_id,
                'comments': top_level_comment.get('textDisplay', ''),
                'author': top_level_comment.get('authorDisplayName', ''),
                'published_at': self._parse_timestamp(top_level_comment.get('publishedAt')),
                'like_count': top_level_comment.get('likeCount', 0),
                'reply_count': item['snippet'].get('totalReplyCount', 0),
                'post_id': video_id,
                'media_url': f"https://www.youtube.com/watch?v={video_id}" if video_id else None,
                'raw_data': item
            }

            return self._standardize_comment(comment_data)

        except Exception as e:
            self.logger.error(f"Error extracting YouTube comment: {e}")
            return None