""" YouTube comment scraper using YouTube Data API v3. """ import logging from typing import List, Dict, Any from googleapiclient.discovery import build from googleapiclient.errors import HttpError from .base import BaseScraper class YouTubeScraper(BaseScraper): """ Scraper for YouTube comments using YouTube Data API v3. Extracts top-level comments only (no replies). """ def __init__(self, config: Dict[str, Any]): """ Initialize YouTube scraper. Args: config: Dictionary with 'api_key' and optionally 'channel_id' """ super().__init__(config) self.api_key = config.get('api_key') if not self.api_key: raise ValueError( "YouTube API key is required. " "Set YOUTUBE_API_KEY in your .env file." ) self.channel_id = config.get('channel_id') if not self.channel_id: self.logger.warning( "YouTube channel_id not provided. " "Set YOUTUBE_CHANNEL_ID in your .env file to specify which channel to scrape." ) self.youtube = build('youtube', 'v3', developerKey=self.api_key) self.logger = logging.getLogger(self.__class__.__name__) def scrape_comments(self, channel_id: str = None, **kwargs) -> List[Dict[str, Any]]: """ Scrape top-level comments from a YouTube channel. Args: channel_id: YouTube channel ID to scrape comments from Returns: List of standardized comment dictionaries """ channel_id = channel_id or self.config.get('channel_id') if not channel_id: raise ValueError("Channel ID is required") all_comments = [] next_page_token = None self.logger.info(f"Starting YouTube comment extraction for channel: {channel_id}") while True: try: # Get comment threads (top-level comments only) request = self.youtube.commentThreads().list( part="snippet", allThreadsRelatedToChannelId=channel_id, maxResults=100, pageToken=next_page_token, textFormat="plainText" ) response = request.execute() # Process each comment thread for item in response.get('items', []): comment = self._extract_top_level_comment(item) if comment: all_comments.append(comment) # Check for more pages next_page_token = response.get('nextPageToken') if not next_page_token: break self.logger.info(f"Fetched {len(all_comments)} comments so far...") except HttpError as e: if e.resp.status in [403, 429]: self.logger.error("YouTube API quota exceeded or access forbidden") break else: self.logger.error(f"YouTube API error: {e}") break except Exception as e: self.logger.error(f"Unexpected error scraping YouTube: {e}") break self.logger.info(f"Completed YouTube scraping. Total comments: {len(all_comments)}") return all_comments def _extract_top_level_comment(self, item: Dict[str, Any]) -> Dict[str, Any]: """ Extract and standardize a top-level comment from YouTube API response. Args: item: YouTube API comment thread item Returns: Standardized comment dictionary """ try: top_level_comment = item['snippet']['topLevelComment']['snippet'] comment_id = item['snippet']['topLevelComment']['id'] # Get video ID (post_id) video_id = item['snippet'].get('videoId') comment_data = { 'comment_id': comment_id, 'comments': top_level_comment.get('textDisplay', ''), 'author': top_level_comment.get('authorDisplayName', ''), 'published_at': self._parse_timestamp(top_level_comment.get('publishedAt')), 'like_count': top_level_comment.get('likeCount', 0), 'reply_count': item['snippet'].get('totalReplyCount', 0), 'post_id': video_id, 'media_url': f"https://www.youtube.com/watch?v={video_id}" if video_id else None, 'raw_data': item } return self._standardize_comment(comment_data) except Exception as e: self.logger.error(f"Error extracting YouTube comment: {e}") return None