135 lines
4.9 KiB
Python
135 lines
4.9 KiB
Python
"""
|
|
YouTube comment scraper using YouTube Data API v3.
|
|
"""
|
|
import logging
|
|
from typing import List, Dict, Any
|
|
from googleapiclient.discovery import build
|
|
from googleapiclient.errors import HttpError
|
|
|
|
from .base import BaseScraper
|
|
|
|
|
|
class YouTubeScraper(BaseScraper):
|
|
"""
|
|
Scraper for YouTube comments using YouTube Data API v3.
|
|
Extracts top-level comments only (no replies).
|
|
"""
|
|
|
|
def __init__(self, config: Dict[str, Any]):
|
|
"""
|
|
Initialize YouTube scraper.
|
|
|
|
Args:
|
|
config: Dictionary with 'api_key' and optionally 'channel_id'
|
|
"""
|
|
super().__init__(config)
|
|
self.api_key = config.get('api_key')
|
|
if not self.api_key:
|
|
raise ValueError(
|
|
"YouTube API key is required. "
|
|
"Set YOUTUBE_API_KEY in your .env file."
|
|
)
|
|
|
|
self.channel_id = config.get('channel_id')
|
|
if not self.channel_id:
|
|
self.logger.warning(
|
|
"YouTube channel_id not provided. "
|
|
"Set YOUTUBE_CHANNEL_ID in your .env file to specify which channel to scrape."
|
|
)
|
|
|
|
self.youtube = build('youtube', 'v3', developerKey=self.api_key)
|
|
self.logger = logging.getLogger(self.__class__.__name__)
|
|
|
|
def scrape_comments(self, channel_id: str = None, **kwargs) -> List[Dict[str, Any]]:
|
|
"""
|
|
Scrape top-level comments from a YouTube channel.
|
|
|
|
Args:
|
|
channel_id: YouTube channel ID to scrape comments from
|
|
|
|
Returns:
|
|
List of standardized comment dictionaries
|
|
"""
|
|
channel_id = channel_id or self.config.get('channel_id')
|
|
if not channel_id:
|
|
raise ValueError("Channel ID is required")
|
|
|
|
all_comments = []
|
|
next_page_token = None
|
|
|
|
self.logger.info(f"Starting YouTube comment extraction for channel: {channel_id}")
|
|
|
|
while True:
|
|
try:
|
|
# Get comment threads (top-level comments only)
|
|
request = self.youtube.commentThreads().list(
|
|
part="snippet",
|
|
allThreadsRelatedToChannelId=channel_id,
|
|
maxResults=100,
|
|
pageToken=next_page_token,
|
|
textFormat="plainText"
|
|
)
|
|
response = request.execute()
|
|
|
|
# Process each comment thread
|
|
for item in response.get('items', []):
|
|
comment = self._extract_top_level_comment(item)
|
|
if comment:
|
|
all_comments.append(comment)
|
|
|
|
# Check for more pages
|
|
next_page_token = response.get('nextPageToken')
|
|
if not next_page_token:
|
|
break
|
|
|
|
self.logger.info(f"Fetched {len(all_comments)} comments so far...")
|
|
|
|
except HttpError as e:
|
|
if e.resp.status in [403, 429]:
|
|
self.logger.error("YouTube API quota exceeded or access forbidden")
|
|
break
|
|
else:
|
|
self.logger.error(f"YouTube API error: {e}")
|
|
break
|
|
except Exception as e:
|
|
self.logger.error(f"Unexpected error scraping YouTube: {e}")
|
|
break
|
|
|
|
self.logger.info(f"Completed YouTube scraping. Total comments: {len(all_comments)}")
|
|
return all_comments
|
|
|
|
def _extract_top_level_comment(self, item: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Extract and standardize a top-level comment from YouTube API response.
|
|
|
|
Args:
|
|
item: YouTube API comment thread item
|
|
|
|
Returns:
|
|
Standardized comment dictionary
|
|
"""
|
|
try:
|
|
top_level_comment = item['snippet']['topLevelComment']['snippet']
|
|
comment_id = item['snippet']['topLevelComment']['id']
|
|
|
|
# Get video ID (post_id)
|
|
video_id = item['snippet'].get('videoId')
|
|
|
|
comment_data = {
|
|
'comment_id': comment_id,
|
|
'comments': top_level_comment.get('textDisplay', ''),
|
|
'author': top_level_comment.get('authorDisplayName', ''),
|
|
'published_at': self._parse_timestamp(top_level_comment.get('publishedAt')),
|
|
'like_count': top_level_comment.get('likeCount', 0),
|
|
'reply_count': item['snippet'].get('totalReplyCount', 0),
|
|
'post_id': video_id,
|
|
'media_url': f"https://www.youtube.com/watch?v={video_id}" if video_id else None,
|
|
'raw_data': item
|
|
}
|
|
|
|
return self._standardize_comment(comment_data)
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error extracting YouTube comment: {e}")
|
|
return None
|