""" Facebook comment scraper using Facebook Graph API. """ import logging import requests from typing import List, Dict, Any from .base import BaseScraper class FacebookScraper(BaseScraper): """ Scraper for Facebook comments using Facebook Graph API. Extracts comments from posts. """ BASE_URL = "https://graph.facebook.com/v19.0" def __init__(self, config: Dict[str, Any]): """ Initialize Facebook scraper. Args: config: Dictionary with 'access_token' and optionally 'page_id' """ super().__init__(config) self.access_token = config.get('access_token') if not self.access_token: raise ValueError( "Facebook access token is required. " "Set FACEBOOK_ACCESS_TOKEN in your .env file." ) self.page_id = config.get('page_id') if not self.page_id: self.logger.warning( "Facebook page_id not provided. " "Set FACEBOOK_PAGE_ID in your .env file to specify which page to scrape." ) self.logger = logging.getLogger(self.__class__.__name__) def scrape_comments(self, page_id: str = None, **kwargs) -> List[Dict[str, Any]]: """ Scrape comments from all posts on a Facebook page. Args: page_id: Facebook page ID to scrape comments from Returns: List of standardized comment dictionaries """ page_id = page_id or self.page_id if not page_id: raise ValueError("Facebook page ID is required") all_comments = [] self.logger.info(f"Starting Facebook comment extraction for page: {page_id}") # Get all posts from the page posts = self._fetch_all_posts(page_id) self.logger.info(f"Found {len(posts)} posts to process") # Get comments for each post for post in posts: post_id = post['id'] post_comments = self._fetch_post_comments(post_id, post) all_comments.extend(post_comments) self.logger.info(f"Fetched {len(post_comments)} comments for post {post_id}") self.logger.info(f"Completed Facebook scraping. Total comments: {len(all_comments)}") return all_comments def _fetch_all_posts(self, page_id: str) -> List[Dict[str, Any]]: """ Fetch all posts from a Facebook page. Args: page_id: Facebook page ID Returns: List of post dictionaries """ url = f"{self.BASE_URL}/{page_id}/feed" params = { 'access_token': self.access_token, 'fields': 'id,message,created_time,permalink_url' } all_posts = [] while url: try: response = requests.get(url, params=params) data = response.json() if 'error' in data: self.logger.error(f"Facebook API error: {data['error']['message']}") break all_posts.extend(data.get('data', [])) # Check for next page url = data.get('paging', {}).get('next') params = {} # Next URL already contains params except Exception as e: self.logger.error(f"Error fetching posts: {e}") break return all_posts def _fetch_post_comments(self, post_id: str, post_data: Dict[str, Any]) -> List[Dict[str, Any]]: """ Fetch all comments for a specific Facebook post. Args: post_id: Facebook post ID post_data: Post data dictionary Returns: List of standardized comment dictionaries """ url = f"{self.BASE_URL}/{post_id}/comments" params = { 'access_token': self.access_token, 'fields': 'id,message,from,created_time,like_count' } all_comments = [] while url: try: response = requests.get(url, params=params) data = response.json() if 'error' in data: self.logger.error(f"Facebook API error: {data['error']['message']}") break # Process comments for comment_data in data.get('data', []): comment = self._extract_comment(comment_data, post_id, post_data) if comment: all_comments.append(comment) # Check for next page url = data.get('paging', {}).get('next') params = {} # Next URL already contains params except Exception as e: self.logger.error(f"Error fetching comments for post {post_id}: {e}") break return all_comments def _extract_comment(self, comment_data: Dict[str, Any], post_id: str, post_data: Dict[str, Any]) -> Dict[str, Any]: """ Extract and standardize a Facebook comment. Args: comment_data: Facebook API comment data post_id: Post ID post_data: Post data dictionary Returns: Standardized comment dictionary """ try: from_data = comment_data.get('from', {}) comment = { 'comment_id': comment_data['id'], 'comments': comment_data.get('message', ''), 'author': from_data.get('name', ''), 'published_at': self._parse_timestamp(comment_data.get('created_time')), 'like_count': comment_data.get('like_count', 0), 'reply_count': 0, # Facebook API doesn't provide reply count easily 'post_id': post_id, 'media_url': post_data.get('permalink_url'), 'raw_data': comment_data } return self._standardize_comment(comment) except Exception as e: self.logger.error(f"Error extracting Facebook comment: {e}") return None