188 lines
6.3 KiB
Python
188 lines
6.3 KiB
Python
"""
|
|
Facebook comment scraper using Facebook Graph API.
|
|
"""
|
|
import logging
|
|
import requests
|
|
from typing import List, Dict, Any
|
|
|
|
from .base import BaseScraper
|
|
|
|
|
|
class FacebookScraper(BaseScraper):
|
|
"""
|
|
Scraper for Facebook comments using Facebook Graph API.
|
|
Extracts comments from posts.
|
|
"""
|
|
|
|
BASE_URL = "https://graph.facebook.com/v19.0"
|
|
|
|
def __init__(self, config: Dict[str, Any]):
|
|
"""
|
|
Initialize Facebook scraper.
|
|
|
|
Args:
|
|
config: Dictionary with 'access_token' and optionally 'page_id'
|
|
"""
|
|
super().__init__(config)
|
|
self.access_token = config.get('access_token')
|
|
if not self.access_token:
|
|
raise ValueError(
|
|
"Facebook access token is required. "
|
|
"Set FACEBOOK_ACCESS_TOKEN in your .env file."
|
|
)
|
|
|
|
self.page_id = config.get('page_id')
|
|
if not self.page_id:
|
|
self.logger.warning(
|
|
"Facebook page_id not provided. "
|
|
"Set FACEBOOK_PAGE_ID in your .env file to specify which page to scrape."
|
|
)
|
|
|
|
self.logger = logging.getLogger(self.__class__.__name__)
|
|
|
|
def scrape_comments(self, page_id: str = None, **kwargs) -> List[Dict[str, Any]]:
|
|
"""
|
|
Scrape comments from all posts on a Facebook page.
|
|
|
|
Args:
|
|
page_id: Facebook page ID to scrape comments from
|
|
|
|
Returns:
|
|
List of standardized comment dictionaries
|
|
"""
|
|
page_id = page_id or self.page_id
|
|
if not page_id:
|
|
raise ValueError("Facebook page ID is required")
|
|
|
|
all_comments = []
|
|
|
|
self.logger.info(f"Starting Facebook comment extraction for page: {page_id}")
|
|
|
|
# Get all posts from the page
|
|
posts = self._fetch_all_posts(page_id)
|
|
self.logger.info(f"Found {len(posts)} posts to process")
|
|
|
|
# Get comments for each post
|
|
for post in posts:
|
|
post_id = post['id']
|
|
post_comments = self._fetch_post_comments(post_id, post)
|
|
all_comments.extend(post_comments)
|
|
self.logger.info(f"Fetched {len(post_comments)} comments for post {post_id}")
|
|
|
|
self.logger.info(f"Completed Facebook scraping. Total comments: {len(all_comments)}")
|
|
return all_comments
|
|
|
|
def _fetch_all_posts(self, page_id: str) -> List[Dict[str, Any]]:
|
|
"""
|
|
Fetch all posts from a Facebook page.
|
|
|
|
Args:
|
|
page_id: Facebook page ID
|
|
|
|
Returns:
|
|
List of post dictionaries
|
|
"""
|
|
url = f"{self.BASE_URL}/{page_id}/feed"
|
|
params = {
|
|
'access_token': self.access_token,
|
|
'fields': 'id,message,created_time,permalink_url'
|
|
}
|
|
|
|
all_posts = []
|
|
while url:
|
|
try:
|
|
response = requests.get(url, params=params)
|
|
data = response.json()
|
|
|
|
if 'error' in data:
|
|
self.logger.error(f"Facebook API error: {data['error']['message']}")
|
|
break
|
|
|
|
all_posts.extend(data.get('data', []))
|
|
|
|
# Check for next page
|
|
url = data.get('paging', {}).get('next')
|
|
params = {} # Next URL already contains params
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error fetching posts: {e}")
|
|
break
|
|
|
|
return all_posts
|
|
|
|
def _fetch_post_comments(self, post_id: str, post_data: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
"""
|
|
Fetch all comments for a specific Facebook post.
|
|
|
|
Args:
|
|
post_id: Facebook post ID
|
|
post_data: Post data dictionary
|
|
|
|
Returns:
|
|
List of standardized comment dictionaries
|
|
"""
|
|
url = f"{self.BASE_URL}/{post_id}/comments"
|
|
params = {
|
|
'access_token': self.access_token,
|
|
'fields': 'id,message,from,created_time,like_count'
|
|
}
|
|
|
|
all_comments = []
|
|
while url:
|
|
try:
|
|
response = requests.get(url, params=params)
|
|
data = response.json()
|
|
|
|
if 'error' in data:
|
|
self.logger.error(f"Facebook API error: {data['error']['message']}")
|
|
break
|
|
|
|
# Process comments
|
|
for comment_data in data.get('data', []):
|
|
comment = self._extract_comment(comment_data, post_id, post_data)
|
|
if comment:
|
|
all_comments.append(comment)
|
|
|
|
# Check for next page
|
|
url = data.get('paging', {}).get('next')
|
|
params = {} # Next URL already contains params
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error fetching comments for post {post_id}: {e}")
|
|
break
|
|
|
|
return all_comments
|
|
|
|
def _extract_comment(self, comment_data: Dict[str, Any], post_id: str, post_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Extract and standardize a Facebook comment.
|
|
|
|
Args:
|
|
comment_data: Facebook API comment data
|
|
post_id: Post ID
|
|
post_data: Post data dictionary
|
|
|
|
Returns:
|
|
Standardized comment dictionary
|
|
"""
|
|
try:
|
|
from_data = comment_data.get('from', {})
|
|
|
|
comment = {
|
|
'comment_id': comment_data['id'],
|
|
'comments': comment_data.get('message', ''),
|
|
'author': from_data.get('name', ''),
|
|
'published_at': self._parse_timestamp(comment_data.get('created_time')),
|
|
'like_count': comment_data.get('like_count', 0),
|
|
'reply_count': 0, # Facebook API doesn't provide reply count easily
|
|
'post_id': post_id,
|
|
'media_url': post_data.get('permalink_url'),
|
|
'raw_data': comment_data
|
|
}
|
|
|
|
return self._standardize_comment(comment)
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error extracting Facebook comment: {e}")
|
|
return None
|