HH/apps/social/scrapers/facebook.py
2026-01-15 14:31:58 +03:00

188 lines
6.3 KiB
Python

"""
Facebook comment scraper using Facebook Graph API.
"""
import logging
import requests
from typing import List, Dict, Any
from .base import BaseScraper
class FacebookScraper(BaseScraper):
"""
Scraper for Facebook comments using Facebook Graph API.
Extracts comments from posts.
"""
BASE_URL = "https://graph.facebook.com/v19.0"
def __init__(self, config: Dict[str, Any]):
"""
Initialize Facebook scraper.
Args:
config: Dictionary with 'access_token' and optionally 'page_id'
"""
super().__init__(config)
self.access_token = config.get('access_token')
if not self.access_token:
raise ValueError(
"Facebook access token is required. "
"Set FACEBOOK_ACCESS_TOKEN in your .env file."
)
self.page_id = config.get('page_id')
if not self.page_id:
self.logger.warning(
"Facebook page_id not provided. "
"Set FACEBOOK_PAGE_ID in your .env file to specify which page to scrape."
)
self.logger = logging.getLogger(self.__class__.__name__)
def scrape_comments(self, page_id: str = None, **kwargs) -> List[Dict[str, Any]]:
"""
Scrape comments from all posts on a Facebook page.
Args:
page_id: Facebook page ID to scrape comments from
Returns:
List of standardized comment dictionaries
"""
page_id = page_id or self.page_id
if not page_id:
raise ValueError("Facebook page ID is required")
all_comments = []
self.logger.info(f"Starting Facebook comment extraction for page: {page_id}")
# Get all posts from the page
posts = self._fetch_all_posts(page_id)
self.logger.info(f"Found {len(posts)} posts to process")
# Get comments for each post
for post in posts:
post_id = post['id']
post_comments = self._fetch_post_comments(post_id, post)
all_comments.extend(post_comments)
self.logger.info(f"Fetched {len(post_comments)} comments for post {post_id}")
self.logger.info(f"Completed Facebook scraping. Total comments: {len(all_comments)}")
return all_comments
def _fetch_all_posts(self, page_id: str) -> List[Dict[str, Any]]:
"""
Fetch all posts from a Facebook page.
Args:
page_id: Facebook page ID
Returns:
List of post dictionaries
"""
url = f"{self.BASE_URL}/{page_id}/feed"
params = {
'access_token': self.access_token,
'fields': 'id,message,created_time,permalink_url'
}
all_posts = []
while url:
try:
response = requests.get(url, params=params)
data = response.json()
if 'error' in data:
self.logger.error(f"Facebook API error: {data['error']['message']}")
break
all_posts.extend(data.get('data', []))
# Check for next page
url = data.get('paging', {}).get('next')
params = {} # Next URL already contains params
except Exception as e:
self.logger.error(f"Error fetching posts: {e}")
break
return all_posts
def _fetch_post_comments(self, post_id: str, post_data: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Fetch all comments for a specific Facebook post.
Args:
post_id: Facebook post ID
post_data: Post data dictionary
Returns:
List of standardized comment dictionaries
"""
url = f"{self.BASE_URL}/{post_id}/comments"
params = {
'access_token': self.access_token,
'fields': 'id,message,from,created_time,like_count'
}
all_comments = []
while url:
try:
response = requests.get(url, params=params)
data = response.json()
if 'error' in data:
self.logger.error(f"Facebook API error: {data['error']['message']}")
break
# Process comments
for comment_data in data.get('data', []):
comment = self._extract_comment(comment_data, post_id, post_data)
if comment:
all_comments.append(comment)
# Check for next page
url = data.get('paging', {}).get('next')
params = {} # Next URL already contains params
except Exception as e:
self.logger.error(f"Error fetching comments for post {post_id}: {e}")
break
return all_comments
def _extract_comment(self, comment_data: Dict[str, Any], post_id: str, post_data: Dict[str, Any]) -> Dict[str, Any]:
"""
Extract and standardize a Facebook comment.
Args:
comment_data: Facebook API comment data
post_id: Post ID
post_data: Post data dictionary
Returns:
Standardized comment dictionary
"""
try:
from_data = comment_data.get('from', {})
comment = {
'comment_id': comment_data['id'],
'comments': comment_data.get('message', ''),
'author': from_data.get('name', ''),
'published_at': self._parse_timestamp(comment_data.get('created_time')),
'like_count': comment_data.get('like_count', 0),
'reply_count': 0, # Facebook API doesn't provide reply count easily
'post_id': post_id,
'media_url': post_data.get('permalink_url'),
'raw_data': comment_data
}
return self._standardize_comment(comment)
except Exception as e:
self.logger.error(f"Error extracting Facebook comment: {e}")
return None