""" Base scraper class for social media platforms. """ import logging from abc import ABC, abstractmethod from typing import List, Dict, Any from datetime import datetime class BaseScraper(ABC): """ Abstract base class for social media scrapers. All platform-specific scrapers should inherit from this class. """ def __init__(self, config: Dict[str, Any]): """ Initialize the scraper with configuration. Args: config: Dictionary containing platform-specific configuration """ self.config = config self.logger = logging.getLogger(self.__class__.__name__) @abstractmethod def scrape_comments(self, **kwargs) -> List[Dict[str, Any]]: """ Scrape comments from the platform. Returns: List of dictionaries containing comment data with standardized fields: - comment_id: Unique comment ID from the platform - comments: Comment text - author: Author name/username - published_at: Publication timestamp (ISO format) - like_count: Number of likes - reply_count: Number of replies - post_id: ID of the post/media - media_url: URL to associated media (if applicable) - raw_data: Complete raw data from platform API """ pass def _standardize_comment(self, comment_data: Dict[str, Any]) -> Dict[str, Any]: """ Standardize comment data format. Subclasses can override this method to handle platform-specific formatting. Args: comment_data: Raw comment data from platform API Returns: Standardized comment dictionary """ return comment_data def _parse_timestamp(self, timestamp_str: str) -> str: """ Parse platform timestamp to ISO format. Args: timestamp_str: Platform-specific timestamp string Returns: ISO formatted timestamp string """ try: # Try common timestamp formats for fmt in [ '%Y-%m-%dT%H:%M:%S%z', '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d', ]: try: dt = datetime.strptime(timestamp_str, fmt) return dt.isoformat() except ValueError: continue # If no format matches, return as-is return timestamp_str except Exception as e: self.logger.warning(f"Failed to parse timestamp {timestamp_str}: {e}") return timestamp_str