2026-01-15 14:31:58 +03:00

87 lines
2.7 KiB
Python

"""
Base scraper class for social media platforms.
"""
import logging
from abc import ABC, abstractmethod
from typing import List, Dict, Any
from datetime import datetime
class BaseScraper(ABC):
"""
Abstract base class for social media scrapers.
All platform-specific scrapers should inherit from this class.
"""
def __init__(self, config: Dict[str, Any]):
"""
Initialize the scraper with configuration.
Args:
config: Dictionary containing platform-specific configuration
"""
self.config = config
self.logger = logging.getLogger(self.__class__.__name__)
@abstractmethod
def scrape_comments(self, **kwargs) -> List[Dict[str, Any]]:
"""
Scrape comments from the platform.
Returns:
List of dictionaries containing comment data with standardized fields:
- comment_id: Unique comment ID from the platform
- comments: Comment text
- author: Author name/username
- published_at: Publication timestamp (ISO format)
- like_count: Number of likes
- reply_count: Number of replies
- post_id: ID of the post/media
- media_url: URL to associated media (if applicable)
- raw_data: Complete raw data from platform API
"""
pass
def _standardize_comment(self, comment_data: Dict[str, Any]) -> Dict[str, Any]:
"""
Standardize comment data format.
Subclasses can override this method to handle platform-specific formatting.
Args:
comment_data: Raw comment data from platform API
Returns:
Standardized comment dictionary
"""
return comment_data
def _parse_timestamp(self, timestamp_str: str) -> str:
"""
Parse platform timestamp to ISO format.
Args:
timestamp_str: Platform-specific timestamp string
Returns:
ISO formatted timestamp string
"""
try:
# Try common timestamp formats
for fmt in [
'%Y-%m-%dT%H:%M:%S%z',
'%Y-%m-%dT%H:%M:%SZ',
'%Y-%m-%d %H:%M:%S',
'%Y-%m-%d',
]:
try:
dt = datetime.strptime(timestamp_str, fmt)
return dt.isoformat()
except ValueError:
continue
# If no format matches, return as-is
return timestamp_str
except Exception as e:
self.logger.warning(f"Failed to parse timestamp {timestamp_str}: {e}")
return timestamp_str