87 lines
2.7 KiB
Python
87 lines
2.7 KiB
Python
"""
|
|
Base scraper class for social media platforms.
|
|
"""
|
|
import logging
|
|
from abc import ABC, abstractmethod
|
|
from typing import List, Dict, Any
|
|
from datetime import datetime
|
|
|
|
|
|
class BaseScraper(ABC):
|
|
"""
|
|
Abstract base class for social media scrapers.
|
|
All platform-specific scrapers should inherit from this class.
|
|
"""
|
|
|
|
def __init__(self, config: Dict[str, Any]):
|
|
"""
|
|
Initialize the scraper with configuration.
|
|
|
|
Args:
|
|
config: Dictionary containing platform-specific configuration
|
|
"""
|
|
self.config = config
|
|
self.logger = logging.getLogger(self.__class__.__name__)
|
|
|
|
@abstractmethod
|
|
def scrape_comments(self, **kwargs) -> List[Dict[str, Any]]:
|
|
"""
|
|
Scrape comments from the platform.
|
|
|
|
Returns:
|
|
List of dictionaries containing comment data with standardized fields:
|
|
- comment_id: Unique comment ID from the platform
|
|
- comments: Comment text
|
|
- author: Author name/username
|
|
- published_at: Publication timestamp (ISO format)
|
|
- like_count: Number of likes
|
|
- reply_count: Number of replies
|
|
- post_id: ID of the post/media
|
|
- media_url: URL to associated media (if applicable)
|
|
- raw_data: Complete raw data from platform API
|
|
"""
|
|
pass
|
|
|
|
def _standardize_comment(self, comment_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Standardize comment data format.
|
|
Subclasses can override this method to handle platform-specific formatting.
|
|
|
|
Args:
|
|
comment_data: Raw comment data from platform API
|
|
|
|
Returns:
|
|
Standardized comment dictionary
|
|
"""
|
|
return comment_data
|
|
|
|
def _parse_timestamp(self, timestamp_str: str) -> str:
|
|
"""
|
|
Parse platform timestamp to ISO format.
|
|
|
|
Args:
|
|
timestamp_str: Platform-specific timestamp string
|
|
|
|
Returns:
|
|
ISO formatted timestamp string
|
|
"""
|
|
try:
|
|
# Try common timestamp formats
|
|
for fmt in [
|
|
'%Y-%m-%dT%H:%M:%S%z',
|
|
'%Y-%m-%dT%H:%M:%SZ',
|
|
'%Y-%m-%d %H:%M:%S',
|
|
'%Y-%m-%d',
|
|
]:
|
|
try:
|
|
dt = datetime.strptime(timestamp_str, fmt)
|
|
return dt.isoformat()
|
|
except ValueError:
|
|
continue
|
|
|
|
# If no format matches, return as-is
|
|
return timestamp_str
|
|
except Exception as e:
|
|
self.logger.warning(f"Failed to parse timestamp {timestamp_str}: {e}")
|
|
return timestamp_str
|