HH/apps/social/services/analysis_service.py

"""
Analysis service for orchestrating AI-powered comment analysis.
Coordinates between SocialMediaComment model and OpenRouter service.
"""
import logging
from typing import List, Dict, Any, Optional
from decimal import Decimal
from datetime import datetime, timedelta

from django.conf import settings
from django.utils import timezone
from django.db import models

from ..models import SocialMediaComment
from .openrouter_service import OpenRouterService


logger = logging.getLogger(__name__)


class AnalysisService:
    """
    Service for managing AI analysis of social media comments.
    Handles batching, filtering, and updating comments with analysis results.
    """

    def __init__(self):
        """Initialize the analysis service."""
        self.openrouter_service = OpenRouterService()
        self.batch_size = getattr(settings, 'ANALYSIS_BATCH_SIZE', 10)

        if not self.openrouter_service.is_configured():
            logger.warning("OpenRouter service not properly configured")
        else:
            logger.info(f"Analysis service initialized (batch_size: {self.batch_size})")

    def analyze_pending_comments(
        self,
        limit: Optional[int] = None,
        platform: Optional[str] = None,
        hours_ago: Optional[int] = None
    ) -> Dict[str, Any]:
        """
        Analyze comments that haven't been analyzed yet.

        Args:
            limit: Maximum number of comments to analyze
            platform: Filter by platform (optional)
            hours_ago: Only analyze comments scraped in the last N hours

        Returns:
            Dictionary with analysis statistics
        """
        if not self.openrouter_service.is_configured():
            logger.error("OpenRouter service not configured")
            return {
                'success': False,
                'error': 'OpenRouter service not configured',
                'analyzed': 0,
                'failed': 0,
                'skipped': 0
            }

        # Build queryset for unanalyzed comments (check if ai_analysis is empty)
        # Using Q() for complex filtering (NULL or empty dict)
        from django.db.models import Q
        queryset = SocialMediaComment.objects.filter(
            Q(ai_analysis__isnull=True) | Q(ai_analysis={})
        )

        if platform:
            queryset = queryset.filter(platform=platform)

        if hours_ago:
            cutoff_time = timezone.now() - timedelta(hours=hours_ago)
            queryset = queryset.filter(scraped_at__gte=cutoff_time)

        if limit:
            queryset = queryset[:limit]

        # Fetch comments
        comments = list(queryset)

        if not comments:
            logger.info("No pending comments to analyze")
            return {
                'success': True,
                'analyzed': 0,
                'failed': 0,
                'skipped': 0,
                'message': 'No pending comments to analyze'
            }

        logger.info(f"Found {len(comments)} pending comments to analyze")

        # Process in batches
        analyzed_count = 0
        failed_count = 0
        skipped_count = 0

        for i in range(0, len(comments), self.batch_size):
            batch = comments[i:i + self.batch_size]
            logger.info(f"Processing batch {i//self.batch_size + 1} ({len(batch)} comments)")

            # Prepare batch for API
            batch_data = [
                {
                    'id': comment.id,
                    'text': comment.comments
                }
                for comment in batch
            ]

            # Analyze batch
            result = self.openrouter_service.analyze_comments(batch_data)

            if result.get('success'):
                # Update comments with analysis results
                for analysis in result.get('analyses', []):
                    try:
                        comment_id = analysis.get('comment_id')
                        comment = SocialMediaComment.objects.get(id=comment_id)

                        # Build new bilingual analysis structure
                        ai_analysis = {
                            'sentiment': analysis.get('sentiment', {}),
                            'summaries': analysis.get('summaries', {}),
                            'keywords': analysis.get('keywords', {}),
                            'topics': analysis.get('topics', {}),
                            'entities': analysis.get('entities', []),
                            'emotions': analysis.get('emotions', {}),
                            'metadata': {
                                **result.get('metadata', {}),
                                'analyzed_at': timezone.now().isoformat()
                            }
                        }

                        # Update with bilingual analysis structure
                        comment.ai_analysis = ai_analysis
                        comment.save()

                        analyzed_count += 1
                        logger.debug(f"Updated comment {comment_id} with bilingual analysis")

                    except SocialMediaComment.DoesNotExist:
                        logger.warning(f"Comment {analysis.get('comment_id')} not found")
                        failed_count += 1
                    except Exception as e:
                        logger.error(f"Error updating comment {comment_id}: {e}")
                        failed_count += 1
            else:
                error = result.get('error', 'Unknown error')
                logger.error(f"Batch analysis failed: {error}")
                failed_count += len(batch)

        # Calculate skipped (comments that were analyzed during processing)
        skipped_count = len(comments) - analyzed_count - failed_count

        logger.info(
            f"Analysis complete: {analyzed_count} analyzed, "
            f"{failed_count} failed, {skipped_count} skipped"
        )

        return {
            'success': True,
            'analyzed': analyzed_count,
            'failed': failed_count,
            'skipped': skipped_count,
            'total': len(comments)
        }

    def analyze_comments_by_platform(self, platform: str, limit: int = 100) -> Dict[str, Any]:
        """
        Analyze comments from a specific platform.

        Args:
            platform: Platform name (e.g., 'youtube', 'facebook')
            limit: Maximum number of comments to analyze

        Returns:
            Dictionary with analysis statistics
        """
        logger.info(f"Analyzing comments from platform: {platform}")
        return self.analyze_pending_comments(limit=limit, platform=platform)

    def analyze_recent_comments(self, hours: int = 24, limit: int = 100) -> Dict[str, Any]:
        """
        Analyze comments scraped in the last N hours.

        Args:
            hours: Number of hours to look back
            limit: Maximum number of comments to analyze

        Returns:
            Dictionary with analysis statistics
        """
        logger.info(f"Analyzing comments from last {hours} hours")
        return self.analyze_pending_comments(limit=limit, hours_ago=hours)

    def get_analysis_statistics(
        self,
        platform: Optional[str] = None,
        days: int = 30
    ) -> Dict[str, Any]:
        """
        Get statistics about analyzed comments using ai_analysis structure.

        Args:
            platform: Filter by platform (optional)
            days: Number of days to look back

        Returns:
            Dictionary with analysis statistics
        """
        cutoff_date = timezone.now() - timedelta(days=days)

        queryset = SocialMediaComment.objects.filter(
            scraped_at__gte=cutoff_date
        )

        if platform:
            queryset = queryset.filter(platform=platform)

        total_comments = queryset.count()

        # Count analyzed comments (those with ai_analysis populated)
        analyzed_comments = 0
        sentiment_counts = {'positive': 0, 'negative': 0, 'neutral': 0}
        confidence_scores = []

        for comment in queryset:
            if comment.ai_analysis:
                analyzed_comments += 1
                sentiment = comment.ai_analysis.get('sentiment', {}).get('classification', {}).get('en', 'neutral')
                if sentiment in sentiment_counts:
                    sentiment_counts[sentiment] += 1
                confidence = comment.ai_analysis.get('sentiment', {}).get('confidence', 0)
                if confidence:
                    confidence_scores.append(confidence)

        # Calculate average confidence
        avg_confidence = sum(confidence_scores) / len(confidence_scores) if confidence_scores else 0

        return {
            'total_comments': total_comments,
            'analyzed_comments': analyzed_comments,
            'unanalyzed_comments': total_comments - analyzed_comments,
            'analysis_rate': (analyzed_comments / total_comments * 100) if total_comments > 0 else 0,
            'sentiment_distribution': sentiment_counts,
            'average_confidence': float(avg_confidence),
            'platform': platform or 'all'
        }

    def reanalyze_comment(self, comment_id: int) -> Dict[str, Any]:
        """
        Re-analyze a specific comment.

        Args:
            comment_id: ID of the comment to re-analyze

        Returns:
            Dictionary with result
        """
        try:
            comment = SocialMediaComment.objects.get(id=comment_id)
        except SocialMediaComment.DoesNotExist:
            return {
                'success': False,
                'error': f'Comment {comment_id} not found'
            }

        if not self.openrouter_service.is_configured():
            return {
                'success': False,
                'error': 'OpenRouter service not configured'
            }

        # Prepare single comment for analysis
        batch_data = [{'id': comment.id, 'text': comment.comments}]

        # Analyze
        result = self.openrouter_service.analyze_comments(batch_data)

        if result.get('success'):
            analysis = result.get('analyses', [{}])[0] if result.get('analyses') else {}

            # Build new bilingual analysis structure
            ai_analysis = {
                'sentiment': analysis.get('sentiment', {}),
                'summaries': analysis.get('summaries', {}),
                'keywords': analysis.get('keywords', {}),
                'topics': analysis.get('topics', {}),
                'entities': analysis.get('entities', []),
                'emotions': analysis.get('emotions', {}),
                'metadata': {
                    **result.get('metadata', {}),
                    'analyzed_at': timezone.now().isoformat()
                }
            }

            # Update comment with bilingual analysis structure
            comment.ai_analysis = ai_analysis
            comment.save()

            sentiment_en = ai_analysis.get('sentiment', {}).get('classification', {}).get('en')
            confidence_val = ai_analysis.get('sentiment', {}).get('confidence', 0)

            return {
                'success': True,
                'comment_id': comment_id,
                'sentiment': sentiment_en,
                'confidence': float(confidence_val)
            }
        else:
            return {
                'success': False,
                'error': result.get('error', 'Unknown error')
            }

    def get_top_keywords(
        self,
        platform: Optional[str] = None,
        limit: int = 20,
        days: int = 30
    ) -> List[Dict[str, Any]]:
        """
        Get most common keywords from analyzed comments using ai_analysis structure.

        Args:
            platform: Filter by platform (optional)
            limit: Maximum number of keywords to return
            days: Number of days to look back

        Returns:
            List of keyword dictionaries with 'keyword' and 'count' keys
        """
        cutoff_date = timezone.now() - timedelta(days=days)

        queryset = SocialMediaComment.objects.filter(
            scraped_at__gte=cutoff_date,
            ai_analysis__isnull=False
        ).exclude(ai_analysis={})

        if platform:
            queryset = queryset.filter(platform=platform)

        # Count keywords from ai_analysis
        keyword_counts = {}
        for comment in queryset:
            keywords_en = comment.ai_analysis.get('keywords', {}).get('en', [])
            for keyword in keywords_en:
                keyword_counts[keyword] = keyword_counts.get(keyword, 0) + 1

        # Sort by count and return top N
        sorted_keywords = sorted(
            keyword_counts.items(),
            key=lambda x: x[1],
            reverse=True
        )[:limit]

        return [
            {'keyword': keyword, 'count': count}
            for keyword, count in sorted_keywords
        ]