HH/apps/social/services/analysis_service.py
2026-01-15 14:31:58 +03:00

365 lines
13 KiB
Python

"""
Analysis service for orchestrating AI-powered comment analysis.
Coordinates between SocialMediaComment model and OpenRouter service.
"""
import logging
from typing import List, Dict, Any, Optional
from decimal import Decimal
from datetime import datetime, timedelta
from django.conf import settings
from django.utils import timezone
from django.db import models
from ..models import SocialMediaComment
from .openrouter_service import OpenRouterService
logger = logging.getLogger(__name__)
class AnalysisService:
"""
Service for managing AI analysis of social media comments.
Handles batching, filtering, and updating comments with analysis results.
"""
def __init__(self):
"""Initialize the analysis service."""
self.openrouter_service = OpenRouterService()
self.batch_size = getattr(settings, 'ANALYSIS_BATCH_SIZE', 10)
if not self.openrouter_service.is_configured():
logger.warning("OpenRouter service not properly configured")
else:
logger.info(f"Analysis service initialized (batch_size: {self.batch_size})")
def analyze_pending_comments(
self,
limit: Optional[int] = None,
platform: Optional[str] = None,
hours_ago: Optional[int] = None
) -> Dict[str, Any]:
"""
Analyze comments that haven't been analyzed yet.
Args:
limit: Maximum number of comments to analyze
platform: Filter by platform (optional)
hours_ago: Only analyze comments scraped in the last N hours
Returns:
Dictionary with analysis statistics
"""
if not self.openrouter_service.is_configured():
logger.error("OpenRouter service not configured")
return {
'success': False,
'error': 'OpenRouter service not configured',
'analyzed': 0,
'failed': 0,
'skipped': 0
}
# Build queryset for unanalyzed comments (check if ai_analysis is empty)
# Using Q() for complex filtering (NULL or empty dict)
from django.db.models import Q
queryset = SocialMediaComment.objects.filter(
Q(ai_analysis__isnull=True) | Q(ai_analysis={})
)
if platform:
queryset = queryset.filter(platform=platform)
if hours_ago:
cutoff_time = timezone.now() - timedelta(hours=hours_ago)
queryset = queryset.filter(scraped_at__gte=cutoff_time)
if limit:
queryset = queryset[:limit]
# Fetch comments
comments = list(queryset)
if not comments:
logger.info("No pending comments to analyze")
return {
'success': True,
'analyzed': 0,
'failed': 0,
'skipped': 0,
'message': 'No pending comments to analyze'
}
logger.info(f"Found {len(comments)} pending comments to analyze")
# Process in batches
analyzed_count = 0
failed_count = 0
skipped_count = 0
for i in range(0, len(comments), self.batch_size):
batch = comments[i:i + self.batch_size]
logger.info(f"Processing batch {i//self.batch_size + 1} ({len(batch)} comments)")
# Prepare batch for API
batch_data = [
{
'id': comment.id,
'text': comment.comments
}
for comment in batch
]
# Analyze batch
result = self.openrouter_service.analyze_comments(batch_data)
if result.get('success'):
# Update comments with analysis results
for analysis in result.get('analyses', []):
try:
comment_id = analysis.get('comment_id')
comment = SocialMediaComment.objects.get(id=comment_id)
# Build new bilingual analysis structure
ai_analysis = {
'sentiment': analysis.get('sentiment', {}),
'summaries': analysis.get('summaries', {}),
'keywords': analysis.get('keywords', {}),
'topics': analysis.get('topics', {}),
'entities': analysis.get('entities', []),
'emotions': analysis.get('emotions', {}),
'metadata': {
**result.get('metadata', {}),
'analyzed_at': timezone.now().isoformat()
}
}
# Update with bilingual analysis structure
comment.ai_analysis = ai_analysis
comment.save()
analyzed_count += 1
logger.debug(f"Updated comment {comment_id} with bilingual analysis")
except SocialMediaComment.DoesNotExist:
logger.warning(f"Comment {analysis.get('comment_id')} not found")
failed_count += 1
except Exception as e:
logger.error(f"Error updating comment {comment_id}: {e}")
failed_count += 1
else:
error = result.get('error', 'Unknown error')
logger.error(f"Batch analysis failed: {error}")
failed_count += len(batch)
# Calculate skipped (comments that were analyzed during processing)
skipped_count = len(comments) - analyzed_count - failed_count
logger.info(
f"Analysis complete: {analyzed_count} analyzed, "
f"{failed_count} failed, {skipped_count} skipped"
)
return {
'success': True,
'analyzed': analyzed_count,
'failed': failed_count,
'skipped': skipped_count,
'total': len(comments)
}
def analyze_comments_by_platform(self, platform: str, limit: int = 100) -> Dict[str, Any]:
"""
Analyze comments from a specific platform.
Args:
platform: Platform name (e.g., 'youtube', 'facebook')
limit: Maximum number of comments to analyze
Returns:
Dictionary with analysis statistics
"""
logger.info(f"Analyzing comments from platform: {platform}")
return self.analyze_pending_comments(limit=limit, platform=platform)
def analyze_recent_comments(self, hours: int = 24, limit: int = 100) -> Dict[str, Any]:
"""
Analyze comments scraped in the last N hours.
Args:
hours: Number of hours to look back
limit: Maximum number of comments to analyze
Returns:
Dictionary with analysis statistics
"""
logger.info(f"Analyzing comments from last {hours} hours")
return self.analyze_pending_comments(limit=limit, hours_ago=hours)
def get_analysis_statistics(
self,
platform: Optional[str] = None,
days: int = 30
) -> Dict[str, Any]:
"""
Get statistics about analyzed comments using ai_analysis structure.
Args:
platform: Filter by platform (optional)
days: Number of days to look back
Returns:
Dictionary with analysis statistics
"""
cutoff_date = timezone.now() - timedelta(days=days)
queryset = SocialMediaComment.objects.filter(
scraped_at__gte=cutoff_date
)
if platform:
queryset = queryset.filter(platform=platform)
total_comments = queryset.count()
# Count analyzed comments (those with ai_analysis populated)
analyzed_comments = 0
sentiment_counts = {'positive': 0, 'negative': 0, 'neutral': 0}
confidence_scores = []
for comment in queryset:
if comment.ai_analysis:
analyzed_comments += 1
sentiment = comment.ai_analysis.get('sentiment', {}).get('classification', {}).get('en', 'neutral')
if sentiment in sentiment_counts:
sentiment_counts[sentiment] += 1
confidence = comment.ai_analysis.get('sentiment', {}).get('confidence', 0)
if confidence:
confidence_scores.append(confidence)
# Calculate average confidence
avg_confidence = sum(confidence_scores) / len(confidence_scores) if confidence_scores else 0
return {
'total_comments': total_comments,
'analyzed_comments': analyzed_comments,
'unanalyzed_comments': total_comments - analyzed_comments,
'analysis_rate': (analyzed_comments / total_comments * 100) if total_comments > 0 else 0,
'sentiment_distribution': sentiment_counts,
'average_confidence': float(avg_confidence),
'platform': platform or 'all'
}
def reanalyze_comment(self, comment_id: int) -> Dict[str, Any]:
"""
Re-analyze a specific comment.
Args:
comment_id: ID of the comment to re-analyze
Returns:
Dictionary with result
"""
try:
comment = SocialMediaComment.objects.get(id=comment_id)
except SocialMediaComment.DoesNotExist:
return {
'success': False,
'error': f'Comment {comment_id} not found'
}
if not self.openrouter_service.is_configured():
return {
'success': False,
'error': 'OpenRouter service not configured'
}
# Prepare single comment for analysis
batch_data = [{'id': comment.id, 'text': comment.comments}]
# Analyze
result = self.openrouter_service.analyze_comments(batch_data)
if result.get('success'):
analysis = result.get('analyses', [{}])[0] if result.get('analyses') else {}
# Build new bilingual analysis structure
ai_analysis = {
'sentiment': analysis.get('sentiment', {}),
'summaries': analysis.get('summaries', {}),
'keywords': analysis.get('keywords', {}),
'topics': analysis.get('topics', {}),
'entities': analysis.get('entities', []),
'emotions': analysis.get('emotions', {}),
'metadata': {
**result.get('metadata', {}),
'analyzed_at': timezone.now().isoformat()
}
}
# Update comment with bilingual analysis structure
comment.ai_analysis = ai_analysis
comment.save()
sentiment_en = ai_analysis.get('sentiment', {}).get('classification', {}).get('en')
confidence_val = ai_analysis.get('sentiment', {}).get('confidence', 0)
return {
'success': True,
'comment_id': comment_id,
'sentiment': sentiment_en,
'confidence': float(confidence_val)
}
else:
return {
'success': False,
'error': result.get('error', 'Unknown error')
}
def get_top_keywords(
self,
platform: Optional[str] = None,
limit: int = 20,
days: int = 30
) -> List[Dict[str, Any]]:
"""
Get most common keywords from analyzed comments using ai_analysis structure.
Args:
platform: Filter by platform (optional)
limit: Maximum number of keywords to return
days: Number of days to look back
Returns:
List of keyword dictionaries with 'keyword' and 'count' keys
"""
cutoff_date = timezone.now() - timedelta(days=days)
queryset = SocialMediaComment.objects.filter(
scraped_at__gte=cutoff_date,
ai_analysis__isnull=False
).exclude(ai_analysis={})
if platform:
queryset = queryset.filter(platform=platform)
# Count keywords from ai_analysis
keyword_counts = {}
for comment in queryset:
keywords_en = comment.ai_analysis.get('keywords', {}).get('en', [])
for keyword in keywords_en:
keyword_counts[keyword] = keyword_counts.get(keyword, 0) + 1
# Sort by count and return top N
sorted_keywords = sorted(
keyword_counts.items(),
key=lambda x: x[1],
reverse=True
)[:limit]
return [
{'keyword': keyword, 'count': count}
for keyword, count in sorted_keywords
]