365 lines
13 KiB
Python
365 lines
13 KiB
Python
"""
|
|
Analysis service for orchestrating AI-powered comment analysis.
|
|
Coordinates between SocialMediaComment model and OpenRouter service.
|
|
"""
|
|
import logging
|
|
from typing import List, Dict, Any, Optional
|
|
from decimal import Decimal
|
|
from datetime import datetime, timedelta
|
|
|
|
from django.conf import settings
|
|
from django.utils import timezone
|
|
from django.db import models
|
|
|
|
from ..models import SocialMediaComment
|
|
from .openrouter_service import OpenRouterService
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class AnalysisService:
|
|
"""
|
|
Service for managing AI analysis of social media comments.
|
|
Handles batching, filtering, and updating comments with analysis results.
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize the analysis service."""
|
|
self.openrouter_service = OpenRouterService()
|
|
self.batch_size = getattr(settings, 'ANALYSIS_BATCH_SIZE', 10)
|
|
|
|
if not self.openrouter_service.is_configured():
|
|
logger.warning("OpenRouter service not properly configured")
|
|
else:
|
|
logger.info(f"Analysis service initialized (batch_size: {self.batch_size})")
|
|
|
|
def analyze_pending_comments(
|
|
self,
|
|
limit: Optional[int] = None,
|
|
platform: Optional[str] = None,
|
|
hours_ago: Optional[int] = None
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Analyze comments that haven't been analyzed yet.
|
|
|
|
Args:
|
|
limit: Maximum number of comments to analyze
|
|
platform: Filter by platform (optional)
|
|
hours_ago: Only analyze comments scraped in the last N hours
|
|
|
|
Returns:
|
|
Dictionary with analysis statistics
|
|
"""
|
|
if not self.openrouter_service.is_configured():
|
|
logger.error("OpenRouter service not configured")
|
|
return {
|
|
'success': False,
|
|
'error': 'OpenRouter service not configured',
|
|
'analyzed': 0,
|
|
'failed': 0,
|
|
'skipped': 0
|
|
}
|
|
|
|
# Build queryset for unanalyzed comments (check if ai_analysis is empty)
|
|
# Using Q() for complex filtering (NULL or empty dict)
|
|
from django.db.models import Q
|
|
queryset = SocialMediaComment.objects.filter(
|
|
Q(ai_analysis__isnull=True) | Q(ai_analysis={})
|
|
)
|
|
|
|
if platform:
|
|
queryset = queryset.filter(platform=platform)
|
|
|
|
if hours_ago:
|
|
cutoff_time = timezone.now() - timedelta(hours=hours_ago)
|
|
queryset = queryset.filter(scraped_at__gte=cutoff_time)
|
|
|
|
if limit:
|
|
queryset = queryset[:limit]
|
|
|
|
# Fetch comments
|
|
comments = list(queryset)
|
|
|
|
if not comments:
|
|
logger.info("No pending comments to analyze")
|
|
return {
|
|
'success': True,
|
|
'analyzed': 0,
|
|
'failed': 0,
|
|
'skipped': 0,
|
|
'message': 'No pending comments to analyze'
|
|
}
|
|
|
|
logger.info(f"Found {len(comments)} pending comments to analyze")
|
|
|
|
# Process in batches
|
|
analyzed_count = 0
|
|
failed_count = 0
|
|
skipped_count = 0
|
|
|
|
for i in range(0, len(comments), self.batch_size):
|
|
batch = comments[i:i + self.batch_size]
|
|
logger.info(f"Processing batch {i//self.batch_size + 1} ({len(batch)} comments)")
|
|
|
|
# Prepare batch for API
|
|
batch_data = [
|
|
{
|
|
'id': comment.id,
|
|
'text': comment.comments
|
|
}
|
|
for comment in batch
|
|
]
|
|
|
|
# Analyze batch
|
|
result = self.openrouter_service.analyze_comments(batch_data)
|
|
|
|
if result.get('success'):
|
|
# Update comments with analysis results
|
|
for analysis in result.get('analyses', []):
|
|
try:
|
|
comment_id = analysis.get('comment_id')
|
|
comment = SocialMediaComment.objects.get(id=comment_id)
|
|
|
|
# Build new bilingual analysis structure
|
|
ai_analysis = {
|
|
'sentiment': analysis.get('sentiment', {}),
|
|
'summaries': analysis.get('summaries', {}),
|
|
'keywords': analysis.get('keywords', {}),
|
|
'topics': analysis.get('topics', {}),
|
|
'entities': analysis.get('entities', []),
|
|
'emotions': analysis.get('emotions', {}),
|
|
'metadata': {
|
|
**result.get('metadata', {}),
|
|
'analyzed_at': timezone.now().isoformat()
|
|
}
|
|
}
|
|
|
|
# Update with bilingual analysis structure
|
|
comment.ai_analysis = ai_analysis
|
|
comment.save()
|
|
|
|
analyzed_count += 1
|
|
logger.debug(f"Updated comment {comment_id} with bilingual analysis")
|
|
|
|
except SocialMediaComment.DoesNotExist:
|
|
logger.warning(f"Comment {analysis.get('comment_id')} not found")
|
|
failed_count += 1
|
|
except Exception as e:
|
|
logger.error(f"Error updating comment {comment_id}: {e}")
|
|
failed_count += 1
|
|
else:
|
|
error = result.get('error', 'Unknown error')
|
|
logger.error(f"Batch analysis failed: {error}")
|
|
failed_count += len(batch)
|
|
|
|
# Calculate skipped (comments that were analyzed during processing)
|
|
skipped_count = len(comments) - analyzed_count - failed_count
|
|
|
|
logger.info(
|
|
f"Analysis complete: {analyzed_count} analyzed, "
|
|
f"{failed_count} failed, {skipped_count} skipped"
|
|
)
|
|
|
|
return {
|
|
'success': True,
|
|
'analyzed': analyzed_count,
|
|
'failed': failed_count,
|
|
'skipped': skipped_count,
|
|
'total': len(comments)
|
|
}
|
|
|
|
def analyze_comments_by_platform(self, platform: str, limit: int = 100) -> Dict[str, Any]:
|
|
"""
|
|
Analyze comments from a specific platform.
|
|
|
|
Args:
|
|
platform: Platform name (e.g., 'youtube', 'facebook')
|
|
limit: Maximum number of comments to analyze
|
|
|
|
Returns:
|
|
Dictionary with analysis statistics
|
|
"""
|
|
logger.info(f"Analyzing comments from platform: {platform}")
|
|
return self.analyze_pending_comments(limit=limit, platform=platform)
|
|
|
|
def analyze_recent_comments(self, hours: int = 24, limit: int = 100) -> Dict[str, Any]:
|
|
"""
|
|
Analyze comments scraped in the last N hours.
|
|
|
|
Args:
|
|
hours: Number of hours to look back
|
|
limit: Maximum number of comments to analyze
|
|
|
|
Returns:
|
|
Dictionary with analysis statistics
|
|
"""
|
|
logger.info(f"Analyzing comments from last {hours} hours")
|
|
return self.analyze_pending_comments(limit=limit, hours_ago=hours)
|
|
|
|
def get_analysis_statistics(
|
|
self,
|
|
platform: Optional[str] = None,
|
|
days: int = 30
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Get statistics about analyzed comments using ai_analysis structure.
|
|
|
|
Args:
|
|
platform: Filter by platform (optional)
|
|
days: Number of days to look back
|
|
|
|
Returns:
|
|
Dictionary with analysis statistics
|
|
"""
|
|
cutoff_date = timezone.now() - timedelta(days=days)
|
|
|
|
queryset = SocialMediaComment.objects.filter(
|
|
scraped_at__gte=cutoff_date
|
|
)
|
|
|
|
if platform:
|
|
queryset = queryset.filter(platform=platform)
|
|
|
|
total_comments = queryset.count()
|
|
|
|
# Count analyzed comments (those with ai_analysis populated)
|
|
analyzed_comments = 0
|
|
sentiment_counts = {'positive': 0, 'negative': 0, 'neutral': 0}
|
|
confidence_scores = []
|
|
|
|
for comment in queryset:
|
|
if comment.ai_analysis:
|
|
analyzed_comments += 1
|
|
sentiment = comment.ai_analysis.get('sentiment', {}).get('classification', {}).get('en', 'neutral')
|
|
if sentiment in sentiment_counts:
|
|
sentiment_counts[sentiment] += 1
|
|
confidence = comment.ai_analysis.get('sentiment', {}).get('confidence', 0)
|
|
if confidence:
|
|
confidence_scores.append(confidence)
|
|
|
|
# Calculate average confidence
|
|
avg_confidence = sum(confidence_scores) / len(confidence_scores) if confidence_scores else 0
|
|
|
|
return {
|
|
'total_comments': total_comments,
|
|
'analyzed_comments': analyzed_comments,
|
|
'unanalyzed_comments': total_comments - analyzed_comments,
|
|
'analysis_rate': (analyzed_comments / total_comments * 100) if total_comments > 0 else 0,
|
|
'sentiment_distribution': sentiment_counts,
|
|
'average_confidence': float(avg_confidence),
|
|
'platform': platform or 'all'
|
|
}
|
|
|
|
def reanalyze_comment(self, comment_id: int) -> Dict[str, Any]:
|
|
"""
|
|
Re-analyze a specific comment.
|
|
|
|
Args:
|
|
comment_id: ID of the comment to re-analyze
|
|
|
|
Returns:
|
|
Dictionary with result
|
|
"""
|
|
try:
|
|
comment = SocialMediaComment.objects.get(id=comment_id)
|
|
except SocialMediaComment.DoesNotExist:
|
|
return {
|
|
'success': False,
|
|
'error': f'Comment {comment_id} not found'
|
|
}
|
|
|
|
if not self.openrouter_service.is_configured():
|
|
return {
|
|
'success': False,
|
|
'error': 'OpenRouter service not configured'
|
|
}
|
|
|
|
# Prepare single comment for analysis
|
|
batch_data = [{'id': comment.id, 'text': comment.comments}]
|
|
|
|
# Analyze
|
|
result = self.openrouter_service.analyze_comments(batch_data)
|
|
|
|
if result.get('success'):
|
|
analysis = result.get('analyses', [{}])[0] if result.get('analyses') else {}
|
|
|
|
# Build new bilingual analysis structure
|
|
ai_analysis = {
|
|
'sentiment': analysis.get('sentiment', {}),
|
|
'summaries': analysis.get('summaries', {}),
|
|
'keywords': analysis.get('keywords', {}),
|
|
'topics': analysis.get('topics', {}),
|
|
'entities': analysis.get('entities', []),
|
|
'emotions': analysis.get('emotions', {}),
|
|
'metadata': {
|
|
**result.get('metadata', {}),
|
|
'analyzed_at': timezone.now().isoformat()
|
|
}
|
|
}
|
|
|
|
# Update comment with bilingual analysis structure
|
|
comment.ai_analysis = ai_analysis
|
|
comment.save()
|
|
|
|
sentiment_en = ai_analysis.get('sentiment', {}).get('classification', {}).get('en')
|
|
confidence_val = ai_analysis.get('sentiment', {}).get('confidence', 0)
|
|
|
|
return {
|
|
'success': True,
|
|
'comment_id': comment_id,
|
|
'sentiment': sentiment_en,
|
|
'confidence': float(confidence_val)
|
|
}
|
|
else:
|
|
return {
|
|
'success': False,
|
|
'error': result.get('error', 'Unknown error')
|
|
}
|
|
|
|
def get_top_keywords(
|
|
self,
|
|
platform: Optional[str] = None,
|
|
limit: int = 20,
|
|
days: int = 30
|
|
) -> List[Dict[str, Any]]:
|
|
"""
|
|
Get most common keywords from analyzed comments using ai_analysis structure.
|
|
|
|
Args:
|
|
platform: Filter by platform (optional)
|
|
limit: Maximum number of keywords to return
|
|
days: Number of days to look back
|
|
|
|
Returns:
|
|
List of keyword dictionaries with 'keyword' and 'count' keys
|
|
"""
|
|
cutoff_date = timezone.now() - timedelta(days=days)
|
|
|
|
queryset = SocialMediaComment.objects.filter(
|
|
scraped_at__gte=cutoff_date,
|
|
ai_analysis__isnull=False
|
|
).exclude(ai_analysis={})
|
|
|
|
if platform:
|
|
queryset = queryset.filter(platform=platform)
|
|
|
|
# Count keywords from ai_analysis
|
|
keyword_counts = {}
|
|
for comment in queryset:
|
|
keywords_en = comment.ai_analysis.get('keywords', {}).get('en', [])
|
|
for keyword in keywords_en:
|
|
keyword_counts[keyword] = keyword_counts.get(keyword, 0) + 1
|
|
|
|
# Sort by count and return top N
|
|
sorted_keywords = sorted(
|
|
keyword_counts.items(),
|
|
key=lambda x: x[1],
|
|
reverse=True
|
|
)[:limit]
|
|
|
|
return [
|
|
{'keyword': keyword, 'count': count}
|
|
for keyword, count in sorted_keywords
|
|
]
|