HH/apps/core/management/commands/translate_po.py

#!/usr/bin/env python3
"""
AI-Powered PO File Translator using LiteLLM with OpenRouter

This command translates .po files using LiteLLM with OpenRouter as the provider,
matching the project's AI service architecture. Optimized for Arabic translation
with healthcare/medical context.

Usage:
    python manage.py translate_po locale/ar/LC_MESSAGES/django.po --lang Arabic
    python manage.py translate_po locale/ar/LC_MESSAGES/django.po --lang Arabic --model openrouter/nvidia/nemotron-3-super-120b-a12b:free
    python manage.py translate_po locale/ar/LC_MESSAGES/django.po --lang Arabic --dry-run
"""

import os
import json
import time
import re
import polib
from typing import List, Dict, Any, Optional, Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed
from django.core.management.base import BaseCommand
from django.conf import settings
from litellm import completion
from litellm.exceptions import RateLimitError, APIConnectionError, Timeout

# Import project AIService for configuration consistency
from apps.core.ai_service import AIService


class Command(BaseCommand):
    help = "Translate .po file entries using LiteLLM with OpenRouter (optimized for Arabic)"

    def add_arguments(self, parser):
        parser.add_argument("po_file_path", type=str, help="Path to the .po file")
        parser.add_argument("--lang", type=str, default="Arabic", help="Target language (default: Arabic)")
        parser.add_argument("--batch-size", type=int, default=5, help="Entries per API call (default: 5)")
        parser.add_argument("--workers", type=int, default=2, help="Concurrent threads (default: 2)")
        parser.add_argument(
            "--model",
            type=str,
            default=None,
            help="Model to use (default: from settings.AI_MODEL or AIService.DEFAULT_MODEL)",
        )
        parser.add_argument("--temperature", type=float, default=0.2, help="Temperature for translation (default: 0.2)")
        parser.add_argument("--fix-fuzzy", action="store_true", help="Include entries marked as fuzzy")
        parser.add_argument("--dry-run", action="store_true", help="Preview translations without saving")
        parser.add_argument("--skip-validation", action="store_true", help="Skip Arabic text validation")
        parser.add_argument(
            "--context", type=str, default="healthcare", help="Context for translation (default: healthcare)"
        )

    def handle(self, *args, **options):
        # Setup configuration
        self.setup_config(options)

        # Load PO file
        po = self.load_po_file(options["po_file_path"])
        if not po:
            return

        # Filter entries to translate
        entries_to_process = self.filter_entries(po, options["fix_fuzzy"])
        total = len(entries_to_process)

        if total == 0:
            self.stdout.write(self.style.SUCCESS("No entries to translate."))
            return

        self.stdout.write(self.style.SUCCESS(f"Found {total} entries to translate to {self.target_lang}"))
        self.stdout.write(f"Using model: {self.model}")
        self.stdout.write(f"Batch size: {self.batch_size}, Workers: {self.workers}")

        if options["dry_run"]:
            self.stdout.write(self.style.WARNING("DRY RUN MODE - No changes will be saved"))

        # Process batches
        success_count = self.process_batches(entries_to_process, po, options)

        # Save if not dry run
        if not options["dry_run"]:
            po.save()
            self.stdout.write(self.style.SUCCESS(f"\n✓ Saved {options['po_file_path']}"))

        self.stdout.write(self.style.SUCCESS(f"\nComplete! Translated {success_count}/{total} entries successfully."))

    def setup_config(self, options: Dict[str, Any]) -> None:
        """Setup configuration from options and settings"""
        # Get API configuration from AIService (consistent with project)
        self.api_key = AIService.OPENROUTER_API_KEY
        self.base_url = AIService.OPENROUTER_BASE_URL

        # Set environment variables for LiteLLM
        os.environ["OPENROUTER_API_KEY"] = self.api_key
        os.environ["OPENROUTER_API_BASE"] = self.base_url

        # Model configuration
        self.model = options["model"] or getattr(settings, "AI_MODEL", None) or AIService.DEFAULT_MODEL

        # Ensure model has openrouter/ prefix if not present
        if not self.model.startswith("openrouter/") and "/" not in self.model:
            self.model = f"openrouter/{self.model}"

        self.temperature = options["temperature"]
        self.batch_size = options["batch_size"]
        self.workers = options["workers"]
        self.target_lang = options["lang"]
        self.context = options["context"]
        self.skip_validation = options["skip_validation"]
        self.dry_run = options["dry_run"]

    def load_po_file(self, file_path: str) -> Optional[polib.POFile]:
        """Load and return PO file"""
        self.stdout.write(f"Loading {file_path}...")
        try:
            po = polib.pofile(file_path)
            total_entries = len(po)
            translated = len([e for e in po if e.msgstr.strip() and "fuzzy" not in e.flags])
            self.stdout.write(f"  Total entries: {total_entries}")
            self.stdout.write(f"  Already translated: {translated}")
            return po
        except Exception as e:
            self.stderr.write(self.style.ERROR(f"Could not load file: {e}"))
            return None

    def filter_entries(self, po: polib.POFile, fix_fuzzy: bool) -> List[polib.POEntry]:
        """Filter entries that need translation"""
        entries = []
        for entry in po:
            if entry.obsolete:
                continue
            if not entry.msgstr.strip():
                entries.append(entry)
            elif fix_fuzzy and "fuzzy" in entry.flags:
                entries.append(entry)
        return entries

    def process_batches(self, entries: List[polib.POEntry], po: polib.POFile, options: Dict[str, Any]) -> int:
        """Process all batches with threading"""
        batches = list(self.chunked(entries, self.batch_size))
        total_batches = len(batches)
        success_count = 0

        with ThreadPoolExecutor(max_workers=self.workers) as executor:
            future_to_batch = {
                executor.submit(self.process_batch, batch, i + 1, total_batches): batch
                for i, batch in enumerate(batches)
            }

            for future in as_completed(future_to_batch):
                batch = future_to_batch[future]
                try:
                    success, msg = future.result()
                    if success:
                        success_count += len(batch)
                    else:
                        self.stderr.write(self.style.WARNING(f"Batch failed: {msg}"))
                except Exception as e:
                    self.stderr.write(self.style.ERROR(f"Batch error: {e}"))

                # Auto-save every 3 batches (if not dry run)
                if not self.dry_run and (success_count // self.batch_size) % 3 == 0:
                    po.save()
                    self.stdout.write(f"  Auto-saved progress...")

        return success_count

    def process_batch(self, batch_entries: List[polib.POEntry], batch_num: int, total_batches: int) -> Tuple[bool, str]:
        """Process a single batch of entries"""
        texts = []
        contexts = []

        for entry in batch_entries:
            # Handle plural forms
            if entry.msgid_plural:
                texts.append({"singular": entry.msgid, "plural": entry.msgid_plural, "context": entry.msgctxt or ""})
            else:
                texts.append({"text": entry.msgid, "context": entry.msgctxt or ""})

        # Build system prompt with context
        system_prompt = self.build_system_prompt()

        # Build user prompt
        user_prompt = self.build_user_prompt(texts)

        # Retry logic
        max_retries = 3
        for attempt in range(max_retries):
            try:
                response = completion(
                    model=self.model,
                    messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}],
                    temperature=self.temperature,
                    max_tokens=2000,
                    api_base=self.base_url,
                    api_key=self.api_key,
                    timeout=60,
                )

                content = response.choices[0].message.content
                translations = self.parse_response(content, len(batch_entries))

                if not translations:
                    return False, "Failed to parse translations"

                # Update entries with translations
                for entry, trans in zip(batch_entries, translations):
                    if self.dry_run:
                        self.stdout.write(f"  [DRY-RUN] {entry.msgid[:50]}... -> {trans[:50]}...")
                    else:
                        entry.msgstr = trans
                        if "fuzzy" in entry.flags:
                            entry.flags.remove("fuzzy")

                self.stdout.write(f"  Batch {batch_num}/{total_batches} ✓")
                return True, "Success"

            except (RateLimitError, APIConnectionError, Timeout) as e:
                wait_time = 2 ** (attempt + 1)
                self.stderr.write(self.style.WARNING(f"  Retry {attempt + 1}/{max_retries} after {wait_time}s: {e}"))
                time.sleep(wait_time)
                if attempt == max_retries - 1:
                    return False, f"API error after retries: {e}"
            except Exception as e:
                return False, f"Unexpected error: {e}"

        return False, "Max retries exceeded"

    def build_system_prompt(self) -> str:
        """Build system prompt optimized for Arabic healthcare translation"""
        base_prompt = f"""You are a professional translator specializing in {self.context} software localization.

CRITICAL RULES:
1. Translate from English to {self.target_lang}
2. Return ONLY a JSON array of translated strings
3. Preserve ALL variables exactly: %(name)s, {{variable}}, %s, etc.
4. Preserve HTML tags (<strong>, <a>, etc.) - do not translate them
5. Preserve newlines (\\n) and formatting
6. Maintain the same array length and order as input
"""

        # Add Arabic-specific instructions
        if self.target_lang.lower() in ["arabic", "ar"]:
            base_prompt += """

ARABIC TRANSLATION REQUIREMENTS:
- Use Modern Standard Arabic (الفصحى) - formal, professional
- Use healthcare/medical terminology appropriate for Saudi Arabian hospitals
- Maintain professional, respectful tone suitable for patient experience management
- Ensure grammatical correctness and natural flow
- Do not use colloquial Arabic (العامية)
- Numbers and dates should follow Arabic conventions where appropriate
- Keep English technical terms if no standard Arabic equivalent exists (e.g., API, URL)
"""

        return base_prompt

    def build_user_prompt(self, texts: List[Dict]) -> str:
        """Build user prompt with texts to translate"""
        return f"""Translate these texts to {self.target_lang}.

Input format: JSON array with text and optional context
Output format: JSON array of translated strings only

Texts to translate:
{json.dumps(texts, ensure_ascii=False, indent=2)}

Return ONLY the JSON array of translations."""

    def parse_response(self, content: str, expected_count: int) -> Optional[List[str]]:
        """Parse and validate API response"""
        try:
            # Clean markdown code blocks
            content = content.strip()
            if content.startswith("```json"):
                content = content[7:]
            elif content.startswith("```"):
                content = content[3:]
            if content.endswith("```"):
                content = content[:-3]
            content = content.strip()

            translations = json.loads(content)

            # Validate
            if not isinstance(translations, list):
                self.stderr.write(self.style.ERROR("Response is not a JSON array"))
                return None

            if len(translations) != expected_count:
                self.stderr.write(
                    self.style.ERROR(f"Count mismatch: expected {expected_count}, got {len(translations)}")
                )
                return None

            # Validate Arabic text if needed
            if not self.skip_validation and self.target_lang.lower() in ["arabic", "ar"]:
                translations = [self.validate_arabic(t) for t in translations]

            return translations

        except json.JSONDecodeError as e:
            self.stderr.write(self.style.ERROR(f"JSON parse error: {e}"))
            return None

    def validate_arabic(self, text: str) -> str:
        """Validate and clean Arabic text"""
        if not text:
            return text

        # Check for Arabic characters
        arabic_pattern = re.compile(r"[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]")

        if not arabic_pattern.search(text):
            # No Arabic characters found - mark for review
            self.stderr.write(self.style.WARNING(f"Translation may not be in Arabic: {text[:50]}..."))

        # Clean excessive whitespace
        text = re.sub(r"\s+", " ", text).strip()

        return text

    @staticmethod
    def chunked(iterable: List, n: int):
        """Split iterable into chunks of size n"""
        for i in range(0, len(iterable), n):
            yield iterable[i : i + n]