HH/apps/core/management/commands/translate_po.py
2026-04-08 17:13:35 +03:00

328 lines
13 KiB
Python

#!/usr/bin/env python3
"""
AI-Powered PO File Translator using LiteLLM with OpenRouter
This command translates .po files using LiteLLM with OpenRouter as the provider,
matching the project's AI service architecture. Optimized for Arabic translation
with healthcare/medical context.
Usage:
python manage.py translate_po locale/ar/LC_MESSAGES/django.po --lang Arabic
python manage.py translate_po locale/ar/LC_MESSAGES/django.po --lang Arabic --model openrouter/nvidia/nemotron-3-super-120b-a12b:free
python manage.py translate_po locale/ar/LC_MESSAGES/django.po --lang Arabic --dry-run
"""
import os
import json
import time
import re
import polib
from typing import List, Dict, Any, Optional, Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed
from django.core.management.base import BaseCommand
from django.conf import settings
from litellm import completion
from litellm.exceptions import RateLimitError, APIConnectionError, Timeout
# Import project AIService for configuration consistency
from apps.core.ai_service import AIService
class Command(BaseCommand):
help = "Translate .po file entries using LiteLLM with OpenRouter (optimized for Arabic)"
def add_arguments(self, parser):
parser.add_argument("po_file_path", type=str, help="Path to the .po file")
parser.add_argument("--lang", type=str, default="Arabic", help="Target language (default: Arabic)")
parser.add_argument("--batch-size", type=int, default=5, help="Entries per API call (default: 5)")
parser.add_argument("--workers", type=int, default=2, help="Concurrent threads (default: 2)")
parser.add_argument(
"--model",
type=str,
default=None,
help="Model to use (default: from settings.AI_MODEL or AIService.DEFAULT_MODEL)",
)
parser.add_argument("--temperature", type=float, default=0.2, help="Temperature for translation (default: 0.2)")
parser.add_argument("--fix-fuzzy", action="store_true", help="Include entries marked as fuzzy")
parser.add_argument("--dry-run", action="store_true", help="Preview translations without saving")
parser.add_argument("--skip-validation", action="store_true", help="Skip Arabic text validation")
parser.add_argument(
"--context", type=str, default="healthcare", help="Context for translation (default: healthcare)"
)
def handle(self, *args, **options):
# Setup configuration
self.setup_config(options)
# Load PO file
po = self.load_po_file(options["po_file_path"])
if not po:
return
# Filter entries to translate
entries_to_process = self.filter_entries(po, options["fix_fuzzy"])
total = len(entries_to_process)
if total == 0:
self.stdout.write(self.style.SUCCESS("No entries to translate."))
return
self.stdout.write(self.style.SUCCESS(f"Found {total} entries to translate to {self.target_lang}"))
self.stdout.write(f"Using model: {self.model}")
self.stdout.write(f"Batch size: {self.batch_size}, Workers: {self.workers}")
if options["dry_run"]:
self.stdout.write(self.style.WARNING("DRY RUN MODE - No changes will be saved"))
# Process batches
success_count = self.process_batches(entries_to_process, po, options)
# Save if not dry run
if not options["dry_run"]:
po.save()
self.stdout.write(self.style.SUCCESS(f"\n✓ Saved {options['po_file_path']}"))
self.stdout.write(self.style.SUCCESS(f"\nComplete! Translated {success_count}/{total} entries successfully."))
def setup_config(self, options: Dict[str, Any]) -> None:
"""Setup configuration from options and settings"""
# Get API configuration from AIService (consistent with project)
self.api_key = AIService.OPENROUTER_API_KEY
self.base_url = AIService.OPENROUTER_BASE_URL
# Set environment variables for LiteLLM
os.environ["OPENROUTER_API_KEY"] = self.api_key
os.environ["OPENROUTER_API_BASE"] = self.base_url
# Model configuration
self.model = options["model"] or getattr(settings, "AI_MODEL", None) or AIService.DEFAULT_MODEL
# Ensure model has openrouter/ prefix if not present
if not self.model.startswith("openrouter/") and "/" not in self.model:
self.model = f"openrouter/{self.model}"
self.temperature = options["temperature"]
self.batch_size = options["batch_size"]
self.workers = options["workers"]
self.target_lang = options["lang"]
self.context = options["context"]
self.skip_validation = options["skip_validation"]
self.dry_run = options["dry_run"]
def load_po_file(self, file_path: str) -> Optional[polib.POFile]:
"""Load and return PO file"""
self.stdout.write(f"Loading {file_path}...")
try:
po = polib.pofile(file_path)
total_entries = len(po)
translated = len([e for e in po if e.msgstr.strip() and "fuzzy" not in e.flags])
self.stdout.write(f" Total entries: {total_entries}")
self.stdout.write(f" Already translated: {translated}")
return po
except Exception as e:
self.stderr.write(self.style.ERROR(f"Could not load file: {e}"))
return None
def filter_entries(self, po: polib.POFile, fix_fuzzy: bool) -> List[polib.POEntry]:
"""Filter entries that need translation"""
entries = []
for entry in po:
if entry.obsolete:
continue
if not entry.msgstr.strip():
entries.append(entry)
elif fix_fuzzy and "fuzzy" in entry.flags:
entries.append(entry)
return entries
def process_batches(self, entries: List[polib.POEntry], po: polib.POFile, options: Dict[str, Any]) -> int:
"""Process all batches with threading"""
batches = list(self.chunked(entries, self.batch_size))
total_batches = len(batches)
success_count = 0
with ThreadPoolExecutor(max_workers=self.workers) as executor:
future_to_batch = {
executor.submit(self.process_batch, batch, i + 1, total_batches): batch
for i, batch in enumerate(batches)
}
for future in as_completed(future_to_batch):
batch = future_to_batch[future]
try:
success, msg = future.result()
if success:
success_count += len(batch)
else:
self.stderr.write(self.style.WARNING(f"Batch failed: {msg}"))
except Exception as e:
self.stderr.write(self.style.ERROR(f"Batch error: {e}"))
# Auto-save every 3 batches (if not dry run)
if not self.dry_run and (success_count // self.batch_size) % 3 == 0:
po.save()
self.stdout.write(f" Auto-saved progress...")
return success_count
def process_batch(self, batch_entries: List[polib.POEntry], batch_num: int, total_batches: int) -> Tuple[bool, str]:
"""Process a single batch of entries"""
texts = []
contexts = []
for entry in batch_entries:
# Handle plural forms
if entry.msgid_plural:
texts.append({"singular": entry.msgid, "plural": entry.msgid_plural, "context": entry.msgctxt or ""})
else:
texts.append({"text": entry.msgid, "context": entry.msgctxt or ""})
# Build system prompt with context
system_prompt = self.build_system_prompt()
# Build user prompt
user_prompt = self.build_user_prompt(texts)
# Retry logic
max_retries = 3
for attempt in range(max_retries):
try:
response = completion(
model=self.model,
messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}],
temperature=self.temperature,
max_tokens=2000,
api_base=self.base_url,
api_key=self.api_key,
timeout=60,
)
content = response.choices[0].message.content
translations = self.parse_response(content, len(batch_entries))
if not translations:
return False, "Failed to parse translations"
# Update entries with translations
for entry, trans in zip(batch_entries, translations):
if self.dry_run:
self.stdout.write(f" [DRY-RUN] {entry.msgid[:50]}... -> {trans[:50]}...")
else:
entry.msgstr = trans
if "fuzzy" in entry.flags:
entry.flags.remove("fuzzy")
self.stdout.write(f" Batch {batch_num}/{total_batches}")
return True, "Success"
except (RateLimitError, APIConnectionError, Timeout) as e:
wait_time = 2 ** (attempt + 1)
self.stderr.write(self.style.WARNING(f" Retry {attempt + 1}/{max_retries} after {wait_time}s: {e}"))
time.sleep(wait_time)
if attempt == max_retries - 1:
return False, f"API error after retries: {e}"
except Exception as e:
return False, f"Unexpected error: {e}"
return False, "Max retries exceeded"
def build_system_prompt(self) -> str:
"""Build system prompt optimized for Arabic healthcare translation"""
base_prompt = f"""You are a professional translator specializing in {self.context} software localization.
CRITICAL RULES:
1. Translate from English to {self.target_lang}
2. Return ONLY a JSON array of translated strings
3. Preserve ALL variables exactly: %(name)s, {{variable}}, %s, etc.
4. Preserve HTML tags (<strong>, <a>, etc.) - do not translate them
5. Preserve newlines (\\n) and formatting
6. Maintain the same array length and order as input
"""
# Add Arabic-specific instructions
if self.target_lang.lower() in ["arabic", "ar"]:
base_prompt += """
ARABIC TRANSLATION REQUIREMENTS:
- Use Modern Standard Arabic (الفصحى) - formal, professional
- Use healthcare/medical terminology appropriate for Saudi Arabian hospitals
- Maintain professional, respectful tone suitable for patient experience management
- Ensure grammatical correctness and natural flow
- Do not use colloquial Arabic (العامية)
- Numbers and dates should follow Arabic conventions where appropriate
- Keep English technical terms if no standard Arabic equivalent exists (e.g., API, URL)
"""
return base_prompt
def build_user_prompt(self, texts: List[Dict]) -> str:
"""Build user prompt with texts to translate"""
return f"""Translate these texts to {self.target_lang}.
Input format: JSON array with text and optional context
Output format: JSON array of translated strings only
Texts to translate:
{json.dumps(texts, ensure_ascii=False, indent=2)}
Return ONLY the JSON array of translations."""
def parse_response(self, content: str, expected_count: int) -> Optional[List[str]]:
"""Parse and validate API response"""
try:
# Clean markdown code blocks
content = content.strip()
if content.startswith("```json"):
content = content[7:]
elif content.startswith("```"):
content = content[3:]
if content.endswith("```"):
content = content[:-3]
content = content.strip()
translations = json.loads(content)
# Validate
if not isinstance(translations, list):
self.stderr.write(self.style.ERROR("Response is not a JSON array"))
return None
if len(translations) != expected_count:
self.stderr.write(
self.style.ERROR(f"Count mismatch: expected {expected_count}, got {len(translations)}")
)
return None
# Validate Arabic text if needed
if not self.skip_validation and self.target_lang.lower() in ["arabic", "ar"]:
translations = [self.validate_arabic(t) for t in translations]
return translations
except json.JSONDecodeError as e:
self.stderr.write(self.style.ERROR(f"JSON parse error: {e}"))
return None
def validate_arabic(self, text: str) -> str:
"""Validate and clean Arabic text"""
if not text:
return text
# Check for Arabic characters
arabic_pattern = re.compile(r"[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]")
if not arabic_pattern.search(text):
# No Arabic characters found - mark for review
self.stderr.write(self.style.WARNING(f"Translation may not be in Arabic: {text[:50]}..."))
# Clean excessive whitespace
text = re.sub(r"\s+", " ", text).strip()
return text
@staticmethod
def chunked(iterable: List, n: int):
"""Split iterable into chunks of size n"""
for i in range(0, len(iterable), n):
yield iterable[i : i + n]