kaauh_ats/run.py

import os
import json
import time
import argparse
import polib
from concurrent.futures import ThreadPoolExecutor, as_completed
from openai import OpenAI, APIConnectionError, RateLimitError

# --- Terminal Colors ---
class Colors:
    HEADER = '\033[95m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'

def print_success(msg): print(f"{Colors.GREEN}{msg}{Colors.ENDC}")
def print_warning(msg): print(f"{Colors.WARNING}{msg}{Colors.ENDC}")
def print_error(msg):   print(f"{Colors.FAIL}{msg}{Colors.ENDC}")
def print_info(msg):    print(f"{Colors.BLUE}{msg}{Colors.ENDC}")

# --- Provider Configurations ---

class ProviderFactory:
    @staticmethod
    def get_client(provider_name, api_key=None, base_url=None):
        """
        Returns a configured OpenAI client and default model based on the provider.
        """
        provider_name = provider_name.lower()

        if provider_name == 'glm':
            return OpenAI(
                api_key=api_key or os.environ.get('ZAI_API_KEY'),
                base_url="https://api.z.ai/api/coding/paas/v4/"
            ), "glm-4.6"

        elif provider_name == 'ollama':
            # Ollama acts like OpenAI but locally
            return OpenAI(
                api_key='ollama', # Required but ignored by Ollama
                base_url=base_url or "http://localhost:11434/v1"
            ), "llama3" # Default model, user can override

        elif provider_name == 'openai':
            return OpenAI(
                api_key=api_key or os.environ.get('OPENAI_API_KEY')
            ), "gpt-4o-mini"

        else:
            raise ValueError(f"Unknown provider: {provider_name}")

# --- Main Logic ---

def translate_po_file(args):

    # 1. Setup Provider
    try:
        client, default_model = ProviderFactory.get_client(args.provider, args.api_key, args.api_base)
        model_name = args.model or default_model
    except Exception as e:
        print_error(f"Configuration Error: {e}")
        return

    # 2. Load PO File
    print_info(f"Loading file: {args.path}")
    try:
        po = polib.pofile(args.path)
    except Exception as e:
        print_error(f"Could not load file: {e}")
        return

    # 3. Filter Entries
    entries_to_process = []
    for entry in po:
        if entry.obsolete:
            continue

        is_empty = not entry.msgstr.strip()
        is_fuzzy = 'fuzzy' in entry.flags

        if is_empty or (args.fix_fuzzy and is_fuzzy):
            entries_to_process.append(entry)

    total_entries = len(entries_to_process)
    print_success(f"Target: {args.lang} | Provider: {args.provider} | Model: {model_name}")
    print_success(f"Found {total_entries} entries to translate.")

    if total_entries == 0:
        print_success("Nothing to do.")
        return

    # 4. Special Handling for Local Ollama
    # Local models struggle with concurrent requests. Force workers=1 unless overridden.
    if args.provider == 'ollama' and args.workers > 1:
        print_warning("⚠️  Warning: Using multiple workers with Ollama can crash local GPUs.")
        print_warning("   Switching to workers=1 for stability (use --workers N to override).")
        args.workers = 1

    # 5. Batch Processing Helper
    def chunked(iterable, n):
        for i in range(0, len(iterable), n):
            yield iterable[i:i + n]

    batches = list(chunked(entries_to_process, args.batch_size))

    # 6. Worker Function
    def process_batch(batch_entries):
        texts = [e.msgid for e in batch_entries]

        # System prompt: Critical for JSON enforcement
        system_prompt = (
            "You are a professional technical translator. "
            "You will receive a JSON list of English strings. "
            "Translate them accurately. "
            "IMPORTANT Rules:\n"
            "1. Return ONLY a valid JSON list of strings.\n"
            "2. Preserve python formatting (%(count)s, {name}, %s) exactly.\n"
            "3. Do not translate HTML tags.\n"
            "4. Do NOT output markdown (like ```json), just raw JSON."
        )

        user_prompt = (
            f"Translate these texts into {args.lang}:\n"
            f"{json.dumps(texts, ensure_ascii=False)}"
        )

        attempts = 0
        max_retries = 3

        while attempts < max_retries:
            try:
                completion = client.chat.completions.create(
                    model=model_name,
                    messages=[
                        {"role": "system", "content": system_prompt},
                        {"role": "user", "content": user_prompt}
                    ],
                    temperature=0.1, # Low temp for deterministic results
                    response_format={"type": "json_object"} if args.provider == "openai" else None
                )

                content = completion.choices[0].message.content

                # Cleanup: Some local models love adding markdown blocks despite instructions
                content = content.replace('```json', '').replace('```', '').strip()

                # Flexible JSON parsing
                try:
                    data = json.loads(content)
                    # Handle cases where model returns {"translations": [...]} instead of [...]
                    if isinstance(data, dict):
                        # Look for the first list value
                        found_list = False
                        for v in data.values():
                            if isinstance(v, list):
                                translations = v
                                found_list = True
                                break
                        if not found_list:
                            return False, f"Could not find list in JSON object: {data}"
                    else:
                        translations = data
                except json.JSONDecodeError:
                    return False, f"Invalid JSON received: {content[:50]}..."

                if not isinstance(translations, list) or len(translations) != len(batch_entries):
                    return False, f"Count mismatch: sent {len(batch_entries)}, got {len(translations) if isinstance(translations, list) else 'invalid'}"

                # Apply translations with Type Checking
                for entry, translation in zip(batch_entries, translations):

                    # --- VITAL FIX FOR AttributeError: 'dict' object has no attribute 'splitlines' ---
                    if isinstance(translation, dict):
                        # Tries to grab the first string value found in the dict (e.g., {"text": "Hello"})
                        extracted = next((str(v) for v in translation.values() if isinstance(v, str)), None)
                        translation = extracted if extracted else str(translation)
                    elif not isinstance(translation, str):
                        # Ensure all other types (like boolean or int) are converted to string
                        translation = str(translation)
                    # ---------------------------------------------------------------------------------

                    entry.msgstr = translation
                    if 'fuzzy' in entry.flags:
                        entry.flags.remove('fuzzy')

                return True, "Success"

            except (RateLimitError, APIConnectionError) as e:
                attempts += 1
                time.sleep(2 ** attempts) # Exponential backoff
                if attempts == max_retries:
                    return False, f"API Error: {e}"
            except Exception as e:
                return False, f"Unexpected: {e}"

    # 7. Execution Loop
    success_count = 0
    print_info(f"Starting processing {len(batches)} batches...")

    with ThreadPoolExecutor(max_workers=args.workers) as executor:
        future_to_batch = {executor.submit(process_batch, batch): batch for batch in batches}

        for i, future in enumerate(as_completed(future_to_batch)):
            batch = future_to_batch[future]
            success, msg = future.result()

            if success:
                success_count += len(batch)
                print_success(f"[{i+1}/{len(batches)}] Batch done.")
            else:
                print_warning(f"[{i+1}/{len(batches)}] Batch failed: {msg}")

            # Auto-save every 5 batches
            if (i + 1) % 5 == 0:
                po.save()

    # 8. Final Save
    po.save()
    print_success(f"\n------------------------------------------------")
    print_success(f"Finished! Translated {success_count}/{total_entries} entries.")
    print_success(f"File saved: {args.path}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Translate .po files using AI Providers (Z.ai, Ollama, OpenAI)")

    parser.add_argument('path', type=str, help='Path to the .po file')
    parser.add_argument('--lang', type=str, required=True, default='ar', help='Target language (e.g., "French", "zh-CN")')

    # Provider Settings
    parser.add_argument('--provider', type=str, default='ollama', choices=['glm', 'ollama', 'openai'], help='AI Provider to use')
    parser.add_argument('--model', type=str, help='Model name (e.g., glm-4, llama3, gpt-4). Defaults vary by provider.')
    parser.add_argument('--api-key', type=str, help='API Key (optional if env var is set)')
    parser.add_argument('--api-base', type=str, help='Custom API Base URL (useful for custom Ollama ports)')

    # Performance Settings
    parser.add_argument('--batch-size', type=int, default=10, help='Lines per request. Keep low (5-10) for local models.')
    parser.add_argument('--workers', type=int, default=3, help='Parallel threads. Note: Ollama defaults to 1.')

    # Logic Settings
    parser.add_argument('--fix-fuzzy', action='store_true', help='Re-translate entries marked as fuzzy')

    args = parser.parse_args()
    translate_po_file(args)