OmniRoute/scripts/validate_translation.py

#!/usr/bin/env python3
"""
OmniRoute i18n Translation Validator
Script for comparing source (en.json) with any translation
Detects missing translations and source changes needing updates

Usage:
    python validate_translation.py              # Uses TRANSLATION_LANG env or --lang argument
    python validate_translation.py --lang cs    # Validate Czech (cs.json)
    python validate_translation.py -l de       # Validate German (de.json)
    TRANSLATION_LANG=fr python validate_translation.py  # Validate French

Environment variables:
    TRANSLATION_LANG    Target language code (e.g., cs, de, fr)
"""

import json
import sys
import os
from pathlib import Path
from typing import Dict, List, Set, Tuple, Any
import argparse

# Colors (ANSI)
RED = "\033[0;31m"
GREEN = "\033[0;32m"
YELLOW = "\033[1;33m"
BLUE = "\033[0;34m"
NC = "\033[0m"

# Configuration - find repo root relative to this script
_script_dir = Path(__file__).parent.resolve()
# If script is in scripts/ subfolder, go up one level to repo root
if _script_dir.name == "scripts":
    SCRIPT_DIR = _script_dir.parent
else:
    SCRIPT_DIR = _script_dir

MESSAGES_DIR = SCRIPT_DIR / "src" / "i18n" / "messages"
SOURCE_FILE = MESSAGES_DIR / "en.json"


# Get target language from env or argument
def get_target_lang() -> str:
    """Get target language from ENV or CLI argument."""
    # First check environment variable
    env_lang = os.environ.get("TRANSLATION_LANG")
    if env_lang:
        return env_lang

    # Then check command line argument (will be set in main)
    if hasattr(get_target_lang, "cli_lang"):
        return get_target_lang.cli_lang

    # Default to cs for backwards compatibility
    return "cs"


# Keys that should NOT be translated (technical terms, proper names, etc.)
# Loaded from external file for easier maintenance
_UNTRANSLATABLE_KEYS_FILE = _script_dir / "i18n" / "untranslatable-keys.json"
if _UNTRANSLATABLE_KEYS_FILE.exists():
    with open(_UNTRANSLATABLE_KEYS_FILE, "r", encoding="utf-8") as _f:
        UNTRANSLATABLE_KEYS = set(json.load(_f).get("keys", []))
else:
    UNTRANSLATABLE_KEYS = set()


def print_header(msg: str) -> None:
    print(f"\n{BLUE}{'=' * 50}{NC}")
    print(f"{BLUE}{msg}{NC}")
    print(f"{BLUE}{'=' * 50}{NC}")


def print_success(msg: str) -> None:
    print(f"{GREEN}✓ {msg}{NC}")


def print_warning(msg: str) -> None:
    print(f"{YELLOW}⚠ {msg}{NC}")


def print_error(msg: str) -> None:
    print(f"{RED}✗ {msg}{NC}")


def load_json(path: Path) -> Dict:
    """Load JSON file"""
    try:
        with open(path, "r", encoding="utf-8") as f:
            return json.load(f)
    except json.JSONDecodeError as e:
        print_error(f"Invalid JSON in {path}: {e}")
        sys.exit(1)


def get_all_keys(obj: Any, prefix: str = "") -> Set[str]:
    """Recursively get all leaf keys from JSON object"""
    keys = set()
    if isinstance(obj, dict):
        for key, value in obj.items():
            new_prefix = f"{prefix}.{key}" if prefix else key
            if isinstance(value, dict):
                keys.update(get_all_keys(value, new_prefix))
            elif isinstance(value, list):
                # Handle arrays - check first element for structure
                if value and isinstance(value[0], dict):
                    for i, item in enumerate(value):
                        keys.update(get_all_keys(item, f"{new_prefix}[{i}]"))
                else:
                    keys.add(new_prefix)
            else:
                keys.add(new_prefix)
    return keys


def find_missing_keys(source: Dict, trans: Dict) -> Set[str]:
    """Keys in source but not in translation"""
    source_keys = get_all_keys(source)
    trans_keys = get_all_keys(trans)
    return source_keys - trans_keys


def find_extra_keys(source: Dict, trans: Dict) -> Set[str]:
    """Keys in translation but not in source"""
    source_keys = get_all_keys(source)
    trans_keys = get_all_keys(trans)
    return trans_keys - source_keys


def get_value_by_path(obj: Dict, path: str) -> Any:
    """Get value from nested dict using dot notation"""
    keys = path.replace("[", ".").replace("]", "").split(".")
    current = obj
    for key in keys:
        if key.isdigit():
            idx = int(key)
            if isinstance(current, list) and idx < len(current):
                current = current[idx]
            else:
                return None
        else:
            if isinstance(current, dict) and key in current:
                current = current[key]
            else:
                return None
    return current


def find_untranslated(source: Dict, trans: Dict) -> Set[str]:
    """Keys where source value equals translation (not translated), excluding untranslatable keys"""
    source_keys = get_all_keys(source)
    untranslated = set()

    for key in source_keys:
        # Skip keys that are in the untranslatable list
        if key in UNTRANSLATABLE_KEYS:
            continue

        source_val = get_value_by_path(source, key)
        trans_val = get_value_by_path(trans, key)

        if source_val is not None and source_val == trans_val:
            untranslated.add(key)

    return untranslated


def find_placeholder_issues(source: Dict, trans: Dict) -> List[Tuple[str, str, str]]:
    """
    Find placeholder mismatches between source and translation.
    Only checks top-level placeholders like {count}, {day}, NOT ICU inner content.
    Returns list of (key, source_placeholder, trans_placeholder)
    """
    source_keys = get_all_keys(source)
    issues = []

    for key in source_keys:
        source_val = get_value_by_path(source, key)
        trans_val = get_value_by_path(trans, key)

        if source_val is None or trans_val is None:
            continue

        if not isinstance(source_val, str) or not isinstance(trans_val, str):
            continue

        # Only extract top-level placeholders: {name}, {count}, {day}, NOT {# X} inside ICU
        import re

        # Extract variable names from placeholders (e.g., 'name' from '{name}' or 'count' from '{count, plural, ...}')
        # This avoids false positives on ICU strings where the internal text is translated.
        placeholder_regex = r"\{\s*([a-zA-Z][a-zA-Z0-9_]*)"
        source_placeholders = set(re.findall(placeholder_regex, source_val))
        trans_placeholders = set(re.findall(placeholder_regex, trans_val))

        # Check for missing placeholders
        missing = source_placeholders - trans_placeholders
        if missing:
            issues.append((key, str(source_placeholders), str(trans_placeholders)))

    return issues


def compare_category(
    source: Dict, trans: Dict, category: str
) -> Tuple[bool, List[str]]:
    """Compare a specific category, return (complete, missing_keys)"""
    if category not in source:
        return False, [f"Category '{category}' not in source"]

    if category not in trans:
        return False, [f"Category '{category}' missing in translation"]

    source_keys = get_all_keys(source[category])
    trans_keys = get_all_keys(trans[category])
    missing = source_keys - trans_keys

    return len(missing) == 0, list(missing)


def get_translation_file() -> Path:
    """Get the translation file path based on target language."""
    lang = get_target_lang()
    return MESSAGES_DIR / f"{lang}.json"


def generate_report():
    """Generate full translation report"""
    translation_file = get_translation_file()
    print_header("OmniRoute Translation Report")
    print(f"Source: {SOURCE_FILE}")
    print(f"Translation: {translation_file}\n")

    source = load_json(SOURCE_FILE)
    trans = load_json(translation_file)

    # Count keys
    source_count = len(get_all_keys(source))
    trans_count = len(get_all_keys(trans))

    print(f"{BLUE}Key Statistics:{NC}")
    print(f"  Source keys: {source_count}")
    print(f"  Translation keys: {trans_count}\n")

    # Missing keys
    print_header("Missing Translations")
    missing = find_missing_keys(source, trans)
    if missing:
        print(f"{RED}Found {len(missing)} missing keys:{NC}")
        for key in sorted(missing)[:50]:  # Limit output
            print(f"  - {key}")
        if len(missing) > 50:
            print(f"  ... and {len(missing) - 50} more")
    else:
        print_success("No missing translations!")

    # Extra keys
    print_header("Extra Keys")
    extra = find_extra_keys(source, trans)
    if extra:
        print(f"{YELLOW}Found {len(extra)} extra keys:{NC}")
        for key in sorted(extra)[:50]:
            print(f"  - {key}")
    else:
        print_success("No extra keys!")

    # Untranslated
    print_header("Untranslated Keys (same as source)")
    untranslated = find_untranslated(source, trans)
    if untranslated:
        print(f"{YELLOW}Found {len(untranslated)} untranslated keys:{NC}")
        for key in sorted(untranslated)[:50]:
            print(f"  - {key}")
        if len(untranslated) > 50:
            print(f"  ... and {len(untranslated) - 50} more")
    else:
        print_success("All keys appear to be translated!")

    # Placeholder issues
    print_header("Placeholder Mismatches")
    placeholder_issues = find_placeholder_issues(source, trans)
    if placeholder_issues:
        print(f"{YELLOW}Found {len(placeholder_issues)} placeholder mismatches:{NC}")
        for key, src_ph, trans_ph in placeholder_issues[:20]:
            print(f"  - {key}")
            print(f"    Source: {src_ph}")
            print(f"    Trans:  {trans_ph}")
        if len(placeholder_issues) > 20:
            print(f"  ... and {len(placeholder_issues) - 20} more")
    else:
        print_success("All placeholders match!")

    # Per-category status
    print_header("Per-Category Status")
    for category in sorted(source.keys()):
        complete, missing = compare_category(source, trans, category)
        if complete:
            print_success(f"{category} - complete")
        else:
            print_error(f"{category} - missing {len(missing)} keys")

    # Summary
    print_header("Summary")
    if not missing and not extra and not untranslated:
        print(f"{GREEN}🎉 Translation is fully synchronized!{NC}")
        return 0
    else:
        print(f"{YELLOW}Translation needs attention:{NC}")
        print(f"  - Missing: {len(missing)}")
        print(f"  - Extra: {len(extra)}")
        print(f"  - Untranslated: {len(untranslated)}")
        return 0


def quick_check() -> int:
    """Quick check - just show counts"""
    translation_file = get_translation_file()
    source = load_json(SOURCE_FILE)
    trans = load_json(translation_file)

    missing = find_missing_keys(source, trans)
    untranslated = find_untranslated(source, trans)

    print(f"Missing: {len(missing)}")
    print(f"Untranslated: {len(untranslated)}")
    print(f"Ignored (UNTRANSLATABLE_KEYS): {len(UNTRANSLATABLE_KEYS)}")

    # Exit codes:
    # 0 = OK
    # 1 = generic error
    # 2 = missing string in translation
    # 3 = untranslated (soft warning - not a failure)
    if missing:
        print_warning(f"{len(missing)} missing keys (non-critical)")
        return 0
    # untranslated is a soft warning, not a failure - translations exist, just not localized
    if untranslated:
        print_warning(f"{len(untranslated)} untranslated keys (non-critical)")
        return 0
    return 0


def show_diff(category: str) -> int:
    """Show detailed diff for a category"""
    translation_file = get_translation_file()
    source = load_json(SOURCE_FILE)
    trans = load_json(translation_file)

    if category not in source:
        print_error(f"Category '{category}' not found in source")
        print("Available categories:")
        for cat in sorted(source.keys()):
            print(f"  - {cat}")
        return 1

    print_header(f"Diff for category: {category}")

    print(f"{BLUE}{'Key':<30} | {'Source':<25} | {'Translation':<25}{NC}")
    print("-" * 85)

    source_keys = get_all_keys(source[category])

    for key in sorted(source_keys):
        source_val = get_value_by_path(source[category], key)
        trans_val = get_value_by_path(trans.get(category, {}), key)

        # Truncate long values
        source_str = str(source_val)[:25] if source_val else "(null)"
        trans_str = str(trans_val)[:25] if trans_val else "(missing)"

        if source_val == trans_val:
            status = f"{YELLOW}(same){NC}"
        elif trans_val is None:
            status = f"{RED}(missing){NC}"
        else:
            status = f"{GREEN}(ok){NC}"

        print(f"{key:<30} | {source_str:<25} | {trans_str:<25} {status}")

    return 0


def export_csv(output_file: str) -> int:
    """Export to CSV"""
    translation_file = get_translation_file()
    source = load_json(SOURCE_FILE)
    trans = load_json(translation_file)

    print_header(f"Exporting to CSV: {output_file}")

    source_keys = get_all_keys(source)

    with open(output_file, "w", encoding="utf-8") as f:
        f.write("key,source_value,translation_value,status\n")

        for key in sorted(source_keys):
            source_val = get_value_by_path(source, key)
            trans_val = get_value_by_path(trans, key)

            # Escape commas
            source_str = str(source_val).replace(",", ";")
            trans_str = str(trans_val).replace(",", ";") if trans_val else ""

            if trans_val is None:
                status = "MISSING"
            elif source_val == trans_val:
                status = "UNTRANSLATED"
            else:
                status = "OK"

            f.write(f'"{key}","{source_str}","{trans_str}",{status}\n')

    print_success(f"Exported to {output_file}")
    return 0


def export_markdown(output_file: str) -> int:
    """Export all keys to separate Markdown files - translated and untranslated"""
    translation_file = get_translation_file()
    source = load_json(SOURCE_FILE)
    trans = load_json(translation_file)

    print_header(f"Exporting to Markdown: {output_file}")

    source_keys = get_all_keys(source)
    missing = find_missing_keys(source, trans)
    untranslated = find_untranslated(source, trans)

    # Separate translated and untranslated
    translated_keys = []
    untranslated_sorted = sorted(untranslated)

    for key in sorted(source_keys):
        if key not in missing and key not in untranslated:
            translated_keys.append(key)

    translated_count = len(translated_keys)
    untranslated_count = len(untranslated_sorted)

    # Export untranslated (main output file)
    with open(output_file, "w", encoding="utf-8") as f:
        f.write("# Nepřeložené klíče (Untranslated Keys)\n\n")
        f.write(f"Zdroj: `{SOURCE_FILE.name}` | Překlad: `{TRANSLATION_FILE.name}`\n\n")

        f.write(f"**Celkem: {untranslated_count} nepreložených klíčů**\n\n")

        f.write("| # | Klíč (Key) | Originál | Nepřeloženo |\n")
        f.write("|---|------------|----------|------------|\n")

        for i, key in enumerate(untranslated_sorted, 1):
            source_val = get_value_by_path(source, key)
            trans_val = get_value_by_path(trans, key)

            source_str = str(source_val).replace("|", "\\|")[:60]
            trans_str = str(trans_val).replace("|", "\\|")[:60]

            f.write(f"| {i} | `{key}` | {source_str} | {trans_str} |\n")

        f.write("\n## Shrnutí (Summary)\n\n")
        f.write(f"- Celkem klíčů: {len(source_keys)}\n")
        f.write(f"- Chybějících: {len(missing)}\n")
        f.write(f"- Nepřeložených: {untranslated_count}\n")
        f.write(f"- Přeložených: {translated_count}\n")

    # Export translated to separate file
    translated_file = output_file.replace(".md", "_translated.md")
    translation_filename = translation_file.name
    with open(translated_file, "w", encoding="utf-8") as f:
        f.write("# Přeložené klíče (Translated Keys)\n\n")
        f.write(f"Zdroj: `{SOURCE_FILE.name}` | Překlad: `{translation_filename}`\n\n")

        f.write(f"**Celkem: {translated_count} přeložených klíčů**\n\n")

        f.write("| # | Klíč (Key) | Originál | Překlad |\n")
        f.write("|---|------------|----------|---------|\n")

        for i, key in enumerate(translated_keys, 1):
            source_val = get_value_by_path(source, key)
            trans_val = get_value_by_path(trans, key)

            source_str = str(source_val).replace("|", "\\|")[:40]
            trans_str = str(trans_val).replace("|", "\\|")[:40]

            f.write(f"| {i} | `{key}` | {source_str} | {trans_str} |\n")

    print_success(f"Exported: {output_file} ({untranslated_count} keys)")
    print_success(f"Exported: {translated_file} ({translated_count} keys)")
    return 0


def usage():
    print("""
OmniRoute i18n Translation Validator

Usage: validate_translation.py [command] [options]

Options:
  -l, --lang <code>    Target language code (e.g., cs, de, fr)
                      Default: cs or TRANSLATION_LANG env variable

Commands:
  (default)        Generate full report
  quick            Quick check - just show counts
  diff <category>  Show detailed diff for a category
  csv [file]       Export to CSV (default: translation_report.csv)
  md [file]        Export to Markdown (default: translation_report.md)

Examples:
  python validate_translation.py                 # Full report (default: cs)
  python validate_translation.py --lang de        # Validate German
  python validate_translation.py -l fr            # Validate French
  TRANSLATION_LANG=es python validate_translation.py  # Validate Spanish
  python validate_translation.py quick            # Quick status check
  python validate_translation.py diff common      # Diff common category
  python validate_translation.py csv              # Export to CSV
  python validate_translation.py md               # Export to Markdown
  python validate_translation.py fix              # Auto-generate missing keys from en.json
""")


def fix_missing_keys() -> int:
    """Auto-generate missing keys by copying from en.json"""
    source_file = SOURCE_FILE
    translation_file = get_translation_file()

    source = load_json(source_file)
    trans = load_json(translation_file)

    # Get all keys recursively
    def get_all_keys(obj, prefix=''):
        keys = []
        if isinstance(obj, dict):
            for k, v in obj.items():
                full_key = f"{prefix}.{k}" if prefix else k
                if isinstance(v, dict):
                    keys.extend(get_all_keys(v, full_key))
                else:
                    keys.append(full_key)
        return keys

    source_keys = set(get_all_keys(source))
    trans_keys = set(get_all_keys(trans))

    missing = source_keys - trans_keys

    if not missing:
        print_success("No missing keys - translation file is complete!")
        return 0

    print(f"Found {len(missing)} missing keys")

    # Add missing keys to translation
    for key in missing:
        parts = key.split('.')
        current = trans
        for i, part in enumerate(parts[:-1]):
            if part not in current:
                current[part] = {}
            current = current[part]

        # Get value from source
        src = source
        for part in parts:
            src = src.get(part, {})

        # Set the value (use English as fallback)
        current[parts[-1]] = src if isinstance(src, str) else key

    # Write back
    with open(translation_file, 'w', encoding='utf-8') as f:
        json.dump(trans, f, ensure_ascii=False, indent=2)
        f.write('\n')

    print_success(f"Added {len(missing)} missing keys to {translation_file.name}")
    return 0


def main():
    # Parse global arguments first
    parser = argparse.ArgumentParser(add_help=False)
    parser.add_argument("-l", "--lang", dest="lang", default=None)
    parser.add_argument("command", nargs="?")
    parser.add_argument("arg", nargs="?")

    # Parse known args only to allow commands to handle their own args
    args, _ = parser.parse_known_args()

    # Set language from argument or use default
    if args.lang:
        get_target_lang.cli_lang = args.lang
    elif not os.environ.get("TRANSLATION_LANG"):
        # Default to cs for backwards compatibility
        get_target_lang.cli_lang = "cs"

    # Check if translation file exists
    translation_file = get_translation_file()
    if not translation_file.exists():
        print_error(f"Translation file not found: {translation_file}")
        print(f"Available languages:")
        for f in sorted(MESSAGES_DIR.glob("*.json")):
            if f.name != "en.json":
                print(f"  - {f.stem}")
        return 1

    # Execute command
    if not args.command or args.command in ("help", "--help", "-h"):
        return generate_report()

    if args.command == "quick":
        return quick_check()
    elif args.command == "diff":
        if not args.arg:
            print_error("Please specify category")
            usage()
            return 1
        return show_diff(args.arg)
    elif args.command == "csv":
        output = args.arg if args.arg else "translation_report.csv"
        return export_csv(output)
    elif args.command == "md":
        output = args.arg if args.arg else "translation_report.md"
        return export_markdown(output)
    elif args.command == "fix":
        return fix_missing_keys()
    else:
        print_error(f"Unknown command: {args.command}")
        usage()
        return 1


if __name__ == "__main__":
    sys.exit(main())