"""Urdu-specific translation handlers.

Handles:
- Urdu-specific translation prompt (Nastaliq script, spoken Urdu vocabulary)
- Urdu → Devanagari transliteration for TTS (Chatterbox needs Devanagari)
- Devanagari → Urdu script conversion for captions
"""
import json
import re

from ._shared import build_client, parse_json_array, bedrock_converse, MODEL, log_llm_call


# ── Public dispatcher hooks ──────────────────────────────────────────────────

def get_translation_prompt() -> str:
    """Return the Urdu-specific system prompt for translation."""
    return (
        "You are a professional voice-over translator for commonly spoken Urdu. "
        "Translate the following numbered lines into Urdu (Nastaliq/Arabic script).\n\n"
        "LANGUAGE RULES:\n"
        "- Use ONLY everyday spoken Urdu — the kind heard on Pakistani news, dramas, and streets.\n"
        "- Use Urdu, Persian, and Arabic-origin vocabulary ONLY. "
        "NEVER use Sanskrit-origin Hindi words (e.g. use محبت not پیار, زندگی not جیون, "
        "وقت not سمے, لیکن not پرنتو, اگر not یدی).\n"
        "- Keep it natural and conversational, not literary or formal.\n"
        "- NEVER insert English words, interjections, or filler sounds (Oh, Ah, Hmm, Well, So). "
        "Translate ALL such expressions into Urdu equivalents.\n\n"
        "CRITICAL — DURATION CONSTRAINT:\n"
        "Each line shows its spoken duration in brackets (e.g. [4.6s]). "
        "The translation will be spoken by TTS and MUST fit within that duration.\n"
        "STRICT RULE: Your translation MUST have FEWER words than the original English. "
        "If the English has 10 words, aim for 7-8 Urdu words maximum.\n"
        "Every word must earn its place — if removing a word doesn't lose core meaning, remove it. "
        "Paraphrase aggressively. Use shorter synonyms. Merge clauses. "
        "A concise translation that fits the time is ALWAYS better than a complete one that overflows.\n\n"
        "TTS COMPATIBILITY — IMPORTANT:\n"
        "The TTS model struggles with long sentences that have multiple commas or clauses. "
        "Restructure into short, direct sentences — but the TOTAL text must still fit the duration shown in brackets. "
        "Do NOT add extra words or content when restructuring. The goal is simpler phrasing, not more text.\n"
        "Each output line is still ONE item in the array (one per input line). "
        "You may use multiple short sentences within that single line, but it must all fit the original duration.\n\n"
        "Write ONLY in Urdu script (Nastaliq/Arabic script). "
        "Return ONLY a JSON array of translated strings, in order, no extra text. "
        "Do NOT include the duration prefix or numbering in the output — only the translated text itself. "
        'Example input: 1. [3.0s] Hello\n2. [2.5s] Goodbye '
        'Example output: ["سلام", "خدا حافظ"]'
    )


def get_fallback_mode() -> str:
    """Urdu uses Bedrock instead of Google Translate as fallback."""
    return "bedrock"


_ENGLISH_FILLERS = re.compile(
    r'\b(Oh|Ah|Hmm|Well|So|Right|Okay|OK|Um|Uh|Hey|Wow|Ooh|Aah)[\.\!\,]?\s*',
    re.IGNORECASE,
)


def post_translate(segments: list[dict]) -> list[dict]:
    """Run Urdu-specific post-processing after translation.

    - Strips leaked English fillers.
    - Transliterates Urdu script → Devanagari for TTS (sets 'tts_text').
    - Captions use translated_text directly (already Urdu/Nastaliq script).
    """
    for seg in segments:
        text = seg.get("translated_text", "")
        # Strip leaked English fillers
        clean_text = _ENGLISH_FILLERS.sub("", text).strip()
        seg["translated_text"] = clean_text

    return transliterate_to_devanagari(segments)


# ── Transliteration: Urdu → Devanagari (for TTS) ────────────────────────────

_URDU_TO_DEVA = {
    'آ': 'आ', 'ب': 'ब', 'پ': 'प', 'ت': 'त', 'ٹ': 'ट', 'ث': 'स',
    'ج': 'ज', 'چ': 'च', 'ح': 'ह', 'خ': 'ख़', 'د': 'द', 'ڈ': 'ड',
    'ذ': 'ज़', 'ر': 'र', 'ڑ': 'ड़', 'ز': 'ज़', 'ژ': 'झ', 'س': 'स',
    'ش': 'श', 'ص': 'स', 'ض': 'ज़', 'ط': 'त', 'ظ': 'ज़', 'ع': 'अ',
    'غ': 'ग़', 'ف': 'फ़', 'ق': 'क़', 'ک': 'क', 'ك': 'क', 'گ': 'ग',
    'ل': 'ल', 'م': 'म', 'ن': 'न', 'ں': 'ं', 'و': 'व', 'ہ': 'ह',
    'ه': 'ह', 'ھ': '्ह', 'ی': 'य', 'ي': 'य', 'ے': 'े', 'ئ': 'इ',
    'َ': 'ा', 'ِ': 'ि', 'ُ': 'ु', 'ٰ': 'ा', 'ّ': '्', 'ً': 'न',
    'ٔ': '', 'ء': '', 'ؓ': '', '۔': '।', '،': ',', '؟': '?', '؛': ';',
}


def _urdu_to_rough_devanagari(text: str) -> str:
    """Deterministic character mapping from Urdu to Devanagari.
    Consonants are mapped correctly, but short vowels are omitted/incorrect
    because Urdu script doesn't explicitly mark them."""
    result = []
    for i, ch in enumerate(text):
        if ch == 'ا':
            # Word-initial alif is 'अ', otherwise 'ा'
            result.append('अ' if i == 0 or text[i - 1] == ' ' else 'ा')
        elif ch in _URDU_TO_DEVA:
            result.append(_URDU_TO_DEVA[ch])
        else:
            result.append(ch)

    # Fix a common edge case: ئ + ے  (e.g., in بروئے)
    rough = ''.join(result)
    rough = rough.replace('इे', 'ए')
    return rough


def _polish_devanagari_vowels(client, model, numbered, expected_count, max_attempts=2):
    """Use an LLM to ONLY fix vowels in the rough Devanagari conversion, preserving exact vocabulary."""
    prompt = (
        "You are a Devanagari spelling corrector for Urdu text. Below are Urdu sentences with ROUGH "
        "character-by-character Devanagari conversions. Consonants are correct but vowels are wrong/missing.\n\n"
        "YOUR ONLY JOB: Fix vowels to make readable Urdu in Devanagari.\n\n"
        "STRICT RULES:\n"
        "- Do NOT change, replace, or translate ANY word. Keep every single Urdu word exactly.\n"
        "- Only add or fix vowel matras (ा ि ी ु ू े ै ो ौ ं ँ)\n"
        "- Add nuqta dots where needed: क़ ख़ ग़ ज़ फ़\n"
        "- Add halant (्) for conjuncts where needed\n\n"
        "EXAMPLES:\n"
        "Urdu: محبت | rough: महबत | fixed: मोहब्बत\n"
        "Urdu: استعمال | rough: असतअमाल | fixed: इस्तेमाल\n"
        "Urdu: حکمت | rough: हकमत | fixed: हिकमत\n"
        "Urdu: طاقت | rough: ताक़त | fixed: ताक़त\n"
        "Urdu: ہمدردی | rough: हमदरदय | fixed: हमदर्दी\n"
        "Urdu: پیروی | rough: पयरवय | fixed: पैरवी\n"
        "Urdu: کریم | rough: करयम | fixed: करीम\n\n"
        "Return ONLY a JSON array of corrected Devanagari strings, in order, one per input."
    )

    for attempt in range(1, max_attempts + 1):
        try:
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": prompt},
                    {"role": "user", "content": numbered},
                ],
                temperature=0.1,
            )
            raw = response.choices[0].message.content.strip()
            log_llm_call(
                step="urdu_vowel_polish", provider="pollinations", model=model,
                system_prompt=prompt, user_prompt=numbered,
                response=raw, temperature=0.1,
            )

            try:
                polished_list = parse_json_array(raw)
            except (json.JSONDecodeError, ValueError):
                print(f"[urdu] Attempt {attempt}: Could not parse response as JSON")
                continue

            if len(polished_list) != expected_count:
                print(f"[urdu] Attempt {attempt}: Got {len(polished_list)} items, expected {expected_count}")
                continue

            # Quick check if it's returning Arabic/Urdu script instead
            sample = " ".join(polished_list[:3])
            bad_chars = sum(1 for ch in sample if '\u0600' <= ch <= '\u06FF')
            if bad_chars > 0:
                print(f"[urdu] Attempt {attempt}: Output still contains Urdu script — retrying")
                prompt = "CRITICAL: OUTPUT MUST BE DEVANAGARI ONLY. NO ARABIC/URDU SCRIPT.\n\n" + prompt
                continue

            return polished_list

        except Exception as e:
            print(f"[urdu] LLM error on attempt {attempt}: {e}")

    return None


def transliterate_to_devanagari(segments: list[dict]) -> list[dict]:
    """Convert Urdu script translations to Devanagari for TTS.
    Adds 'tts_text' field to each segment.
    Uses a hybrid approach: Deterministic char mapping + LLM vowel polishing."""
    if not segments:
        return segments

    print("[urdu] Starting Hybrid Urdu → Devanagari conversion...")

    # Step 1: Deterministic mapping to rough Devanagari
    rough_texts = []
    for seg in segments:
        urdu_text = seg.get("translated_text", "")
        rough_deva = _urdu_to_rough_devanagari(urdu_text)
        rough_texts.append(rough_deva)

    expected = len(segments)
    numbered = "\n".join(
        f"{i + 1}. Urdu: {seg.get('translated_text', '')}\n   Rough: {rough_texts[i]}"
        for i, seg in enumerate(segments)
    )

    # Try Pollinations
    client = build_client()
    polished_list = _polish_devanagari_vowels(client, MODEL, numbered, expected)

    if polished_list:
        for seg, deva_text in zip(segments, polished_list):
            seg["tts_text"] = deva_text
        print("[urdu] Urdu → Devanagari hybrid transliteration complete ✓")
        return segments

    print("[urdu] Pollinations Polish failed — trying Bedrock fallback...")

    # Bedrock Fallback
    try:
        system_prompt = (
            "You are a Devanagari spelling corrector for Urdu text. Below are Urdu sentences with ROUGH "
            "character-by-character Devanagari conversions. Consonants are correct but vowels are wrong/missing.\n\n"
            "YOUR ONLY JOB: Fix vowels to make readable Urdu in Devanagari. Do NOT change/replace/translate ANY word.\n\n"
            "EXAMPLES:\nمحبت | rough: महबत | fixed: मोहब्बत\nاستعمال | rough: असतअमाल | fixed: इस्तेमाल\n"
            "حکمت | rough: हकमत | fixed: हिकमत\nहमदरदی | rough: हमदरदय | fixed: हमदर्दी\n\n"
            "Return ONLY a JSON array of corrected Devanagari strings."
        )

        for attempt in range(1, 3):
            raw = bedrock_converse(system_prompt, numbered, step="urdu_vowel_polish_bedrock")

            try:
                polished_list = parse_json_array(raw)
            except (json.JSONDecodeError, ValueError):
                print(f"[urdu] Bedrock attempt {attempt}: Could not parse response")
                continue

            if len(polished_list) != expected:
                print(f"[urdu] Bedrock attempt {attempt}: Got {len(polished_list)} items, expected {expected}")
                continue

            sample = " ".join(polished_list[:3])
            bad_chars = sum(1 for ch in sample if '\u0600' <= ch <= '\u06FF')
            if bad_chars > 0:
                print(f"[urdu] Bedrock attempt {attempt}: Output contains Urdu script — retrying")
                system_prompt = "CRITICAL: OUTPUT MUST BE DEVANAGARI ONLY. NO ARABIC/URDU SCRIPT.\n\n" + system_prompt
                continue

            for seg, deva_text in zip(segments, polished_list):
                seg["tts_text"] = deva_text
            print("[urdu] Urdu → Devanagari transliteration (Bedrock) complete ✓")
            return segments

    except Exception as e:
        print(f"[urdu] WARNING: Bedrock fallback failed ({e})")

    print("[urdu] WARNING: All polishing failed. Falling back to rough Devanagari.")
    for seg, r_text in zip(segments, rough_texts):
        seg["tts_text"] = r_text
    return segments


# ── Transliteration: Devanagari → Urdu script (for captions) ────────────────

def transliterate_to_urdu_script(segments: list[dict]) -> list[dict]:
    """Convert Devanagari Urdu translations to Urdu (Nastaliq/Arabic) script for subtitles.
    Adds 'caption_text' field to each segment."""
    if not segments:
        return segments

    texts = [seg.get("translated_text", "") for seg in segments]
    numbered = "\n".join(f"{i + 1}. {t}" for i, t in enumerate(texts))

    system_prompt = (
        "You are a script converter. Convert the following Devanagari Urdu text into Urdu script (Nastaliq/Arabic script). "
        "This is NOT translation — the language is already Urdu, just written in Devanagari. "
        "Convert it to proper Urdu script preserving every word exactly.\n\n"
        "Return ONLY a JSON array of converted strings, in order, no extra text. "
        "Do NOT include numbering in the output."
    )

    client = build_client()
    try:
        response = client.chat.completions.create(
            model=MODEL,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": numbered},
            ],
            temperature=0.1,
        )

        raw = response.choices[0].message.content.strip()
        log_llm_call(
            step="urdu_script_convert", provider="pollinations", model=MODEL,
            system_prompt=system_prompt, user_prompt=numbered,
            response=raw, temperature=0.1,
        )
        urdu_list = parse_json_array(raw)

        if len(urdu_list) != len(segments):
            print(f"[urdu] WARNING: Urdu script returned {len(urdu_list)} items, expected {len(segments)}. Using Devanagari for captions")
            return segments

        for seg, urdu_text in zip(segments, urdu_list):
            seg["caption_text"] = urdu_text

        print("[urdu] Urdu script transliteration complete ✓")
        return segments

    except Exception as e:
        print(f"[urdu] Pollinations transliteration failed ({e}) — trying Bedrock...")

        try:
            raw = bedrock_converse(system_prompt, numbered, step="urdu_script_convert_bedrock")
            urdu_list = parse_json_array(raw)

            if len(urdu_list) != len(segments):
                print(f"[urdu] WARNING: Bedrock Urdu script returned {len(urdu_list)} items, expected {len(segments)}. Using Devanagari for captions")
                return segments

            for seg, urdu_text in zip(segments, urdu_list):
                seg["caption_text"] = urdu_text

            print("[urdu] Urdu script transliteration (Bedrock) complete ✓")
            return segments

        except Exception as e2:
            print(f"[urdu] WARNING: Bedrock transliteration also failed ({e2}), using Devanagari for captions")
            return segments