"""
app.py
──────
Interfaz web Gradio para el sistema RAG de corrección de castellano s.XVI.

Arranque:
    python app.py

Requiere:
    - .env con OPENAI_API_KEY
    - (opcional) corpus en ./corpus/
"""

import os
import json
import gradio as gr
from dotenv import load_dotenv

from knowledge_base import SAMPLE_PAIRS
from corpus_loader import CorpusLoader
from vector_store import VectorStore
from rag_corrector import RAGCorrector
from evaluator import Evaluator

load_dotenv()

# ── Inicialización ────────────────────────────────────────────────────────────

print(" Inicializando Scriptorium RAG...")

vs = VectorStore()

# Cargar corpus desde disco (si existe) + pares de ejemplo embebidos
loader = CorpusLoader(os.getenv("CORPUS_PATH", "./corpus"))
disk_pairs = loader.load()
all_pairs  = SAMPLE_PAIRS + disk_pairs

# Indexar todo en ChromaDB
vs.index(all_pairs)

corrector = RAGCorrector(vs)
evaluator = Evaluator()

print(f" Sistema listo. Documentos en vector store: {vs.count()}")

# ── Ejemplos de demostración ──────────────────────────────────────────────────

DEMO_EXAMPLES = [
    "q̃ fizo merçed al dho lugar de las alcaualas del anno de mill e quinientos",
    "el escriuano del cabildo faze fe y da testimouio verdadero de todo lo sobredho",
    "en la muy noble çibdad de burgos a veynte dias del mes de março anno dho",
    "yo juan de la torre vezino desta uilla de toledo otorgo e conosco",
    "sepan quantos esta carta de poder vieren como yo pero lopez vezino dela villa",
    "fizo pareçer ante si a los testigos q̃ dixeron ser mayores de veynte annos",
]

# Variable global para el vector store activo
current_embed_model = "openai"
vs = VectorStore(embedding_model="openai")
vs.index(all_pairs)

def cambiar_embedding(embed_model: str):
    global vs, corrector, current_embed_model

    if embed_model == current_embed_model:
        return f"ℹ Ya estás usando **{embed_model}**"

    try:
        current_embed_model = embed_model
        vs = VectorStore(embedding_model=embed_model)

        # Indexar si la colección está vacía
        if vs.count() == 0:
            vs.index(all_pairs)
            msg = f" Re-indexado con **{embed_model}** · {vs.count()} docs"
        else:
            msg = f" Cargado índice existente **{embed_model}** · {vs.count()} docs"

        # Recrear el corrector con el nuevo vector store
        corrector = RAGCorrector(vs)
        return msg

    except Exception as e:
        return f" Error cambiando embedding: {e}"


# ── Función principal ─────────────────────────────────────────────────────────

def corregir(htr_text: str, top_k: int, mostrar_prompt: bool, model: str):
    if not htr_text.strip():
        return "", "", "", "", " Introduce un texto HTR para corregir."

    if not os.getenv("OPENAI_API_KEY"):
        return "", "", "", "", " Falta OPENAI_API_KEY en el fichero .env"

    try:
        result = corrector.correct(htr_text, top_k=int(top_k), model= model)
    except Exception as e:
        return "", "", "", "", f" Error al llamar a la API: {e}"

    corrected  = result["corrected"]
    retrieved  = result["retrieved"]
    htr_errors = result["htr_errors"]
    grafia_w   = result["grafia_warns"]

    # ── Panel de documentos recuperados ──────────────────────────────────────
    docs_md = f"### Top-{len(retrieved)} documentos recuperados\n\n"
    for i, doc in enumerate(retrieved, 1):
        docs_md += (
            f"**{i}. [{doc['type']} · {doc['region']} · {doc['date']}]**  "
            f"*similitud: {doc['score']}*\n\n"
            f"- **HTR:** `{doc['htr']}`\n"
            f"- **GT:**  `{doc['gt']}`\n"
        )
        if doc["corrections"]:
            docs_md += f"- **Correcciones:** {', '.join(doc['corrections'])}\n"
        docs_md += "\n---\n"

    # ── Panel de análisis ─────────────────────────────────────────────────────
    analysis_md = "### Análisis del texto\n\n"

    if htr_errors:
        analysis_md += "**⚠ Posibles errores HTR detectados:**\n"
        for e in htr_errors:
            analysis_md += f"- `{e['htr']}` → `{e['gt']}`: {e['context']}  \n  *Ej: {e['example']}*\n"
        analysis_md += "\n"

    if grafia_w:
        analysis_md += "**✦ Alertas de grafía (NO modernizar):**\n"
        for g in grafia_w:
            analysis_md += f"- `{g['modern']}` → mantener `{g['ancient']}`: {g['rule']}\n"
        analysis_md += "\n"

    if not htr_errors and not grafia_w:
        analysis_md += "*No se detectaron patrones conocidos de error en el texto.*\n"

    # Diff visual (diferencias)
    diff_md = "### Diferencias HTR → Corregido\n\n"
    orig_words = htr_text.split()
    corr_words = corrected.split()
    diff_parts = []
    max_len = max(len(orig_words), len(corr_words))
    changed = 0
    for i in range(max_len):
        o = orig_words[i] if i < len(orig_words) else "—"
        c = corr_words[i] if i < len(corr_words) else "—"
        if o != c:
            diff_parts.append(f"~~{o}~~ → **{c}**")
            changed += 1
        else:
            diff_parts.append(c)
    diff_md += " ".join(diff_parts)
    diff_md += f"\n\n*{changed} palabra(s) modificada(s) de {len(orig_words)} totales.*"

    # ── Prompt (opcional) ─────────────────────────────────────────────────────
    #prompt_md = ""
    #if mostrar_prompt:
    #    prompt_md = f"```\nSYSTEM:\n{result.get('_system', '(ver rag_corrector.py)')}\n\nUSER:\n{result['prompt']}\n```"

    status = f" Corrección completada con **{result['model']}** · {vs.count()} docs en índice"

    prompt_visible = ""
    if mostrar_prompt:
        prompt_visible = (
            "### System Prompt\n\n"
            f"```\n{result.get('_system', '(ver rag_corrector.py)')}\n```\n\n"
            "### User Prompt (dinámico)\n\n"
            f"```\n{result['prompt']}\n```"
        )

    return corrected, docs_md, analysis_md, diff_md, status, prompt_visible

def evaluar_par(htr_text: str, gt_text: str):
    if not htr_text.strip() or not gt_text.strip():
        return "⚠ Introduce tanto el texto HTR como el groundtruth."
    try:
        result = corrector.correct(htr_text)
        metrics = evaluator.evaluate_pair(htr_text, result["corrected"], gt_text)
        m = metrics
        mod = m["modernism"]
        report = (
            f"### Métricas de evaluación\n\n"
            f"| Métrica | Antes (HTR) | Después (RAG) | Mejora |\n"
            f"|---------|------------|---------------|--------|\n"
            f"| **CER** | {m['cer_before']:.2%} | {m['cer_after']:.2%} | {m['cer_improvement']:+.2%} |\n"
            f"| **WER** | {m['wer_before']:.2%} | {m['wer_after']:.2%} | {m['wer_improvement']:+.2%} |\n\n"
            f"**Detector de modernismos:** score={mod['score']:.2f}  "
            f"({mod['count']} problema(s) detectado(s))\n"
        )
        if mod["issues"]:
            report += "\nFormas modernas introducidas incorrectamente:\n"
            for iss in mod["issues"]:
                report += f"- `{iss['modern']}` (debería ser `{iss['ancient']}`): {iss['rule']}\n"

        report += f"\n**Texto corregido por RAG:**\n> {result['corrected']}"
        return report
    except Exception as e:
        return f" Error: {e}"


def add_to_corpus(htr_text: str, gt_text: str, doc_type: str, region: str, date: str, caligrafia: str):
    if not htr_text.strip() or not gt_text.strip():
        return "⚠ HTR y GT son obligatorios."
    try:
        pair_id = f"user_{abs(hash(htr_text)) % 100000:05d}"
        new_pair = {
            "id":      pair_id,
            "htr":     htr_text.strip(),
            "gt":      gt_text.strip(),
            "type":    doc_type or "desconocido",
            "region":  region or "desconocida",
            "date":    date or "",
            "caligrafia": caligrafia or "desconocida",
            "corrections": [],
            "source":  "user_added",
        }
        added = vs.index([new_pair])
        if added:
            return f" Par añadido al corpus con id `{pair_id}`. Total: {vs.count()} docs."
        else:
            return f" Par ya existía en el corpus (id: `{pair_id}`)."
    except Exception as e:
        return f" Error: {e}"


# ── Interfaz Gradio ───────────────────────────────────────────────────────────

with gr.Blocks(
    title="Scriptorium RAG",
    theme=gr.themes.Base(
        primary_hue="amber",
        secondary_hue="stone",
        neutral_hue="stone",
        font=gr.themes.GoogleFont("IM Fell English"),
    ),
    css="""
    .header { text-align: center; padding: 20px 0 10px; }
    .header h1 { font-size: 2.2em; color: #92400e; letter-spacing: 0.15em; }
    .header p  { color: #78716c; font-style: italic; }
    .status-bar { font-size: 0.85em; padding: 6px 12px; border-radius: 6px; }
    """,
) as demo:

    # ── Header ────────────────────────────────────────────────────────────────
    gr.HTML("""
    <div class="header">
      <h1>RAG CODEX for Historical Spanish</h1>
      <p>RAG system of Spanish correction from the 16th century</p>
    </div>
    """)

    with gr.Tabs():

        # ── Pestaña 1: Corrección ─────────────────────────────────────────────
        with gr.TabItem(" HTR Correction"):
            with gr.Row():
                with gr.Column(scale=2):
                    htr_input = gr.Textbox(
                        label="HTR text (recognizer input)",
                        placeholder="Paste the HTR result here…",
                        lines=6,
                    )
                    with gr.Row():
                        top_k_slider = gr.Slider(
                            minimum=1, maximum=10, value=5, step=1,
                            label="Documents retrieved (k)",
                        )
                        model_selector = gr.Dropdown(
                            label="Modelo LLM",
                            choices=[
                                "llama-3.3-70b-versatile",
                                "openai/gpt-oss-120b",
                            ],
                            value="llama-3.3-70b-versatile",
                        )
                        embedding_selector = gr.Dropdown(
                            label="Modelo de Embedding",
                            choices=[
                                "openai",    # text-embedding-3-small
                                "mpnet",     # paraphrase-multilingual-mpnet-base-v2
                                "mt5-base fine-tuned",
                            ],
                            value="openai",
                        )

                        show_prompt = gr.Checkbox(label="Show RAG prompt", value=False)
                    btn_corregir = gr.Button("✦ Correct with RAG", variant="primary")

                    gr.Examples(
                        examples=DEMO_EXAMPLES,
                        inputs=htr_input,
                        label="Demonstration examples",
                    )

                with gr.Column(scale=2):
                    corrected_out = gr.Textbox(
                        label="Corrected text (RAG output)",
                        lines=6,
                        interactive=False,
                    )
                    status_out = gr.Markdown(elem_classes=["status-bar"])

            with gr.Row():
                with gr.Column():
                    docs_out = gr.Markdown(label="Documents recovered from the corpus")
                with gr.Column():
                    analysis_out = gr.Markdown(label="Pattern analysis")

            diff_out = gr.Markdown(label="Word-by-word differences")
            prompt_out = gr.Markdown(label="Prompt sent to the LLM", visible=True)

            btn_corregir.click(
                fn=corregir,
                inputs=[htr_input, top_k_slider, show_prompt, model_selector],
                outputs=[corrected_out, docs_out, analysis_out, diff_out, status_out, prompt_out],
            )

            embed_status = gr.Markdown()
            embedding_selector.change(
                fn=cambiar_embedding,
                inputs=[embedding_selector],
                outputs=[embed_status],
            )

        # ── Pestaña 2: Evaluación ─────────────────────────────────────────────
        with gr.TabItem(" Evaluation with GT"):
            gr.Markdown("Compare the RAG correction against the actual groundtruth to measure CER/WER and detect modernisms.")
            with gr.Row():
                eval_htr = gr.Textbox(label="HTR text", lines=4)
                eval_gt  = gr.Textbox(label="Groundtruth (reference)", lines=4)
            btn_eval = gr.Button("Evaluate", variant="primary")
            eval_out = gr.Markdown()
            btn_eval.click(fn=evaluar_par, inputs=[eval_htr, eval_gt], outputs=eval_out)

        # ── Pestaña 3: Añadir al corpus ───────────────────────────────────────
        with gr.TabItem("➕ Add to corpus"):
            gr.Markdown("Add new pairs to the vector store to improve the RAG continuously.")
            with gr.Row():
                add_htr    = gr.Textbox(label="Texto HTR", lines=4)
                add_gt     = gr.Textbox(label="Groundtruth corregido", lines=4)
            with gr.Row():
                add_type   = gr.Textbox(label="Document type", placeholder="notarial / judicial / eclesiastico")
                add_region = gr.Textbox(label="Region", placeholder="Castilla, Andalucía…")
                add_date   = gr.Textbox(label="Date", placeholder="1542")
                add_caligrafia = gr.Dropdown(
                    label="Caligrafía",
                    choices=["desconocida", "procesal", "encadenada", "italica"],
                    value="desconocida",
                ) 
            btn_add  = gr.Button("Add to corpus", variant="primary")
            add_out  = gr.Markdown()
            btn_add.click(
                fn=add_to_corpus,
                inputs=[add_htr, add_gt, add_type, add_region, add_date, add_caligrafia],
                outputs=add_out,
            )

        # ── Pestaña 4: Info del sistema ───────────────────────────────────────
        with gr.TabItem("ℹ System"):
            gr.Markdown(f"""
## System status

- **Modelo LLM:** {os.getenv('OPENAI_MODEL', 'gpt-4o')}
- **Vector store:** ChromaDB (persistente en `{os.getenv('CHROMA_PATH','./chroma_db')}`)
- **Documentos indexados:** {vs.count()}
- **Corpus cargado desde disco:** {len(disk_pairs)} pares
- **Pares de ejemplo embebidos:** {len(SAMPLE_PAIRS)}

## Arquitectura

```
Texto HTR
   │
   ├─► Detector de patrones HTR (knowledge_base.py)
   ├─► Detector de grafías modernas (knowledge_base.py)
   │
   ├─► Embedding (text-embedding-3-small)
   │        │
   │        └─► Búsqueda top-k en ChromaDB ──► Few-shot dinámico
   │
   └─► Prompt constructor ──► GPT-4o ──► Texto corregido
```

## Formato del corpus

Para añadir tu corpus, crea `./corpus/` con ficheros JSON:
```json
[
  {{"id": "doc001", "htr": "texto htr...", "gt": "groundtruth...",
    "type": "notarial", "region": "Castilla", "date": "1542"}},
  ...
]
```
O CSV con columnas: `id, htr, gt, type, region, date`
""")

if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        auth=("admin", "admin"),   # ← autenticación básica (opcional)

        share=False,
        show_error=True,
    )