""" app.py ────── Interfaz web Gradio para el sistema RAG de corrección de castellano s.XVI. Arranque: python app.py Requiere: - .env con OPENAI_API_KEY - (opcional) corpus en ./corpus/ """ import os import json import gradio as gr from dotenv import load_dotenv from knowledge_base import SAMPLE_PAIRS from corpus_loader import CorpusLoader from vector_store import VectorStore from rag_corrector import RAGCorrector from evaluator import Evaluator load_dotenv() # ── Inicialización ──────────────────────────────────────────────────────────── print(" Inicializando Scriptorium RAG...") vs = VectorStore() # Cargar corpus desde disco (si existe) + pares de ejemplo embebidos loader = CorpusLoader(os.getenv("CORPUS_PATH", "./corpus")) disk_pairs = loader.load() all_pairs = SAMPLE_PAIRS + disk_pairs # Indexar todo en ChromaDB vs.index(all_pairs) corrector = RAGCorrector(vs) evaluator = Evaluator() print(f" Sistema listo. Documentos en vector store: {vs.count()}") # ── Ejemplos de demostración ────────────────────────────────────────────────── DEMO_EXAMPLES = [ "q̃ fizo merçed al dho lugar de las alcaualas del anno de mill e quinientos", "el escriuano del cabildo faze fe y da testimouio verdadero de todo lo sobredho", "en la muy noble çibdad de burgos a veynte dias del mes de março anno dho", "yo juan de la torre vezino desta uilla de toledo otorgo e conosco", "sepan quantos esta carta de poder vieren como yo pero lopez vezino dela villa", "fizo pareçer ante si a los testigos q̃ dixeron ser mayores de veynte annos", ] # Variable global para el vector store activo current_embed_model = "openai" vs = VectorStore(embedding_model="openai") vs.index(all_pairs) def cambiar_embedding(embed_model: str): global vs, corrector, current_embed_model if embed_model == current_embed_model: return f"ℹ Ya estás usando **{embed_model}**" try: current_embed_model = embed_model vs = VectorStore(embedding_model=embed_model) # Indexar si la colección está vacía if vs.count() == 0: vs.index(all_pairs) msg = f" Re-indexado con **{embed_model}** · {vs.count()} docs" else: msg = f" Cargado índice existente **{embed_model}** · {vs.count()} docs" # Recrear el corrector con el nuevo vector store corrector = RAGCorrector(vs) return msg except Exception as e: return f" Error cambiando embedding: {e}" # ── Función principal ───────────────────────────────────────────────────────── def corregir(htr_text: str, top_k: int, mostrar_prompt: bool, model: str): if not htr_text.strip(): return "", "", "", "", " Introduce un texto HTR para corregir." if not os.getenv("OPENAI_API_KEY"): return "", "", "", "", " Falta OPENAI_API_KEY en el fichero .env" try: result = corrector.correct(htr_text, top_k=int(top_k), model= model) except Exception as e: return "", "", "", "", f" Error al llamar a la API: {e}" corrected = result["corrected"] retrieved = result["retrieved"] htr_errors = result["htr_errors"] grafia_w = result["grafia_warns"] # ── Panel de documentos recuperados ────────────────────────────────────── docs_md = f"### Top-{len(retrieved)} documentos recuperados\n\n" for i, doc in enumerate(retrieved, 1): docs_md += ( f"**{i}. [{doc['type']} · {doc['region']} · {doc['date']}]** " f"*similitud: {doc['score']}*\n\n" f"- **HTR:** `{doc['htr']}`\n" f"- **GT:** `{doc['gt']}`\n" ) if doc["corrections"]: docs_md += f"- **Correcciones:** {', '.join(doc['corrections'])}\n" docs_md += "\n---\n" # ── Panel de análisis ───────────────────────────────────────────────────── analysis_md = "### Análisis del texto\n\n" if htr_errors: analysis_md += "**⚠ Posibles errores HTR detectados:**\n" for e in htr_errors: analysis_md += f"- `{e['htr']}` → `{e['gt']}`: {e['context']} \n *Ej: {e['example']}*\n" analysis_md += "\n" if grafia_w: analysis_md += "**✦ Alertas de grafía (NO modernizar):**\n" for g in grafia_w: analysis_md += f"- `{g['modern']}` → mantener `{g['ancient']}`: {g['rule']}\n" analysis_md += "\n" if not htr_errors and not grafia_w: analysis_md += "*No se detectaron patrones conocidos de error en el texto.*\n" # Diff visual (diferencias) diff_md = "### Diferencias HTR → Corregido\n\n" orig_words = htr_text.split() corr_words = corrected.split() diff_parts = [] max_len = max(len(orig_words), len(corr_words)) changed = 0 for i in range(max_len): o = orig_words[i] if i < len(orig_words) else "—" c = corr_words[i] if i < len(corr_words) else "—" if o != c: diff_parts.append(f"~~{o}~~ → **{c}**") changed += 1 else: diff_parts.append(c) diff_md += " ".join(diff_parts) diff_md += f"\n\n*{changed} palabra(s) modificada(s) de {len(orig_words)} totales.*" # ── Prompt (opcional) ───────────────────────────────────────────────────── #prompt_md = "" #if mostrar_prompt: # prompt_md = f"```\nSYSTEM:\n{result.get('_system', '(ver rag_corrector.py)')}\n\nUSER:\n{result['prompt']}\n```" status = f" Corrección completada con **{result['model']}** · {vs.count()} docs en índice" prompt_visible = "" if mostrar_prompt: prompt_visible = ( "### System Prompt\n\n" f"```\n{result.get('_system', '(ver rag_corrector.py)')}\n```\n\n" "### User Prompt (dinámico)\n\n" f"```\n{result['prompt']}\n```" ) return corrected, docs_md, analysis_md, diff_md, status, prompt_visible def evaluar_par(htr_text: str, gt_text: str): if not htr_text.strip() or not gt_text.strip(): return "⚠ Introduce tanto el texto HTR como el groundtruth." try: result = corrector.correct(htr_text) metrics = evaluator.evaluate_pair(htr_text, result["corrected"], gt_text) m = metrics mod = m["modernism"] report = ( f"### Métricas de evaluación\n\n" f"| Métrica | Antes (HTR) | Después (RAG) | Mejora |\n" f"|---------|------------|---------------|--------|\n" f"| **CER** | {m['cer_before']:.2%} | {m['cer_after']:.2%} | {m['cer_improvement']:+.2%} |\n" f"| **WER** | {m['wer_before']:.2%} | {m['wer_after']:.2%} | {m['wer_improvement']:+.2%} |\n\n" f"**Detector de modernismos:** score={mod['score']:.2f} " f"({mod['count']} problema(s) detectado(s))\n" ) if mod["issues"]: report += "\nFormas modernas introducidas incorrectamente:\n" for iss in mod["issues"]: report += f"- `{iss['modern']}` (debería ser `{iss['ancient']}`): {iss['rule']}\n" report += f"\n**Texto corregido por RAG:**\n> {result['corrected']}" return report except Exception as e: return f" Error: {e}" def add_to_corpus(htr_text: str, gt_text: str, doc_type: str, region: str, date: str, caligrafia: str): if not htr_text.strip() or not gt_text.strip(): return "⚠ HTR y GT son obligatorios." try: pair_id = f"user_{abs(hash(htr_text)) % 100000:05d}" new_pair = { "id": pair_id, "htr": htr_text.strip(), "gt": gt_text.strip(), "type": doc_type or "desconocido", "region": region or "desconocida", "date": date or "", "caligrafia": caligrafia or "desconocida", "corrections": [], "source": "user_added", } added = vs.index([new_pair]) if added: return f" Par añadido al corpus con id `{pair_id}`. Total: {vs.count()} docs." else: return f" Par ya existía en el corpus (id: `{pair_id}`)." except Exception as e: return f" Error: {e}" # ── Interfaz Gradio ─────────────────────────────────────────────────────────── with gr.Blocks( title="Scriptorium RAG", theme=gr.themes.Base( primary_hue="amber", secondary_hue="stone", neutral_hue="stone", font=gr.themes.GoogleFont("IM Fell English"), ), css=""" .header { text-align: center; padding: 20px 0 10px; } .header h1 { font-size: 2.2em; color: #92400e; letter-spacing: 0.15em; } .header p { color: #78716c; font-style: italic; } .status-bar { font-size: 0.85em; padding: 6px 12px; border-radius: 6px; } """, ) as demo: # ── Header ──────────────────────────────────────────────────────────────── gr.HTML("""

RAG CODEX for Historical Spanish

RAG system of Spanish correction from the 16th century

""") with gr.Tabs(): # ── Pestaña 1: Corrección ───────────────────────────────────────────── with gr.TabItem(" HTR Correction"): with gr.Row(): with gr.Column(scale=2): htr_input = gr.Textbox( label="HTR text (recognizer input)", placeholder="Paste the HTR result here…", lines=6, ) with gr.Row(): top_k_slider = gr.Slider( minimum=1, maximum=10, value=5, step=1, label="Documents retrieved (k)", ) model_selector = gr.Dropdown( label="Modelo LLM", choices=[ "llama-3.3-70b-versatile", "openai/gpt-oss-120b", ], value="llama-3.3-70b-versatile", ) embedding_selector = gr.Dropdown( label="Modelo de Embedding", choices=[ "openai", # text-embedding-3-small "mpnet", # paraphrase-multilingual-mpnet-base-v2 "mt5-base fine-tuned", ], value="openai", ) show_prompt = gr.Checkbox(label="Show RAG prompt", value=False) btn_corregir = gr.Button("✦ Correct with RAG", variant="primary") gr.Examples( examples=DEMO_EXAMPLES, inputs=htr_input, label="Demonstration examples", ) with gr.Column(scale=2): corrected_out = gr.Textbox( label="Corrected text (RAG output)", lines=6, interactive=False, ) status_out = gr.Markdown(elem_classes=["status-bar"]) with gr.Row(): with gr.Column(): docs_out = gr.Markdown(label="Documents recovered from the corpus") with gr.Column(): analysis_out = gr.Markdown(label="Pattern analysis") diff_out = gr.Markdown(label="Word-by-word differences") prompt_out = gr.Markdown(label="Prompt sent to the LLM", visible=True) btn_corregir.click( fn=corregir, inputs=[htr_input, top_k_slider, show_prompt, model_selector], outputs=[corrected_out, docs_out, analysis_out, diff_out, status_out, prompt_out], ) embed_status = gr.Markdown() embedding_selector.change( fn=cambiar_embedding, inputs=[embedding_selector], outputs=[embed_status], ) # ── Pestaña 2: Evaluación ───────────────────────────────────────────── with gr.TabItem(" Evaluation with GT"): gr.Markdown("Compare the RAG correction against the actual groundtruth to measure CER/WER and detect modernisms.") with gr.Row(): eval_htr = gr.Textbox(label="HTR text", lines=4) eval_gt = gr.Textbox(label="Groundtruth (reference)", lines=4) btn_eval = gr.Button("Evaluate", variant="primary") eval_out = gr.Markdown() btn_eval.click(fn=evaluar_par, inputs=[eval_htr, eval_gt], outputs=eval_out) # ── Pestaña 3: Añadir al corpus ─────────────────────────────────────── with gr.TabItem("➕ Add to corpus"): gr.Markdown("Add new pairs to the vector store to improve the RAG continuously.") with gr.Row(): add_htr = gr.Textbox(label="Texto HTR", lines=4) add_gt = gr.Textbox(label="Groundtruth corregido", lines=4) with gr.Row(): add_type = gr.Textbox(label="Document type", placeholder="notarial / judicial / eclesiastico") add_region = gr.Textbox(label="Region", placeholder="Castilla, Andalucía…") add_date = gr.Textbox(label="Date", placeholder="1542") add_caligrafia = gr.Dropdown( label="Caligrafía", choices=["desconocida", "procesal", "encadenada", "italica"], value="desconocida", ) btn_add = gr.Button("Add to corpus", variant="primary") add_out = gr.Markdown() btn_add.click( fn=add_to_corpus, inputs=[add_htr, add_gt, add_type, add_region, add_date, add_caligrafia], outputs=add_out, ) # ── Pestaña 4: Info del sistema ─────────────────────────────────────── with gr.TabItem("ℹ System"): gr.Markdown(f""" ## System status - **Modelo LLM:** {os.getenv('OPENAI_MODEL', 'gpt-4o')} - **Vector store:** ChromaDB (persistente en `{os.getenv('CHROMA_PATH','./chroma_db')}`) - **Documentos indexados:** {vs.count()} - **Corpus cargado desde disco:** {len(disk_pairs)} pares - **Pares de ejemplo embebidos:** {len(SAMPLE_PAIRS)} ## Arquitectura ``` Texto HTR │ ├─► Detector de patrones HTR (knowledge_base.py) ├─► Detector de grafías modernas (knowledge_base.py) │ ├─► Embedding (text-embedding-3-small) │ │ │ └─► Búsqueda top-k en ChromaDB ──► Few-shot dinámico │ └─► Prompt constructor ──► GPT-4o ──► Texto corregido ``` ## Formato del corpus Para añadir tu corpus, crea `./corpus/` con ficheros JSON: ```json [ {{"id": "doc001", "htr": "texto htr...", "gt": "groundtruth...", "type": "notarial", "region": "Castilla", "date": "1542"}}, ... ] ``` O CSV con columnas: `id, htr, gt, type, region, date` """) if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, auth=("admin", "admin"), # ← autenticación básica (opcional) share=False, show_error=True, )