NSF-RAG-Codex / app.py
Alexander Sanchez
mt5-base added
8e14964
"""
app.py
──────
Interfaz web Gradio para el sistema RAG de correcciΓ³n de castellano s.XVI.
Arranque:
python app.py
Requiere:
- .env con OPENAI_API_KEY
- (opcional) corpus en ./corpus/
"""
import os
import json
import gradio as gr
from dotenv import load_dotenv
from knowledge_base import SAMPLE_PAIRS
from corpus_loader import CorpusLoader
from vector_store import VectorStore
from rag_corrector import RAGCorrector
from evaluator import Evaluator
load_dotenv()
# ── InicializaciΓ³n ────────────────────────────────────────────────────────────
print(" Inicializando Scriptorium RAG...")
vs = VectorStore()
# Cargar corpus desde disco (si existe) + pares de ejemplo embebidos
loader = CorpusLoader(os.getenv("CORPUS_PATH", "./corpus"))
disk_pairs = loader.load()
all_pairs = SAMPLE_PAIRS + disk_pairs
# Indexar todo en ChromaDB
vs.index(all_pairs)
corrector = RAGCorrector(vs)
evaluator = Evaluator()
print(f" Sistema listo. Documentos en vector store: {vs.count()}")
# ── Ejemplos de demostraciΓ³n ──────────────────────────────────────────────────
DEMO_EXAMPLES = [
"q̃ fizo merçed al dho lugar de las alcaualas del anno de mill e quinientos",
"el escriuano del cabildo faze fe y da testimouio verdadero de todo lo sobredho",
"en la muy noble Γ§ibdad de burgos a veynte dias del mes de marΓ§o anno dho",
"yo juan de la torre vezino desta uilla de toledo otorgo e conosco",
"sepan quantos esta carta de poder vieren como yo pero lopez vezino dela villa",
"fizo pareçer ante si a los testigos q̃ dixeron ser mayores de veynte annos",
]
# Variable global para el vector store activo
current_embed_model = "openai"
vs = VectorStore(embedding_model="openai")
vs.index(all_pairs)
def cambiar_embedding(embed_model: str):
global vs, corrector, current_embed_model
if embed_model == current_embed_model:
return f"β„Ή Ya estΓ‘s usando **{embed_model}**"
try:
current_embed_model = embed_model
vs = VectorStore(embedding_model=embed_model)
# Indexar si la colecciΓ³n estΓ‘ vacΓ­a
if vs.count() == 0:
vs.index(all_pairs)
msg = f" Re-indexado con **{embed_model}** Β· {vs.count()} docs"
else:
msg = f" Cargado Γ­ndice existente **{embed_model}** Β· {vs.count()} docs"
# Recrear el corrector con el nuevo vector store
corrector = RAGCorrector(vs)
return msg
except Exception as e:
return f" Error cambiando embedding: {e}"
# ── FunciΓ³n principal ─────────────────────────────────────────────────────────
def corregir(htr_text: str, top_k: int, mostrar_prompt: bool, model: str):
if not htr_text.strip():
return "", "", "", "", " Introduce un texto HTR para corregir."
if not os.getenv("OPENAI_API_KEY"):
return "", "", "", "", " Falta OPENAI_API_KEY en el fichero .env"
try:
result = corrector.correct(htr_text, top_k=int(top_k), model= model)
except Exception as e:
return "", "", "", "", f" Error al llamar a la API: {e}"
corrected = result["corrected"]
retrieved = result["retrieved"]
htr_errors = result["htr_errors"]
grafia_w = result["grafia_warns"]
# ── Panel de documentos recuperados ──────────────────────────────────────
docs_md = f"### Top-{len(retrieved)} documentos recuperados\n\n"
for i, doc in enumerate(retrieved, 1):
docs_md += (
f"**{i}. [{doc['type']} Β· {doc['region']} Β· {doc['date']}]** "
f"*similitud: {doc['score']}*\n\n"
f"- **HTR:** `{doc['htr']}`\n"
f"- **GT:** `{doc['gt']}`\n"
)
if doc["corrections"]:
docs_md += f"- **Correcciones:** {', '.join(doc['corrections'])}\n"
docs_md += "\n---\n"
# ── Panel de anΓ‘lisis ─────────────────────────────────────────────────────
analysis_md = "### AnΓ‘lisis del texto\n\n"
if htr_errors:
analysis_md += "**⚠ Posibles errores HTR detectados:**\n"
for e in htr_errors:
analysis_md += f"- `{e['htr']}` β†’ `{e['gt']}`: {e['context']} \n *Ej: {e['example']}*\n"
analysis_md += "\n"
if grafia_w:
analysis_md += "**✦ Alertas de grafía (NO modernizar):**\n"
for g in grafia_w:
analysis_md += f"- `{g['modern']}` β†’ mantener `{g['ancient']}`: {g['rule']}\n"
analysis_md += "\n"
if not htr_errors and not grafia_w:
analysis_md += "*No se detectaron patrones conocidos de error en el texto.*\n"
# Diff visual (diferencias)
diff_md = "### Diferencias HTR β†’ Corregido\n\n"
orig_words = htr_text.split()
corr_words = corrected.split()
diff_parts = []
max_len = max(len(orig_words), len(corr_words))
changed = 0
for i in range(max_len):
o = orig_words[i] if i < len(orig_words) else "β€”"
c = corr_words[i] if i < len(corr_words) else "β€”"
if o != c:
diff_parts.append(f"~~{o}~~ β†’ **{c}**")
changed += 1
else:
diff_parts.append(c)
diff_md += " ".join(diff_parts)
diff_md += f"\n\n*{changed} palabra(s) modificada(s) de {len(orig_words)} totales.*"
# ── Prompt (opcional) ─────────────────────────────────────────────────────
#prompt_md = ""
#if mostrar_prompt:
# prompt_md = f"```\nSYSTEM:\n{result.get('_system', '(ver rag_corrector.py)')}\n\nUSER:\n{result['prompt']}\n```"
status = f" CorrecciΓ³n completada con **{result['model']}** Β· {vs.count()} docs en Γ­ndice"
prompt_visible = ""
if mostrar_prompt:
prompt_visible = (
"### System Prompt\n\n"
f"```\n{result.get('_system', '(ver rag_corrector.py)')}\n```\n\n"
"### User Prompt (dinΓ‘mico)\n\n"
f"```\n{result['prompt']}\n```"
)
return corrected, docs_md, analysis_md, diff_md, status, prompt_visible
def evaluar_par(htr_text: str, gt_text: str):
if not htr_text.strip() or not gt_text.strip():
return "⚠ Introduce tanto el texto HTR como el groundtruth."
try:
result = corrector.correct(htr_text)
metrics = evaluator.evaluate_pair(htr_text, result["corrected"], gt_text)
m = metrics
mod = m["modernism"]
report = (
f"### MΓ©tricas de evaluaciΓ³n\n\n"
f"| MΓ©trica | Antes (HTR) | DespuΓ©s (RAG) | Mejora |\n"
f"|---------|------------|---------------|--------|\n"
f"| **CER** | {m['cer_before']:.2%} | {m['cer_after']:.2%} | {m['cer_improvement']:+.2%} |\n"
f"| **WER** | {m['wer_before']:.2%} | {m['wer_after']:.2%} | {m['wer_improvement']:+.2%} |\n\n"
f"**Detector de modernismos:** score={mod['score']:.2f} "
f"({mod['count']} problema(s) detectado(s))\n"
)
if mod["issues"]:
report += "\nFormas modernas introducidas incorrectamente:\n"
for iss in mod["issues"]:
report += f"- `{iss['modern']}` (deberΓ­a ser `{iss['ancient']}`): {iss['rule']}\n"
report += f"\n**Texto corregido por RAG:**\n> {result['corrected']}"
return report
except Exception as e:
return f" Error: {e}"
def add_to_corpus(htr_text: str, gt_text: str, doc_type: str, region: str, date: str, caligrafia: str):
if not htr_text.strip() or not gt_text.strip():
return "⚠ HTR y GT son obligatorios."
try:
pair_id = f"user_{abs(hash(htr_text)) % 100000:05d}"
new_pair = {
"id": pair_id,
"htr": htr_text.strip(),
"gt": gt_text.strip(),
"type": doc_type or "desconocido",
"region": region or "desconocida",
"date": date or "",
"caligrafia": caligrafia or "desconocida",
"corrections": [],
"source": "user_added",
}
added = vs.index([new_pair])
if added:
return f" Par aΓ±adido al corpus con id `{pair_id}`. Total: {vs.count()} docs."
else:
return f" Par ya existΓ­a en el corpus (id: `{pair_id}`)."
except Exception as e:
return f" Error: {e}"
# ── Interfaz Gradio ───────────────────────────────────────────────────────────
with gr.Blocks(
title="Scriptorium RAG",
theme=gr.themes.Base(
primary_hue="amber",
secondary_hue="stone",
neutral_hue="stone",
font=gr.themes.GoogleFont("IM Fell English"),
),
css="""
.header { text-align: center; padding: 20px 0 10px; }
.header h1 { font-size: 2.2em; color: #92400e; letter-spacing: 0.15em; }
.header p { color: #78716c; font-style: italic; }
.status-bar { font-size: 0.85em; padding: 6px 12px; border-radius: 6px; }
""",
) as demo:
# ── Header ────────────────────────────────────────────────────────────────
gr.HTML("""
<div class="header">
<h1>RAG CODEX for Historical Spanish</h1>
<p>RAG system of Spanish correction from the 16th century</p>
</div>
""")
with gr.Tabs():
# ── PestaΓ±a 1: CorrecciΓ³n ─────────────────────────────────────────────
with gr.TabItem(" HTR Correction"):
with gr.Row():
with gr.Column(scale=2):
htr_input = gr.Textbox(
label="HTR text (recognizer input)",
placeholder="Paste the HTR result here…",
lines=6,
)
with gr.Row():
top_k_slider = gr.Slider(
minimum=1, maximum=10, value=5, step=1,
label="Documents retrieved (k)",
)
model_selector = gr.Dropdown(
label="Modelo LLM",
choices=[
"llama-3.3-70b-versatile",
"openai/gpt-oss-120b",
],
value="llama-3.3-70b-versatile",
)
embedding_selector = gr.Dropdown(
label="Modelo de Embedding",
choices=[
"openai", # text-embedding-3-small
"mpnet", # paraphrase-multilingual-mpnet-base-v2
"mt5-base fine-tuned",
],
value="openai",
)
show_prompt = gr.Checkbox(label="Show RAG prompt", value=False)
btn_corregir = gr.Button("✦ Correct with RAG", variant="primary")
gr.Examples(
examples=DEMO_EXAMPLES,
inputs=htr_input,
label="Demonstration examples",
)
with gr.Column(scale=2):
corrected_out = gr.Textbox(
label="Corrected text (RAG output)",
lines=6,
interactive=False,
)
status_out = gr.Markdown(elem_classes=["status-bar"])
with gr.Row():
with gr.Column():
docs_out = gr.Markdown(label="Documents recovered from the corpus")
with gr.Column():
analysis_out = gr.Markdown(label="Pattern analysis")
diff_out = gr.Markdown(label="Word-by-word differences")
prompt_out = gr.Markdown(label="Prompt sent to the LLM", visible=True)
btn_corregir.click(
fn=corregir,
inputs=[htr_input, top_k_slider, show_prompt, model_selector],
outputs=[corrected_out, docs_out, analysis_out, diff_out, status_out, prompt_out],
)
embed_status = gr.Markdown()
embedding_selector.change(
fn=cambiar_embedding,
inputs=[embedding_selector],
outputs=[embed_status],
)
# ── PestaΓ±a 2: EvaluaciΓ³n ─────────────────────────────────────────────
with gr.TabItem(" Evaluation with GT"):
gr.Markdown("Compare the RAG correction against the actual groundtruth to measure CER/WER and detect modernisms.")
with gr.Row():
eval_htr = gr.Textbox(label="HTR text", lines=4)
eval_gt = gr.Textbox(label="Groundtruth (reference)", lines=4)
btn_eval = gr.Button("Evaluate", variant="primary")
eval_out = gr.Markdown()
btn_eval.click(fn=evaluar_par, inputs=[eval_htr, eval_gt], outputs=eval_out)
# ── PestaΓ±a 3: AΓ±adir al corpus ───────────────────────────────────────
with gr.TabItem("βž• Add to corpus"):
gr.Markdown("Add new pairs to the vector store to improve the RAG continuously.")
with gr.Row():
add_htr = gr.Textbox(label="Texto HTR", lines=4)
add_gt = gr.Textbox(label="Groundtruth corregido", lines=4)
with gr.Row():
add_type = gr.Textbox(label="Document type", placeholder="notarial / judicial / eclesiastico")
add_region = gr.Textbox(label="Region", placeholder="Castilla, AndalucΓ­a…")
add_date = gr.Textbox(label="Date", placeholder="1542")
add_caligrafia = gr.Dropdown(
label="CaligrafΓ­a",
choices=["desconocida", "procesal", "encadenada", "italica"],
value="desconocida",
)
btn_add = gr.Button("Add to corpus", variant="primary")
add_out = gr.Markdown()
btn_add.click(
fn=add_to_corpus,
inputs=[add_htr, add_gt, add_type, add_region, add_date, add_caligrafia],
outputs=add_out,
)
# ── PestaΓ±a 4: Info del sistema ───────────────────────────────────────
with gr.TabItem("β„Ή System"):
gr.Markdown(f"""
## System status
- **Modelo LLM:** {os.getenv('OPENAI_MODEL', 'gpt-4o')}
- **Vector store:** ChromaDB (persistente en `{os.getenv('CHROMA_PATH','./chroma_db')}`)
- **Documentos indexados:** {vs.count()}
- **Corpus cargado desde disco:** {len(disk_pairs)} pares
- **Pares de ejemplo embebidos:** {len(SAMPLE_PAIRS)}
## Arquitectura
```
Texto HTR
β”‚
β”œβ”€β–Ί Detector de patrones HTR (knowledge_base.py)
β”œβ”€β–Ί Detector de grafΓ­as modernas (knowledge_base.py)
β”‚
β”œβ”€β–Ί Embedding (text-embedding-3-small)
β”‚ β”‚
β”‚ └─► BΓΊsqueda top-k en ChromaDB ──► Few-shot dinΓ‘mico
β”‚
└─► Prompt constructor ──► GPT-4o ──► Texto corregido
```
## Formato del corpus
Para aΓ±adir tu corpus, crea `./corpus/` con ficheros JSON:
```json
[
{{"id": "doc001", "htr": "texto htr...", "gt": "groundtruth...",
"type": "notarial", "region": "Castilla", "date": "1542"}},
...
]
```
O CSV con columnas: `id, htr, gt, type, region, date`
""")
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
auth=("admin", "admin"), # ← autenticaciΓ³n bΓ‘sica (opcional)
share=False,
show_error=True,
)