Spaces:

alezsd
/

NSF-RAG-Codex

Running

Alexander Sanchez

mt5-base added

8e14964 15 days ago

16.7 kB

	"""
	app.py
	──────
	Interfaz web Gradio para el sistema RAG de corrección de castellano s.XVI.

	Arranque:
	python app.py

	Requiere:
	- .env con OPENAI_API_KEY
	- (opcional) corpus en ./corpus/
	"""

	import os
	import json
	import gradio as gr
	from dotenv import load_dotenv

	from knowledge_base import SAMPLE_PAIRS
	from corpus_loader import CorpusLoader
	from vector_store import VectorStore
	from rag_corrector import RAGCorrector
	from evaluator import Evaluator

	load_dotenv()

	# ── Inicialización ────────────────────────────────────────────────────────────

	print(" Inicializando Scriptorium RAG...")

	vs = VectorStore()

	# Cargar corpus desde disco (si existe) + pares de ejemplo embebidos
	loader = CorpusLoader(os.getenv("CORPUS_PATH", "./corpus"))
	disk_pairs = loader.load()
	all_pairs = SAMPLE_PAIRS + disk_pairs

	# Indexar todo en ChromaDB
	vs.index(all_pairs)

	corrector = RAGCorrector(vs)
	evaluator = Evaluator()

	print(f" Sistema listo. Documentos en vector store: {vs.count()}")

	# ── Ejemplos de demostración ──────────────────────────────────────────────────

	DEMO_EXAMPLES = [
	"q̃ fizo merçed al dho lugar de las alcaualas del anno de mill e quinientos",
	"el escriuano del cabildo faze fe y da testimouio verdadero de todo lo sobredho",
	"en la muy noble çibdad de burgos a veynte dias del mes de março anno dho",
	"yo juan de la torre vezino desta uilla de toledo otorgo e conosco",
	"sepan quantos esta carta de poder vieren como yo pero lopez vezino dela villa",
	"fizo pareçer ante si a los testigos q̃ dixeron ser mayores de veynte annos",
	]

	# Variable global para el vector store activo
	current_embed_model = "openai"
	vs = VectorStore(embedding_model="openai")
	vs.index(all_pairs)

	def cambiar_embedding(embed_model: str):
	global vs, corrector, current_embed_model

	if embed_model == current_embed_model:
	return f"ℹ Ya estás usando {embed_model}"

	try:
	current_embed_model = embed_model
	vs = VectorStore(embedding_model=embed_model)

	# Indexar si la colección está vacía
	if vs.count() == 0:
	vs.index(all_pairs)
	msg = f" Re-indexado con {embed_model} · {vs.count()} docs"
	else:
	msg = f" Cargado índice existente {embed_model} · {vs.count()} docs"

	# Recrear el corrector con el nuevo vector store
	corrector = RAGCorrector(vs)
	return msg

	except Exception as e:
	return f" Error cambiando embedding: {e}"



	# ── Función principal ─────────────────────────────────────────────────────────

	def corregir(htr_text: str, top_k: int, mostrar_prompt: bool, model: str):
	if not htr_text.strip():
	return "", "", "", "", " Introduce un texto HTR para corregir."

	if not os.getenv("OPENAI_API_KEY"):
	return "", "", "", "", " Falta OPENAI_API_KEY en el fichero .env"

	try:
	result = corrector.correct(htr_text, top_k=int(top_k), model= model)
	except Exception as e:
	return "", "", "", "", f" Error al llamar a la API: {e}"

	corrected = result["corrected"]
	retrieved = result["retrieved"]
	htr_errors = result["htr_errors"]
	grafia_w = result["grafia_warns"]

	# ── Panel de documentos recuperados ──────────────────────────────────────
	docs_md = f"### Top-{len(retrieved)} documentos recuperados\n\n"
	for i, doc in enumerate(retrieved, 1):
	docs_md += (
	f"{i}. [{doc['type']} · {doc['region']} · {doc['date']}] "
	f"similitud: {doc['score']}\n\n"
	f"- HTR: `{doc['htr']}`\n"
	f"- GT: `{doc['gt']}`\n"
	)
	if doc["corrections"]:
	docs_md += f"- Correcciones: {', '.join(doc['corrections'])}\n"
	docs_md += "\n---\n"

	# ── Panel de análisis ─────────────────────────────────────────────────────
	analysis_md = "### Análisis del texto\n\n"

	if htr_errors:
	analysis_md += "⚠ Posibles errores HTR detectados:\n"
	for e in htr_errors:
	analysis_md += f"- `{e['htr']}` → `{e['gt']}`: {e['context']} \n Ej: {e['example']}\n"
	analysis_md += "\n"

	if grafia_w:
	analysis_md += "✦ Alertas de grafía (NO modernizar):\n"
	for g in grafia_w:
	analysis_md += f"- `{g['modern']}` → mantener `{g['ancient']}`: {g['rule']}\n"
	analysis_md += "\n"

	if not htr_errors and not grafia_w:
	analysis_md += "No se detectaron patrones conocidos de error en el texto.\n"

	# Diff visual (diferencias)
	diff_md = "### Diferencias HTR → Corregido\n\n"
	orig_words = htr_text.split()
	corr_words = corrected.split()
	diff_parts = []
	max_len = max(len(orig_words), len(corr_words))
	changed = 0
	for i in range(max_len):
	o = orig_words[i] if i < len(orig_words) else "—"
	c = corr_words[i] if i < len(corr_words) else "—"
	if o != c:
	diff_parts.append(f"~~{o}~~ → {c}")
	changed += 1
	else:
	diff_parts.append(c)
	diff_md += " ".join(diff_parts)
	diff_md += f"\n\n{changed} palabra(s) modificada(s) de {len(orig_words)} totales."

	# ── Prompt (opcional) ─────────────────────────────────────────────────────
	#prompt_md = ""
	#if mostrar_prompt:
	# prompt_md = f"```\nSYSTEM:\n{result.get('_system', '(ver rag_corrector.py)')}\n\nUSER:\n{result['prompt']}\n```"

	status = f" Corrección completada con {result['model']} · {vs.count()} docs en índice"

	prompt_visible = ""
	if mostrar_prompt:
	prompt_visible = (
	"### System Prompt\n\n"
	f"```\n{result.get('_system', '(ver rag_corrector.py)')}\n```\n\n"
	"### User Prompt (dinámico)\n\n"
	f"```\n{result['prompt']}\n```"
	)

	return corrected, docs_md, analysis_md, diff_md, status, prompt_visible

	def evaluar_par(htr_text: str, gt_text: str):
	if not htr_text.strip() or not gt_text.strip():
	return "⚠ Introduce tanto el texto HTR como el groundtruth."
	try:
	result = corrector.correct(htr_text)
	metrics = evaluator.evaluate_pair(htr_text, result["corrected"], gt_text)
	m = metrics
	mod = m["modernism"]
	report = (
	f"### Métricas de evaluación\n\n"
	f"\| Métrica \| Antes (HTR) \| Después (RAG) \| Mejora \|\n"
	f"\|---------\|------------\|---------------\|--------\|\n"
	f"\| CER \| {m['cer_before']:.2%} \| {m['cer_after']:.2%} \| {m['cer_improvement']:+.2%} \|\n"
	f"\| WER \| {m['wer_before']:.2%} \| {m['wer_after']:.2%} \| {m['wer_improvement']:+.2%} \|\n\n"
	f"Detector de modernismos: score={mod['score']:.2f} "
	f"({mod['count']} problema(s) detectado(s))\n"
	)
	if mod["issues"]:
	report += "\nFormas modernas introducidas incorrectamente:\n"
	for iss in mod["issues"]:
	report += f"- `{iss['modern']}` (debería ser `{iss['ancient']}`): {iss['rule']}\n"

	report += f"\nTexto corregido por RAG:\n> {result['corrected']}"
	return report
	except Exception as e:
	return f" Error: {e}"


	def add_to_corpus(htr_text: str, gt_text: str, doc_type: str, region: str, date: str, caligrafia: str):
	if not htr_text.strip() or not gt_text.strip():
	return "⚠ HTR y GT son obligatorios."
	try:
	pair_id = f"user_{abs(hash(htr_text)) % 100000:05d}"
	new_pair = {
	"id": pair_id,
	"htr": htr_text.strip(),
	"gt": gt_text.strip(),
	"type": doc_type or "desconocido",
	"region": region or "desconocida",
	"date": date or "",
	"caligrafia": caligrafia or "desconocida",
	"corrections": [],
	"source": "user_added",
	}
	added = vs.index([new_pair])
	if added:
	return f" Par añadido al corpus con id `{pair_id}`. Total: {vs.count()} docs."
	else:
	return f" Par ya existía en el corpus (id: `{pair_id}`)."
	except Exception as e:
	return f" Error: {e}"


	# ── Interfaz Gradio ───────────────────────────────────────────────────────────

	with gr.Blocks(
	title="Scriptorium RAG",
	theme=gr.themes.Base(
	primary_hue="amber",
	secondary_hue="stone",
	neutral_hue="stone",
	font=gr.themes.GoogleFont("IM Fell English"),
	),
	css="""
	.header { text-align: center; padding: 20px 0 10px; }
	.header h1 { font-size: 2.2em; color: #92400e; letter-spacing: 0.15em; }
	.header p { color: #78716c; font-style: italic; }
	.status-bar { font-size: 0.85em; padding: 6px 12px; border-radius: 6px; }
	""",
	) as demo:

	# ── Header ────────────────────────────────────────────────────────────────
	gr.HTML("""
	<div class="header">
	<h1>RAG CODEX for Historical Spanish</h1>
	<p>RAG system of Spanish correction from the 16th century</p>
	</div>
	""")

	with gr.Tabs():

	# ── Pestaña 1: Corrección ─────────────────────────────────────────────
	with gr.TabItem(" HTR Correction"):
	with gr.Row():
	with gr.Column(scale=2):
	htr_input = gr.Textbox(
	label="HTR text (recognizer input)",
	placeholder="Paste the HTR result here…",
	lines=6,
	)
	with gr.Row():
	top_k_slider = gr.Slider(
	minimum=1, maximum=10, value=5, step=1,
	label="Documents retrieved (k)",
	)
	model_selector = gr.Dropdown(
	label="Modelo LLM",
	choices=[
	"llama-3.3-70b-versatile",
	"openai/gpt-oss-120b",
	],
	value="llama-3.3-70b-versatile",
	)
	embedding_selector = gr.Dropdown(
	label="Modelo de Embedding",
	choices=[
	"openai", # text-embedding-3-small
	"mpnet", # paraphrase-multilingual-mpnet-base-v2
	"mt5-base fine-tuned",
	],
	value="openai",
	)

	show_prompt = gr.Checkbox(label="Show RAG prompt", value=False)
	btn_corregir = gr.Button("✦ Correct with RAG", variant="primary")

	gr.Examples(
	examples=DEMO_EXAMPLES,
	inputs=htr_input,
	label="Demonstration examples",
	)

	with gr.Column(scale=2):
	corrected_out = gr.Textbox(
	label="Corrected text (RAG output)",
	lines=6,
	interactive=False,
	)
	status_out = gr.Markdown(elem_classes=["status-bar"])

	with gr.Row():
	with gr.Column():
	docs_out = gr.Markdown(label="Documents recovered from the corpus")
	with gr.Column():
	analysis_out = gr.Markdown(label="Pattern analysis")

	diff_out = gr.Markdown(label="Word-by-word differences")
	prompt_out = gr.Markdown(label="Prompt sent to the LLM", visible=True)

	btn_corregir.click(
	fn=corregir,
	inputs=[htr_input, top_k_slider, show_prompt, model_selector],
	outputs=[corrected_out, docs_out, analysis_out, diff_out, status_out, prompt_out],
	)

	embed_status = gr.Markdown()
	embedding_selector.change(
	fn=cambiar_embedding,
	inputs=[embedding_selector],
	outputs=[embed_status],
	)

	# ── Pestaña 2: Evaluación ─────────────────────────────────────────────
	with gr.TabItem(" Evaluation with GT"):
	gr.Markdown("Compare the RAG correction against the actual groundtruth to measure CER/WER and detect modernisms.")
	with gr.Row():
	eval_htr = gr.Textbox(label="HTR text", lines=4)
	eval_gt = gr.Textbox(label="Groundtruth (reference)", lines=4)
	btn_eval = gr.Button("Evaluate", variant="primary")
	eval_out = gr.Markdown()
	btn_eval.click(fn=evaluar_par, inputs=[eval_htr, eval_gt], outputs=eval_out)

	# ── Pestaña 3: Añadir al corpus ───────────────────────────────────────
	with gr.TabItem("➕ Add to corpus"):
	gr.Markdown("Add new pairs to the vector store to improve the RAG continuously.")
	with gr.Row():
	add_htr = gr.Textbox(label="Texto HTR", lines=4)
	add_gt = gr.Textbox(label="Groundtruth corregido", lines=4)
	with gr.Row():
	add_type = gr.Textbox(label="Document type", placeholder="notarial / judicial / eclesiastico")
	add_region = gr.Textbox(label="Region", placeholder="Castilla, Andalucía…")
	add_date = gr.Textbox(label="Date", placeholder="1542")
	add_caligrafia = gr.Dropdown(
	label="Caligrafía",
	choices=["desconocida", "procesal", "encadenada", "italica"],
	value="desconocida",
	)
	btn_add = gr.Button("Add to corpus", variant="primary")
	add_out = gr.Markdown()
	btn_add.click(
	fn=add_to_corpus,
	inputs=[add_htr, add_gt, add_type, add_region, add_date, add_caligrafia],
	outputs=add_out,
	)

	# ── Pestaña 4: Info del sistema ───────────────────────────────────────
	with gr.TabItem("ℹ System"):
	gr.Markdown(f"""
	## System status

	- Modelo LLM: {os.getenv('OPENAI_MODEL', 'gpt-4o')}
	- Vector store: ChromaDB (persistente en `{os.getenv('CHROMA_PATH','./chroma_db')}`)
	- Documentos indexados: {vs.count()}
	- Corpus cargado desde disco: {len(disk_pairs)} pares
	- Pares de ejemplo embebidos: {len(SAMPLE_PAIRS)}

	## Arquitectura

	```
	Texto HTR
	│
	├─► Detector de patrones HTR (knowledge_base.py)
	├─► Detector de grafías modernas (knowledge_base.py)
	│
	├─► Embedding (text-embedding-3-small)
	│ │
	│ └─► Búsqueda top-k en ChromaDB ──► Few-shot dinámico
	│
	└─► Prompt constructor ──► GPT-4o ──► Texto corregido
	```

	## Formato del corpus

	Para añadir tu corpus, crea `./corpus/` con ficheros JSON:
	```json
	[
	{{"id": "doc001", "htr": "texto htr...", "gt": "groundtruth...",
	"type": "notarial", "region": "Castilla", "date": "1542"}},
	...
	]
	```
	O CSV con columnas: `id, htr, gt, type, region, date`
	""")

	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	auth=("admin", "admin"), # ← autenticación básica (opcional)

	share=False,
	show_error=True,
	)