Spaces:
Sleeping
Sleeping
NSF RAG v1.0
Browse files- app.py +334 -0
- corpus_loader.py +103 -0
- evaluator.py +163 -0
- knowledge_base.py +124 -0
- rag_corrector.py +171 -0
- vector_store.py +153 -0
app.py
ADDED
|
@@ -0,0 +1,334 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
app.py
|
| 3 |
+
──────
|
| 4 |
+
Interfaz web Gradio para el sistema RAG de corrección de castellano s.XVI.
|
| 5 |
+
|
| 6 |
+
Arranque:
|
| 7 |
+
python app.py
|
| 8 |
+
|
| 9 |
+
Requiere:
|
| 10 |
+
- .env con OPENAI_API_KEY
|
| 11 |
+
- (opcional) corpus en ./corpus/
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import os
|
| 15 |
+
import json
|
| 16 |
+
import gradio as gr
|
| 17 |
+
from dotenv import load_dotenv
|
| 18 |
+
|
| 19 |
+
from knowledge_base import SAMPLE_PAIRS
|
| 20 |
+
from corpus_loader import CorpusLoader
|
| 21 |
+
from vector_store import VectorStore
|
| 22 |
+
from rag_corrector import RAGCorrector
|
| 23 |
+
from evaluator import Evaluator
|
| 24 |
+
|
| 25 |
+
load_dotenv()
|
| 26 |
+
|
| 27 |
+
# ── Inicialización ────────────────────────────────────────────────────────────
|
| 28 |
+
|
| 29 |
+
print("🚀 Inicializando Scriptorium RAG...")
|
| 30 |
+
|
| 31 |
+
vs = VectorStore()
|
| 32 |
+
|
| 33 |
+
# Cargar corpus desde disco (si existe) + pares de ejemplo embebidos
|
| 34 |
+
loader = CorpusLoader(os.getenv("CORPUS_PATH", "./corpus"))
|
| 35 |
+
disk_pairs = loader.load()
|
| 36 |
+
all_pairs = SAMPLE_PAIRS + disk_pairs
|
| 37 |
+
|
| 38 |
+
# Indexar todo en ChromaDB
|
| 39 |
+
vs.index(all_pairs)
|
| 40 |
+
|
| 41 |
+
corrector = RAGCorrector(vs)
|
| 42 |
+
evaluator = Evaluator()
|
| 43 |
+
|
| 44 |
+
print(f"✅ Sistema listo. Documentos en vector store: {vs.count()}")
|
| 45 |
+
|
| 46 |
+
# ── Ejemplos de demostración ──────────────────────────────────────────────────
|
| 47 |
+
|
| 48 |
+
DEMO_EXAMPLES = [
|
| 49 |
+
"q̃ fizo merçed al dho lugar de las alcaualas del anno de mill e quinientos",
|
| 50 |
+
"el escriuano del cabildo faze fe y da testimouio verdadero de todo lo sobredho",
|
| 51 |
+
"en la muy noble çibdad de burgos a veynte dias del mes de março anno dho",
|
| 52 |
+
"yo juan de la torre vezino desta uilla de toledo otorgo e conosco",
|
| 53 |
+
"sepan quantos esta carta de poder vieren como yo pero lopez vezino dela villa",
|
| 54 |
+
"fizo pareçer ante si a los testigos q̃ dixeron ser mayores de veynte annos",
|
| 55 |
+
]
|
| 56 |
+
|
| 57 |
+
# ── Función principal ─────────────────────────────────────────────────────────
|
| 58 |
+
|
| 59 |
+
def corregir(htr_text: str, top_k: int, mostrar_prompt: bool):
|
| 60 |
+
if not htr_text.strip():
|
| 61 |
+
return "", "", "", "", "⚠ Introduce un texto HTR para corregir."
|
| 62 |
+
|
| 63 |
+
if not os.getenv("OPENAI_API_KEY"):
|
| 64 |
+
return "", "", "", "", "❌ Falta OPENAI_API_KEY en el fichero .env"
|
| 65 |
+
|
| 66 |
+
try:
|
| 67 |
+
result = corrector.correct(htr_text, top_k=int(top_k))
|
| 68 |
+
except Exception as e:
|
| 69 |
+
return "", "", "", "", f"❌ Error al llamar a la API: {e}"
|
| 70 |
+
|
| 71 |
+
corrected = result["corrected"]
|
| 72 |
+
retrieved = result["retrieved"]
|
| 73 |
+
htr_errors = result["htr_errors"]
|
| 74 |
+
grafia_w = result["grafia_warns"]
|
| 75 |
+
|
| 76 |
+
# ── Panel de documentos recuperados ──────────────────────────────────────
|
| 77 |
+
docs_md = f"### Top-{len(retrieved)} documentos recuperados\n\n"
|
| 78 |
+
for i, doc in enumerate(retrieved, 1):
|
| 79 |
+
docs_md += (
|
| 80 |
+
f"**{i}. [{doc['type']} · {doc['region']} · {doc['date']}]** "
|
| 81 |
+
f"*similitud: {doc['score']}*\n\n"
|
| 82 |
+
f"- **HTR:** `{doc['htr']}`\n"
|
| 83 |
+
f"- **GT:** `{doc['gt']}`\n"
|
| 84 |
+
)
|
| 85 |
+
if doc["corrections"]:
|
| 86 |
+
docs_md += f"- **Correcciones:** {', '.join(doc['corrections'])}\n"
|
| 87 |
+
docs_md += "\n---\n"
|
| 88 |
+
|
| 89 |
+
# ── Panel de análisis ─────────────────────────────────────────────────────
|
| 90 |
+
analysis_md = "### Análisis del texto\n\n"
|
| 91 |
+
|
| 92 |
+
if htr_errors:
|
| 93 |
+
analysis_md += "**⚠ Posibles errores HTR detectados:**\n"
|
| 94 |
+
for e in htr_errors:
|
| 95 |
+
analysis_md += f"- `{e['htr']}` → `{e['gt']}`: {e['context']} \n *Ej: {e['example']}*\n"
|
| 96 |
+
analysis_md += "\n"
|
| 97 |
+
|
| 98 |
+
if grafia_w:
|
| 99 |
+
analysis_md += "**✦ Alertas de grafía (NO modernizar):**\n"
|
| 100 |
+
for g in grafia_w:
|
| 101 |
+
analysis_md += f"- `{g['modern']}` → mantener `{g['ancient']}`: {g['rule']}\n"
|
| 102 |
+
analysis_md += "\n"
|
| 103 |
+
|
| 104 |
+
if not htr_errors and not grafia_w:
|
| 105 |
+
analysis_md += "*No se detectaron patrones conocidos de error en el texto.*\n"
|
| 106 |
+
|
| 107 |
+
# Diff visual (diferencias)
|
| 108 |
+
diff_md = "### Diferencias HTR → Corregido\n\n"
|
| 109 |
+
orig_words = htr_text.split()
|
| 110 |
+
corr_words = corrected.split()
|
| 111 |
+
diff_parts = []
|
| 112 |
+
max_len = max(len(orig_words), len(corr_words))
|
| 113 |
+
changed = 0
|
| 114 |
+
for i in range(max_len):
|
| 115 |
+
o = orig_words[i] if i < len(orig_words) else "—"
|
| 116 |
+
c = corr_words[i] if i < len(corr_words) else "—"
|
| 117 |
+
if o != c:
|
| 118 |
+
diff_parts.append(f"~~{o}~~ → **{c}**")
|
| 119 |
+
changed += 1
|
| 120 |
+
else:
|
| 121 |
+
diff_parts.append(c)
|
| 122 |
+
diff_md += " ".join(diff_parts)
|
| 123 |
+
diff_md += f"\n\n*{changed} palabra(s) modificada(s) de {len(orig_words)} totales.*"
|
| 124 |
+
|
| 125 |
+
# ── Prompt (opcional) ─────────────────────────────────────────���───────────
|
| 126 |
+
prompt_md = ""
|
| 127 |
+
if mostrar_prompt:
|
| 128 |
+
prompt_md = f"```\nSYSTEM:\n{result.get('_system', '(ver rag_corrector.py)')}\n\nUSER:\n{result['prompt']}\n```"
|
| 129 |
+
|
| 130 |
+
status = f"✅ Corrección completada con **{result['model']}** · {vs.count()} docs en índice"
|
| 131 |
+
|
| 132 |
+
return corrected, docs_md, analysis_md, diff_md, status
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def evaluar_par(htr_text: str, gt_text: str):
|
| 136 |
+
if not htr_text.strip() or not gt_text.strip():
|
| 137 |
+
return "⚠ Introduce tanto el texto HTR como el groundtruth."
|
| 138 |
+
try:
|
| 139 |
+
result = corrector.correct(htr_text)
|
| 140 |
+
metrics = evaluator.evaluate_pair(htr_text, result["corrected"], gt_text)
|
| 141 |
+
m = metrics
|
| 142 |
+
mod = m["modernism"]
|
| 143 |
+
report = (
|
| 144 |
+
f"### Métricas de evaluación\n\n"
|
| 145 |
+
f"| Métrica | Antes (HTR) | Después (RAG) | Mejora |\n"
|
| 146 |
+
f"|---------|------------|---------------|--------|\n"
|
| 147 |
+
f"| **CER** | {m['cer_before']:.2%} | {m['cer_after']:.2%} | {m['cer_improvement']:+.2%} |\n"
|
| 148 |
+
f"| **WER** | {m['wer_before']:.2%} | {m['wer_after']:.2%} | {m['wer_improvement']:+.2%} |\n\n"
|
| 149 |
+
f"**Detector de modernismos:** score={mod['score']:.2f} "
|
| 150 |
+
f"({mod['count']} problema(s) detectado(s))\n"
|
| 151 |
+
)
|
| 152 |
+
if mod["issues"]:
|
| 153 |
+
report += "\nFormas modernas introducidas incorrectamente:\n"
|
| 154 |
+
for iss in mod["issues"]:
|
| 155 |
+
report += f"- `{iss['modern']}` (debería ser `{iss['ancient']}`): {iss['rule']}\n"
|
| 156 |
+
|
| 157 |
+
report += f"\n**Texto corregido por RAG:**\n> {result['corrected']}"
|
| 158 |
+
return report
|
| 159 |
+
except Exception as e:
|
| 160 |
+
return f"❌ Error: {e}"
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def add_to_corpus(htr_text: str, gt_text: str, doc_type: str, region: str, date: str):
|
| 164 |
+
if not htr_text.strip() or not gt_text.strip():
|
| 165 |
+
return "⚠ HTR y GT son obligatorios."
|
| 166 |
+
try:
|
| 167 |
+
pair_id = f"user_{abs(hash(htr_text)) % 100000:05d}"
|
| 168 |
+
new_pair = {
|
| 169 |
+
"id": pair_id,
|
| 170 |
+
"htr": htr_text.strip(),
|
| 171 |
+
"gt": gt_text.strip(),
|
| 172 |
+
"type": doc_type or "desconocido",
|
| 173 |
+
"region": region or "desconocida",
|
| 174 |
+
"date": date or "",
|
| 175 |
+
"corrections": [],
|
| 176 |
+
"source": "user_added",
|
| 177 |
+
}
|
| 178 |
+
added = vs.index([new_pair])
|
| 179 |
+
if added:
|
| 180 |
+
return f"✅ Par añadido al corpus con id `{pair_id}`. Total: {vs.count()} docs."
|
| 181 |
+
else:
|
| 182 |
+
return f"ℹ Par ya existía en el corpus (id: `{pair_id}`)."
|
| 183 |
+
except Exception as e:
|
| 184 |
+
return f"❌ Error: {e}"
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
# ── Interfaz Gradio ───────────────────────────────────────────────────────────
|
| 188 |
+
|
| 189 |
+
with gr.Blocks(
|
| 190 |
+
title="Scriptorium RAG",
|
| 191 |
+
theme=gr.themes.Base(
|
| 192 |
+
primary_hue="amber",
|
| 193 |
+
secondary_hue="stone",
|
| 194 |
+
neutral_hue="stone",
|
| 195 |
+
font=gr.themes.GoogleFont("IM Fell English"),
|
| 196 |
+
),
|
| 197 |
+
css="""
|
| 198 |
+
.header { text-align: center; padding: 20px 0 10px; }
|
| 199 |
+
.header h1 { font-size: 2.2em; color: #92400e; letter-spacing: 0.15em; }
|
| 200 |
+
.header p { color: #78716c; font-style: italic; }
|
| 201 |
+
.status-bar { font-size: 0.85em; padding: 6px 12px; border-radius: 6px; }
|
| 202 |
+
""",
|
| 203 |
+
) as demo:
|
| 204 |
+
|
| 205 |
+
# ── Header ────────────────────────────────────────────────────────────────
|
| 206 |
+
gr.HTML("""
|
| 207 |
+
<div class="header">
|
| 208 |
+
<h1>RAG CODEX for NSF project</h1>
|
| 209 |
+
<p>Sistema RAG de corrección de castellano del siglo XVI</p>
|
| 210 |
+
</div>
|
| 211 |
+
""")
|
| 212 |
+
|
| 213 |
+
with gr.Tabs():
|
| 214 |
+
|
| 215 |
+
# ── Pestaña 1: Corrección ─────────────────────────────────────────────
|
| 216 |
+
with gr.TabItem(" Corrección HTR"):
|
| 217 |
+
with gr.Row():
|
| 218 |
+
with gr.Column(scale=2):
|
| 219 |
+
htr_input = gr.Textbox(
|
| 220 |
+
label="Texto HTR (entrada del reconocedor)",
|
| 221 |
+
placeholder="Pega aquí el texto resultado del HTR…",
|
| 222 |
+
lines=6,
|
| 223 |
+
)
|
| 224 |
+
with gr.Row():
|
| 225 |
+
top_k_slider = gr.Slider(
|
| 226 |
+
minimum=1, maximum=10, value=5, step=1,
|
| 227 |
+
label="Documentos recuperados (k)",
|
| 228 |
+
)
|
| 229 |
+
show_prompt = gr.Checkbox(label="Mostrar prompt RAG", value=False)
|
| 230 |
+
btn_corregir = gr.Button("✦ Corregir con RAG", variant="primary")
|
| 231 |
+
|
| 232 |
+
gr.Examples(
|
| 233 |
+
examples=DEMO_EXAMPLES,
|
| 234 |
+
inputs=htr_input,
|
| 235 |
+
label="Ejemplos de demostración",
|
| 236 |
+
)
|
| 237 |
+
|
| 238 |
+
with gr.Column(scale=2):
|
| 239 |
+
corrected_out = gr.Textbox(
|
| 240 |
+
label="Texto corregido (salida RAG)",
|
| 241 |
+
lines=6,
|
| 242 |
+
interactive=False,
|
| 243 |
+
)
|
| 244 |
+
status_out = gr.Markdown(elem_classes=["status-bar"])
|
| 245 |
+
|
| 246 |
+
with gr.Row():
|
| 247 |
+
with gr.Column():
|
| 248 |
+
docs_out = gr.Markdown(label="Documentos recuperados del corpus")
|
| 249 |
+
with gr.Column():
|
| 250 |
+
analysis_out = gr.Markdown(label="Análisis de patrones")
|
| 251 |
+
|
| 252 |
+
diff_out = gr.Markdown(label="Diferencias palabra a palabra")
|
| 253 |
+
prompt_out = gr.Markdown(label="Prompt enviado al LLM", visible=False)
|
| 254 |
+
|
| 255 |
+
btn_corregir.click(
|
| 256 |
+
fn=corregir,
|
| 257 |
+
inputs=[htr_input, top_k_slider, show_prompt],
|
| 258 |
+
outputs=[corrected_out, docs_out, analysis_out, diff_out, status_out],
|
| 259 |
+
)
|
| 260 |
+
|
| 261 |
+
# ── Pestaña 2: Evaluación ─────────────────────────────────────────────
|
| 262 |
+
with gr.TabItem(" Evaluación con GT"):
|
| 263 |
+
gr.Markdown("Compara la corrección RAG contra el groundtruth real para medir CER/WER y detectar modernismos.")
|
| 264 |
+
with gr.Row():
|
| 265 |
+
eval_htr = gr.Textbox(label="Texto HTR", lines=4)
|
| 266 |
+
eval_gt = gr.Textbox(label="Groundtruth (referencia)", lines=4)
|
| 267 |
+
btn_eval = gr.Button("Evaluar", variant="primary")
|
| 268 |
+
eval_out = gr.Markdown()
|
| 269 |
+
btn_eval.click(fn=evaluar_par, inputs=[eval_htr, eval_gt], outputs=eval_out)
|
| 270 |
+
|
| 271 |
+
# ── Pestaña 3: Añadir al corpus ───────────────────────────────────────
|
| 272 |
+
with gr.TabItem("➕ Añadir al corpus"):
|
| 273 |
+
gr.Markdown("Añade nuevos pares al vector store para mejorar el RAG de forma continua.")
|
| 274 |
+
with gr.Row():
|
| 275 |
+
add_htr = gr.Textbox(label="Texto HTR", lines=4)
|
| 276 |
+
add_gt = gr.Textbox(label="Groundtruth corregido", lines=4)
|
| 277 |
+
with gr.Row():
|
| 278 |
+
add_type = gr.Textbox(label="Tipo documental", placeholder="notarial / judicial / eclesiastico")
|
| 279 |
+
add_region = gr.Textbox(label="Región", placeholder="Castilla, Andalucía…")
|
| 280 |
+
add_date = gr.Textbox(label="Fecha", placeholder="1542")
|
| 281 |
+
btn_add = gr.Button("Añadir al corpus", variant="primary")
|
| 282 |
+
add_out = gr.Markdown()
|
| 283 |
+
btn_add.click(
|
| 284 |
+
fn=add_to_corpus,
|
| 285 |
+
inputs=[add_htr, add_gt, add_type, add_region, add_date],
|
| 286 |
+
outputs=add_out,
|
| 287 |
+
)
|
| 288 |
+
|
| 289 |
+
# ── Pestaña 4: Info del sistema ───────────────────────────────────────
|
| 290 |
+
with gr.TabItem("ℹ Sistema"):
|
| 291 |
+
gr.Markdown(f"""
|
| 292 |
+
## Estado del sistema
|
| 293 |
+
|
| 294 |
+
- **Modelo LLM:** {os.getenv('OPENAI_MODEL', 'gpt-4o')}
|
| 295 |
+
- **Vector store:** ChromaDB (persistente en `{os.getenv('CHROMA_PATH','./chroma_db')}`)
|
| 296 |
+
- **Documentos indexados:** {vs.count()}
|
| 297 |
+
- **Corpus cargado desde disco:** {len(disk_pairs)} pares
|
| 298 |
+
- **Pares de ejemplo embebidos:** {len(SAMPLE_PAIRS)}
|
| 299 |
+
|
| 300 |
+
## Arquitectura
|
| 301 |
+
|
| 302 |
+
```
|
| 303 |
+
Texto HTR
|
| 304 |
+
│
|
| 305 |
+
├─► Detector de patrones HTR (knowledge_base.py)
|
| 306 |
+
├─► Detector de grafías modernas (knowledge_base.py)
|
| 307 |
+
│
|
| 308 |
+
├─► Embedding (text-embedding-3-small)
|
| 309 |
+
│ │
|
| 310 |
+
│ └─► Búsqueda top-k en ChromaDB ──► Few-shot dinámico
|
| 311 |
+
│
|
| 312 |
+
└─► Prompt constructor ──► GPT-4o ──► Texto corregido
|
| 313 |
+
```
|
| 314 |
+
|
| 315 |
+
## Formato del corpus
|
| 316 |
+
|
| 317 |
+
Para añadir tu corpus, crea `./corpus/` con ficheros JSON:
|
| 318 |
+
```json
|
| 319 |
+
[
|
| 320 |
+
{{"id": "doc001", "htr": "texto htr...", "gt": "groundtruth...",
|
| 321 |
+
"type": "notarial", "region": "Castilla", "date": "1542"}},
|
| 322 |
+
...
|
| 323 |
+
]
|
| 324 |
+
```
|
| 325 |
+
O CSV con columnas: `id, htr, gt, type, region, date`
|
| 326 |
+
""")
|
| 327 |
+
|
| 328 |
+
if __name__ == "__main__":
|
| 329 |
+
demo.launch(
|
| 330 |
+
server_name="0.0.0.0",
|
| 331 |
+
server_port=7860,
|
| 332 |
+
share=False,
|
| 333 |
+
show_error=True,
|
| 334 |
+
)
|
corpus_loader.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
corpus_loader.py
|
| 3 |
+
────────────────
|
| 4 |
+
Carga pares (HTR, groundtruth) desde disco.
|
| 5 |
+
|
| 6 |
+
Formatos soportados:
|
| 7 |
+
1. JSON → lista de objetos {"id","htr","gt","type","region","date"}
|
| 8 |
+
2. CSV → columnas: id, htr, gt, type, region, date
|
| 9 |
+
3. TXT → carpeta con ficheros *.htr.txt y *.gt.txt (mismo nombre base)
|
| 10 |
+
|
| 11 |
+
Uso:
|
| 12 |
+
from corpus_loader import CorpusLoader
|
| 13 |
+
loader = CorpusLoader("./corpus")
|
| 14 |
+
pairs = loader.load() # lista de dicts
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import json
|
| 18 |
+
import csv
|
| 19 |
+
import os
|
| 20 |
+
from pathlib import Path
|
| 21 |
+
from typing import List, Dict
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class CorpusLoader:
|
| 25 |
+
def __init__(self, corpus_path: str):
|
| 26 |
+
self.corpus_path = Path(corpus_path)
|
| 27 |
+
|
| 28 |
+
def load(self) -> List[Dict]:
|
| 29 |
+
"""Detecta formato y carga todos los pares disponibles."""
|
| 30 |
+
pairs = []
|
| 31 |
+
|
| 32 |
+
if not self.corpus_path.exists():
|
| 33 |
+
print(f"⚠ Corpus path '{self.corpus_path}' no existe. Usando pares de ejemplo.")
|
| 34 |
+
return []
|
| 35 |
+
|
| 36 |
+
# ── JSON ──────────────────────────────────────────────────────────────
|
| 37 |
+
for f in sorted(self.corpus_path.glob("*.json")):
|
| 38 |
+
try:
|
| 39 |
+
with open(f, encoding="utf-8") as fh:
|
| 40 |
+
data = json.load(fh)
|
| 41 |
+
if isinstance(data, list):
|
| 42 |
+
pairs.extend(self._normalize(data, source=f.stem))
|
| 43 |
+
elif isinstance(data, dict): # un solo documento
|
| 44 |
+
pairs.append(self._normalize_one(data, source=f.stem))
|
| 45 |
+
print(f"✅ JSON cargado: {f.name} ({len(data)} pares)")
|
| 46 |
+
except Exception as e:
|
| 47 |
+
print(f"❌ Error leyendo {f.name}: {e}")
|
| 48 |
+
|
| 49 |
+
# ── CSV ───────────────────────────────────────────────────────────────
|
| 50 |
+
for f in sorted(self.corpus_path.glob("*.csv")):
|
| 51 |
+
try:
|
| 52 |
+
with open(f, encoding="utf-8", newline="") as fh:
|
| 53 |
+
reader = csv.DictReader(fh)
|
| 54 |
+
rows = list(reader)
|
| 55 |
+
pairs.extend(self._normalize(rows, source=f.stem))
|
| 56 |
+
print(f"✅ CSV cargado: {f.name} ({len(rows)} pares)")
|
| 57 |
+
except Exception as e:
|
| 58 |
+
print(f"❌ Error leyendo {f.name}: {e}")
|
| 59 |
+
|
| 60 |
+
# ── TXT pareado ───────────────────────────────────────────────────────
|
| 61 |
+
htr_files = sorted(self.corpus_path.glob("*.htr.txt"))
|
| 62 |
+
for htr_file in htr_files:
|
| 63 |
+
gt_file = htr_file.with_suffix("").with_suffix(".gt.txt")
|
| 64 |
+
if not gt_file.exists():
|
| 65 |
+
print(f"⚠ Sin GT para {htr_file.name}, omitido.")
|
| 66 |
+
continue
|
| 67 |
+
try:
|
| 68 |
+
htr_text = htr_file.read_text(encoding="utf-8").strip()
|
| 69 |
+
gt_text = gt_file.read_text(encoding="utf-8").strip()
|
| 70 |
+
pairs.append({
|
| 71 |
+
"id": htr_file.stem.replace(".htr", ""),
|
| 72 |
+
"htr": htr_text,
|
| 73 |
+
"gt": gt_text,
|
| 74 |
+
"type": "desconocido",
|
| 75 |
+
"region": "desconocida",
|
| 76 |
+
"date": "",
|
| 77 |
+
"source": "txt",
|
| 78 |
+
})
|
| 79 |
+
except Exception as e:
|
| 80 |
+
print(f"❌ Error leyendo {htr_file.name}: {e}")
|
| 81 |
+
|
| 82 |
+
if htr_files:
|
| 83 |
+
print(f"✅ TXT cargados: {len(htr_files)} pares")
|
| 84 |
+
|
| 85 |
+
print(f"\n📚 Total pares cargados desde disco: {len(pairs)}")
|
| 86 |
+
return pairs
|
| 87 |
+
|
| 88 |
+
# ── helpers ───────────────────────────────────────────────────────────────
|
| 89 |
+
|
| 90 |
+
def _normalize(self, rows: List[Dict], source: str) -> List[Dict]:
|
| 91 |
+
return [self._normalize_one(r, source) for r in rows if r.get("htr") and r.get("gt")]
|
| 92 |
+
|
| 93 |
+
def _normalize_one(self, row: Dict, source: str) -> Dict:
|
| 94 |
+
return {
|
| 95 |
+
"id": str(row.get("id", source)),
|
| 96 |
+
"htr": str(row.get("htr", "")).strip(),
|
| 97 |
+
"gt": str(row.get("gt", "")).strip(),
|
| 98 |
+
"type": str(row.get("type", "desconocido")),
|
| 99 |
+
"region": str(row.get("region", "desconocida")),
|
| 100 |
+
"date": str(row.get("date", "")),
|
| 101 |
+
"source": source,
|
| 102 |
+
"corrections": row.get("corrections", []),
|
| 103 |
+
}
|
evaluator.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
evaluator.py
|
| 3 |
+
────────────
|
| 4 |
+
Métricas de calidad para las correcciones del sistema RAG.
|
| 5 |
+
|
| 6 |
+
- CER (Character Error Rate) : nivel de carácter
|
| 7 |
+
- WER (Word Error Rate) : nivel de palabra
|
| 8 |
+
- Modernism score : penalización por formas modernas introducidas
|
| 9 |
+
- Batch evaluation : evalúa el sistema sobre un conjunto de pares con GT
|
| 10 |
+
|
| 11 |
+
Uso:
|
| 12 |
+
from evaluator import Evaluator
|
| 13 |
+
ev = Evaluator()
|
| 14 |
+
metrics = ev.evaluate_pair(htr="...", corrected="...", gt="...")
|
| 15 |
+
report = ev.batch_evaluate(corrector, pairs[:50])
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
import re
|
| 19 |
+
from typing import List, Dict, Tuple
|
| 20 |
+
from knowledge_base import GRAFIA_PATTERNS
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class Evaluator:
|
| 24 |
+
|
| 25 |
+
# ── Métricas de edición ──────────────────────────────────────────────────
|
| 26 |
+
|
| 27 |
+
@staticmethod
|
| 28 |
+
def cer(reference: str, hypothesis: str) -> float:
|
| 29 |
+
"""Character Error Rate (Levenshtein a nivel carácter)."""
|
| 30 |
+
r, h = list(reference), list(hypothesis)
|
| 31 |
+
return Evaluator._levenshtein(r, h) / max(len(r), 1)
|
| 32 |
+
|
| 33 |
+
@staticmethod
|
| 34 |
+
def wer(reference: str, hypothesis: str) -> float:
|
| 35 |
+
"""Word Error Rate (Levenshtein a nivel palabra)."""
|
| 36 |
+
r = reference.split()
|
| 37 |
+
h = hypothesis.split()
|
| 38 |
+
return Evaluator._levenshtein(r, h) / max(len(r), 1)
|
| 39 |
+
|
| 40 |
+
@staticmethod
|
| 41 |
+
def _levenshtein(seq1: list, seq2: list) -> int:
|
| 42 |
+
m, n = len(seq1), len(seq2)
|
| 43 |
+
dp = list(range(n + 1))
|
| 44 |
+
for i in range(1, m + 1):
|
| 45 |
+
prev = dp[:]
|
| 46 |
+
dp[0] = i
|
| 47 |
+
for j in range(1, n + 1):
|
| 48 |
+
if seq1[i - 1] == seq2[j - 1]:
|
| 49 |
+
dp[j] = prev[j - 1]
|
| 50 |
+
else:
|
| 51 |
+
dp[j] = 1 + min(prev[j], dp[j - 1], prev[j - 1])
|
| 52 |
+
return dp[n]
|
| 53 |
+
|
| 54 |
+
# ── Detector de modernismos ──────────────────────────────────────────────
|
| 55 |
+
|
| 56 |
+
@staticmethod
|
| 57 |
+
def modernism_penalty(original_htr: str, corrected: str) -> Dict:
|
| 58 |
+
"""
|
| 59 |
+
Detecta formas modernas introducidas por el LLM que no estaban
|
| 60 |
+
en el HTR original. Retorna lista de problemas detectados.
|
| 61 |
+
"""
|
| 62 |
+
issues = []
|
| 63 |
+
orig_lower = original_htr.lower()
|
| 64 |
+
corr_lower = corrected.lower()
|
| 65 |
+
|
| 66 |
+
for p in GRAFIA_PATTERNS:
|
| 67 |
+
modern = p["modern"].lower()
|
| 68 |
+
ancient_forms = [f.strip().lower() for f in p["ancient"].split("/")]
|
| 69 |
+
|
| 70 |
+
# Si el corrected contiene la forma moderna Y el original no la tenía
|
| 71 |
+
if modern in corr_lower and modern not in orig_lower:
|
| 72 |
+
# Verificar que tampoco era una forma antigua válida
|
| 73 |
+
if not any(af in orig_lower for af in ancient_forms):
|
| 74 |
+
issues.append({
|
| 75 |
+
"modern": p["modern"],
|
| 76 |
+
"ancient": p["ancient"],
|
| 77 |
+
"rule": p["rule"],
|
| 78 |
+
})
|
| 79 |
+
|
| 80 |
+
return {
|
| 81 |
+
"count": len(issues),
|
| 82 |
+
"issues": issues,
|
| 83 |
+
"score": max(0.0, 1.0 - len(issues) * 0.1), # 0.0–1.0
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
# ── Evaluación de un par ─────────────────────────────────────────────────
|
| 87 |
+
|
| 88 |
+
def evaluate_pair(
|
| 89 |
+
self, htr: str, corrected: str, gt: str
|
| 90 |
+
) -> Dict:
|
| 91 |
+
"""
|
| 92 |
+
Evalúa una sola corrección comparando con el groundtruth.
|
| 93 |
+
"""
|
| 94 |
+
cer_htr = self.cer(gt, htr) # CER antes de corregir
|
| 95 |
+
cer_corr = self.cer(gt, corrected) # CER después de corregir
|
| 96 |
+
wer_htr = self.wer(gt, htr)
|
| 97 |
+
wer_corr = self.wer(gt, corrected)
|
| 98 |
+
|
| 99 |
+
modernism = self.modernism_penalty(htr, corrected)
|
| 100 |
+
|
| 101 |
+
return {
|
| 102 |
+
"cer_before": round(cer_htr, 4),
|
| 103 |
+
"cer_after": round(cer_corr, 4),
|
| 104 |
+
"cer_improvement": round(cer_htr - cer_corr, 4),
|
| 105 |
+
"wer_before": round(wer_htr, 4),
|
| 106 |
+
"wer_after": round(wer_corr, 4),
|
| 107 |
+
"wer_improvement": round(wer_htr - wer_corr, 4),
|
| 108 |
+
"modernism": modernism,
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
# ── Evaluación en batch ──────────────────────────────────────────────────
|
| 112 |
+
|
| 113 |
+
def batch_evaluate(
|
| 114 |
+
self, corrector, pairs: List[Dict], verbose: bool = True
|
| 115 |
+
) -> Dict:
|
| 116 |
+
"""
|
| 117 |
+
Evalúa el sistema sobre una lista de pares con groundtruth.
|
| 118 |
+
Retorna métricas agregadas + detalle por par.
|
| 119 |
+
"""
|
| 120 |
+
results = []
|
| 121 |
+
for i, pair in enumerate(pairs):
|
| 122 |
+
if verbose:
|
| 123 |
+
print(f" Evaluando {i+1}/{len(pairs)}: {pair['id']}")
|
| 124 |
+
try:
|
| 125 |
+
out = corrector.correct(pair["htr"])
|
| 126 |
+
metrics = self.evaluate_pair(
|
| 127 |
+
htr=pair["htr"],
|
| 128 |
+
corrected=out["corrected"],
|
| 129 |
+
gt=pair["gt"],
|
| 130 |
+
)
|
| 131 |
+
metrics["id"] = pair["id"]
|
| 132 |
+
metrics["htr"] = pair["htr"]
|
| 133 |
+
metrics["corrected"] = out["corrected"]
|
| 134 |
+
metrics["gt"] = pair["gt"]
|
| 135 |
+
results.append(metrics)
|
| 136 |
+
except Exception as e:
|
| 137 |
+
print(f" ❌ Error en {pair['id']}: {e}")
|
| 138 |
+
|
| 139 |
+
if not results:
|
| 140 |
+
return {"error": "Sin resultados"}
|
| 141 |
+
|
| 142 |
+
avg = lambda key: round(sum(r[key] for r in results) / len(results), 4)
|
| 143 |
+
|
| 144 |
+
summary = {
|
| 145 |
+
"n_evaluated": len(results),
|
| 146 |
+
"avg_cer_before": avg("cer_before"),
|
| 147 |
+
"avg_cer_after": avg("cer_after"),
|
| 148 |
+
"avg_cer_improvement": avg("cer_improvement"),
|
| 149 |
+
"avg_wer_before": avg("wer_before"),
|
| 150 |
+
"avg_wer_after": avg("wer_after"),
|
| 151 |
+
"avg_wer_improvement": avg("wer_improvement"),
|
| 152 |
+
"avg_modernism_score": avg("modernism"), # via nested
|
| 153 |
+
"detail": results,
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
if verbose:
|
| 157 |
+
print(f"\n📊 RESUMEN EVALUACIÓN ({len(results)} pares)")
|
| 158 |
+
print(f" CER: {summary['avg_cer_before']:.2%} → {summary['avg_cer_after']:.2%} "
|
| 159 |
+
f"(mejora: {summary['avg_cer_improvement']:+.2%})")
|
| 160 |
+
print(f" WER: {summary['avg_wer_before']:.2%} → {summary['avg_wer_after']:.2%} "
|
| 161 |
+
f"(mejora: {summary['avg_wer_improvement']:+.2%})")
|
| 162 |
+
|
| 163 |
+
return summary
|
knowledge_base.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
knowledge_base.py
|
| 3 |
+
─────────────────
|
| 4 |
+
Patrones de error HTR y grafías del castellano s.XVI.
|
| 5 |
+
Edita estas listas para ampliar la base de conocimiento estática.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
# ── Confusiones visuales habituales en HTR sobre letra procesal / gótica ──────
|
| 9 |
+
HTR_ERROR_PATTERNS = [
|
| 10 |
+
{"htr": "u", "gt": "n", "context": "confusión visual u/n en minúscula gótica", "example": "uuestro → nuestro"},
|
| 11 |
+
{"htr": "n", "gt": "u", "context": "confusión visual n/u en procesal", "example": "sno → suo (raro)"},
|
| 12 |
+
{"htr": "c", "gt": "e", "context": "confusión c/e en cursiva procesal", "example": "merçcd → merçed"},
|
| 13 |
+
{"htr": "rn", "gt": "m", "context": "rn interpretado como m por el modelo", "example": "cornprar → comprar"},
|
| 14 |
+
{"htr": "ii", "gt": "u", "context": "doble i confundida con u", "example": "diios → duos"},
|
| 15 |
+
{"htr": "cl", "gt": "d", "context": "cl confundido con d en procesal", "example": "clado → dado"},
|
| 16 |
+
{"htr": "f", "gt": "s", "context": "s larga (ſ) leída como f por el modelo", "example": "feñor → señor"},
|
| 17 |
+
{"htr": "1", "gt": "l", "context": "1 numérico confundido con l minúscula", "example": "1ugar → lugar"},
|
| 18 |
+
{"htr": "i", "gt": "j", "context": "i/j sin distinción gráfica en s.XVI", "example": "iusticia → justicia"},
|
| 19 |
+
{"htr": "ó", "gt": "a", "context": "a cerrada interpretada como o", "example": "cóso → caso"},
|
| 20 |
+
{"htr": "ll", "gt": "h", "context": "ll confundida con h en algunas manos", "example": "llonra → honra"},
|
| 21 |
+
{"htr": "vn", "gt": "un", "context": "v usada como u consonántica al inicio de sílaba", "example": "vno → uno"},
|
| 22 |
+
{"htr": "e", "gt": "c", "context": "c abierta leída como e", "example": "etro → otro"},
|
| 23 |
+
{"htr": "o", "gt": "a", "context": "a redondeada confundida con o", "example": "los → las"},
|
| 24 |
+
]
|
| 25 |
+
|
| 26 |
+
# ── Grafías propias del castellano s.XVI que NO deben modernizarse ────────────
|
| 27 |
+
GRAFIA_PATTERNS = [
|
| 28 |
+
{"modern": "hizo", "ancient": "fizo", "rule": "f- inicial latina ante diptongo ie/ue"},
|
| 29 |
+
{"modern": "hijo", "ancient": "fijo", "rule": "f- inicial conservada"},
|
| 30 |
+
{"modern": "hacer", "ancient": "fazer / hazer", "rule": "variación f/h en infinitivos"},
|
| 31 |
+
{"modern": "dicho", "ancient": "dho / dicho", "rule": "abreviatura notarial dho"},
|
| 32 |
+
{"modern": "merced", "ancient": "merçed", "rule": "cedilla ante e/i"},
|
| 33 |
+
{"modern": "que", "ancient": "q̃ / que", "rule": "abreviatura con tilde volada"},
|
| 34 |
+
{"modern": "porque", "ancient": "porq̃ / porque", "rule": "abreviatura frecuente"},
|
| 35 |
+
{"modern": "vecino", "ancient": "vezino", "rule": "z/c ante vocal anterior"},
|
| 36 |
+
{"modern": "precio", "ancient": "presçio", "rule": "variación s/c+cedilla"},
|
| 37 |
+
{"modern": "ciudad", "ancient": "çibdad / cibdad","rule": "forma medieval con b/v"},
|
| 38 |
+
{"modern": "mucho", "ancient": "muncho", "rule": "nasalización muncho/mucho"},
|
| 39 |
+
{"modern": "mismo", "ancient": "mesmo", "rule": "mesmo forma habitual s.XVI"},
|
| 40 |
+
{"modern": "también", "ancient": "tanbién", "rule": "asimilación nasal"},
|
| 41 |
+
{"modern": "escribano", "ancient": "escriuano", "rule": "v/u gráfica"},
|
| 42 |
+
{"modern": "nuestro", "ancient": "nuestro / nro", "rule": "abreviatura nro en notarial"},
|
| 43 |
+
{"modern": "señor", "ancient": "señor / sr", "rule": "abreviatura sr"},
|
| 44 |
+
{"modern": "dicho", "ancient": "dcho / dho", "rule": "doble abreviatura notarial"},
|
| 45 |
+
{"modern": "tienen", "ancient": "tienen / tienẽ","rule": "tilde abreviativa nasal final"},
|
| 46 |
+
{"modern": "lugar", "ancient": "lugar / lug̃r", "rule": "abreviatura con tilde"},
|
| 47 |
+
{"modern": "presente", "ancient": "prezente / presente", "rule": "alternancia s/z"},
|
| 48 |
+
]
|
| 49 |
+
|
| 50 |
+
# ── Documentos de ejemplo embebidos (se amplían con el corpus real) ───────────
|
| 51 |
+
SAMPLE_PAIRS = [
|
| 52 |
+
{
|
| 53 |
+
"id": "sample_001",
|
| 54 |
+
"type": "notarial",
|
| 55 |
+
"region": "Castilla",
|
| 56 |
+
"date": "1542",
|
| 57 |
+
"htr": "q̃ fizo merçed al dho lugar de las alcaualas",
|
| 58 |
+
"gt": "que fizo merçed al dicho lugar de las alcaualas",
|
| 59 |
+
"corrections": ["dho → dicho (abreviatura expandida)"],
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"id": "sample_002",
|
| 63 |
+
"type": "notarial",
|
| 64 |
+
"region": "Andalucía",
|
| 65 |
+
"date": "1561",
|
| 66 |
+
"htr": "el escriuano del cabildo faze fe y da testimouio verdadero",
|
| 67 |
+
"gt": "el escriuano del cabildo faze fe y da testimonio verdadero",
|
| 68 |
+
"corrections": ["testimouio → testimonio (u/n confusión HTR)"],
|
| 69 |
+
},
|
| 70 |
+
{
|
| 71 |
+
"id": "sample_003",
|
| 72 |
+
"type": "judicial",
|
| 73 |
+
"region": "Castilla",
|
| 74 |
+
"date": "1534",
|
| 75 |
+
"htr": "en la muy noble çibdad de burgos a veynte dias del mes de março",
|
| 76 |
+
"gt": "en la muy noble çibdad de burgos a veynte días del mes de março",
|
| 77 |
+
"corrections": ["días: acento restituido"],
|
| 78 |
+
},
|
| 79 |
+
{
|
| 80 |
+
"id": "sample_004",
|
| 81 |
+
"type": "eclesiastico",
|
| 82 |
+
"region": "Toledo",
|
| 83 |
+
"date": "1578",
|
| 84 |
+
"htr": "el vezino dela villa de alcala prezento ante nos vna peticion",
|
| 85 |
+
"gt": "el vezino de la villa de alcalá presentó ante nos una petición",
|
| 86 |
+
"corrections": ["dela → de la", "alcala → alcalá", "prezento → presentó", "vna → una"],
|
| 87 |
+
},
|
| 88 |
+
{
|
| 89 |
+
"id": "sample_005",
|
| 90 |
+
"type": "notarial",
|
| 91 |
+
"region": "Extremadura",
|
| 92 |
+
"date": "1549",
|
| 93 |
+
"htr": "yo el sobredho escriuano doy fe q̃ conosco al otorgante",
|
| 94 |
+
"gt": "yo el sobredicho escriuano doy fe que conosco al otorgante",
|
| 95 |
+
"corrections": ["sobredho → sobredicho", "q̃ → que"],
|
| 96 |
+
},
|
| 97 |
+
{
|
| 98 |
+
"id": "sample_006",
|
| 99 |
+
"type": "judicial",
|
| 100 |
+
"region": "Castilla",
|
| 101 |
+
"date": "1556",
|
| 102 |
+
"htr": "fizo pareçer ante si a juan de la torre vezino desta villa",
|
| 103 |
+
"gt": "fizo pareçer ante sí a juan de la torre vezino desta villa",
|
| 104 |
+
"corrections": ["si → sí (acento diacrítico)"],
|
| 105 |
+
},
|
| 106 |
+
{
|
| 107 |
+
"id": "sample_007",
|
| 108 |
+
"type": "notarial",
|
| 109 |
+
"region": "Aragón",
|
| 110 |
+
"date": "1523",
|
| 111 |
+
"htr": "sepan quantos esta carta de poder vieren como yo pero lopez",
|
| 112 |
+
"gt": "sepan quantos esta carta de poder vieren como yo pero lópez",
|
| 113 |
+
"corrections": ["lopez → lópez (acento)"],
|
| 114 |
+
},
|
| 115 |
+
{
|
| 116 |
+
"id": "sample_008",
|
| 117 |
+
"type": "eclesiastico",
|
| 118 |
+
"region": "Sevilla",
|
| 119 |
+
"date": "1587",
|
| 120 |
+
"htr": "en el nonbre de dios e de la virgen maria su madre amen",
|
| 121 |
+
"gt": "en el nonbre de dios e de la virgen maría su madre amén",
|
| 122 |
+
"corrections": ["maría, amén: acentos restituidos"],
|
| 123 |
+
},
|
| 124 |
+
]
|
rag_corrector.py
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
rag_corrector.py
|
| 3 |
+
────────────────
|
| 4 |
+
Núcleo del sistema RAG.
|
| 5 |
+
|
| 6 |
+
1. Detecta posibles errores HTR y grafías modernas en el texto de entrada.
|
| 7 |
+
2. Recupera ejemplos similares del vector store (few-shot dinámico).
|
| 8 |
+
3. Construye el prompt con reglas, ejemplos y alertas.
|
| 9 |
+
4. Llama a GPT-4o y devuelve el texto corregido + trazabilidad.
|
| 10 |
+
|
| 11 |
+
Uso:
|
| 12 |
+
from rag_corrector import RAGCorrector
|
| 13 |
+
corrector = RAGCorrector(vector_store)
|
| 14 |
+
result = corrector.correct("texto htr aqui")
|
| 15 |
+
print(result["corrected"])
|
| 16 |
+
print(result["prompt"]) # para depuración
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
import os
|
| 20 |
+
from typing import List, Dict, Tuple
|
| 21 |
+
from openai import OpenAI
|
| 22 |
+
from dotenv import load_dotenv
|
| 23 |
+
|
| 24 |
+
from knowledge_base import HTR_ERROR_PATTERNS, GRAFIA_PATTERNS
|
| 25 |
+
|
| 26 |
+
load_dotenv()
|
| 27 |
+
|
| 28 |
+
MODEL = os.getenv("OPENAI_MODEL", "gpt-4o")
|
| 29 |
+
TOP_K = int(os.getenv("TOP_K", 5))
|
| 30 |
+
|
| 31 |
+
SYSTEM_PROMPT = """Eres un corrector especializado en documentos notariales y judiciales \
|
| 32 |
+
españoles del siglo XVI (castellano antiguo).
|
| 33 |
+
|
| 34 |
+
Tu ÚNICA tarea es corregir los errores introducidos por el proceso automático de \
|
| 35 |
+
reconocimiento de texto manuscrito (HTR). NO debes modernizar el texto bajo ninguna \
|
| 36 |
+
circunstancia.
|
| 37 |
+
|
| 38 |
+
REGLAS ABSOLUTAS — incumplirlas invalida la corrección:
|
| 39 |
+
1. Conserva SIEMPRE las grafías propias del s.XVI:
|
| 40 |
+
fizo, fazer, hazer, merçed, vezino, mesmo, çibdad, escriuano,
|
| 41 |
+
dho (=dicho), q̃ (=que), nro (=nuestro), vn/vna, etc.
|
| 42 |
+
2. NO conviertas f→h inicial latina (fizo ≠ hizo, fazer ≠ hacer).
|
| 43 |
+
3. Conserva abreviaturas y tildes voladas (q̃, nro, dho, sr).
|
| 44 |
+
4. Corrige SOLO lo que claramente sea un error HTR (grafema confundido visualmente).
|
| 45 |
+
5. Si no estás seguro de si algo es error HTR o forma s.XVI válida → conserva el original.
|
| 46 |
+
6. Responde ÚNICAMENTE con el texto corregido. Sin explicaciones, sin comillas."""
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class RAGCorrector:
|
| 50 |
+
def __init__(self, vector_store):
|
| 51 |
+
self.vs = vector_store
|
| 52 |
+
#self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
| 53 |
+
self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"), base_url=os.getenv("OPENAI_BASE_URL", "https://api.x.ai/v1"),)
|
| 54 |
+
# ── API pública ──────────────────────────────────────────────────────────
|
| 55 |
+
|
| 56 |
+
def correct(self, htr_text: str, top_k: int = TOP_K) -> Dict:
|
| 57 |
+
"""
|
| 58 |
+
Corrige un texto HTR usando RAG.
|
| 59 |
+
|
| 60 |
+
Retorna dict con:
|
| 61 |
+
corrected : str — texto corregido
|
| 62 |
+
prompt : str — prompt completo enviado al LLM
|
| 63 |
+
retrieved : list — documentos recuperados del vector store
|
| 64 |
+
htr_errors : list — patrones HTR detectados
|
| 65 |
+
grafia_warns : list — grafías modernas detectadas (alertas)
|
| 66 |
+
model : str — modelo usado
|
| 67 |
+
"""
|
| 68 |
+
retrieved = self.vs.retrieve(htr_text, k=top_k)
|
| 69 |
+
htr_errors = self._detect_htr_errors(htr_text)
|
| 70 |
+
grafia_warns = self._detect_grafias(htr_text)
|
| 71 |
+
|
| 72 |
+
prompt = self._build_prompt(htr_text, retrieved, htr_errors, grafia_warns)
|
| 73 |
+
|
| 74 |
+
corrected = self._call_llm(prompt)
|
| 75 |
+
|
| 76 |
+
return {
|
| 77 |
+
"corrected": corrected,
|
| 78 |
+
"prompt": prompt,
|
| 79 |
+
"retrieved": retrieved,
|
| 80 |
+
"htr_errors": htr_errors,
|
| 81 |
+
"grafia_warns": grafia_warns,
|
| 82 |
+
"model": MODEL,
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
# ── Detección de patrones ────────────────────────────────────────────────
|
| 86 |
+
|
| 87 |
+
def _detect_htr_errors(self, text: str) -> List[Dict]:
|
| 88 |
+
found = []
|
| 89 |
+
for p in HTR_ERROR_PATTERNS:
|
| 90 |
+
if p["htr"] in text:
|
| 91 |
+
found.append(p)
|
| 92 |
+
return found
|
| 93 |
+
|
| 94 |
+
def _detect_grafias(self, text: str) -> List[Dict]:
|
| 95 |
+
"""Detecta formas modernas que NO deberían modernizarse."""
|
| 96 |
+
found = []
|
| 97 |
+
lower = text.lower()
|
| 98 |
+
for p in GRAFIA_PATTERNS:
|
| 99 |
+
if p["modern"].lower() in lower:
|
| 100 |
+
found.append(p)
|
| 101 |
+
return found
|
| 102 |
+
|
| 103 |
+
# ── Constructor de prompt ────────────────────────────────────────────────
|
| 104 |
+
|
| 105 |
+
def _build_prompt(
|
| 106 |
+
self,
|
| 107 |
+
htr_text: str,
|
| 108 |
+
retrieved: List[Dict],
|
| 109 |
+
htr_errors: List[Dict],
|
| 110 |
+
grafia_warns: List[Dict],
|
| 111 |
+
) -> str:
|
| 112 |
+
|
| 113 |
+
sections = []
|
| 114 |
+
|
| 115 |
+
# Few-shot dinámico: ejemplos recuperados
|
| 116 |
+
if retrieved:
|
| 117 |
+
examples = []
|
| 118 |
+
for i, doc in enumerate(retrieved, 1):
|
| 119 |
+
corr = "; ".join(doc["corrections"]) if doc["corrections"] else "—"
|
| 120 |
+
examples.append(
|
| 121 |
+
f"Ejemplo {i} [{doc['type']}, {doc['region']}, {doc['date']}]"
|
| 122 |
+
f" (similitud={doc['score']}):\n"
|
| 123 |
+
f" HTR: \"{doc['htr']}\"\n"
|
| 124 |
+
f" GT: \"{doc['gt']}\"\n"
|
| 125 |
+
f" Correcciones aplicadas: {corr}"
|
| 126 |
+
)
|
| 127 |
+
sections.append(
|
| 128 |
+
"EJEMPLOS DEL CORPUS (similares al texto a corregir):\n"
|
| 129 |
+
+ "\n\n".join(examples)
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
# Alertas de patrones HTR detectados
|
| 133 |
+
if htr_errors:
|
| 134 |
+
hints = "\n".join(
|
| 135 |
+
f" • '{p['htr']}' puede ser '{p['gt']}': {p['context']} (ej: {p['example']})"
|
| 136 |
+
for p in htr_errors
|
| 137 |
+
)
|
| 138 |
+
sections.append(f"POSIBLES ERRORES HTR DETECTADOS EN ESTE TEXTO:\n{hints}")
|
| 139 |
+
|
| 140 |
+
# Alertas de grafías modernas
|
| 141 |
+
if grafia_warns:
|
| 142 |
+
warns = "\n".join(
|
| 143 |
+
f" • '{p['modern']}' → mantener como '{p['ancient']}': {p['rule']}"
|
| 144 |
+
for p in grafia_warns
|
| 145 |
+
)
|
| 146 |
+
sections.append(
|
| 147 |
+
f"ALERTA — GRAFÍAS QUE NO DEBEN MODERNIZARSE:\n{warns}"
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
context_block = "\n\n".join(sections)
|
| 151 |
+
|
| 152 |
+
return (
|
| 153 |
+
f"{context_block}\n\n"
|
| 154 |
+
f"TEXTO HTR A CORREGIR:\n\"{htr_text}\""
|
| 155 |
+
if context_block
|
| 156 |
+
else f"TEXTO HTR A CORREGIR:\n\"{htr_text}\""
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
# ── Llamada al LLM ───────────────────────────────────────────────────────
|
| 160 |
+
|
| 161 |
+
def _call_llm(self, user_prompt: str) -> str:
|
| 162 |
+
response = self.client.chat.completions.create(
|
| 163 |
+
model=MODEL,
|
| 164 |
+
temperature=0.1, # baja temperatura: reproducibilidad
|
| 165 |
+
max_tokens=1024,
|
| 166 |
+
messages=[
|
| 167 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 168 |
+
{"role": "user", "content": user_prompt},
|
| 169 |
+
],
|
| 170 |
+
)
|
| 171 |
+
return response.choices[0].message.content.strip().strip('"').strip("'")
|
vector_store.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
vector_store.py
|
| 3 |
+
───────────────
|
| 4 |
+
Indexa pares HTR/GT en ChromaDB usando embeddings de OpenAI.
|
| 5 |
+
Recupera los top-k documentos más similares a un texto de consulta.
|
| 6 |
+
|
| 7 |
+
Uso:
|
| 8 |
+
from vector_store import VectorStore
|
| 9 |
+
vs = VectorStore()
|
| 10 |
+
vs.index(pairs) # indexar corpus
|
| 11 |
+
results = vs.retrieve("texto htr...", k=5)
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import json
|
| 15 |
+
import os
|
| 16 |
+
from typing import List, Dict
|
| 17 |
+
from tqdm import tqdm
|
| 18 |
+
|
| 19 |
+
import chromadb
|
| 20 |
+
from chromadb.utils import embedding_functions
|
| 21 |
+
from dotenv import load_dotenv
|
| 22 |
+
|
| 23 |
+
load_dotenv()
|
| 24 |
+
|
| 25 |
+
CHROMA_PATH = os.getenv("CHROMA_PATH", "./chroma_db")
|
| 26 |
+
OPENAI_KEY = os.getenv("OPENAI_API_KEY", "")
|
| 27 |
+
EMBED_MODEL = "text-embedding-3-small"
|
| 28 |
+
COLLECTION = "scriptorium_corpus"
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class VectorStore:
|
| 32 |
+
def __init__(self):
|
| 33 |
+
self.client = chromadb.PersistentClient(path=CHROMA_PATH)
|
| 34 |
+
|
| 35 |
+
# Función de embedding: OpenAI si hay clave, sino local (sentencetransformers)
|
| 36 |
+
#if OPENAI_KEY:
|
| 37 |
+
# self.ef = embedding_functions.OpenAIEmbeddingFunction(
|
| 38 |
+
# api_key=OPENAI_KEY,
|
| 39 |
+
# model_name=EMBED_MODEL,
|
| 40 |
+
# )
|
| 41 |
+
#else:
|
| 42 |
+
# # Fallback: modelo local multilingüe
|
| 43 |
+
# self.ef = embedding_functions.SentenceTransformerEmbeddingFunction(
|
| 44 |
+
# model_name="intfloat/multilingual-e5-small"
|
| 45 |
+
# )
|
| 46 |
+
|
| 47 |
+
self.ef = embedding_functions.SentenceTransformerEmbeddingFunction(
|
| 48 |
+
model_name="intfloat/multilingual-e5-small")
|
| 49 |
+
|
| 50 |
+
self.collection = self.client.get_or_create_collection(
|
| 51 |
+
name=COLLECTION,
|
| 52 |
+
embedding_function=self.ef,
|
| 53 |
+
metadata={"hnsw:space": "cosine"},
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
# ── Indexación ────────────────────────────────────────────────────────────
|
| 57 |
+
|
| 58 |
+
def index(self, pairs: List[Dict], batch_size: int = 50) -> int:
|
| 59 |
+
"""
|
| 60 |
+
Indexa los pares HTR/GT. Cada fragmento se almacena con:
|
| 61 |
+
- document : texto que se embebe (htr + ' [SEP] ' + gt)
|
| 62 |
+
- metadata : tipo, región, fecha, htr, gt originales
|
| 63 |
+
- id : identificador único del par
|
| 64 |
+
Retorna el número de documentos nuevos añadidos.
|
| 65 |
+
"""
|
| 66 |
+
existing_ids = set(self.collection.get(include=[])["ids"])
|
| 67 |
+
to_add = [p for p in pairs if p["id"] not in existing_ids]
|
| 68 |
+
|
| 69 |
+
if not to_add:
|
| 70 |
+
print(f"ℹ Vector store ya actualizado ({len(existing_ids)} documentos).")
|
| 71 |
+
return 0
|
| 72 |
+
|
| 73 |
+
print(f"🔄 Indexando {len(to_add)} documentos nuevos...")
|
| 74 |
+
|
| 75 |
+
for i in tqdm(range(0, len(to_add), batch_size), desc="Indexando"):
|
| 76 |
+
batch = to_add[i : i + batch_size]
|
| 77 |
+
|
| 78 |
+
# El texto que se embebe combina HTR + GT para capturar
|
| 79 |
+
# tanto los errores como las formas correctas
|
| 80 |
+
documents = [
|
| 81 |
+
f"HTR: {p['htr']} [SEP] GT: {p['gt']}"
|
| 82 |
+
for p in batch
|
| 83 |
+
]
|
| 84 |
+
metadatas = [
|
| 85 |
+
{
|
| 86 |
+
"htr": p["htr"],
|
| 87 |
+
"gt": p["gt"],
|
| 88 |
+
"type": p.get("type", ""),
|
| 89 |
+
"region": p.get("region", ""),
|
| 90 |
+
"date": p.get("date", ""),
|
| 91 |
+
"corrections": json.dumps(
|
| 92 |
+
p.get("corrections", []), ensure_ascii=False
|
| 93 |
+
),
|
| 94 |
+
}
|
| 95 |
+
for p in batch
|
| 96 |
+
]
|
| 97 |
+
ids = [p["id"] for p in batch]
|
| 98 |
+
|
| 99 |
+
self.collection.add(
|
| 100 |
+
documents=documents,
|
| 101 |
+
metadatas=metadatas,
|
| 102 |
+
ids=ids,
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
print(f"✅ Indexación completa. Total en store: {self.collection.count()}")
|
| 106 |
+
return len(to_add)
|
| 107 |
+
|
| 108 |
+
# ── Recuperación ─────────────────────────────────────────────────────────
|
| 109 |
+
|
| 110 |
+
def retrieve(self, query: str, k: int = 5) -> List[Dict]:
|
| 111 |
+
"""
|
| 112 |
+
Recupera los k pares más similares al texto HTR de consulta.
|
| 113 |
+
Retorna lista de dicts con htr, gt, type, region, date, corrections, score.
|
| 114 |
+
"""
|
| 115 |
+
if self.collection.count() == 0:
|
| 116 |
+
return []
|
| 117 |
+
|
| 118 |
+
results = self.collection.query(
|
| 119 |
+
query_texts=[query],
|
| 120 |
+
n_results=min(k, self.collection.count()),
|
| 121 |
+
include=["metadatas", "distances"],
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
retrieved = []
|
| 125 |
+
for meta, dist in zip(
|
| 126 |
+
results["metadatas"][0], results["distances"][0]
|
| 127 |
+
):
|
| 128 |
+
retrieved.append({
|
| 129 |
+
"htr": meta["htr"],
|
| 130 |
+
"gt": meta["gt"],
|
| 131 |
+
"type": meta.get("type", ""),
|
| 132 |
+
"region": meta.get("region", ""),
|
| 133 |
+
"date": meta.get("date", ""),
|
| 134 |
+
"corrections": json.loads(meta.get("corrections", "[]")),
|
| 135 |
+
"score": round(1 - dist, 4), # cosine similarity
|
| 136 |
+
})
|
| 137 |
+
|
| 138 |
+
return retrieved
|
| 139 |
+
|
| 140 |
+
# ── Utilidades ────────────────────────────────────────────────────────────
|
| 141 |
+
|
| 142 |
+
def count(self) -> int:
|
| 143 |
+
return self.collection.count()
|
| 144 |
+
|
| 145 |
+
def reset(self):
|
| 146 |
+
"""Elimina y recrea la colección (útil para re-indexar desde cero)."""
|
| 147 |
+
self.client.delete_collection(COLLECTION)
|
| 148 |
+
self.collection = self.client.get_or_create_collection(
|
| 149 |
+
name=COLLECTION,
|
| 150 |
+
embedding_function=self.ef,
|
| 151 |
+
metadata={"hnsw:space": "cosine"},
|
| 152 |
+
)
|
| 153 |
+
print("🗑 Vector store reseteado.")
|