NSF-RAG-Codex / evaluator.py
alezsd's picture
NSF RAG v1.0
9abe007
"""
evaluator.py
────────────
MΓ©tricas de calidad para las correcciones del sistema RAG.
- CER (Character Error Rate) : nivel de carΓ‘cter
- WER (Word Error Rate) : nivel de palabra
- Modernism score : penalizaciΓ³n por formas modernas introducidas
- Batch evaluation : evalΓΊa el sistema sobre un conjunto de pares con GT
Uso:
from evaluator import Evaluator
ev = Evaluator()
metrics = ev.evaluate_pair(htr="...", corrected="...", gt="...")
report = ev.batch_evaluate(corrector, pairs[:50])
"""
import re
from typing import List, Dict, Tuple
from knowledge_base import GRAFIA_PATTERNS
class Evaluator:
# ── MΓ©tricas de ediciΓ³n ──────────────────────────────────────────────────
@staticmethod
def cer(reference: str, hypothesis: str) -> float:
"""Character Error Rate (Levenshtein a nivel carΓ‘cter)."""
r, h = list(reference), list(hypothesis)
return Evaluator._levenshtein(r, h) / max(len(r), 1)
@staticmethod
def wer(reference: str, hypothesis: str) -> float:
"""Word Error Rate (Levenshtein a nivel palabra)."""
r = reference.split()
h = hypothesis.split()
return Evaluator._levenshtein(r, h) / max(len(r), 1)
@staticmethod
def _levenshtein(seq1: list, seq2: list) -> int:
m, n = len(seq1), len(seq2)
dp = list(range(n + 1))
for i in range(1, m + 1):
prev = dp[:]
dp[0] = i
for j in range(1, n + 1):
if seq1[i - 1] == seq2[j - 1]:
dp[j] = prev[j - 1]
else:
dp[j] = 1 + min(prev[j], dp[j - 1], prev[j - 1])
return dp[n]
# ── Detector de modernismos ──────────────────────────────────────────────
@staticmethod
def modernism_penalty(original_htr: str, corrected: str) -> Dict:
"""
Detecta formas modernas introducidas por el LLM que no estaban
en el HTR original. Retorna lista de problemas detectados.
"""
issues = []
orig_lower = original_htr.lower()
corr_lower = corrected.lower()
for p in GRAFIA_PATTERNS:
modern = p["modern"].lower()
ancient_forms = [f.strip().lower() for f in p["ancient"].split("/")]
# Si el corrected contiene la forma moderna Y el original no la tenΓ­a
if modern in corr_lower and modern not in orig_lower:
# Verificar que tampoco era una forma antigua vΓ‘lida
if not any(af in orig_lower for af in ancient_forms):
issues.append({
"modern": p["modern"],
"ancient": p["ancient"],
"rule": p["rule"],
})
return {
"count": len(issues),
"issues": issues,
"score": max(0.0, 1.0 - len(issues) * 0.1), # 0.0–1.0
}
# ── EvaluaciΓ³n de un par ─────────────────────────────────────────────────
def evaluate_pair(
self, htr: str, corrected: str, gt: str
) -> Dict:
"""
EvalΓΊa una sola correcciΓ³n comparando con el groundtruth.
"""
cer_htr = self.cer(gt, htr) # CER antes de corregir
cer_corr = self.cer(gt, corrected) # CER despuΓ©s de corregir
wer_htr = self.wer(gt, htr)
wer_corr = self.wer(gt, corrected)
modernism = self.modernism_penalty(htr, corrected)
return {
"cer_before": round(cer_htr, 4),
"cer_after": round(cer_corr, 4),
"cer_improvement": round(cer_htr - cer_corr, 4),
"wer_before": round(wer_htr, 4),
"wer_after": round(wer_corr, 4),
"wer_improvement": round(wer_htr - wer_corr, 4),
"modernism": modernism,
}
# ── EvaluaciΓ³n en batch ──────────────────────────────────────────────────
def batch_evaluate(
self, corrector, pairs: List[Dict], verbose: bool = True
) -> Dict:
"""
EvalΓΊa el sistema sobre una lista de pares con groundtruth.
Retorna mΓ©tricas agregadas + detalle por par.
"""
results = []
for i, pair in enumerate(pairs):
if verbose:
print(f" Evaluando {i+1}/{len(pairs)}: {pair['id']}")
try:
out = corrector.correct(pair["htr"])
metrics = self.evaluate_pair(
htr=pair["htr"],
corrected=out["corrected"],
gt=pair["gt"],
)
metrics["id"] = pair["id"]
metrics["htr"] = pair["htr"]
metrics["corrected"] = out["corrected"]
metrics["gt"] = pair["gt"]
results.append(metrics)
except Exception as e:
print(f" ❌ Error en {pair['id']}: {e}")
if not results:
return {"error": "Sin resultados"}
avg = lambda key: round(sum(r[key] for r in results) / len(results), 4)
summary = {
"n_evaluated": len(results),
"avg_cer_before": avg("cer_before"),
"avg_cer_after": avg("cer_after"),
"avg_cer_improvement": avg("cer_improvement"),
"avg_wer_before": avg("wer_before"),
"avg_wer_after": avg("wer_after"),
"avg_wer_improvement": avg("wer_improvement"),
"avg_modernism_score": avg("modernism"), # via nested
"detail": results,
}
if verbose:
print(f"\nπŸ“Š RESUMEN EVALUACIΓ“N ({len(results)} pares)")
print(f" CER: {summary['avg_cer_before']:.2%} β†’ {summary['avg_cer_after']:.2%} "
f"(mejora: {summary['avg_cer_improvement']:+.2%})")
print(f" WER: {summary['avg_wer_before']:.2%} β†’ {summary['avg_wer_after']:.2%} "
f"(mejora: {summary['avg_wer_improvement']:+.2%})")
return summary