""" evaluator.py ──────────── Métricas de calidad para las correcciones del sistema RAG. - CER (Character Error Rate) : nivel de carácter - WER (Word Error Rate) : nivel de palabra - Modernism score : penalización por formas modernas introducidas - Batch evaluation : evalúa el sistema sobre un conjunto de pares con GT Uso: from evaluator import Evaluator ev = Evaluator() metrics = ev.evaluate_pair(htr="...", corrected="...", gt="...") report = ev.batch_evaluate(corrector, pairs[:50]) """ import re from typing import List, Dict, Tuple from knowledge_base import GRAFIA_PATTERNS class Evaluator: # ── Métricas de edición ────────────────────────────────────────────────── @staticmethod def cer(reference: str, hypothesis: str) -> float: """Character Error Rate (Levenshtein a nivel carácter).""" r, h = list(reference), list(hypothesis) return Evaluator._levenshtein(r, h) / max(len(r), 1) @staticmethod def wer(reference: str, hypothesis: str) -> float: """Word Error Rate (Levenshtein a nivel palabra).""" r = reference.split() h = hypothesis.split() return Evaluator._levenshtein(r, h) / max(len(r), 1) @staticmethod def _levenshtein(seq1: list, seq2: list) -> int: m, n = len(seq1), len(seq2) dp = list(range(n + 1)) for i in range(1, m + 1): prev = dp[:] dp[0] = i for j in range(1, n + 1): if seq1[i - 1] == seq2[j - 1]: dp[j] = prev[j - 1] else: dp[j] = 1 + min(prev[j], dp[j - 1], prev[j - 1]) return dp[n] # ── Detector de modernismos ────────────────────────────────────────────── @staticmethod def modernism_penalty(original_htr: str, corrected: str) -> Dict: """ Detecta formas modernas introducidas por el LLM que no estaban en el HTR original. Retorna lista de problemas detectados. """ issues = [] orig_lower = original_htr.lower() corr_lower = corrected.lower() for p in GRAFIA_PATTERNS: modern = p["modern"].lower() ancient_forms = [f.strip().lower() for f in p["ancient"].split("/")] # Si el corrected contiene la forma moderna Y el original no la tenía if modern in corr_lower and modern not in orig_lower: # Verificar que tampoco era una forma antigua válida if not any(af in orig_lower for af in ancient_forms): issues.append({ "modern": p["modern"], "ancient": p["ancient"], "rule": p["rule"], }) return { "count": len(issues), "issues": issues, "score": max(0.0, 1.0 - len(issues) * 0.1), # 0.0–1.0 } # ── Evaluación de un par ───────────────────────────────────────────────── def evaluate_pair( self, htr: str, corrected: str, gt: str ) -> Dict: """ Evalúa una sola corrección comparando con el groundtruth. """ cer_htr = self.cer(gt, htr) # CER antes de corregir cer_corr = self.cer(gt, corrected) # CER después de corregir wer_htr = self.wer(gt, htr) wer_corr = self.wer(gt, corrected) modernism = self.modernism_penalty(htr, corrected) return { "cer_before": round(cer_htr, 4), "cer_after": round(cer_corr, 4), "cer_improvement": round(cer_htr - cer_corr, 4), "wer_before": round(wer_htr, 4), "wer_after": round(wer_corr, 4), "wer_improvement": round(wer_htr - wer_corr, 4), "modernism": modernism, } # ── Evaluación en batch ────────────────────────────────────────────────── def batch_evaluate( self, corrector, pairs: List[Dict], verbose: bool = True ) -> Dict: """ Evalúa el sistema sobre una lista de pares con groundtruth. Retorna métricas agregadas + detalle por par. """ results = [] for i, pair in enumerate(pairs): if verbose: print(f" Evaluando {i+1}/{len(pairs)}: {pair['id']}") try: out = corrector.correct(pair["htr"]) metrics = self.evaluate_pair( htr=pair["htr"], corrected=out["corrected"], gt=pair["gt"], ) metrics["id"] = pair["id"] metrics["htr"] = pair["htr"] metrics["corrected"] = out["corrected"] metrics["gt"] = pair["gt"] results.append(metrics) except Exception as e: print(f" ❌ Error en {pair['id']}: {e}") if not results: return {"error": "Sin resultados"} avg = lambda key: round(sum(r[key] for r in results) / len(results), 4) summary = { "n_evaluated": len(results), "avg_cer_before": avg("cer_before"), "avg_cer_after": avg("cer_after"), "avg_cer_improvement": avg("cer_improvement"), "avg_wer_before": avg("wer_before"), "avg_wer_after": avg("wer_after"), "avg_wer_improvement": avg("wer_improvement"), "avg_modernism_score": avg("modernism"), # via nested "detail": results, } if verbose: print(f"\n📊 RESUMEN EVALUACIÓN ({len(results)} pares)") print(f" CER: {summary['avg_cer_before']:.2%} → {summary['avg_cer_after']:.2%} " f"(mejora: {summary['avg_cer_improvement']:+.2%})") print(f" WER: {summary['avg_wer_before']:.2%} → {summary['avg_wer_after']:.2%} " f"(mejora: {summary['avg_wer_improvement']:+.2%})") return summary