Spaces:
Sleeping
Sleeping
| """ | |
| evaluator.py | |
| ββββββββββββ | |
| MΓ©tricas de calidad para las correcciones del sistema RAG. | |
| - CER (Character Error Rate) : nivel de carΓ‘cter | |
| - WER (Word Error Rate) : nivel de palabra | |
| - Modernism score : penalizaciΓ³n por formas modernas introducidas | |
| - Batch evaluation : evalΓΊa el sistema sobre un conjunto de pares con GT | |
| Uso: | |
| from evaluator import Evaluator | |
| ev = Evaluator() | |
| metrics = ev.evaluate_pair(htr="...", corrected="...", gt="...") | |
| report = ev.batch_evaluate(corrector, pairs[:50]) | |
| """ | |
| import re | |
| from typing import List, Dict, Tuple | |
| from knowledge_base import GRAFIA_PATTERNS | |
| class Evaluator: | |
| # ββ MΓ©tricas de ediciΓ³n ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def cer(reference: str, hypothesis: str) -> float: | |
| """Character Error Rate (Levenshtein a nivel carΓ‘cter).""" | |
| r, h = list(reference), list(hypothesis) | |
| return Evaluator._levenshtein(r, h) / max(len(r), 1) | |
| def wer(reference: str, hypothesis: str) -> float: | |
| """Word Error Rate (Levenshtein a nivel palabra).""" | |
| r = reference.split() | |
| h = hypothesis.split() | |
| return Evaluator._levenshtein(r, h) / max(len(r), 1) | |
| def _levenshtein(seq1: list, seq2: list) -> int: | |
| m, n = len(seq1), len(seq2) | |
| dp = list(range(n + 1)) | |
| for i in range(1, m + 1): | |
| prev = dp[:] | |
| dp[0] = i | |
| for j in range(1, n + 1): | |
| if seq1[i - 1] == seq2[j - 1]: | |
| dp[j] = prev[j - 1] | |
| else: | |
| dp[j] = 1 + min(prev[j], dp[j - 1], prev[j - 1]) | |
| return dp[n] | |
| # ββ Detector de modernismos ββββββββββββββββββββββββββββββββββββββββββββββ | |
| def modernism_penalty(original_htr: str, corrected: str) -> Dict: | |
| """ | |
| Detecta formas modernas introducidas por el LLM que no estaban | |
| en el HTR original. Retorna lista de problemas detectados. | |
| """ | |
| issues = [] | |
| orig_lower = original_htr.lower() | |
| corr_lower = corrected.lower() | |
| for p in GRAFIA_PATTERNS: | |
| modern = p["modern"].lower() | |
| ancient_forms = [f.strip().lower() for f in p["ancient"].split("/")] | |
| # Si el corrected contiene la forma moderna Y el original no la tenΓa | |
| if modern in corr_lower and modern not in orig_lower: | |
| # Verificar que tampoco era una forma antigua vΓ‘lida | |
| if not any(af in orig_lower for af in ancient_forms): | |
| issues.append({ | |
| "modern": p["modern"], | |
| "ancient": p["ancient"], | |
| "rule": p["rule"], | |
| }) | |
| return { | |
| "count": len(issues), | |
| "issues": issues, | |
| "score": max(0.0, 1.0 - len(issues) * 0.1), # 0.0β1.0 | |
| } | |
| # ββ EvaluaciΓ³n de un par βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def evaluate_pair( | |
| self, htr: str, corrected: str, gt: str | |
| ) -> Dict: | |
| """ | |
| EvalΓΊa una sola correcciΓ³n comparando con el groundtruth. | |
| """ | |
| cer_htr = self.cer(gt, htr) # CER antes de corregir | |
| cer_corr = self.cer(gt, corrected) # CER despuΓ©s de corregir | |
| wer_htr = self.wer(gt, htr) | |
| wer_corr = self.wer(gt, corrected) | |
| modernism = self.modernism_penalty(htr, corrected) | |
| return { | |
| "cer_before": round(cer_htr, 4), | |
| "cer_after": round(cer_corr, 4), | |
| "cer_improvement": round(cer_htr - cer_corr, 4), | |
| "wer_before": round(wer_htr, 4), | |
| "wer_after": round(wer_corr, 4), | |
| "wer_improvement": round(wer_htr - wer_corr, 4), | |
| "modernism": modernism, | |
| } | |
| # ββ EvaluaciΓ³n en batch ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def batch_evaluate( | |
| self, corrector, pairs: List[Dict], verbose: bool = True | |
| ) -> Dict: | |
| """ | |
| EvalΓΊa el sistema sobre una lista de pares con groundtruth. | |
| Retorna mΓ©tricas agregadas + detalle por par. | |
| """ | |
| results = [] | |
| for i, pair in enumerate(pairs): | |
| if verbose: | |
| print(f" Evaluando {i+1}/{len(pairs)}: {pair['id']}") | |
| try: | |
| out = corrector.correct(pair["htr"]) | |
| metrics = self.evaluate_pair( | |
| htr=pair["htr"], | |
| corrected=out["corrected"], | |
| gt=pair["gt"], | |
| ) | |
| metrics["id"] = pair["id"] | |
| metrics["htr"] = pair["htr"] | |
| metrics["corrected"] = out["corrected"] | |
| metrics["gt"] = pair["gt"] | |
| results.append(metrics) | |
| except Exception as e: | |
| print(f" β Error en {pair['id']}: {e}") | |
| if not results: | |
| return {"error": "Sin resultados"} | |
| avg = lambda key: round(sum(r[key] for r in results) / len(results), 4) | |
| summary = { | |
| "n_evaluated": len(results), | |
| "avg_cer_before": avg("cer_before"), | |
| "avg_cer_after": avg("cer_after"), | |
| "avg_cer_improvement": avg("cer_improvement"), | |
| "avg_wer_before": avg("wer_before"), | |
| "avg_wer_after": avg("wer_after"), | |
| "avg_wer_improvement": avg("wer_improvement"), | |
| "avg_modernism_score": avg("modernism"), # via nested | |
| "detail": results, | |
| } | |
| if verbose: | |
| print(f"\nπ RESUMEN EVALUACIΓN ({len(results)} pares)") | |
| print(f" CER: {summary['avg_cer_before']:.2%} β {summary['avg_cer_after']:.2%} " | |
| f"(mejora: {summary['avg_cer_improvement']:+.2%})") | |
| print(f" WER: {summary['avg_wer_before']:.2%} β {summary['avg_wer_after']:.2%} " | |
| f"(mejora: {summary['avg_wer_improvement']:+.2%})") | |
| return summary | |