File size: 4,344 Bytes
cf52a55 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | """
Script de avaliação do pipeline.
================================
Executa o pipeline em um dataset de (pergunta, resposta_referência) e
calcula métricas (coerência L3, BLEU, ROUGE-L, opcionalmente similaridade).
"""
from __future__ import annotations
import json
import os
import sys
from pathlib import Path
# Garante que o projeto está no path
PROJECT_ROOT = Path(__file__).resolve().parent
if str(PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(PROJECT_ROOT))
def load_eval_dataset(path: str | Path) -> list:
"""Carrega dataset de eval: lista de dicts com prompt, reference_answer, etc."""
path = Path(path)
if not path.exists():
return []
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
return data if isinstance(data, list) else []
def run_eval(
dataset_path: str | Path | None = None,
config_path: str | Path | None = None,
verbose: bool = False,
) -> dict:
"""
Roda o pipeline em cada item do dataset e agrega métricas.
Retorna dict com scores médios e por exemplo.
"""
from pipeline import HybridLLMPipeline
from knowledge_base import get_knowledge_base
from config_loader import load_config, PROJECT_ROOT
from metrics import evaluate_response
config = load_config(config_path)
dataset_path = dataset_path or PROJECT_ROOT / "data" / "eval" / "sample.json"
dataset = load_eval_dataset(dataset_path)
if not dataset:
return {"error": "Dataset vazio ou não encontrado", "path": str(dataset_path)}
# Pipeline com KB carregado por config (sem RAG na eval para reprodutibilidade)
kb = get_knowledge_base(config=config, config_path=config_path, query_for_rag=None)
pipeline = HybridLLMPipeline(knowledge_base=kb, config=config, verbose=verbose)
results = []
all_bleu = []
all_rouge = []
all_coherence = []
for item in dataset:
prompt = item.get("prompt", "")
reference = item.get("reference_answer", "")
if not prompt:
continue
try:
result = pipeline.process(prompt)
except Exception as e:
results.append({"id": item.get("id"), "error": str(e)})
continue
metrics = evaluate_response(result, reference_answer=reference if reference else None)
results.append({
"id": item.get("id"),
"prompt": prompt[:80],
"truth_value": result.truth_value,
"coherence": metrics.get("coherence", {}),
"bleu": metrics.get("bleu"),
"rouge_l": metrics.get("rouge_l"),
"semantic_similarity": metrics.get("semantic_similarity"),
})
if "coherence" in metrics and "coherence_score" in metrics["coherence"]:
all_coherence.append(metrics["coherence"]["coherence_score"])
if metrics.get("bleu") is not None:
all_bleu.append(metrics["bleu"])
if metrics.get("rouge_l") is not None:
all_rouge.append(metrics["rouge_l"])
out = {
"n_examples": len(dataset),
"n_processed": len(results),
"results": results,
"averages": {
"coherence": sum(all_coherence) / len(all_coherence) if all_coherence else 0,
"bleu": sum(all_bleu) / len(all_bleu) if all_bleu else 0,
"rouge_l": sum(all_rouge) / len(all_rouge) if all_rouge else 0,
},
}
return out
def main():
import argparse
parser = argparse.ArgumentParser(description="Avaliação do pipeline L1–L4")
parser.add_argument("--dataset", default=None, help="Caminho para JSON de eval")
parser.add_argument("--config", default=None, help="Caminho para config.yaml")
parser.add_argument("--verbose", action="store_true", help="Log do pipeline")
parser.add_argument("--output", default=None, help="Salvar resultado em JSON")
args = parser.parse_args()
out = run_eval(
dataset_path=args.dataset,
config_path=args.config,
verbose=args.verbose,
)
if args.output:
with open(args.output, "w", encoding="utf-8") as f:
json.dump(out, f, ensure_ascii=False, indent=2)
print(f"Resultado salvo em {args.output}")
else:
print(json.dumps(out, ensure_ascii=False, indent=2))
if __name__ == "__main__":
main()
|