| """Functional eval: chat questions over the full pipeline. |
| |
| Uploads all test_data/ samples and runs the chat-graph through every question. |
| Per question: |
| * pass: at least one ``expected_substrings`` token is in the answer (diacritic-tolerant) |
| * tool match: every ``expected_tools`` entry is in the tool messages |
| * latency_ms |
| |
| CLI: python eval/run_eval.py --llm dummy [--quick] |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import asyncio |
| import json |
| import os |
| import re |
| import statistics |
| import sys |
| import time |
| import unicodedata |
| from pathlib import Path |
|
|
| sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) |
|
|
| from langchain_core.messages import HumanMessage, ToolMessage |
|
|
| from graph.chat_graph import build_chat_graph |
| from graph.pipeline_graph import build_pipeline_graph |
| from providers import get_chat_model, get_dummy_handle |
| from store import HybridStore |
| from tools import ChatToolContext |
|
|
|
|
| EVAL_DIR = Path(__file__).resolve().parent |
| QUESTIONS_PATH = EVAL_DIR / "questions.json" |
| RESULTS_MD = EVAL_DIR / "results.md" |
| SAMPLE_DIRS = [ |
| EVAL_DIR.parent / "test_data" / "invoices", |
| EVAL_DIR.parent / "test_data" / "contracts", |
| EVAL_DIR.parent / "test_data" / "multi_doc", |
| ] |
|
|
|
|
| def _normalize(text: str) -> str: |
| nfkd = unicodedata.normalize("NFKD", text) |
| return "".join(c for c in nfkd if not unicodedata.combining(c)).lower() |
|
|
|
|
| def _setup() -> tuple: |
| """Pipeline futás → ChatToolContext kitöltése.""" |
| store = HybridStore() |
| files = [] |
| for d in SAMPLE_DIRS: |
| if not d.exists(): |
| continue |
| for pdf in sorted(d.glob("*.pdf")): |
| files.append((pdf.name, pdf.read_bytes())) |
|
|
| if not files: |
| raise RuntimeError( |
| "No sample PDFs found. Run: python test_data/generate_samples.py" |
| ) |
|
|
| if os.getenv("LLM_PROFILE", "dummy") == "dummy": |
| dummy = get_dummy_handle() |
| dummy.set_docs_hint([fn for fn, _ in files]) |
|
|
| pipeline = build_pipeline_graph(store) |
| state = asyncio.run(pipeline.ainvoke({"files": files})) |
|
|
| context = ChatToolContext(store=store) |
| for pd in state.get("documents") or []: |
| context.add_document(pd) |
|
|
| return context, [fn for fn, _ in files], state |
|
|
|
|
| def _run_one(context: ChatToolContext, llm, question: dict) -> dict: |
| chat_graph = build_chat_graph(llm, context) |
| start = time.time() |
| try: |
| state = asyncio.run(chat_graph.ainvoke({ |
| "messages": [HumanMessage(content=question["question"])], |
| })) |
| latency_ms = (time.time() - start) * 1000 |
| answer = state.get("final_answer", "") |
| tool_calls = [ |
| m.name for m in state.get("messages") or [] |
| if isinstance(m, ToolMessage) and m.name |
| ] |
| except Exception as e: |
| latency_ms = (time.time() - start) * 1000 |
| answer = f"ERROR: {e}" |
| tool_calls = [] |
|
|
| |
| answer_norm = _normalize(answer) |
| pass_subst = any( |
| _normalize(s) in answer_norm |
| for s in question.get("expected_substrings", []) |
| ) |
|
|
| |
| expected_tools = set(question.get("expected_tools", [])) |
| actual_tools = set(tool_calls) |
| tools_match = expected_tools.issubset(actual_tools) if expected_tools else True |
|
|
| return { |
| "id": question["id"], |
| "category": question["category"], |
| "question": question["question"], |
| "answer": answer[:200] + ("..." if len(answer) > 200 else ""), |
| "tools_called": tool_calls, |
| "expected_tools": list(expected_tools), |
| "tools_match": tools_match, |
| "pass": pass_subst, |
| "latency_ms": round(latency_ms, 1), |
| } |
|
|
|
|
| def main() -> None: |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--llm", default=os.getenv("LLM_PROFILE", "dummy"), |
| choices=["claude", "ollama", "dummy"]) |
| parser.add_argument("--quick", action="store_true", |
| help="csak 5 kérdés (gyors smoke teszt)") |
| parser.add_argument("--no-write", action="store_true") |
| args = parser.parse_args() |
|
|
| os.environ["LLM_PROFILE"] = args.llm |
|
|
| print(f"Eval init: llm={args.llm}...") |
| context, filenames, _ = _setup() |
| print(f" Setup: {len(filenames)} doksi feltöltve.") |
|
|
| llm = get_chat_model(args.llm) |
| questions = json.loads(QUESTIONS_PATH.read_text(encoding="utf-8")) |
| if args.quick: |
| seen_cat = set() |
| out = [] |
| for q in questions: |
| if q["category"] not in seen_cat: |
| seen_cat.add(q["category"]) |
| out.append(q) |
| questions = out |
|
|
| print(f"\nFutás: {len(questions)} kérdés...") |
| results = [] |
| for q in questions: |
| r = _run_one(context, llm, q) |
| status = "✓ PASS" if r["pass"] else "✗ FAIL" |
| print(f" {status} [{r['category']:8}] {r['id']}: {r['answer'][:60]}...") |
| results.append(r) |
|
|
| |
| passed = sum(1 for r in results if r["pass"]) |
| tools_match = sum(1 for r in results if r["tools_match"]) |
| latencies = [r["latency_ms"] for r in results] |
|
|
| by_cat: dict[str, dict] = {} |
| for r in results: |
| c = r["category"] |
| by_cat.setdefault(c, {"pass": 0, "total": 0}) |
| by_cat[c]["total"] += 1 |
| if r["pass"]: |
| by_cat[c]["pass"] += 1 |
|
|
| md = ["# Funkcionális ertekeles eredmenye", ""] |
| md.append(f"- LLM provider: **{args.llm}**") |
| md.append(f"- Osszesen: {len(results)} kerdes") |
| md.append(f"- Pass rate: **{passed}/{len(results)} ({100*passed/len(results):.0f}%)**") |
| md.append(f"- Tool-sorrend egyezes: {tools_match}/{len(results)}") |
| md.append(f"- Latency p50: {statistics.median(latencies):.0f} ms, p95: " |
| f"{sorted(latencies)[int(len(latencies)*0.95)]:.0f} ms, " |
| f"max: {max(latencies):.0f} ms") |
| md.append("") |
| md.append("## Per-kerdes eredmenyek") |
| md.append("") |
| md.append("| ID | Kat. | Pass | Tools | Latency (ms) |") |
| md.append("|---|---|---|---|---|") |
| for r in results: |
| tool_match_str = "[+]" if r["tools_match"] else "[-]" |
| pass_str = "OK" if r["pass"] else "FAIL" |
| tools_str = ", ".join(r["tools_called"]) or "(none)" |
| md.append(f"| {r['id']} | {r['category']} | {pass_str} | {tools_str} {tool_match_str} | {r['latency_ms']:.0f} |") |
| md.append("") |
| md.append("## Per-kategoria") |
| md.append("") |
| md.append("| Kategoria | Pass | Total |") |
| md.append("|---|---|---|") |
| for cat, d in by_cat.items(): |
| md.append(f"| {cat} | {d['pass']} | {d['total']} |") |
|
|
| md_text = "\n".join(md) + "\n" |
| print() |
| print(md_text) |
|
|
| if not args.no_write: |
| RESULTS_MD.write_text(md_text, encoding="utf-8") |
| print(f"\nMentve: {RESULTS_MD}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|