| """Programmatic Mellea probe — hit the agent stream N times and dump |
| per-requirement pass/fail to a CSV so we can see which invariant keeps |
| failing and decide how to fix. |
| |
| Requires the local server running on http://127.0.0.1:7860. |
| |
| Usage: |
| uv run python scripts/probe_mellea.py --query Hollis --runs 5 |
| """ |
| from __future__ import annotations |
|
|
| import argparse |
| import csv |
| import json |
| import time |
| from pathlib import Path |
| from urllib.parse import quote |
|
|
| import httpx |
|
|
|
|
| def stream_one(query: str, base: str, timeout_s: float) -> dict: |
| url = f"{base}/api/agent/stream?q={quote(query)}" |
| t0 = time.time() |
| final = None |
| intent = None |
| attempts = [] |
| with httpx.stream("GET", url, timeout=timeout_s) as r: |
| r.raise_for_status() |
| ev = None |
| buf = [] |
| for line in r.iter_lines(): |
| if line.startswith("event:"): |
| ev = line.split(":", 1)[1].strip() |
| elif line.startswith("data:"): |
| buf.append(line[5:].lstrip()) |
| elif line == "": |
| if ev and buf: |
| data = "\n".join(buf) |
| buf = [] |
| if ev == "plan": |
| try: |
| intent = json.loads(data).get("intent") |
| except json.JSONDecodeError: |
| pass |
| elif ev == "mellea_attempt": |
| try: |
| attempts.append(json.loads(data)) |
| except json.JSONDecodeError: |
| pass |
| elif ev == "final": |
| try: |
| final = json.loads(data) |
| except json.JSONDecodeError: |
| final = {"_raw": data} |
| ev = None |
| dt = round(time.time() - t0, 2) |
| return {"final": final or {}, "elapsed_s": dt, "intent": intent, |
| "attempts": attempts} |
|
|
|
|
| def main(): |
| ap = argparse.ArgumentParser() |
| ap.add_argument("--query", required=True) |
| ap.add_argument("--runs", type=int, default=5) |
| ap.add_argument("--base", default="http://127.0.0.1:7860") |
| ap.add_argument("--timeout", type=float, default=600.0) |
| ap.add_argument("--out", default="outputs/mellea_probe.csv") |
| args = ap.parse_args() |
|
|
| out = Path(args.out) |
| out.parent.mkdir(parents=True, exist_ok=True) |
|
|
| rows = [] |
| for i in range(args.runs): |
| try: |
| r = stream_one(args.query, args.base, args.timeout) |
| except Exception as e: |
| print(f"[{i+1}/{args.runs}] ERROR: {e!r}") |
| continue |
| f = r["final"] |
| m = f.get("mellea") or {} |
| passed = m.get("requirements_passed", []) |
| failed = m.get("requirements_failed", []) |
| para = f.get("paragraph", "") |
| row = { |
| "run": i + 1, |
| "intent": r.get("intent"), |
| "elapsed_s": r["elapsed_s"], |
| "rerolls": m.get("rerolls"), |
| "n_attempts": m.get("n_attempts"), |
| "passed_count": len(passed), |
| "failed_count": len(failed), |
| "failed": ",".join(failed), |
| "passed": ",".join(passed), |
| "para_chars": len(para), |
| "paragraph": para.replace("\n", " "), |
| } |
| |
| for a in r.get("attempts", []): |
| row[f"attempt{a.get('attempt')}_failed"] = ",".join(a.get("failed", [])) |
| rows.append(row) |
| atts = r.get("attempts", []) |
| att_summary = " | ".join( |
| f"#{a.get('attempt')}={'✓' if not a.get('failed') else 'fail:'+','.join(a.get('failed', []))}" |
| for a in atts |
| ) or "no attempts" |
| print(f"[{i+1}/{args.runs}] {r['elapsed_s']:6.1f}s final={len(passed)}/4 attempts: {att_summary}") |
|
|
| if rows: |
| with out.open("w", newline="") as f: |
| w = csv.DictWriter(f, fieldnames=list(rows[0].keys())) |
| w.writeheader() |
| w.writerows(rows) |
| print(f"\nWrote {out}") |
| print("Pass-rate distribution: " + |
| json.dumps({n: sum(1 for r in rows if r['passed_count'] == n) |
| for n in range(5)})) |
| |
| for r in rows: |
| if r['failed_count']: |
| print(f"\n--- run {r['run']} failed [{r['failed']}] ---") |
| print(r['paragraph'][:600]) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|