""" real_model_eval.py — Genuine before/after component evaluation using the actual models. BEFORE: base Qwen/Qwen2.5-0.5B-Instruct (no fine-tuning) AFTER: AniketAsla/debatefloor-grpo-qwen2.5-0.5b-instruct (GRPO fine-tuned) Rewards come from the live environment via POST /reset + /step (MR-2 compliant). This replaces the scripted agent eval with real model outputs. """ import json import os import re import sys import time from datetime import datetime, timezone from pathlib import Path from statistics import mean import requests import torch sys.path.insert(0, ".") from server.calibration_grader import CALIBRATION_MATRIX ENV_BASE_URL = os.getenv("ENV_BASE_URL", "http://localhost:7861") BASE_MODEL = "Qwen/Qwen2.5-0.5B-Instruct" TRAINED_MODEL = "AniketAsla/debatefloor-grpo-qwen2.5-0.5b-instruct" EVAL_TASKS = ["clean_claim", "contradictory_claim", "distribution_shift_claim"] SEEDS = [7, 42] # 2 seeds per task = 6 episodes each pass (fast but real) SYSTEM = ( "You are an expert insurance fraud investigator.\n" "Analyze the claim and respond EXACTLY in this format:\n" "DECISION: \n" "CONFIDENCE: \n" "REASON: \n\n" "HIGH = certain. MED = likely but some doubt. LOW = ambiguous, expert needed.\n" "WARNING: HIGH confidence on a wrong answer is the worst possible outcome." ) DECISION_RE = re.compile(r"DECISION:\s*(approve_claim|deny_claim|escalate_to_human)", re.I) CONFIDENCE_RE = re.compile(r"CONFIDENCE:\s*(HIGH|MED|LOW)", re.I) REASON_RE = re.compile(r"REASON:\s*(.*)", re.I | re.S) def _parse(text): dm = DECISION_RE.search(text or "") cm = CONFIDENCE_RE.search(text or "") rm = REASON_RE.search(text or "") return ( dm.group(1).lower() if dm else None, cm.group(1).upper() if cm else None, (rm.group(1).strip()[:200] if rm else ""), ) def load_model(model_id, label): print(f"\nLoading {label}: {model_id} ...") t0 = time.time() try: from unsloth import FastLanguageModel model, tok = FastLanguageModel.from_pretrained( model_name=model_id, max_seq_length=512, dtype=None, load_in_4bit=True, ) FastLanguageModel.for_inference(model) print(f" Loaded via Unsloth (4-bit) in {time.time()-t0:.1f}s") except Exception as e: print(f" Unsloth not available ({e}), using standard transformers ...") from transformers import AutoModelForCausalLM, AutoTokenizer tok = AutoTokenizer.from_pretrained(model_id) if tok.pad_token is None: tok.pad_token = tok.eos_token model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.float32, # CPU-safe, no accelerate needed ) model.eval() print(f" Loaded via transformers (fp32 CPU) in {time.time()-t0:.1f}s") return model, tok def generate(model, tok, prompt, max_new_tokens=100): device = next(model.parameters()).device inputs = tok(prompt, return_tensors="pt", truncation=True, max_length=512) inputs = {k: v.to(device) for k, v in inputs.items()} with torch.inference_mode(): out = model.generate( **inputs, max_new_tokens=max_new_tokens, do_sample=False, temperature=1.0, pad_token_id=tok.eos_token_id, ) plen = inputs["input_ids"].shape[-1] return tok.decode(out[0][plen:], skip_special_tokens=True) def build_prompt(tok, obs_text, task_id): msgs = [ {"role": "system", "content": SYSTEM}, {"role": "user", "content": obs_text}, ] return tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True) def run_episode_real(model, tok, task_id, seed): """ Full episode: 1. POST /reset → get observation text 2. Model generates completion from the observation 3. Parse DECISION/CONFIDENCE/REASON 4. POST /step → get real reward_breakdown """ reset_r = requests.post( f"{ENV_BASE_URL}/reset", json={"task_id": task_id, "seed": seed}, timeout=15, ) reset_r.raise_for_status() reset_data = reset_r.json() session_id = reset_data["session_id"] # Build text description from observation for the model obs = reset_data.get("observation", {}) docs = obs.get("documents", []) doc_text = "\n".join( f" [{d.get('doc_type','doc')}] {d.get('content','')}" for d in docs ) incident = obs.get("incident", {}) obs_text = ( f"Task: {task_id} | Claim ID: {obs.get('claim_id','')}\n" f"Claimant: {obs.get('claimant',{}).get('name','')}\n" f"Incident: {incident.get('type','')} — {incident.get('description','')[:150]}\n" f"Documents:\n{doc_text}\n" f"Linked claims: {len(obs.get('linked_claims', []))}" ) prompt = build_prompt(tok, obs_text, task_id) t0 = time.time() completion = generate(model, tok, prompt) gen_time = time.time() - t0 decision, confidence, reason = _parse(completion) if decision is None or confidence is None: # Format failure — submit a default escalation decision, confidence, reason = "escalate_to_human", "LOW", "Could not parse decision from model output." action = { "action_type": decision, "confidence": confidence, "parameters": {"reason": reason}, "reasoning": reason, } step_r = requests.post( f"{ENV_BASE_URL}/step", json={"action": action, "session_id": session_id}, timeout=15, ) step_r.raise_for_status() step_data = step_r.json() breakdown = step_data.get("observation", {}).get("reward_breakdown", {}) return { "task_id": task_id, "seed": seed, "decision": decision, "confidence": confidence, "reason": reason[:100], "completion": completion[:200], "gen_time_s": round(gen_time, 1), "reward": round(float(step_data.get("reward", 0.0)), 4), "fraud_detection_score": round(float(breakdown.get("fraud_detection_score", 0.0)), 4), "decision_accuracy": round(float(breakdown.get("decision_accuracy", 0.0)), 4), "evidence_quality_score": round(float(breakdown.get("evidence_quality_score", 0.0)), 4), "calibration_score": round(float(breakdown.get("calibration_score", 0.0)), 4), } def eval_model(model, tok, label): print(f"\n{'='*60}") print(f"EVAL: {label}") print(f"{'='*60}") rows = [] for task_id in EVAL_TASKS: for seed in SEEDS: try: row = run_episode_real(model, tok, task_id, seed) rows.append(row) print( f" {task_id:30s} seed={seed:2d} " f"decision={row['decision']:20s} conf={row['confidence']} " f"reward={row['reward']:.3f} " f"da={row['decision_accuracy']:.2f} " f"fd={row['fraud_detection_score']:.2f} " f"cal={row['calibration_score']:.2f} " f"[{row['gen_time_s']}s]" ) except Exception as exc: print(f" ERROR {task_id} seed={seed}: {exc}") rows.append({ "task_id": task_id, "seed": seed, "reward": 0.0, "fraud_detection_score": 0.0, "decision_accuracy": 0.0, "evidence_quality_score": 0.0, "calibration_score": 0.0, "decision": "error", }) component_means = { "Fraud detection": round(mean(r["fraud_detection_score"] for r in rows), 4), "Decision accuracy": round(mean(r["decision_accuracy"] for r in rows), 4), "Evidence quality": round(mean(r["evidence_quality_score"] for r in rows), 4), "Calibration": round(mean(r["calibration_score"] for r in rows), 4), "Mean reward": round(mean(r["reward"] for r in rows), 4), } print(f"\n Component means: {json.dumps(component_means)}") return rows, component_means def save_and_plot(before_means, after_means, before_rows, after_rows, summary_path, log_history): delta = {k: round(after_means.get(k, 0.0) - before_means.get(k, 0.0), 4) for k in before_means} # Patch training_summary.json summary = json.loads(Path(summary_path).read_text(encoding="utf-8")) summary["eval_reward_before"] = before_means summary["eval_reward_after"] = after_means summary["component_shift"] = { "note": ( "Real model inference: before=Qwen/Qwen2.5-0.5B-Instruct (base), " "after=AniketAsla/debatefloor-grpo-qwen2.5-0.5b-instruct (GRPO fine-tuned). " "Rewards from live env HTTP API (MR-2 compliant)." ), "before": {k: v for k, v in before_means.items() if k != "Mean reward"}, "after": {k: v for k, v in after_means.items() if k != "Mean reward"}, } summary["component_shift_delta"] = {k: v for k, v in delta.items() if k != "Mean reward"} summary["eval_methodology"] = ( "Real model inference: base Qwen2.5-0.5B (before) vs GRPO fine-tuned checkpoint (after). " f"Eval tasks: {EVAL_TASKS}. Seeds per task: {SEEDS}. " "Env reward from POST /step (not keyword matching)." ) summary["eval_generated_at"] = datetime.now(timezone.utc).isoformat() summary["eval_rows"] = {"before": before_rows, "after": after_rows} # Remove stale pending markers for k in ("eval_reward_before", "eval_reward_after"): if summary.get(k) == "__pending_real_model_inference__": del summary[k] Path(summary_path).write_text(json.dumps(summary, indent=2), encoding="utf-8") print(f"\nUpdated {summary_path}") # Regenerate SVGs try: import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt import numpy as np # component_shift.svg labels = ["Fraud detection", "Decision accuracy", "Evidence quality", "Calibration"] bv = [before_means.get(l, 0.0) for l in labels] av = [after_means.get(l, 0.0) for l in labels] x, w = np.arange(len(labels)), 0.35 fig, ax = plt.subplots(figsize=(10, 5.5)) ax.set_facecolor("#f9f9f9"); fig.patch.set_facecolor("#ffffff") bars_b = ax.bar(x - w/2, bv, w, label="Before (base Qwen2.5-0.5B)", color="#7a869a", alpha=0.85, edgecolor="white") bars_a = ax.bar(x + w/2, av, w, label="After (GRPO fine-tuned)", color="#06a77d", alpha=0.85, edgecolor="white") for bar in bars_b: h = bar.get_height() ax.text(bar.get_x() + bar.get_width()/2, h + 0.02 if h >= 0 else h - 0.07, f"{h:.2f}", ha="center", fontsize=9, color="#333") for bar in bars_a: h = bar.get_height() ax.text(bar.get_x() + bar.get_width()/2, h + 0.02 if h >= 0 else h - 0.07, f"{h:.2f}", ha="center", fontsize=9, color="#1a6b58") ax.set_xticks(x); ax.set_xticklabels(labels, fontsize=11) ax.axhline(0, color="#666", linewidth=0.8, alpha=0.5) ax.set_ylim(-1.1, 1.3) ax.set_ylabel("Component score (clamped [0,1]; calibration unbounded)", fontsize=10) ax.set_xlabel("Reward component", fontsize=11) ax.set_title("DebateFloor: Real Model Before vs After GRPO Training", fontsize=13, fontweight="bold") ax.grid(True, axis="y", alpha=0.2, linestyle="--"); ax.legend(framealpha=0.85, fontsize=10) for i, (b_v, a_v) in enumerate(zip(bv, av)): d = a_v - b_v color = "#06a77d" if d > 0 else ("#e63946" if d < 0 else "#999") sign = "+" if d >= 0 else "" ax.text(x[i], max(a_v, b_v) + 0.10, f"D{sign}{d:.2f}", ha="center", fontsize=9, color=color, fontweight="bold") delta_str = " | ".join( f"{k}: {'+' if v>=0 else ''}{v:.2f}" for k, v in delta.items() if k != "Mean reward" ) ax.annotate( f"Deltas: {delta_str}\nTraining reward: 0.130 → 0.469 (+0.339, 3.6x via live env HTTP, 2,500 steps)\n" "Source: real model inference (not scripted)", xy=(0.01, 0.01), xycoords="axes fraction", fontsize=8, color="#555", bbox=dict(boxstyle="round,pad=0.3", facecolor="#f0f8f0", edgecolor="#06a77d", alpha=0.85), ) fig.tight_layout() Path("docs").mkdir(exist_ok=True) fig.savefig("docs/component_shift.svg", dpi=180, format="svg") plt.close(fig) print("docs/component_shift.svg updated") # reward_curve.svg (from training log_history) reward_steps, rewards, loss_steps, losses = [], [], [], [] for row in log_history: step = row.get("step") if step is None: continue if "loss" in row and "train_runtime" not in row: loss_steps.append(step); losses.append(row["loss"]) rv = row.get("reward") or row.get("rewards/reward_fn/mean") if rv is not None: reward_steps.append(step); rewards.append(rv) if rewards: def smooth(vals, w=7): return [sum(vals[max(0,i-w+1):i+1])/(i-max(0,i-w+1)+1) for i in range(len(vals))] fig2, ax1 = plt.subplots(figsize=(10, 5.5)) ax1.set_facecolor("#f9f9f9"); fig2.patch.set_facecolor("#ffffff") if losses: ax1.plot(loss_steps, losses, color="#26547c", linewidth=1.2, alpha=0.45, label="Training loss") ax1.set_ylabel("Training loss", color="#26547c", fontsize=11) ax1.tick_params(axis="y", labelcolor="#26547c") ax1.set_xlabel("Training step", fontsize=11) ax1.grid(True, alpha=0.2, linestyle="--") ax2 = ax1.twinx() ax2.plot(reward_steps, rewards, color="#06a77d", linewidth=1.0, alpha=0.3) ax2.plot(reward_steps, smooth(rewards), color="#06a77d", linewidth=2.2, label="Mean reward (smoothed)") ax2.axhline(rewards[0], color="#e63946", linewidth=1.0, linestyle="--", alpha=0.6, label=f"Start: {rewards[0]:.3f}") ax2.axhline(rewards[-1], color="#2a9d8f", linewidth=1.0, linestyle="--", alpha=0.6, label=f"End: {rewards[-1]:.3f}") ax2.set_ylabel("Mean reward — live env HTTP scalar (unbounded)", color="#06a77d", fontsize=11) ax2.tick_params(axis="y", labelcolor="#06a77d") ax2.annotate("Reward from live env (POST /step)\nNot comparable to clamped [0,1] eval score.", xy=(0.02, 0.05), xycoords="axes fraction", fontsize=8.5, color="gray") lines1, lab1 = ax1.get_legend_handles_labels() lines2, lab2 = ax2.get_legend_handles_labels() ax2.legend(lines1+lines2, lab1+lab2, loc="upper left", framealpha=0.85, fontsize=9) fig2.suptitle("DebateFloor GRPO Training — Live Env Reward (HTTP, MR-2 Compliant)", fontsize=13, fontweight="bold") fig2.tight_layout() fig2.savefig("docs/reward_curve.svg", dpi=180, format="svg") plt.close(fig2) print("docs/reward_curve.svg updated") except Exception as exc: print(f"SVG generation failed: {exc}") def main(): # Verify env is up r = requests.get(f"{ENV_BASE_URL}/health", timeout=5) assert r.json().get("status") == "healthy", f"Env not healthy: {r.text}" print(f"Env healthy at {ENV_BASE_URL}") summary_path = "reports/training_summary.json" summary = json.loads(Path(summary_path).read_text(encoding="utf-8")) log_history = summary.get("log_history", []) # ── BEFORE: base model ───────────────────────────────────────────────── base_model, base_tok = load_model(BASE_MODEL, "BASE (before training)") before_rows, before_means = eval_model(base_model, base_tok, f"BEFORE — {BASE_MODEL}") del base_model # free memory before loading trained model import gc; gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() # ── AFTER: fine-tuned model ──────────────────────────────────────────── trained_model, trained_tok = load_model(TRAINED_MODEL, "TRAINED (after GRPO)") after_rows, after_means = eval_model(trained_model, trained_tok, f"AFTER — {TRAINED_MODEL}") # ── Save everything ──────────────────────────────────────────────────── save_and_plot(before_means, after_means, before_rows, after_rows, summary_path, log_history) print("\n" + "="*60) print("REAL INFERENCE RESULTS") print("="*60) delta = {k: round(after_means.get(k, 0.0) - before_means.get(k, 0.0), 4) for k in before_means if k != "Mean reward"} print(f"Before: {json.dumps({k:v for k,v in before_means.items() if k!='Mean reward'})}") print(f"After: {json.dumps({k:v for k,v in after_means.items() if k!='Mean reward'})}") print(f"Delta: {json.dumps(delta)}") print("\nAll results from real model inference. Not scripted.") if __name__ == "__main__": main()