debatefloor / train /real_model_eval.py
AniketAsla's picture
deploy: update train/real_model_eval.py
bd77971 verified
"""
real_model_eval.py — Genuine before/after component evaluation using the actual models.
BEFORE: base Qwen/Qwen2.5-0.5B-Instruct (no fine-tuning)
AFTER: AniketAsla/debatefloor-grpo-qwen2.5-0.5b-instruct (GRPO fine-tuned)
Rewards come from the live environment via POST /reset + /step (MR-2 compliant).
This replaces the scripted agent eval with real model outputs.
"""
import json
import os
import re
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from statistics import mean
import requests
import torch
sys.path.insert(0, ".")
from server.calibration_grader import CALIBRATION_MATRIX
ENV_BASE_URL = os.getenv("ENV_BASE_URL", "http://localhost:7861")
BASE_MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
TRAINED_MODEL = "AniketAsla/debatefloor-grpo-qwen2.5-0.5b-instruct"
EVAL_TASKS = ["clean_claim", "contradictory_claim", "distribution_shift_claim"]
SEEDS = [7, 42] # 2 seeds per task = 6 episodes each pass (fast but real)
SYSTEM = (
"You are an expert insurance fraud investigator.\n"
"Analyze the claim and respond EXACTLY in this format:\n"
"DECISION: <approve_claim|deny_claim|escalate_to_human>\n"
"CONFIDENCE: <HIGH|MED|LOW>\n"
"REASON: <one sentence citing specific evidence>\n\n"
"HIGH = certain. MED = likely but some doubt. LOW = ambiguous, expert needed.\n"
"WARNING: HIGH confidence on a wrong answer is the worst possible outcome."
)
DECISION_RE = re.compile(r"DECISION:\s*(approve_claim|deny_claim|escalate_to_human)", re.I)
CONFIDENCE_RE = re.compile(r"CONFIDENCE:\s*(HIGH|MED|LOW)", re.I)
REASON_RE = re.compile(r"REASON:\s*(.*)", re.I | re.S)
def _parse(text):
dm = DECISION_RE.search(text or "")
cm = CONFIDENCE_RE.search(text or "")
rm = REASON_RE.search(text or "")
return (
dm.group(1).lower() if dm else None,
cm.group(1).upper() if cm else None,
(rm.group(1).strip()[:200] if rm else ""),
)
def load_model(model_id, label):
print(f"\nLoading {label}: {model_id} ...")
t0 = time.time()
try:
from unsloth import FastLanguageModel
model, tok = FastLanguageModel.from_pretrained(
model_name=model_id,
max_seq_length=512,
dtype=None,
load_in_4bit=True,
)
FastLanguageModel.for_inference(model)
print(f" Loaded via Unsloth (4-bit) in {time.time()-t0:.1f}s")
except Exception as e:
print(f" Unsloth not available ({e}), using standard transformers ...")
from transformers import AutoModelForCausalLM, AutoTokenizer
tok = AutoTokenizer.from_pretrained(model_id)
if tok.pad_token is None:
tok.pad_token = tok.eos_token
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.float32, # CPU-safe, no accelerate needed
)
model.eval()
print(f" Loaded via transformers (fp32 CPU) in {time.time()-t0:.1f}s")
return model, tok
def generate(model, tok, prompt, max_new_tokens=100):
device = next(model.parameters()).device
inputs = tok(prompt, return_tensors="pt", truncation=True, max_length=512)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.inference_mode():
out = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=False,
temperature=1.0,
pad_token_id=tok.eos_token_id,
)
plen = inputs["input_ids"].shape[-1]
return tok.decode(out[0][plen:], skip_special_tokens=True)
def build_prompt(tok, obs_text, task_id):
msgs = [
{"role": "system", "content": SYSTEM},
{"role": "user", "content": obs_text},
]
return tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
def run_episode_real(model, tok, task_id, seed):
"""
Full episode:
1. POST /reset → get observation text
2. Model generates completion from the observation
3. Parse DECISION/CONFIDENCE/REASON
4. POST /step → get real reward_breakdown
"""
reset_r = requests.post(
f"{ENV_BASE_URL}/reset",
json={"task_id": task_id, "seed": seed},
timeout=15,
)
reset_r.raise_for_status()
reset_data = reset_r.json()
session_id = reset_data["session_id"]
# Build text description from observation for the model
obs = reset_data.get("observation", {})
docs = obs.get("documents", [])
doc_text = "\n".join(
f" [{d.get('doc_type','doc')}] {d.get('content','')}" for d in docs
)
incident = obs.get("incident", {})
obs_text = (
f"Task: {task_id} | Claim ID: {obs.get('claim_id','')}\n"
f"Claimant: {obs.get('claimant',{}).get('name','')}\n"
f"Incident: {incident.get('type','')}{incident.get('description','')[:150]}\n"
f"Documents:\n{doc_text}\n"
f"Linked claims: {len(obs.get('linked_claims', []))}"
)
prompt = build_prompt(tok, obs_text, task_id)
t0 = time.time()
completion = generate(model, tok, prompt)
gen_time = time.time() - t0
decision, confidence, reason = _parse(completion)
if decision is None or confidence is None:
# Format failure — submit a default escalation
decision, confidence, reason = "escalate_to_human", "LOW", "Could not parse decision from model output."
action = {
"action_type": decision,
"confidence": confidence,
"parameters": {"reason": reason},
"reasoning": reason,
}
step_r = requests.post(
f"{ENV_BASE_URL}/step",
json={"action": action, "session_id": session_id},
timeout=15,
)
step_r.raise_for_status()
step_data = step_r.json()
breakdown = step_data.get("observation", {}).get("reward_breakdown", {})
return {
"task_id": task_id,
"seed": seed,
"decision": decision,
"confidence": confidence,
"reason": reason[:100],
"completion": completion[:200],
"gen_time_s": round(gen_time, 1),
"reward": round(float(step_data.get("reward", 0.0)), 4),
"fraud_detection_score": round(float(breakdown.get("fraud_detection_score", 0.0)), 4),
"decision_accuracy": round(float(breakdown.get("decision_accuracy", 0.0)), 4),
"evidence_quality_score": round(float(breakdown.get("evidence_quality_score", 0.0)), 4),
"calibration_score": round(float(breakdown.get("calibration_score", 0.0)), 4),
}
def eval_model(model, tok, label):
print(f"\n{'='*60}")
print(f"EVAL: {label}")
print(f"{'='*60}")
rows = []
for task_id in EVAL_TASKS:
for seed in SEEDS:
try:
row = run_episode_real(model, tok, task_id, seed)
rows.append(row)
print(
f" {task_id:30s} seed={seed:2d} "
f"decision={row['decision']:20s} conf={row['confidence']} "
f"reward={row['reward']:.3f} "
f"da={row['decision_accuracy']:.2f} "
f"fd={row['fraud_detection_score']:.2f} "
f"cal={row['calibration_score']:.2f} "
f"[{row['gen_time_s']}s]"
)
except Exception as exc:
print(f" ERROR {task_id} seed={seed}: {exc}")
rows.append({
"task_id": task_id, "seed": seed,
"reward": 0.0, "fraud_detection_score": 0.0,
"decision_accuracy": 0.0, "evidence_quality_score": 0.0,
"calibration_score": 0.0, "decision": "error",
})
component_means = {
"Fraud detection": round(mean(r["fraud_detection_score"] for r in rows), 4),
"Decision accuracy": round(mean(r["decision_accuracy"] for r in rows), 4),
"Evidence quality": round(mean(r["evidence_quality_score"] for r in rows), 4),
"Calibration": round(mean(r["calibration_score"] for r in rows), 4),
"Mean reward": round(mean(r["reward"] for r in rows), 4),
}
print(f"\n Component means: {json.dumps(component_means)}")
return rows, component_means
def save_and_plot(before_means, after_means, before_rows, after_rows, summary_path, log_history):
delta = {k: round(after_means.get(k, 0.0) - before_means.get(k, 0.0), 4) for k in before_means}
# Patch training_summary.json
summary = json.loads(Path(summary_path).read_text(encoding="utf-8"))
summary["eval_reward_before"] = before_means
summary["eval_reward_after"] = after_means
summary["component_shift"] = {
"note": (
"Real model inference: before=Qwen/Qwen2.5-0.5B-Instruct (base), "
"after=AniketAsla/debatefloor-grpo-qwen2.5-0.5b-instruct (GRPO fine-tuned). "
"Rewards from live env HTTP API (MR-2 compliant)."
),
"before": {k: v for k, v in before_means.items() if k != "Mean reward"},
"after": {k: v for k, v in after_means.items() if k != "Mean reward"},
}
summary["component_shift_delta"] = {k: v for k, v in delta.items() if k != "Mean reward"}
summary["eval_methodology"] = (
"Real model inference: base Qwen2.5-0.5B (before) vs GRPO fine-tuned checkpoint (after). "
f"Eval tasks: {EVAL_TASKS}. Seeds per task: {SEEDS}. "
"Env reward from POST /step (not keyword matching)."
)
summary["eval_generated_at"] = datetime.now(timezone.utc).isoformat()
summary["eval_rows"] = {"before": before_rows, "after": after_rows}
# Remove stale pending markers
for k in ("eval_reward_before", "eval_reward_after"):
if summary.get(k) == "__pending_real_model_inference__":
del summary[k]
Path(summary_path).write_text(json.dumps(summary, indent=2), encoding="utf-8")
print(f"\nUpdated {summary_path}")
# Regenerate SVGs
try:
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import numpy as np
# component_shift.svg
labels = ["Fraud detection", "Decision accuracy", "Evidence quality", "Calibration"]
bv = [before_means.get(l, 0.0) for l in labels]
av = [after_means.get(l, 0.0) for l in labels]
x, w = np.arange(len(labels)), 0.35
fig, ax = plt.subplots(figsize=(10, 5.5))
ax.set_facecolor("#f9f9f9"); fig.patch.set_facecolor("#ffffff")
bars_b = ax.bar(x - w/2, bv, w, label="Before (base Qwen2.5-0.5B)", color="#7a869a", alpha=0.85, edgecolor="white")
bars_a = ax.bar(x + w/2, av, w, label="After (GRPO fine-tuned)", color="#06a77d", alpha=0.85, edgecolor="white")
for bar in bars_b:
h = bar.get_height()
ax.text(bar.get_x() + bar.get_width()/2, h + 0.02 if h >= 0 else h - 0.07,
f"{h:.2f}", ha="center", fontsize=9, color="#333")
for bar in bars_a:
h = bar.get_height()
ax.text(bar.get_x() + bar.get_width()/2, h + 0.02 if h >= 0 else h - 0.07,
f"{h:.2f}", ha="center", fontsize=9, color="#1a6b58")
ax.set_xticks(x); ax.set_xticklabels(labels, fontsize=11)
ax.axhline(0, color="#666", linewidth=0.8, alpha=0.5)
ax.set_ylim(-1.1, 1.3)
ax.set_ylabel("Component score (clamped [0,1]; calibration unbounded)", fontsize=10)
ax.set_xlabel("Reward component", fontsize=11)
ax.set_title("DebateFloor: Real Model Before vs After GRPO Training", fontsize=13, fontweight="bold")
ax.grid(True, axis="y", alpha=0.2, linestyle="--"); ax.legend(framealpha=0.85, fontsize=10)
for i, (b_v, a_v) in enumerate(zip(bv, av)):
d = a_v - b_v
color = "#06a77d" if d > 0 else ("#e63946" if d < 0 else "#999")
sign = "+" if d >= 0 else ""
ax.text(x[i], max(a_v, b_v) + 0.10, f"D{sign}{d:.2f}",
ha="center", fontsize=9, color=color, fontweight="bold")
delta_str = " | ".join(
f"{k}: {'+' if v>=0 else ''}{v:.2f}" for k, v in delta.items() if k != "Mean reward"
)
ax.annotate(
f"Deltas: {delta_str}\nTraining reward: 0.130 → 0.469 (+0.339, 3.6x via live env HTTP, 2,500 steps)\n"
"Source: real model inference (not scripted)",
xy=(0.01, 0.01), xycoords="axes fraction", fontsize=8, color="#555",
bbox=dict(boxstyle="round,pad=0.3", facecolor="#f0f8f0", edgecolor="#06a77d", alpha=0.85),
)
fig.tight_layout()
Path("docs").mkdir(exist_ok=True)
fig.savefig("docs/component_shift.svg", dpi=180, format="svg")
plt.close(fig)
print("docs/component_shift.svg updated")
# reward_curve.svg (from training log_history)
reward_steps, rewards, loss_steps, losses = [], [], [], []
for row in log_history:
step = row.get("step")
if step is None: continue
if "loss" in row and "train_runtime" not in row:
loss_steps.append(step); losses.append(row["loss"])
rv = row.get("reward") or row.get("rewards/reward_fn/mean")
if rv is not None:
reward_steps.append(step); rewards.append(rv)
if rewards:
def smooth(vals, w=7):
return [sum(vals[max(0,i-w+1):i+1])/(i-max(0,i-w+1)+1) for i in range(len(vals))]
fig2, ax1 = plt.subplots(figsize=(10, 5.5))
ax1.set_facecolor("#f9f9f9"); fig2.patch.set_facecolor("#ffffff")
if losses:
ax1.plot(loss_steps, losses, color="#26547c", linewidth=1.2, alpha=0.45, label="Training loss")
ax1.set_ylabel("Training loss", color="#26547c", fontsize=11)
ax1.tick_params(axis="y", labelcolor="#26547c")
ax1.set_xlabel("Training step", fontsize=11)
ax1.grid(True, alpha=0.2, linestyle="--")
ax2 = ax1.twinx()
ax2.plot(reward_steps, rewards, color="#06a77d", linewidth=1.0, alpha=0.3)
ax2.plot(reward_steps, smooth(rewards), color="#06a77d", linewidth=2.2, label="Mean reward (smoothed)")
ax2.axhline(rewards[0], color="#e63946", linewidth=1.0, linestyle="--", alpha=0.6, label=f"Start: {rewards[0]:.3f}")
ax2.axhline(rewards[-1], color="#2a9d8f", linewidth=1.0, linestyle="--", alpha=0.6, label=f"End: {rewards[-1]:.3f}")
ax2.set_ylabel("Mean reward — live env HTTP scalar (unbounded)", color="#06a77d", fontsize=11)
ax2.tick_params(axis="y", labelcolor="#06a77d")
ax2.annotate("Reward from live env (POST /step)\nNot comparable to clamped [0,1] eval score.",
xy=(0.02, 0.05), xycoords="axes fraction", fontsize=8.5, color="gray")
lines1, lab1 = ax1.get_legend_handles_labels()
lines2, lab2 = ax2.get_legend_handles_labels()
ax2.legend(lines1+lines2, lab1+lab2, loc="upper left", framealpha=0.85, fontsize=9)
fig2.suptitle("DebateFloor GRPO Training — Live Env Reward (HTTP, MR-2 Compliant)", fontsize=13, fontweight="bold")
fig2.tight_layout()
fig2.savefig("docs/reward_curve.svg", dpi=180, format="svg")
plt.close(fig2)
print("docs/reward_curve.svg updated")
except Exception as exc:
print(f"SVG generation failed: {exc}")
def main():
# Verify env is up
r = requests.get(f"{ENV_BASE_URL}/health", timeout=5)
assert r.json().get("status") == "healthy", f"Env not healthy: {r.text}"
print(f"Env healthy at {ENV_BASE_URL}")
summary_path = "reports/training_summary.json"
summary = json.loads(Path(summary_path).read_text(encoding="utf-8"))
log_history = summary.get("log_history", [])
# ── BEFORE: base model ─────────────────────────────────────────────────
base_model, base_tok = load_model(BASE_MODEL, "BASE (before training)")
before_rows, before_means = eval_model(base_model, base_tok, f"BEFORE — {BASE_MODEL}")
del base_model # free memory before loading trained model
import gc; gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
# ── AFTER: fine-tuned model ────────────────────────────────────────────
trained_model, trained_tok = load_model(TRAINED_MODEL, "TRAINED (after GRPO)")
after_rows, after_means = eval_model(trained_model, trained_tok, f"AFTER — {TRAINED_MODEL}")
# ── Save everything ────────────────────────────────────────────────────
save_and_plot(before_means, after_means, before_rows, after_rows, summary_path, log_history)
print("\n" + "="*60)
print("REAL INFERENCE RESULTS")
print("="*60)
delta = {k: round(after_means.get(k, 0.0) - before_means.get(k, 0.0), 4) for k in before_means if k != "Mean reward"}
print(f"Before: {json.dumps({k:v for k,v in before_means.items() if k!='Mean reward'})}")
print(f"After: {json.dumps({k:v for k,v in after_means.items() if k!='Mean reward'})}")
print(f"Delta: {json.dumps(delta)}")
print("\nAll results from real model inference. Not scripted.")
if __name__ == "__main__":
main()