Spaces:

AniketAsla
/

debatefloor

Running

App Files Files Community

debatefloor / train /real_model_eval.py

AniketAsla

deploy: update train/real_model_eval.py

bd77971 verified 12 days ago

raw

history blame contribute delete

18 kB

	"""
	real_model_eval.py — Genuine before/after component evaluation using the actual models.

	BEFORE: base Qwen/Qwen2.5-0.5B-Instruct (no fine-tuning)
	AFTER: AniketAsla/debatefloor-grpo-qwen2.5-0.5b-instruct (GRPO fine-tuned)

	Rewards come from the live environment via POST /reset + /step (MR-2 compliant).
	This replaces the scripted agent eval with real model outputs.
	"""

	import json
	import os
	import re
	import sys
	import time
	from datetime import datetime, timezone
	from pathlib import Path
	from statistics import mean

	import requests
	import torch

	sys.path.insert(0, ".")
	from server.calibration_grader import CALIBRATION_MATRIX

	ENV_BASE_URL = os.getenv("ENV_BASE_URL", "http://localhost:7861")
	BASE_MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
	TRAINED_MODEL = "AniketAsla/debatefloor-grpo-qwen2.5-0.5b-instruct"

	EVAL_TASKS = ["clean_claim", "contradictory_claim", "distribution_shift_claim"]
	SEEDS = [7, 42] # 2 seeds per task = 6 episodes each pass (fast but real)

	SYSTEM = (
	"You are an expert insurance fraud investigator.\n"
	"Analyze the claim and respond EXACTLY in this format:\n"
	"DECISION: <approve_claim\|deny_claim\|escalate_to_human>\n"
	"CONFIDENCE: <HIGH\|MED\|LOW>\n"
	"REASON: <one sentence citing specific evidence>\n\n"
	"HIGH = certain. MED = likely but some doubt. LOW = ambiguous, expert needed.\n"
	"WARNING: HIGH confidence on a wrong answer is the worst possible outcome."
	)

	DECISION_RE = re.compile(r"DECISION:\s*(approve_claim\|deny_claim\|escalate_to_human)", re.I)
	CONFIDENCE_RE = re.compile(r"CONFIDENCE:\s*(HIGH\|MED\|LOW)", re.I)
	REASON_RE = re.compile(r"REASON:\s(.)", re.I \| re.S)


	def _parse(text):
	dm = DECISION_RE.search(text or "")
	cm = CONFIDENCE_RE.search(text or "")
	rm = REASON_RE.search(text or "")
	return (
	dm.group(1).lower() if dm else None,
	cm.group(1).upper() if cm else None,
	(rm.group(1).strip()[:200] if rm else ""),
	)


	def load_model(model_id, label):
	print(f"\nLoading {label}: {model_id} ...")
	t0 = time.time()
	try:
	from unsloth import FastLanguageModel
	model, tok = FastLanguageModel.from_pretrained(
	model_name=model_id,
	max_seq_length=512,
	dtype=None,
	load_in_4bit=True,
	)
	FastLanguageModel.for_inference(model)
	print(f" Loaded via Unsloth (4-bit) in {time.time()-t0:.1f}s")
	except Exception as e:
	print(f" Unsloth not available ({e}), using standard transformers ...")
	from transformers import AutoModelForCausalLM, AutoTokenizer
	tok = AutoTokenizer.from_pretrained(model_id)
	if tok.pad_token is None:
	tok.pad_token = tok.eos_token
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	torch_dtype=torch.float32, # CPU-safe, no accelerate needed
	)
	model.eval()
	print(f" Loaded via transformers (fp32 CPU) in {time.time()-t0:.1f}s")
	return model, tok


	def generate(model, tok, prompt, max_new_tokens=100):
	device = next(model.parameters()).device
	inputs = tok(prompt, return_tensors="pt", truncation=True, max_length=512)
	inputs = {k: v.to(device) for k, v in inputs.items()}
	with torch.inference_mode():
	out = model.generate(
	**inputs,
	max_new_tokens=max_new_tokens,
	do_sample=False,
	temperature=1.0,
	pad_token_id=tok.eos_token_id,
	)
	plen = inputs["input_ids"].shape[-1]
	return tok.decode(out[0][plen:], skip_special_tokens=True)


	def build_prompt(tok, obs_text, task_id):
	msgs = [
	{"role": "system", "content": SYSTEM},
	{"role": "user", "content": obs_text},
	]
	return tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)


	def run_episode_real(model, tok, task_id, seed):
	"""
	Full episode:
	1. POST /reset → get observation text
	2. Model generates completion from the observation
	3. Parse DECISION/CONFIDENCE/REASON
	4. POST /step → get real reward_breakdown
	"""
	reset_r = requests.post(
	f"{ENV_BASE_URL}/reset",
	json={"task_id": task_id, "seed": seed},
	timeout=15,
	)
	reset_r.raise_for_status()
	reset_data = reset_r.json()
	session_id = reset_data["session_id"]

	# Build text description from observation for the model
	obs = reset_data.get("observation", {})
	docs = obs.get("documents", [])
	doc_text = "\n".join(
	f" [{d.get('doc_type','doc')}] {d.get('content','')}" for d in docs
	)
	incident = obs.get("incident", {})
	obs_text = (
	f"Task: {task_id} \| Claim ID: {obs.get('claim_id','')}\n"
	f"Claimant: {obs.get('claimant',{}).get('name','')}\n"
	f"Incident: {incident.get('type','')} — {incident.get('description','')[:150]}\n"
	f"Documents:\n{doc_text}\n"
	f"Linked claims: {len(obs.get('linked_claims', []))}"
	)

	prompt = build_prompt(tok, obs_text, task_id)

	t0 = time.time()
	completion = generate(model, tok, prompt)
	gen_time = time.time() - t0
	decision, confidence, reason = _parse(completion)

	if decision is None or confidence is None:
	# Format failure — submit a default escalation
	decision, confidence, reason = "escalate_to_human", "LOW", "Could not parse decision from model output."

	action = {
	"action_type": decision,
	"confidence": confidence,
	"parameters": {"reason": reason},
	"reasoning": reason,
	}
	step_r = requests.post(
	f"{ENV_BASE_URL}/step",
	json={"action": action, "session_id": session_id},
	timeout=15,
	)
	step_r.raise_for_status()
	step_data = step_r.json()
	breakdown = step_data.get("observation", {}).get("reward_breakdown", {})

	return {
	"task_id": task_id,
	"seed": seed,
	"decision": decision,
	"confidence": confidence,
	"reason": reason[:100],
	"completion": completion[:200],
	"gen_time_s": round(gen_time, 1),
	"reward": round(float(step_data.get("reward", 0.0)), 4),
	"fraud_detection_score": round(float(breakdown.get("fraud_detection_score", 0.0)), 4),
	"decision_accuracy": round(float(breakdown.get("decision_accuracy", 0.0)), 4),
	"evidence_quality_score": round(float(breakdown.get("evidence_quality_score", 0.0)), 4),
	"calibration_score": round(float(breakdown.get("calibration_score", 0.0)), 4),
	}


	def eval_model(model, tok, label):
	print(f"\n{'='*60}")
	print(f"EVAL: {label}")
	print(f"{'='*60}")
	rows = []
	for task_id in EVAL_TASKS:
	for seed in SEEDS:
	try:
	row = run_episode_real(model, tok, task_id, seed)
	rows.append(row)
	print(
	f" {task_id:30s} seed={seed:2d} "
	f"decision={row['decision']:20s} conf={row['confidence']} "
	f"reward={row['reward']:.3f} "
	f"da={row['decision_accuracy']:.2f} "
	f"fd={row['fraud_detection_score']:.2f} "
	f"cal={row['calibration_score']:.2f} "
	f"[{row['gen_time_s']}s]"
	)
	except Exception as exc:
	print(f" ERROR {task_id} seed={seed}: {exc}")
	rows.append({
	"task_id": task_id, "seed": seed,
	"reward": 0.0, "fraud_detection_score": 0.0,
	"decision_accuracy": 0.0, "evidence_quality_score": 0.0,
	"calibration_score": 0.0, "decision": "error",
	})

	component_means = {
	"Fraud detection": round(mean(r["fraud_detection_score"] for r in rows), 4),
	"Decision accuracy": round(mean(r["decision_accuracy"] for r in rows), 4),
	"Evidence quality": round(mean(r["evidence_quality_score"] for r in rows), 4),
	"Calibration": round(mean(r["calibration_score"] for r in rows), 4),
	"Mean reward": round(mean(r["reward"] for r in rows), 4),
	}
	print(f"\n Component means: {json.dumps(component_means)}")
	return rows, component_means


	def save_and_plot(before_means, after_means, before_rows, after_rows, summary_path, log_history):
	delta = {k: round(after_means.get(k, 0.0) - before_means.get(k, 0.0), 4) for k in before_means}

	# Patch training_summary.json
	summary = json.loads(Path(summary_path).read_text(encoding="utf-8"))
	summary["eval_reward_before"] = before_means
	summary["eval_reward_after"] = after_means
	summary["component_shift"] = {
	"note": (
	"Real model inference: before=Qwen/Qwen2.5-0.5B-Instruct (base), "
	"after=AniketAsla/debatefloor-grpo-qwen2.5-0.5b-instruct (GRPO fine-tuned). "
	"Rewards from live env HTTP API (MR-2 compliant)."
	),
	"before": {k: v for k, v in before_means.items() if k != "Mean reward"},
	"after": {k: v for k, v in after_means.items() if k != "Mean reward"},
	}
	summary["component_shift_delta"] = {k: v for k, v in delta.items() if k != "Mean reward"}
	summary["eval_methodology"] = (
	"Real model inference: base Qwen2.5-0.5B (before) vs GRPO fine-tuned checkpoint (after). "
	f"Eval tasks: {EVAL_TASKS}. Seeds per task: {SEEDS}. "
	"Env reward from POST /step (not keyword matching)."
	)
	summary["eval_generated_at"] = datetime.now(timezone.utc).isoformat()
	summary["eval_rows"] = {"before": before_rows, "after": after_rows}

	# Remove stale pending markers
	for k in ("eval_reward_before", "eval_reward_after"):
	if summary.get(k) == "__pending_real_model_inference__":
	del summary[k]

	Path(summary_path).write_text(json.dumps(summary, indent=2), encoding="utf-8")
	print(f"\nUpdated {summary_path}")

	# Regenerate SVGs
	try:
	import matplotlib
	matplotlib.use("Agg")
	import matplotlib.pyplot as plt
	import numpy as np

	# component_shift.svg
	labels = ["Fraud detection", "Decision accuracy", "Evidence quality", "Calibration"]
	bv = [before_means.get(l, 0.0) for l in labels]
	av = [after_means.get(l, 0.0) for l in labels]
	x, w = np.arange(len(labels)), 0.35

	fig, ax = plt.subplots(figsize=(10, 5.5))
	ax.set_facecolor("#f9f9f9"); fig.patch.set_facecolor("#ffffff")
	bars_b = ax.bar(x - w/2, bv, w, label="Before (base Qwen2.5-0.5B)", color="#7a869a", alpha=0.85, edgecolor="white")
	bars_a = ax.bar(x + w/2, av, w, label="After (GRPO fine-tuned)", color="#06a77d", alpha=0.85, edgecolor="white")

	for bar in bars_b:
	h = bar.get_height()
	ax.text(bar.get_x() + bar.get_width()/2, h + 0.02 if h >= 0 else h - 0.07,
	f"{h:.2f}", ha="center", fontsize=9, color="#333")
	for bar in bars_a:
	h = bar.get_height()
	ax.text(bar.get_x() + bar.get_width()/2, h + 0.02 if h >= 0 else h - 0.07,
	f"{h:.2f}", ha="center", fontsize=9, color="#1a6b58")

	ax.set_xticks(x); ax.set_xticklabels(labels, fontsize=11)
	ax.axhline(0, color="#666", linewidth=0.8, alpha=0.5)
	ax.set_ylim(-1.1, 1.3)
	ax.set_ylabel("Component score (clamped [0,1]; calibration unbounded)", fontsize=10)
	ax.set_xlabel("Reward component", fontsize=11)
	ax.set_title("DebateFloor: Real Model Before vs After GRPO Training", fontsize=13, fontweight="bold")
	ax.grid(True, axis="y", alpha=0.2, linestyle="--"); ax.legend(framealpha=0.85, fontsize=10)

	for i, (b_v, a_v) in enumerate(zip(bv, av)):
	d = a_v - b_v
	color = "#06a77d" if d > 0 else ("#e63946" if d < 0 else "#999")
	sign = "+" if d >= 0 else ""
	ax.text(x[i], max(a_v, b_v) + 0.10, f"D{sign}{d:.2f}",
	ha="center", fontsize=9, color=color, fontweight="bold")

	delta_str = " \| ".join(
	f"{k}: {'+' if v>=0 else ''}{v:.2f}" for k, v in delta.items() if k != "Mean reward"
	)
	ax.annotate(
	f"Deltas: {delta_str}\nTraining reward: 0.130 → 0.469 (+0.339, 3.6x via live env HTTP, 2,500 steps)\n"
	"Source: real model inference (not scripted)",
	xy=(0.01, 0.01), xycoords="axes fraction", fontsize=8, color="#555",
	bbox=dict(boxstyle="round,pad=0.3", facecolor="#f0f8f0", edgecolor="#06a77d", alpha=0.85),
	)
	fig.tight_layout()
	Path("docs").mkdir(exist_ok=True)
	fig.savefig("docs/component_shift.svg", dpi=180, format="svg")
	plt.close(fig)
	print("docs/component_shift.svg updated")

	# reward_curve.svg (from training log_history)
	reward_steps, rewards, loss_steps, losses = [], [], [], []
	for row in log_history:
	step = row.get("step")
	if step is None: continue
	if "loss" in row and "train_runtime" not in row:
	loss_steps.append(step); losses.append(row["loss"])
	rv = row.get("reward") or row.get("rewards/reward_fn/mean")
	if rv is not None:
	reward_steps.append(step); rewards.append(rv)

	if rewards:
	def smooth(vals, w=7):
	return [sum(vals[max(0,i-w+1):i+1])/(i-max(0,i-w+1)+1) for i in range(len(vals))]
	fig2, ax1 = plt.subplots(figsize=(10, 5.5))
	ax1.set_facecolor("#f9f9f9"); fig2.patch.set_facecolor("#ffffff")
	if losses:
	ax1.plot(loss_steps, losses, color="#26547c", linewidth=1.2, alpha=0.45, label="Training loss")
	ax1.set_ylabel("Training loss", color="#26547c", fontsize=11)
	ax1.tick_params(axis="y", labelcolor="#26547c")
	ax1.set_xlabel("Training step", fontsize=11)
	ax1.grid(True, alpha=0.2, linestyle="--")
	ax2 = ax1.twinx()
	ax2.plot(reward_steps, rewards, color="#06a77d", linewidth=1.0, alpha=0.3)
	ax2.plot(reward_steps, smooth(rewards), color="#06a77d", linewidth=2.2, label="Mean reward (smoothed)")
	ax2.axhline(rewards[0], color="#e63946", linewidth=1.0, linestyle="--", alpha=0.6, label=f"Start: {rewards[0]:.3f}")
	ax2.axhline(rewards[-1], color="#2a9d8f", linewidth=1.0, linestyle="--", alpha=0.6, label=f"End: {rewards[-1]:.3f}")
	ax2.set_ylabel("Mean reward — live env HTTP scalar (unbounded)", color="#06a77d", fontsize=11)
	ax2.tick_params(axis="y", labelcolor="#06a77d")
	ax2.annotate("Reward from live env (POST /step)\nNot comparable to clamped [0,1] eval score.",
	xy=(0.02, 0.05), xycoords="axes fraction", fontsize=8.5, color="gray")
	lines1, lab1 = ax1.get_legend_handles_labels()
	lines2, lab2 = ax2.get_legend_handles_labels()
	ax2.legend(lines1+lines2, lab1+lab2, loc="upper left", framealpha=0.85, fontsize=9)
	fig2.suptitle("DebateFloor GRPO Training — Live Env Reward (HTTP, MR-2 Compliant)", fontsize=13, fontweight="bold")
	fig2.tight_layout()
	fig2.savefig("docs/reward_curve.svg", dpi=180, format="svg")
	plt.close(fig2)
	print("docs/reward_curve.svg updated")
	except Exception as exc:
	print(f"SVG generation failed: {exc}")


	def main():
	# Verify env is up
	r = requests.get(f"{ENV_BASE_URL}/health", timeout=5)
	assert r.json().get("status") == "healthy", f"Env not healthy: {r.text}"
	print(f"Env healthy at {ENV_BASE_URL}")

	summary_path = "reports/training_summary.json"
	summary = json.loads(Path(summary_path).read_text(encoding="utf-8"))
	log_history = summary.get("log_history", [])

	# ── BEFORE: base model ─────────────────────────────────────────────────
	base_model, base_tok = load_model(BASE_MODEL, "BASE (before training)")
	before_rows, before_means = eval_model(base_model, base_tok, f"BEFORE — {BASE_MODEL}")
	del base_model # free memory before loading trained model
	import gc; gc.collect()
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	# ── AFTER: fine-tuned model ────────────────────────────────────────────
	trained_model, trained_tok = load_model(TRAINED_MODEL, "TRAINED (after GRPO)")
	after_rows, after_means = eval_model(trained_model, trained_tok, f"AFTER — {TRAINED_MODEL}")

	# ── Save everything ────────────────────────────────────────────────────
	save_and_plot(before_means, after_means, before_rows, after_rows, summary_path, log_history)

	print("\n" + "="*60)
	print("REAL INFERENCE RESULTS")
	print("="*60)
	delta = {k: round(after_means.get(k, 0.0) - before_means.get(k, 0.0), 4) for k in before_means if k != "Mean reward"}
	print(f"Before: {json.dumps({k:v for k,v in before_means.items() if k!='Mean reward'})}")
	print(f"After: {json.dumps({k:v for k,v in after_means.items() if k!='Mean reward'})}")
	print(f"Delta: {json.dumps(delta)}")
	print("\nAll results from real model inference. Not scripted.")


	if __name__ == "__main__":
	main()