""" Speculative Tool Actions — Eval Runner v3 ========================================== Evaluates all 5 configurations on the same eval set. Config A: 8B strong model (fine-tuned on SFT) Config B: 1.7B cheap proposer (fine-tuned on SFT) Config C: 1.7B proposes → 8B verifier ACCEPT/REJECT; fallback to 8B on REJECT Config D: 1.7B proposes → 4B verifier ACCEPT/REJECT; fallback to 8B on REJECT Config E: 1.7B generates N=3 diverse proposals → 4B verifier picks best All models fine-tuned on same SFT data in chat-template "messages" format. """ import json import re import torch from collections import Counter from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel from datasets import load_dataset HUB = "narcolepticchicken" ACTIONS = [ "tool_call", "retrieval", "file_read", "file_write", "repair", "verifier", "ask_clarification", "final_answer", "BLOCKED", ] COST = {"strong": 1.0, "cheap": 0.15, "verify": 0.05} SYSTEM_PROMPT = ( "You are an agent action predictor. Given the conversation so far, " "predict the type of the next action the assistant should take. " "Choose exactly one from: " + ", ".join(ACTIONS) + ". " "Output only the action type name, nothing else." ) VERIFIER_SYSTEM = ( "You are an action verifier. Given conversation context and a proposed next action, " "determine if the proposal is correct. Respond with exactly ACCEPT or REJECT." ) def load_proposer(model_name, adapter_id=None): """Load an SFT-trained proposer model.""" print(f" Loading {model_name} + {adapter_id or 'none'}") tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) if tok.pad_token is None: tok.pad_token = tok.eos_token model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True, ) if adapter_id: model = PeftModel.from_pretrained(model, adapter_id) model.eval() return model, tok def load_verifier(adapter_id): """Load the verifier (SFT-trained on ACCEPT/REJECT).""" print(f" Loading verifier: {adapter_id}") tok = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B", trust_remote_code=True) if tok.pad_token is None: tok.pad_token = tok.eos_token model = AutoModelForCausalLM.from_pretrained( "Qwen/Qwen3-4B", torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True, ) model = PeftModel.from_pretrained(model, adapter_id) model.eval() return model, tok def build_proposer_messages(context): """Build chat messages for the proposer: system + context + query.""" msgs = [{"role": "system", "content": SYSTEM_PROMPT}] for m in context[-6:]: msgs.append({"role": m["role"], "content": str(m["content"])[:500]}) msgs.append({"role": "user", "content": "What should be the next action type?"}) return msgs def build_verifier_messages(context, proposal): """Build chat messages for the verifier: system + context + proposal query.""" msgs = [{"role": "system", "content": VERIFIER_SYSTEM}] for m in context[-6:]: msgs.append({"role": m["role"], "content": str(m["content"])[:400]}) msgs.append({ "role": "user", "content": f"Proposed next action: {proposal}\n\nIs this the correct next action? ACCEPT or REJECT?" }) return msgs def parse_action(text): """Extract action type from model output.""" text = text.strip().lower() for a in ACTIONS: if a.lower() in text: return a return "tool_call" @torch.no_grad() def predict_action(model, tok, messages, device, do_sample=False, temperature=0.8): """Generate a prediction and parse the action.""" txt = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inp = tok(txt, return_tensors="pt", truncation=True, max_length=2048).to(device) out = model.generate( **inp, max_new_tokens=20, do_sample=do_sample, temperature=temperature, top_p=0.95 if do_sample else 1.0, pad_token_id=tok.pad_token_id, ) decoded = tok.decode(out[0][inp["input_ids"].shape[1]:], skip_special_tokens=True) return parse_action(decoded) @torch.no_grad() def verify_action(model, tok, messages, device): """Ask the verifier: ACCEPT or REJECT?""" txt = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inp = tok(txt, return_tensors="pt", truncation=True, max_length=1024).to(device) out = model.generate( **inp, max_new_tokens=5, do_sample=False, pad_token_id=tok.pad_token_id, ) decoded = tok.decode(out[0][inp["input_ids"].shape[1]:], skip_special_tokens=True).strip().lower() return "accept" in decoded and "reject" not in decoded def evaluate(): device = "cuda" print(f"GPU: {torch.cuda.get_device_name(0)}") if torch.cuda.device_count() > 1: print(f" 2nd GPU: {torch.cuda.get_device_name(1)}") # Load eval data eval_ds = load_dataset(f"{HUB}/speculative-eval-v3-main", split="train") data = list(eval_ds.select(range(min(200, len(eval_ds))))) print(f"\nEvaluating {len(data)} examples") dist = Counter(ex["action_type"] for ex in data) print("Distribution:", dict(dist)) # Load models print("\nLoading models...") # Proposer (1.7B) cm, ctok = load_proposer("Qwen/Qwen3-1.7B", f"{HUB}/speculative-proposer-v3-1.7b") # Strong model (8B) sm, stok = load_proposer("Qwen/Qwen3-8B", f"{HUB}/speculative-proposer-v3-8b") # Verifier (4B) vm, vtok = load_verifier(f"{HUB}/speculative-verifier-v3-4b") results = {} # ── Config A: Strong only ─────────────────────────────── print("\nConfig A: 8B strong only") ra = [] for i, ex in enumerate(data): if i % 20 == 0: print(f" {i}/{len(data)}") msgs = build_proposer_messages(ex["messages"]) p = predict_action(sm, stok, msgs, device) ra.append({"pred": p, "true": ex["action_type"]}) acc_a = sum(1 for r in ra if r["pred"] == r["true"]) / len(ra) results["A"] = {"accuracy": round(acc_a, 4), "cost": COST["strong"]} print(f" Acc: {acc_a:.3f} Cost: {COST['strong']:.3f}") # ── Config B: Cheap only ──────────────────────────────── print("\nConfig B: 1.7B cheap only") rb = [] for i, ex in enumerate(data): if i % 20 == 0: print(f" {i}/{len(data)}") msgs = build_proposer_messages(ex["messages"]) p = predict_action(cm, ctok, msgs, device) rb.append({"pred": p, "true": ex["action_type"]}) acc_b = sum(1 for r in rb if r["pred"] == r["true"]) / len(rb) results["B"] = {"accuracy": round(acc_b, 4), "cost": COST["cheap"]} print(f" Acc: {acc_b:.3f} Cost: {COST['cheap']:.3f}") # ── Config C: Cheap + 8B verifier ─────────────────────── print("\nConfig C: cheap + 8B verifier (not implemented — skipping, same as old)") # The 8B verifier was never properly trained. We'll skip this and focus on D. results["C"] = {"accuracy": None, "cost": None, "note": "skipped — 8B verifier not trained"} # ── Config D: Cheap + 4B verifier ─────────────────────── print("\nConfig D: cheap + 4B verifier") rd = [] n_accept = 0 n_fallback = 0 for i, ex in enumerate(data): if i % 20 == 0: print(f" {i}/{len(data)}") msgs = build_proposer_messages(ex["messages"]) cheap_pred = predict_action(cm, ctok, msgs, device) # Verify vmsgs = build_verifier_messages(ex["messages"], cheap_pred) accepted = verify_action(vm, vtok, vmsgs, device) if accepted: n_accept += 1 rd.append({"pred": cheap_pred, "true": ex["action_type"], "accepted": True}) else: n_fallback += 1 # Fall back to strong model strong_pred = predict_action(sm, stok, msgs, device) rd.append({"pred": strong_pred, "true": ex["action_type"], "accepted": False}) acc_d = sum(1 for r in rd if r["pred"] == r["true"]) / len(rd) cost_d = COST["cheap"] + COST["verify"] + COST["strong"] * (n_fallback / len(data)) results["D"] = { "accuracy": round(acc_d, 4), "cost": round(cost_d, 4), "accept_rate": round(n_accept / len(data), 4), } print(f" Acc: {acc_d:.3f} Cost: {cost_d:.3f} Accept: {n_accept}/{len(data)} ({n_accept/len(data):.1%})") # ── Config E: Multi-proposal reranking ────────────────── print("\nConfig E: multi-proposal (n=3) + 4B verifier") re_results = [] for i, ex in enumerate(data): if i % 20 == 0: print(f" {i}/{len(data)}") msgs = build_proposer_messages(ex["messages"]) # Generate 3 diverse proposals proposals = set() for _ in range(3): p = predict_action(cm, ctok, msgs, device, do_sample=True, temperature=0.8) proposals.add(p) # Score each with verifier scored = [] for p in proposals: vmsgs = build_verifier_messages(ex["messages"], p) accepted = verify_action(vm, vtok, vmsgs, device) scored.append((p, accepted)) # Pick the first ACCEPT, or fall back to first proposal best = next((p for p, a in scored if a), list(proposals)[0]) re_results.append({"pred": best, "true": ex["action_type"]}) acc_e = sum(1 for r in re_results if r["pred"] == r["true"]) / len(re_results) cost_e = COST["cheap"] * 3 + COST["verify"] * 3 # 3 proposals, 3 verifications results["E"] = {"accuracy": round(acc_e, 4), "cost": round(cost_e, 4)} print(f" Acc: {acc_e:.3f} Cost: {cost_e:.3f}") # ── Baselines & Summary ───────────────────────────────── rand_acc = 1.0 / len(ACTIONS) maj_class = dist.most_common(1)[0][0] maj_acc = dist[maj_class] / len(data) print(f"\n{'='*65}") print(f"Baselines: random={rand_acc:.3f}, majority({maj_class})={maj_acc:.3f}") print(f"\n{'Config':<8} {'Acc':>8} {'Cost':>8} {'xRand':>8} {'xMaj':>8}") print("-" * 55) for c in ["A", "B", "D", "E"]: if results[c]["accuracy"] is not None: m = results[c] print(f"{c:<8} {m['accuracy']:>8.3f} {m['cost']:>8.3f} {m['accuracy']/rand_acc:>8.1f} {m['accuracy']/maj_acc:>8.1f}") # ── Cost-quality frontier ──────────────────────────────── print(f"\nCOST-QUALITY FRONTIER") frontier = [(c, results[c]) for c in ["A", "B", "D", "E"] if results[c]["accuracy"] is not None] for c, m in sorted(frontier, key=lambda x: x[1]["cost"]): print(f" {c}: cost={m['cost']:.3f} acc={m['accuracy']:.3f}") # ── Save ───────────────────────────────────────────────── output = { "results": results, "baselines": {"random": rand_acc, "majority": maj_acc, "majority_class": maj_class}, "n": len(data), "distribution": dict(dist), } with open("/tmp/eval_v3.json", "w") as f: json.dump(output, f, indent=2) from huggingface_hub import HfApi api = HfApi() api.upload_file( path_or_fileobj="/tmp/eval_v3.json", path_in_repo="eval_results_v3.json", repo_id=f"{HUB}/speculative-tool-actions", repo_type="model", commit_message="Eval v3 results", ) print("\n✓ Results uploaded.") if __name__ == "__main__": evaluate()