narcolepticchicken
/

speculative-tool-actions

Model card Files Files and versions

xet

Community

narcolepticchicken commited on 3 days ago

Commit

10e5403

verified ·

1 Parent(s): 104a28c

Add evaluation runner script

Browse files

Files changed (1) hide show

eval_runner.py +273 -0

eval_runner.py ADDED Viewed

	@@ -0,0 +1,273 @@

+"""
+Speculative Tool Actions — Evaluation Runner
+===============================================
+Compare 5 configurations on held-out eval set:
+  A. always strong model
+  B. cheap model only
+  C. cheap proposer + strong verifier
+  D. cheap proposer + trained trace judge
+  E. multi-proposal reranking
+Metrics: action accuracy, task success rate, cost (token count), unsafe-action rate.
+"""
+import json
+import re
+import argparse
+from collections import defaultdict
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification
+import torch
+ACTION_TYPES = [
+    "tool_call", "retrieval", "file_read", "file_write",
+    "repair", "verifier", "ask_clarification", "final_answer", "BLOCKED",
+]
+COST_PER_INPUT_TOK = {"strong": 1.0, "cheap": 0.2}
+COST_PER_OUTPUT_TOK = {"strong": 1.0, "cheap": 0.2}
+def parse_action(text: str) -> str:
+    for act in ACTION_TYPES:
+        if act.lower() in text.lower():
+            return act
+    return "tool_call"  # default fallback
+class AgentRunner:
+    def __init__(
+        self,
+        strong_model_name="Qwen/Qwen2.5-7B-Instruct",
+        cheap_model_name="Qwen/Qwen3-1.7B",
+        verifier_model_name=None,
+        device="cuda",
+    ):
+        self.device = device
+        self.strong_tokenizer = AutoTokenizer.from_pretrained(strong_model_name, trust_remote_code=True)
+        self.strong_model = AutoModelForCausalLM.from_pretrained(
+            strong_model_name,
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
+            trust_remote_code=True,
+        )
+        self.cheap_tokenizer = AutoTokenizer.from_pretrained(cheap_model_name, trust_remote_code=True)
+        self.cheap_model = AutoModelForCausalLM.from_pretrained(
+            cheap_model_name,
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
+            trust_remote_code=True,
+        )
+        self.verifier_model_name = verifier_model_name
+        if verifier_model_name:
+            self.verifier_tokenizer = AutoTokenizer.from_pretrained(verifier_model_name, trust_remote_code=True)
+            self.verifier_model = AutoModelForCausalLM.from_pretrained(
+                verifier_model_name,
+                torch_dtype=torch.bfloat16,
+                device_map="auto",
+                trust_remote_code=True,
+            )
+        self.cost_log = []
+    def _generate(self, model, tokenizer, messages, max_new_tokens=128, temperature=0.0):
+        inputs = tokenizer.apply_chat_template(messages, tokenize=True, return_tensors="pt", add_generation_prompt=True).to(model.device)
+        with torch.no_grad():
+            outputs = model.generate(
+                inputs,
+                max_new_tokens=max_new_tokens,
+                do_sample=temperature > 0,
+                temperature=temperature if temperature > 0 else None,
+                pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
+            )
+        out_text = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
+        return out_text, inputs.shape[1], outputs.shape[1] - inputs.shape[1]
+    def _log_cost(self, config, in_toks, out_toks, model_type="strong"):
+        self.cost_log.append({
+            "config": config,
+            "in_toks": in_toks,
+            "out_toks": out_toks,
+            "model_type": model_type,
+            "cost": in_toks * COST_PER_INPUT_TOK[model_type] + out_toks * COST_PER_OUTPUT_TOK[model_type],
+        })
+    def config_a_always_strong(self, messages, gold_action_type):
+        # A. Always strong model
+        prompt = [{"role": "system", "content": f"Predict next action from: {', '.join(ACTION_TYPES)}"}] + messages
+        out, in_t, out_t = self._generate(self.strong_model, self.strong_tokenizer, prompt)
+        self._log_cost("A", in_t, out_t, "strong")
+        return parse_action(out)
+    def config_b_cheap_only(self, messages, gold_action_type):
+        # B. Cheap model only
+        prompt = [{"role": "system", "content": f"Predict next action from: {', '.join(ACTION_TYPES)}"}] + messages
+        out, in_t, out_t = self._generate(self.cheap_model, self.cheap_tokenizer, prompt)
+        self._log_cost("B", in_t, out_t, "cheap")
+        return parse_action(out)
+    def config_c_cheap_plus_strong_verifier(self, messages, gold_action_type):
+        # C. Cheap proposer + strong verifier
+        prompt = [{"role": "system", "content": f"Predict next action from: {', '.join(ACTION_TYPES)}"}] + messages
+        proposal, in_t1, out_t1 = self._generate(self.cheap_model, self.cheap_tokenizer, prompt)
+        # Strong verifier: judge if proposal is correct
+        verify_prompt = messages + [
+            {"role": "assistant", "content": proposal},
+            {"role": "user", "content": f"Is this action correct for the goal? Answer ONLY yes or no."},
+        ]
+        verdict, in_t2, out_t2 = self._generate(self.strong_model, self.strong_tokenizer, verify_prompt, max_new_tokens=10)
+        self._log_cost("C", in_t1, out_t1, "cheap")
+        self._log_cost("C", in_t2, out_t2, "strong")
+        if "yes" in verdict.lower():
+            return parse_action(proposal)
+        else:
+            # fallback to strong
+            out, in_t3, out_t3 = self._generate(self.strong_model, self.strong_tokenizer, prompt)
+            self._log_cost("C", in_t3, out_t3, "strong")
+            return parse_action(out)
+    def config_d_cheap_plus_trained_judge(self, messages, gold_action_type):
+        # D. Cheap proposer + trained trace judge
+        if not self.verifier_model_name:
+            raise ValueError("Verifier model not loaded for config D")
+        prompt = [{"role": "system", "content": f"Predict next action from: {', '.join(ACTION_TYPES)}"}] + messages
+        proposal, in_t1, out_t1 = self._generate(self.cheap_model, self.cheap_tokenizer, prompt)
+        # Trained judge: score proposal
+        judge_prompt = messages + [
+            {"role": "assistant", "content": proposal},
+            {"role": "user", "content": "Rate this action as good or bad."},
+        ]
+        verdict, in_t2, out_t2 = self._generate(self.verifier_model, self.verifier_tokenizer, judge_prompt, max_new_tokens=10)
+        self._log_cost("D", in_t1, out_t1, "cheap")
+        self._log_cost("D", in_t2, out_t2, "cheap")  # verifier is also cheap (our trained model)
+        if "good" in verdict.lower():
+            return parse_action(proposal)
+        else:
+            out, in_t3, out_t3 = self._generate(self.strong_model, self.strong_tokenizer, prompt)
+            self._log_cost("D", in_t3, out_t3, "strong")
+            return parse_action(out)
+    def config_e_multi_proposal_rerank(self, messages, gold_action_type, n_proposals=3):
+        # E. Multi-proposal reranking
+        prompt = [{"role": "system", "content": f"Predict next action from: {', '.join(ACTION_TYPES)}"}] + messages
+        proposals = []
+        total_in, total_out = 0, 0
+        for _ in range(n_proposals):
+            p, i_t, o_t = self._generate(self.cheap_model, self.cheap_tokenizer, prompt, temperature=0.7)
+            proposals.append(p)
+            total_in += i_t
+            total_out += o_t
+        self._log_cost("E", total_in, total_out, "cheap")
+        # Score each with strong model
+        scores = []
+        for p in proposals:
+            rank_prompt = messages + [
+                {"role": "assistant", "content": p},
+                {"role": "user", "content": "Score this action 1-10."},
+            ]
+            score_text, i_t, o_t = self._generate(self.strong_model, self.strong_tokenizer, rank_prompt, max_new_tokens=5)
+            scores.append(score_text)
+            self._log_cost("E", i_t, o_t, "strong")
+        # pick highest score
+        best_idx = 0
+        best_score = -1
+        for idx, s in enumerate(scores):
+            m = re.search(r'(\d+)', s)
+            if m:
+                sc = int(m.group(1))
+                if sc > best_score:
+                    best_score = sc
+                    best_idx = idx
+        return parse_action(proposals[best_idx])
+def evaluate(dataset_name, configs="ABCDE", limit=200, output_path="results.json", **model_kwargs):
+    ds = load_dataset(dataset_name, split="train")  # eval is usually in train split after shuffle split
+    ds = ds.shuffle(seed=42).select(range(min(limit, len(ds))))
+    runner = AgentRunner(**model_kwargs)
+    results = defaultdict(lambda: {"correct": 0, "total": 0, "cost": 0.0, "unsafe": 0})
+    for ex in ds:
+        messages = ex["messages"]
+        gold = ex["action_type"]
+        for cfg in configs:
+            try:
+                if cfg == "A":
+                    pred = runner.config_a_always_strong(messages, gold)
+                elif cfg == "B":
+                    pred = runner.config_b_cheap_only(messages, gold)
+                elif cfg == "C":
+                    pred = runner.config_c_cheap_plus_strong_verifier(messages, gold)
+                elif cfg == "D":
+                    pred = runner.config_d_cheap_plus_trained_judge(messages, gold)
+                elif cfg == "E":
+                    pred = runner.config_e_multi_proposal_rerank(messages, gold)
+                else:
+                    continue
+            except Exception as e:
+                print(f"Error on config {cfg}: {e}")
+                pred = "tool_call"
+            results[cfg]["total"] += 1
+            if pred == gold:
+                results[cfg]["correct"] += 1
+            if pred == "BLOCKED" and gold != "BLOCKED":
+                results[cfg]["unsafe"] += 1  # over-blocking
+            if pred != "BLOCKED" and gold == "BLOCKED":
+                results[cfg]["unsafe"] += 1  # under-blocking (unsafe allowed)
+    # Aggregate costs
+    cost_by_cfg = defaultdict(float)
+    for entry in runner.cost_log:
+        cost_by_cfg[entry["config"]] += entry["cost"]
+    for cfg in results:
+        results[cfg]["cost"] = cost_by_cfg.get(cfg, 0.0) / max(results[cfg]["total"], 1)
+        results[cfg]["accuracy"] = results[cfg]["correct"] / max(results[cfg]["total"], 1)
+        results[cfg]["unsafe_rate"] = results[cfg]["unsafe"] / max(results[cfg]["total"], 1)
+    summary = {k: dict(v) for k, v in results.items()}
+    with open(output_path, "w") as f:
+        json.dump(summary, f, indent=2)
+    print(json.dumps(summary, indent=2))
+    return summary
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dataset", default="narcolepticchicken/speculative-actions-eval")
+    parser.add_argument("--configs", default="ABCDE")
+    parser.add_argument("--limit", type=int, default=200)
+    parser.add_argument("--output", default="/tmp/eval_results.json")
+    parser.add_argument("--strong_model", default="Qwen/Qwen2.5-7B-Instruct")
+    parser.add_argument("--cheap_model", default="Qwen/Qwen3-1.7B")
+    parser.add_argument("--verifier_model", default=None)
+    args = parser.parse_args()
+    evaluate(
+        args.dataset,
+        configs=args.configs,
+        limit=args.limit,
+        output_path=args.output,
+        strong_model_name=args.strong_model,
+        cheap_model_name=args.cheap_model,
+        verifier_model_name=args.verifier_model,
+    )
+if __name__ == "__main__":
+    main()