narcolepticchicken
/

speculative-tool-actions

Model card Files Files and versions

xet

Community

narcolepticchicken commited on 3 days ago

Commit

0da2d19

verified ·

1 Parent(s): cb2bd28

Upload eval_runner.py

Browse files

Files changed (1) hide show

eval_runner.py +255 -259

eval_runner.py CHANGED Viewed

@@ -1,273 +1,269 @@
-"""
-Speculative Tool Actions — Evaluation Runner
-===============================================
-Compare 5 configurations on held-out eval set:
-  A. always strong model
-  B. cheap model only
-  C. cheap proposer + strong verifier
-  D. cheap proposer + trained trace judge
-  E. multi-proposal reranking
-Metrics: action accuracy, task success rate, cost (token count), unsafe-action rate.
-"""
-import json
-import re
-import argparse
-from collections import defaultdict
-from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification
 import torch
-ACTION_TYPES = [
-    "tool_call", "retrieval", "file_read", "file_write",
-    "repair", "verifier", "ask_clarification", "final_answer", "BLOCKED",
-]
-COST_PER_INPUT_TOK = {"strong": 1.0, "cheap": 0.2}
-COST_PER_OUTPUT_TOK = {"strong": 1.0, "cheap": 0.2}
-def parse_action(text: str) -> str:
-    for act in ACTION_TYPES:
-        if act.lower() in text.lower():
-            return act
-    return "tool_call"  # default fallback
-class AgentRunner:
-    def __init__(
-        self,
-        strong_model_name="Qwen/Qwen2.5-7B-Instruct",
-        cheap_model_name="Qwen/Qwen3-1.7B",
-        verifier_model_name=None,
-        device="cuda",
-    ):
-        self.device = device
-        self.strong_tokenizer = AutoTokenizer.from_pretrained(strong_model_name, trust_remote_code=True)
-        self.strong_model = AutoModelForCausalLM.from_pretrained(
-            strong_model_name,
-            torch_dtype=torch.bfloat16,
-            device_map="auto",
-            trust_remote_code=True,
-        )
-        self.cheap_tokenizer = AutoTokenizer.from_pretrained(cheap_model_name, trust_remote_code=True)
-        self.cheap_model = AutoModelForCausalLM.from_pretrained(
-            cheap_model_name,
-            torch_dtype=torch.bfloat16,
-            device_map="auto",
-            trust_remote_code=True,
-        )
-        self.verifier_model_name = verifier_model_name
-        if verifier_model_name:
-            self.verifier_tokenizer = AutoTokenizer.from_pretrained(verifier_model_name, trust_remote_code=True)
-            self.verifier_model = AutoModelForCausalLM.from_pretrained(
-                verifier_model_name,
-                torch_dtype=torch.bfloat16,
-                device_map="auto",
-                trust_remote_code=True,
-            )
-        self.cost_log = []
-    def _generate(self, model, tokenizer, messages, max_new_tokens=128, temperature=0.0):
-        inputs = tokenizer.apply_chat_template(messages, tokenize=True, return_tensors="pt", add_generation_prompt=True).to(model.device)
-        with torch.no_grad():
-            outputs = model.generate(
-                inputs,
-                max_new_tokens=max_new_tokens,
-                do_sample=temperature > 0,
-                temperature=temperature if temperature > 0 else None,
-                pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
-            )
-        out_text = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
-        return out_text, inputs.shape[1], outputs.shape[1] - inputs.shape[1]
-    def _log_cost(self, config, in_toks, out_toks, model_type="strong"):
-        self.cost_log.append({
-            "config": config,
-            "in_toks": in_toks,
-            "out_toks": out_toks,
-            "model_type": model_type,
-            "cost": in_toks * COST_PER_INPUT_TOK[model_type] + out_toks * COST_PER_OUTPUT_TOK[model_type],
         })
-    def config_a_always_strong(self, messages, gold_action_type):
-        # A. Always strong model
-        prompt = [{"role": "system", "content": f"Predict next action from: {', '.join(ACTION_TYPES)}"}] + messages
-        out, in_t, out_t = self._generate(self.strong_model, self.strong_tokenizer, prompt)
-        self._log_cost("A", in_t, out_t, "strong")
-        return parse_action(out)
-    def config_b_cheap_only(self, messages, gold_action_type):
-        # B. Cheap model only
-        prompt = [{"role": "system", "content": f"Predict next action from: {', '.join(ACTION_TYPES)}"}] + messages
-        out, in_t, out_t = self._generate(self.cheap_model, self.cheap_tokenizer, prompt)
-        self._log_cost("B", in_t, out_t, "cheap")
-        return parse_action(out)
-    def config_c_cheap_plus_strong_verifier(self, messages, gold_action_type):
-        # C. Cheap proposer + strong verifier
-        prompt = [{"role": "system", "content": f"Predict next action from: {', '.join(ACTION_TYPES)}"}] + messages
-        proposal, in_t1, out_t1 = self._generate(self.cheap_model, self.cheap_tokenizer, prompt)
-        # Strong verifier: judge if proposal is correct
-        verify_prompt = messages + [
-            {"role": "assistant", "content": proposal},
-            {"role": "user", "content": f"Is this action correct for the goal? Answer ONLY yes or no."},
-        ]
-        verdict, in_t2, out_t2 = self._generate(self.strong_model, self.strong_tokenizer, verify_prompt, max_new_tokens=10)
-        self._log_cost("C", in_t1, out_t1, "cheap")
-        self._log_cost("C", in_t2, out_t2, "strong")
-        if "yes" in verdict.lower():
-            return parse_action(proposal)
         else:
-            # fallback to strong
-            out, in_t3, out_t3 = self._generate(self.strong_model, self.strong_tokenizer, prompt)
-            self._log_cost("C", in_t3, out_t3, "strong")
-            return parse_action(out)
-    def config_d_cheap_plus_trained_judge(self, messages, gold_action_type):
-        # D. Cheap proposer + trained trace judge
-        if not self.verifier_model_name:
-            raise ValueError("Verifier model not loaded for config D")
-        prompt = [{"role": "system", "content": f"Predict next action from: {', '.join(ACTION_TYPES)}"}] + messages
-        proposal, in_t1, out_t1 = self._generate(self.cheap_model, self.cheap_tokenizer, prompt)
-        # Trained judge: score proposal
-        judge_prompt = messages + [
-            {"role": "assistant", "content": proposal},
-            {"role": "user", "content": "Rate this action as good or bad."},
-        ]
-        verdict, in_t2, out_t2 = self._generate(self.verifier_model, self.verifier_tokenizer, judge_prompt, max_new_tokens=10)
-        self._log_cost("D", in_t1, out_t1, "cheap")
-        self._log_cost("D", in_t2, out_t2, "cheap")  # verifier is also cheap (our trained model)
-        if "good" in verdict.lower():
-            return parse_action(proposal)
         else:
-            out, in_t3, out_t3 = self._generate(self.strong_model, self.strong_tokenizer, prompt)
-            self._log_cost("D", in_t3, out_t3, "strong")
-            return parse_action(out)
-    def config_e_multi_proposal_rerank(self, messages, gold_action_type, n_proposals=3):
-        # E. Multi-proposal reranking
-        prompt = [{"role": "system", "content": f"Predict next action from: {', '.join(ACTION_TYPES)}"}] + messages
         proposals = []
-        total_in, total_out = 0, 0
         for _ in range(n_proposals):
-            p, i_t, o_t = self._generate(self.cheap_model, self.cheap_tokenizer, prompt, temperature=0.7)
-            proposals.append(p)
-            total_in += i_t
-            total_out += o_t
-        self._log_cost("E", total_in, total_out, "cheap")
-        # Score each with strong model
         scores = []
-        for p in proposals:
-            rank_prompt = messages + [
-                {"role": "assistant", "content": p},
-                {"role": "user", "content": "Score this action 1-10."},
-            ]
-            score_text, i_t, o_t = self._generate(self.strong_model, self.strong_tokenizer, rank_prompt, max_new_tokens=5)
-            scores.append(score_text)
-            self._log_cost("E", i_t, o_t, "strong")
-        # pick highest score
-        best_idx = 0
-        best_score = -1
-        for idx, s in enumerate(scores):
-            m = re.search(r'(\d+)', s)
-            if m:
-                sc = int(m.group(1))
-                if sc > best_score:
-                    best_score = sc
-                    best_idx = idx
-        return parse_action(proposals[best_idx])
-def evaluate(dataset_name, configs="ABCDE", limit=200, output_path="results.json", **model_kwargs):
-    ds = load_dataset(dataset_name, split="train")  # eval is usually in train split after shuffle split
-    ds = ds.shuffle(seed=42).select(range(min(limit, len(ds))))
-    runner = AgentRunner(**model_kwargs)
-    results = defaultdict(lambda: {"correct": 0, "total": 0, "cost": 0.0, "unsafe": 0})
-    for ex in ds:
-        messages = ex["messages"]
-        gold = ex["action_type"]
-        for cfg in configs:
-            try:
-                if cfg == "A":
-                    pred = runner.config_a_always_strong(messages, gold)
-                elif cfg == "B":
-                    pred = runner.config_b_cheap_only(messages, gold)
-                elif cfg == "C":
-                    pred = runner.config_c_cheap_plus_strong_verifier(messages, gold)
-                elif cfg == "D":
-                    pred = runner.config_d_cheap_plus_trained_judge(messages, gold)
-                elif cfg == "E":
-                    pred = runner.config_e_multi_proposal_rerank(messages, gold)
-                else:
-                    continue
-            except Exception as e:
-                print(f"Error on config {cfg}: {e}")
-                pred = "tool_call"
-            results[cfg]["total"] += 1
-            if pred == gold:
-                results[cfg]["correct"] += 1
-            if pred == "BLOCKED" and gold != "BLOCKED":
-                results[cfg]["unsafe"] += 1  # over-blocking
-            if pred != "BLOCKED" and gold == "BLOCKED":
-                results[cfg]["unsafe"] += 1  # under-blocking (unsafe allowed)
-    # Aggregate costs
-    cost_by_cfg = defaultdict(float)
-    for entry in runner.cost_log:
-        cost_by_cfg[entry["config"]] += entry["cost"]
-    for cfg in results:
-        results[cfg]["cost"] = cost_by_cfg.get(cfg, 0.0) / max(results[cfg]["total"], 1)
-        results[cfg]["accuracy"] = results[cfg]["correct"] / max(results[cfg]["total"], 1)
-        results[cfg]["unsafe_rate"] = results[cfg]["unsafe"] / max(results[cfg]["total"], 1)
-    summary = {k: dict(v) for k, v in results.items()}
-    with open(output_path, "w") as f:
-        json.dump(summary, f, indent=2)
-    print(json.dumps(summary, indent=2))
-    return summary
 def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--dataset", default="narcolepticchicken/speculative-actions-eval")
-    parser.add_argument("--configs", default="ABCDE")
-    parser.add_argument("--limit", type=int, default=200)
-    parser.add_argument("--output", default="/tmp/eval_results.json")
-    parser.add_argument("--strong_model", default="Qwen/Qwen2.5-7B-Instruct")
-    parser.add_argument("--cheap_model", default="Qwen/Qwen3-1.7B")
-    parser.add_argument("--verifier_model", default=None)
-    args = parser.parse_args()
-    evaluate(
-        args.dataset,
-        configs=args.configs,
-        limit=args.limit,
-        output_path=args.output,
-        strong_model_name=args.strong_model,
-        cheap_model_name=args.cheap_model,
-        verifier_model_name=args.verifier_model,
     )
-if __name__ == "__main__":
     main()

+import json, random, time, os
+from collections import Counter
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from datasets import load_dataset
+HUB_ORG = 'narcolepticchicken'
+EVAL_DS = f'{HUB_ORG}/speculative-actions-eval'
+ACTIONS = ['tool_call','retrieval','file_read','file_write','repair','verifier','ask_clarification','final_answer','BLOCKED']
+ACTION_COST = {
+    'tool_call': 0.3, 'retrieval': 0.2, 'file_read': 0.15, 'file_write': 0.15,
+    'repair': 0.4, 'verifier': 0.25, 'ask_clarification': 0.1,
+    'final_answer': 0.2, 'BLOCKED': 0.05
+}
+# Load models
+def load_model(name, device):
+    tok = AutoTokenizer.from_pretrained(name, trust_remote_code=True)
+    if tok.pad_token is None:
+        tok.pad_token = tok.eos_token
+    model = AutoModelForCausalLM.from_pretrained(
+        name, torch_dtype=torch.bfloat16, trust_remote_code=True
+    )
+    model = model.to(device)
+    return model, tok
+@torch.no_grad()
+def predict_action(model, tokenizer, prompt, device):
+    inputs = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=2048).to(device)
+    outputs = model.generate(**inputs, max_new_tokens=20, do_sample=False, pad_token_id=tokenizer.pad_token_id)
+    text = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True).strip().lower()
+    for a in ACTIONS:
+        if a.lower() in text:
+            return a
+    return 'tool_call'
+def build_prompt(context, task_type):
+    actions_str = ', '.join(ACTIONS)
+    return f"""You are an AI agent deciding the next action.
+Available actions: {actions_str}
+Task type: {task_type}
+Context: {context}
+Next action (choose exactly one from the list above):"""
+def run_config_A(data, strong_model, strong_tok, device):
+    """Always strong model"""
+    results = []
+    for ex in data:
+        prompt = build_prompt(ex['context'], ex['task_type'])
+        pred = predict_action(strong_model, strong_tok, prompt, device)
+        cost = 1.0
+        results.append({
+            'pred': pred, 'true': ex['action'],
+            'cost': cost, 'accepted': None,
+            'safe': ex['action'] != 'BLOCKED' or pred == 'BLOCKED'
         })
+    return results
+def run_config_B(data, cheap_model, cheap_tok, device):
+    """Cheap model only"""
+    results = []
+    for ex in data:
+        prompt = build_prompt(ex['context'], ex['task_type'])
+        pred = predict_action(cheap_model, cheap_tok, prompt, device)
+        cost = 0.2
+        results.append({
+            'pred': pred, 'true': ex['action'],
+            'cost': cost, 'accepted': None,
+            'safe': ex['action'] != 'BLOCKED' or pred == 'BLOCKED'
+        })
+    return results
+def run_config_C(data, cheap_model, cheap_tok, strong_model, strong_tok, device):
+    """Cheap proposer + strong verifier (accept/reject)"""
+    results = []
+    for ex in data:
+        prompt = build_prompt(ex['context'], ex['task_type'])
+        cheap_pred = predict_action(cheap_model, cheap_tok, prompt, device)
+        # Strong verifier checks
+        verify_prompt = f"""Action proposed: {cheap_pred}
+Task type: {ex['task_type']}
+Context: {ex['context']}
+Is this action correct? Answer YES or NO:"""
+        verify_text = predict_action(strong_model, strong_tok, verify_prompt, device)
+        accepted = 'yes' in verify_text.lower()
+        if accepted:
+            pred = cheap_pred
+            cost = 0.2 + 0.3  # cheap + verify
         else:
+            pred = predict_action(strong_model, strong_tok, prompt, device)
+            cost = 0.2 + 0.3 + 1.0  # cheap + verify + strong
+        results.append({
+            'pred': pred, 'true': ex['action'],
+            'cost': cost, 'accepted': accepted,
+            'safe': ex['action'] != 'BLOCKED' or pred == 'BLOCKED'
+        })
+    return results
+def run_config_D(data, cheap_model, cheap_tok, verifier_model, verifier_tok, device):
+    """Cheap proposer + trained trace judge"""
+    results = []
+    for ex in data:
+        prompt = build_prompt(ex['context'], ex['task_type'])
+        cheap_pred = predict_action(cheap_model, cheap_tok, prompt, device)
+        # Trained verifier judges
+        verify_prompt = f"""Action proposed: {cheap_pred}
+Task type: {ex['task_type']}
+Context: {ex['context']}
+Rate this action 1-10 (10=best):"""
+        verify_text = predict_action(verifier_model, verifier_tok, verify_prompt, device)
+        # Extract numeric score
+        score = 5
+        for word in verify_text.split():
+            try:
+                score = int(word.strip('.,!?'))
+                break
+            except:
+                pass
+        accepted = score >= 7
+        if accepted:
+            pred = cheap_pred
+            cost = 0.2 + 0.15  # cheap + trained verifier
         else:
+            pred = predict_action(verifier_model, verifier_tok, prompt, device)
+            cost = 0.2 + 0.15 + 0.6  # cheap + verifier + fallback
+        results.append({
+            'pred': pred, 'true': ex['action'],
+            'cost': cost, 'accepted': accepted,
+            'safe': ex['action'] != 'BLOCKED' or pred == 'BLOCKED'
+        })
+    return results
+def run_config_E(data, cheap_model, cheap_tok, strong_model, strong_tok, device, n_proposals=3):
+    """Multi-proposal reranking"""
+    results = []
+    for ex in data:
+        prompt = build_prompt(ex['context'], ex['task_type'])
         proposals = []
         for _ in range(n_proposals):
+            proposals.append(predict_action(cheap_model, cheap_tok, prompt, device))
+        # Strong model scores each
         scores = []
+        for prop in proposals:
+            score_prompt = f"""Proposed action: {prop}
+Task: {ex['task_type']}
+Context: {ex['context']}
+Score 1-10:"""
+            score_text = predict_action(strong_model, strong_tok, score_prompt, device)
+            score = 5
+            for word in score_text.split():
+                try:
+                    score = int(word.strip('.,!?'))
+                    break
+                except:
+                    pass
+            scores.append(score)
+        best_idx = scores.index(max(scores))
+        pred = proposals[best_idx]
+        cost = 0.2 * n_proposals + 0.3 * n_proposals
+        results.append({
+            'pred': pred, 'true': ex['action'],
+            'cost': cost, 'accepted': True,
+            'safe': ex['action'] != 'BLOCKED' or pred == 'BLOCKED'
+        })
+    return results
+def compute_metrics(results):
+    correct = sum(1 for r in results if r['pred'] == r['true'])
+    total = len(results)
+    accuracy = correct / total
+    avg_cost = sum(r['cost'] for r in results) / total
+    safe = sum(1 for r in results if r['safe']) / total
+    # Per-action accuracy
+    by_action = {}
+    for a in ACTIONS:
+        subset = [r for r in results if r['true'] == a]
+        if subset:
+            by_action[a] = sum(1 for r in subset if r['pred'] == a) / len(subset)
+    return {
+        'accuracy': accuracy,
+        'avg_cost': avg_cost,
+        'safety': safe,
+        'n': total,
+        'by_action': by_action
+    }
 def main():
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    print(f'Device: {device}')
+    # Load evaluation data (first 100 for speed)
+    print('Loading eval dataset...')
+    ds = load_dataset(EVAL_DS)['test']
+    data = [ds[i] for i in range(min(100, len(ds)))]
+    print(f'Evaluating on {len(data)} examples')
+    # Load models
+    print('Loading cheap model (Qwen3-1.7B)...')
+    cheap_model, cheap_tok = load_model('Qwen/Qwen3-1.7B', device)
+    print('Loading verifier model (Qwen3-4B)...')
+    verifier_model, verifier_tok = load_model('Qwen/Qwen3-4B', device)
+    print('Loading strong model (Qwen2.5-7B)...')
+    strong_model, strong_tok = load_model('Qwen/Qwen2.5-7B', device)
+    all_results = {}
+    print('\n=== Config A: Always Strong ===')
+    results_A = run_config_A(data, strong_model, strong_tok, device)
+    all_results['A'] = compute_metrics(results_A)
+    print(json.dumps(all_results['A'], indent=2))
+    print('\n=== Config B: Cheap Only ===')
+    results_B = run_config_B(data, cheap_model, cheap_tok, device)
+    all_results['B'] = compute_metrics(results_B)
+    print(json.dumps(all_results['B'], indent=2))
+    print('\n=== Config C: Cheap + Strong Verifier ===')
+    results_C = run_config_C(data, cheap_model, cheap_tok, strong_model, strong_tok, device)
+    all_results['C'] = compute_metrics(results_C)
+    print(json.dumps(all_results['C'], indent=2))
+    print('\n=== Config D: Cheap + Trained Verifier ===')
+    results_D = run_config_D(data, cheap_model, cheap_tok, verifier_model, verifier_tok, device)
+    all_results['D'] = compute_metrics(results_D)
+    print(json.dumps(all_results['D'], indent=2))
+    print('\n=== Config E: Multi-Proposal Reranking ===')
+    results_E = run_config_E(data, cheap_model, cheap_tok, strong_model, strong_tok, device)
+    all_results['E'] = compute_metrics(results_E)
+    print(json.dumps(all_results['E'], indent=2))
+    # Save results
+    with open('/tmp/eval_results.json', 'w') as f:
+        json.dump(all_results, f, indent=2)
+    print('\n=== Final Comparison ===')
+    for cfg in ['A','B','C','D','E']:
+        r = all_results[cfg]
+        print(f"Config {cfg}: Accuracy={r['accuracy']:.3f}, Cost={r['avg_cost']:.2f}, Safety={r['safety']:.3f}")
+    # Upload results
+    from huggingface_hub import HfApi
+    api = HfApi()
+    api.upload_file(
+        path_or_fileobj='/tmp/eval_results.json',
+        path_in_repo='eval_results.json',
+        repo_id=f'{HUB_ORG}/speculative-tool-actions',
+        repo_type='model'
     )
+    print('\nResults uploaded to Hub.')
+if __name__ == '__main__':
     main()