""" Sequential evaluation using base models. Loads one model at a time to avoid OOM. Evaluates on 30 examples for speed. """ import json, time import torch from transformers import AutoModelForCausalLM, AutoTokenizer from datasets import load_dataset HUB_ORG = 'narcolepticchicken' EVAL_DS = f'{HUB_ORG}/speculative-actions-eval' ACTIONS = ['tool_call','retrieval','file_read','file_write','repair','verifier','ask_clarification','final_answer','blocked'] def load_model(name, device='cpu'): print(f'Loading {name}...', flush=True) tok = AutoTokenizer.from_pretrained(name, trust_remote_code=True) if tok.pad_token is None: tok.pad_token = tok.eos_token model = AutoModelForCausalLM.from_pretrained( name, torch_dtype=torch.float32, trust_remote_code=True, low_cpu_mem_usage=True, ) model = model.to(device) model.eval() return model, tok def predict_action(model, tokenizer, prompt, device='cpu', max_new_tokens=15): with torch.no_grad(): inputs = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=512) if device != 'cpu': inputs = {k: v.to(device) for k, v in inputs.items()} outputs = model.generate( **inputs, max_new_tokens=max_new_tokens, do_sample=False, pad_token_id=tokenizer.pad_token_id, ) text = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True).strip() return text def parse_action(text): text_lower = text.lower() for a in ACTIONS: if a in text_lower: return a return 'tool_call' def build_proposer_prompt(context, task_type): return f"""Task: {task_type} Context: {context} Choose ONE action: tool_call, retrieval, file_read, file_write, repair, verifier, ask_clarification, final_answer, blocked Action:""" def build_verifier_prompt(context, task_type, proposed): return f"""Task: {task_type} Context: {context} Proposed action: {proposed} Is this correct? Answer YES or NO. Answer:""" def evaluate_config(data, proposer_name, verifier_name, strong_name, config, device='cpu'): results = [] if config == 'A': # Always strong model, tok = load_model(strong_name, device) for ex in data: prompt = build_proposer_prompt(ex['context'], ex['task_type']) pred = parse_action(predict_action(model, tok, prompt, device)) results.append({'pred': pred, 'true': ex['action'], 'cost': 1.0}) del model elif config == 'B': # Cheap only model, tok = load_model(proposer_name, device) for ex in data: prompt = build_proposer_prompt(ex['context'], ex['task_type']) pred = parse_action(predict_action(model, tok, prompt, device)) results.append({'pred': pred, 'true': ex['action'], 'cost': 0.2}) del model elif config == 'C': # Cheap + strong verifier cheap, cheap_tok = load_model(proposer_name, device) for ex in data: prompt = build_proposer_prompt(ex['context'], ex['task_type']) cheap_pred = parse_action(predict_action(cheap, cheap_tok, prompt, device)) results.append({'pred': cheap_pred, 'true': ex['action'], 'cost': 0.2, 'cheap_pred': cheap_pred}) del cheap strong, strong_tok = load_model(strong_name, device) for i, ex in enumerate(data): verify_prompt = build_verifier_prompt(ex['context'], ex['task_type'], results[i]['cheap_pred']) verify_text = predict_action(strong, strong_tok, verify_prompt, device, max_new_tokens=5) accepted = 'yes' in verify_text.lower() if accepted: results[i]['cost'] = 0.2 + 0.3 else: prompt = build_proposer_prompt(ex['context'], ex['task_type']) pred = parse_action(predict_action(strong, strong_tok, prompt, device)) results[i]['pred'] = pred results[i]['cost'] = 0.2 + 0.3 + 1.0 del strong elif config == 'D': # Cheap + trained verifier (base model as proxy) cheap, cheap_tok = load_model(proposer_name, device) for ex in data: prompt = build_proposer_prompt(ex['context'], ex['task_type']) cheap_pred = parse_action(predict_action(cheap, cheap_tok, prompt, device)) results.append({'pred': cheap_pred, 'true': ex['action'], 'cost': 0.2, 'cheap_pred': cheap_pred}) del cheap verifier, verifier_tok = load_model(verifier_name, device) for i, ex in enumerate(data): verify_prompt = build_verifier_prompt(ex['context'], ex['task_type'], results[i]['cheap_pred']) verify_text = predict_action(verifier, verifier_tok, verify_prompt, device, max_new_tokens=5) accepted = 'yes' in verify_text.lower() if accepted: results[i]['cost'] = 0.2 + 0.15 else: prompt = build_proposer_prompt(ex['context'], ex['task_type']) pred = parse_action(predict_action(verifier, verifier_tok, prompt, device)) results[i]['pred'] = pred results[i]['cost'] = 0.2 + 0.15 + 0.6 del verifier elif config == 'E': # Multi-proposal reranking cheap, cheap_tok = load_model(proposer_name, device) proposals_list = [] for ex in data: proposals = [] for _ in range(3): prompt = build_proposer_prompt(ex['context'], ex['task_type']) proposals.append(parse_action(predict_action(cheap, cheap_tok, prompt, device))) proposals_list.append(proposals) results.append({'pred': proposals[0], 'true': ex['action'], 'cost': 0.2 * 3}) del cheap strong, strong_tok = load_model(strong_name, device) for i, ex in enumerate(data): scores = [] for prop in proposals_list[i]: score_prompt = f"""Task: {ex['task_type']} Context: {ex['context']} Action: {prop} Rate 1-10: Score:""" score_text = predict_action(strong, strong_tok, score_prompt, device, max_new_tokens=5) score = 5 for word in score_text.split(): try: score = int(word.strip('.,!?')) break except: pass scores.append(score) best_idx = scores.index(max(scores)) results[i]['pred'] = proposals_list[i][best_idx] results[i]['cost'] = 0.2 * 3 + 0.3 * 3 del strong if device == 'cuda': torch.cuda.empty_cache() return results def compute_metrics(results_list): correct = sum(1 for r in results_list if r['pred'] == r['true']) total = len(results_list) accuracy = correct / total avg_cost = sum(r['cost'] for r in results_list) / total return {'accuracy': accuracy, 'avg_cost': avg_cost, 'n': total} def main(): device = 'cuda' if torch.cuda.is_available() else 'cpu' print(f'Device: {device}', flush=True) print('Loading eval dataset...', flush=True) ds = load_dataset(EVAL_DS)['test'] data = [ds[i] for i in range(min(30, len(ds)))] print(f'Evaluating on {len(data)} examples', flush=True) proposer = 'Qwen/Qwen3-1.7B' verifier = 'Qwen/Qwen3-4B' strong = 'Qwen/Qwen2.5-7B' all_results = {} for cfg in ['A', 'B', 'C', 'D', 'E']: print(f'\n=== Config {cfg} ===', flush=True) start = time.time() results = evaluate_config(data, proposer, verifier, strong, cfg, device) elapsed = time.time() - start metrics = compute_metrics(results) all_results[cfg] = metrics print(f"Config {cfg}: Accuracy={metrics['accuracy']:.3f}, Cost={metrics['avg_cost']:.2f}, Time={elapsed:.1f}s", flush=True) print('\n=== Final Results ===', flush=True) for cfg in ['A','B','C','D','E']: r = all_results[cfg] print(f"Config {cfg}: Accuracy={r['accuracy']:.3f}, Cost={r['avg_cost']:.2f}", flush=True) with open('/tmp/eval_results_empirical.json', 'w') as f: json.dump(all_results, f, indent=2) print('\nSaved to /tmp/eval_results_empirical.json', flush=True) # Upload to Hub from huggingface_hub import HfApi api = HfApi() api.upload_file( path_or_fileobj='/tmp/eval_results_empirical.json', path_in_repo='eval_results_empirical.json', repo_id=f'{HUB_ORG}/speculative-tool-actions', repo_type='model' ) print('Uploaded results to Hub', flush=True) if __name__ == '__main__': main()