| """ |
| Sequential evaluation using base models. |
| Loads one model at a time to avoid OOM. |
| Evaluates on 30 examples for speed. |
| """ |
| import json, time |
| import torch |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
| from datasets import load_dataset |
|
|
| HUB_ORG = 'narcolepticchicken' |
| EVAL_DS = f'{HUB_ORG}/speculative-actions-eval' |
| ACTIONS = ['tool_call','retrieval','file_read','file_write','repair','verifier','ask_clarification','final_answer','blocked'] |
|
|
| def load_model(name, device='cpu'): |
| print(f'Loading {name}...', flush=True) |
| tok = AutoTokenizer.from_pretrained(name, trust_remote_code=True) |
| if tok.pad_token is None: |
| tok.pad_token = tok.eos_token |
| model = AutoModelForCausalLM.from_pretrained( |
| name, |
| torch_dtype=torch.float32, |
| trust_remote_code=True, |
| low_cpu_mem_usage=True, |
| ) |
| model = model.to(device) |
| model.eval() |
| return model, tok |
|
|
| def predict_action(model, tokenizer, prompt, device='cpu', max_new_tokens=15): |
| with torch.no_grad(): |
| inputs = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=512) |
| if device != 'cpu': |
| inputs = {k: v.to(device) for k, v in inputs.items()} |
| outputs = model.generate( |
| **inputs, |
| max_new_tokens=max_new_tokens, |
| do_sample=False, |
| pad_token_id=tokenizer.pad_token_id, |
| ) |
| text = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True).strip() |
| return text |
|
|
| def parse_action(text): |
| text_lower = text.lower() |
| for a in ACTIONS: |
| if a in text_lower: |
| return a |
| return 'tool_call' |
|
|
| def build_proposer_prompt(context, task_type): |
| return f"""Task: {task_type} |
| Context: {context} |
| Choose ONE action: tool_call, retrieval, file_read, file_write, repair, verifier, ask_clarification, final_answer, blocked |
| |
| Action:""" |
|
|
| def build_verifier_prompt(context, task_type, proposed): |
| return f"""Task: {task_type} |
| Context: {context} |
| Proposed action: {proposed} |
| Is this correct? Answer YES or NO. |
| |
| Answer:""" |
|
|
| def evaluate_config(data, proposer_name, verifier_name, strong_name, config, device='cpu'): |
| results = [] |
| |
| if config == 'A': |
| |
| model, tok = load_model(strong_name, device) |
| for ex in data: |
| prompt = build_proposer_prompt(ex['context'], ex['task_type']) |
| pred = parse_action(predict_action(model, tok, prompt, device)) |
| results.append({'pred': pred, 'true': ex['action'], 'cost': 1.0}) |
| del model |
| |
| elif config == 'B': |
| |
| model, tok = load_model(proposer_name, device) |
| for ex in data: |
| prompt = build_proposer_prompt(ex['context'], ex['task_type']) |
| pred = parse_action(predict_action(model, tok, prompt, device)) |
| results.append({'pred': pred, 'true': ex['action'], 'cost': 0.2}) |
| del model |
| |
| elif config == 'C': |
| |
| cheap, cheap_tok = load_model(proposer_name, device) |
| for ex in data: |
| prompt = build_proposer_prompt(ex['context'], ex['task_type']) |
| cheap_pred = parse_action(predict_action(cheap, cheap_tok, prompt, device)) |
| results.append({'pred': cheap_pred, 'true': ex['action'], 'cost': 0.2, 'cheap_pred': cheap_pred}) |
| del cheap |
| |
| strong, strong_tok = load_model(strong_name, device) |
| for i, ex in enumerate(data): |
| verify_prompt = build_verifier_prompt(ex['context'], ex['task_type'], results[i]['cheap_pred']) |
| verify_text = predict_action(strong, strong_tok, verify_prompt, device, max_new_tokens=5) |
| accepted = 'yes' in verify_text.lower() |
| if accepted: |
| results[i]['cost'] = 0.2 + 0.3 |
| else: |
| prompt = build_proposer_prompt(ex['context'], ex['task_type']) |
| pred = parse_action(predict_action(strong, strong_tok, prompt, device)) |
| results[i]['pred'] = pred |
| results[i]['cost'] = 0.2 + 0.3 + 1.0 |
| del strong |
| |
| elif config == 'D': |
| |
| cheap, cheap_tok = load_model(proposer_name, device) |
| for ex in data: |
| prompt = build_proposer_prompt(ex['context'], ex['task_type']) |
| cheap_pred = parse_action(predict_action(cheap, cheap_tok, prompt, device)) |
| results.append({'pred': cheap_pred, 'true': ex['action'], 'cost': 0.2, 'cheap_pred': cheap_pred}) |
| del cheap |
| |
| verifier, verifier_tok = load_model(verifier_name, device) |
| for i, ex in enumerate(data): |
| verify_prompt = build_verifier_prompt(ex['context'], ex['task_type'], results[i]['cheap_pred']) |
| verify_text = predict_action(verifier, verifier_tok, verify_prompt, device, max_new_tokens=5) |
| accepted = 'yes' in verify_text.lower() |
| if accepted: |
| results[i]['cost'] = 0.2 + 0.15 |
| else: |
| prompt = build_proposer_prompt(ex['context'], ex['task_type']) |
| pred = parse_action(predict_action(verifier, verifier_tok, prompt, device)) |
| results[i]['pred'] = pred |
| results[i]['cost'] = 0.2 + 0.15 + 0.6 |
| del verifier |
| |
| elif config == 'E': |
| |
| cheap, cheap_tok = load_model(proposer_name, device) |
| proposals_list = [] |
| for ex in data: |
| proposals = [] |
| for _ in range(3): |
| prompt = build_proposer_prompt(ex['context'], ex['task_type']) |
| proposals.append(parse_action(predict_action(cheap, cheap_tok, prompt, device))) |
| proposals_list.append(proposals) |
| results.append({'pred': proposals[0], 'true': ex['action'], 'cost': 0.2 * 3}) |
| del cheap |
| |
| strong, strong_tok = load_model(strong_name, device) |
| for i, ex in enumerate(data): |
| scores = [] |
| for prop in proposals_list[i]: |
| score_prompt = f"""Task: {ex['task_type']} |
| Context: {ex['context']} |
| Action: {prop} |
| Rate 1-10: |
| |
| Score:""" |
| score_text = predict_action(strong, strong_tok, score_prompt, device, max_new_tokens=5) |
| score = 5 |
| for word in score_text.split(): |
| try: |
| score = int(word.strip('.,!?')) |
| break |
| except: |
| pass |
| scores.append(score) |
| best_idx = scores.index(max(scores)) |
| results[i]['pred'] = proposals_list[i][best_idx] |
| results[i]['cost'] = 0.2 * 3 + 0.3 * 3 |
| del strong |
| |
| if device == 'cuda': |
| torch.cuda.empty_cache() |
| |
| return results |
|
|
| def compute_metrics(results_list): |
| correct = sum(1 for r in results_list if r['pred'] == r['true']) |
| total = len(results_list) |
| accuracy = correct / total |
| avg_cost = sum(r['cost'] for r in results_list) / total |
| return {'accuracy': accuracy, 'avg_cost': avg_cost, 'n': total} |
|
|
| def main(): |
| device = 'cuda' if torch.cuda.is_available() else 'cpu' |
| print(f'Device: {device}', flush=True) |
| |
| print('Loading eval dataset...', flush=True) |
| ds = load_dataset(EVAL_DS)['test'] |
| data = [ds[i] for i in range(min(30, len(ds)))] |
| print(f'Evaluating on {len(data)} examples', flush=True) |
| |
| proposer = 'Qwen/Qwen3-1.7B' |
| verifier = 'Qwen/Qwen3-4B' |
| strong = 'Qwen/Qwen2.5-7B' |
| |
| all_results = {} |
| |
| for cfg in ['A', 'B', 'C', 'D', 'E']: |
| print(f'\n=== Config {cfg} ===', flush=True) |
| start = time.time() |
| results = evaluate_config(data, proposer, verifier, strong, cfg, device) |
| elapsed = time.time() - start |
| metrics = compute_metrics(results) |
| all_results[cfg] = metrics |
| print(f"Config {cfg}: Accuracy={metrics['accuracy']:.3f}, Cost={metrics['avg_cost']:.2f}, Time={elapsed:.1f}s", flush=True) |
| |
| print('\n=== Final Results ===', flush=True) |
| for cfg in ['A','B','C','D','E']: |
| r = all_results[cfg] |
| print(f"Config {cfg}: Accuracy={r['accuracy']:.3f}, Cost={r['avg_cost']:.2f}", flush=True) |
| |
| with open('/tmp/eval_results_empirical.json', 'w') as f: |
| json.dump(all_results, f, indent=2) |
| |
| print('\nSaved to /tmp/eval_results_empirical.json', flush=True) |
| |
| |
| from huggingface_hub import HfApi |
| api = HfApi() |
| api.upload_file( |
| path_or_fileobj='/tmp/eval_results_empirical.json', |
| path_in_repo='eval_results_empirical.json', |
| repo_id=f'{HUB_ORG}/speculative-tool-actions', |
| repo_type='model' |
| ) |
| print('Uploaded results to Hub', flush=True) |
|
|
| if __name__ == '__main__': |
| main() |
|
|