speculative-tool-actions / eval_sequential.py
narcolepticchicken's picture
Upload eval_sequential.py
2789831 verified
"""
Sequential evaluation using base models.
Loads one model at a time to avoid OOM.
Evaluates on 30 examples for speed.
"""
import json, time
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
HUB_ORG = 'narcolepticchicken'
EVAL_DS = f'{HUB_ORG}/speculative-actions-eval'
ACTIONS = ['tool_call','retrieval','file_read','file_write','repair','verifier','ask_clarification','final_answer','blocked']
def load_model(name, device='cpu'):
print(f'Loading {name}...', flush=True)
tok = AutoTokenizer.from_pretrained(name, trust_remote_code=True)
if tok.pad_token is None:
tok.pad_token = tok.eos_token
model = AutoModelForCausalLM.from_pretrained(
name,
torch_dtype=torch.float32,
trust_remote_code=True,
low_cpu_mem_usage=True,
)
model = model.to(device)
model.eval()
return model, tok
def predict_action(model, tokenizer, prompt, device='cpu', max_new_tokens=15):
with torch.no_grad():
inputs = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=512)
if device != 'cpu':
inputs = {k: v.to(device) for k, v in inputs.items()}
outputs = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=False,
pad_token_id=tokenizer.pad_token_id,
)
text = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True).strip()
return text
def parse_action(text):
text_lower = text.lower()
for a in ACTIONS:
if a in text_lower:
return a
return 'tool_call'
def build_proposer_prompt(context, task_type):
return f"""Task: {task_type}
Context: {context}
Choose ONE action: tool_call, retrieval, file_read, file_write, repair, verifier, ask_clarification, final_answer, blocked
Action:"""
def build_verifier_prompt(context, task_type, proposed):
return f"""Task: {task_type}
Context: {context}
Proposed action: {proposed}
Is this correct? Answer YES or NO.
Answer:"""
def evaluate_config(data, proposer_name, verifier_name, strong_name, config, device='cpu'):
results = []
if config == 'A':
# Always strong
model, tok = load_model(strong_name, device)
for ex in data:
prompt = build_proposer_prompt(ex['context'], ex['task_type'])
pred = parse_action(predict_action(model, tok, prompt, device))
results.append({'pred': pred, 'true': ex['action'], 'cost': 1.0})
del model
elif config == 'B':
# Cheap only
model, tok = load_model(proposer_name, device)
for ex in data:
prompt = build_proposer_prompt(ex['context'], ex['task_type'])
pred = parse_action(predict_action(model, tok, prompt, device))
results.append({'pred': pred, 'true': ex['action'], 'cost': 0.2})
del model
elif config == 'C':
# Cheap + strong verifier
cheap, cheap_tok = load_model(proposer_name, device)
for ex in data:
prompt = build_proposer_prompt(ex['context'], ex['task_type'])
cheap_pred = parse_action(predict_action(cheap, cheap_tok, prompt, device))
results.append({'pred': cheap_pred, 'true': ex['action'], 'cost': 0.2, 'cheap_pred': cheap_pred})
del cheap
strong, strong_tok = load_model(strong_name, device)
for i, ex in enumerate(data):
verify_prompt = build_verifier_prompt(ex['context'], ex['task_type'], results[i]['cheap_pred'])
verify_text = predict_action(strong, strong_tok, verify_prompt, device, max_new_tokens=5)
accepted = 'yes' in verify_text.lower()
if accepted:
results[i]['cost'] = 0.2 + 0.3
else:
prompt = build_proposer_prompt(ex['context'], ex['task_type'])
pred = parse_action(predict_action(strong, strong_tok, prompt, device))
results[i]['pred'] = pred
results[i]['cost'] = 0.2 + 0.3 + 1.0
del strong
elif config == 'D':
# Cheap + trained verifier (base model as proxy)
cheap, cheap_tok = load_model(proposer_name, device)
for ex in data:
prompt = build_proposer_prompt(ex['context'], ex['task_type'])
cheap_pred = parse_action(predict_action(cheap, cheap_tok, prompt, device))
results.append({'pred': cheap_pred, 'true': ex['action'], 'cost': 0.2, 'cheap_pred': cheap_pred})
del cheap
verifier, verifier_tok = load_model(verifier_name, device)
for i, ex in enumerate(data):
verify_prompt = build_verifier_prompt(ex['context'], ex['task_type'], results[i]['cheap_pred'])
verify_text = predict_action(verifier, verifier_tok, verify_prompt, device, max_new_tokens=5)
accepted = 'yes' in verify_text.lower()
if accepted:
results[i]['cost'] = 0.2 + 0.15
else:
prompt = build_proposer_prompt(ex['context'], ex['task_type'])
pred = parse_action(predict_action(verifier, verifier_tok, prompt, device))
results[i]['pred'] = pred
results[i]['cost'] = 0.2 + 0.15 + 0.6
del verifier
elif config == 'E':
# Multi-proposal reranking
cheap, cheap_tok = load_model(proposer_name, device)
proposals_list = []
for ex in data:
proposals = []
for _ in range(3):
prompt = build_proposer_prompt(ex['context'], ex['task_type'])
proposals.append(parse_action(predict_action(cheap, cheap_tok, prompt, device)))
proposals_list.append(proposals)
results.append({'pred': proposals[0], 'true': ex['action'], 'cost': 0.2 * 3})
del cheap
strong, strong_tok = load_model(strong_name, device)
for i, ex in enumerate(data):
scores = []
for prop in proposals_list[i]:
score_prompt = f"""Task: {ex['task_type']}
Context: {ex['context']}
Action: {prop}
Rate 1-10:
Score:"""
score_text = predict_action(strong, strong_tok, score_prompt, device, max_new_tokens=5)
score = 5
for word in score_text.split():
try:
score = int(word.strip('.,!?'))
break
except:
pass
scores.append(score)
best_idx = scores.index(max(scores))
results[i]['pred'] = proposals_list[i][best_idx]
results[i]['cost'] = 0.2 * 3 + 0.3 * 3
del strong
if device == 'cuda':
torch.cuda.empty_cache()
return results
def compute_metrics(results_list):
correct = sum(1 for r in results_list if r['pred'] == r['true'])
total = len(results_list)
accuracy = correct / total
avg_cost = sum(r['cost'] for r in results_list) / total
return {'accuracy': accuracy, 'avg_cost': avg_cost, 'n': total}
def main():
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Device: {device}', flush=True)
print('Loading eval dataset...', flush=True)
ds = load_dataset(EVAL_DS)['test']
data = [ds[i] for i in range(min(30, len(ds)))]
print(f'Evaluating on {len(data)} examples', flush=True)
proposer = 'Qwen/Qwen3-1.7B'
verifier = 'Qwen/Qwen3-4B'
strong = 'Qwen/Qwen2.5-7B'
all_results = {}
for cfg in ['A', 'B', 'C', 'D', 'E']:
print(f'\n=== Config {cfg} ===', flush=True)
start = time.time()
results = evaluate_config(data, proposer, verifier, strong, cfg, device)
elapsed = time.time() - start
metrics = compute_metrics(results)
all_results[cfg] = metrics
print(f"Config {cfg}: Accuracy={metrics['accuracy']:.3f}, Cost={metrics['avg_cost']:.2f}, Time={elapsed:.1f}s", flush=True)
print('\n=== Final Results ===', flush=True)
for cfg in ['A','B','C','D','E']:
r = all_results[cfg]
print(f"Config {cfg}: Accuracy={r['accuracy']:.3f}, Cost={r['avg_cost']:.2f}", flush=True)
with open('/tmp/eval_results_empirical.json', 'w') as f:
json.dump(all_results, f, indent=2)
print('\nSaved to /tmp/eval_results_empirical.json', flush=True)
# Upload to Hub
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
path_or_fileobj='/tmp/eval_results_empirical.json',
path_in_repo='eval_results_empirical.json',
repo_id=f'{HUB_ORG}/speculative-tool-actions',
repo_type='model'
)
print('Uploaded results to Hub', flush=True)
if __name__ == '__main__':
main()