narcolepticchicken commited on
Commit
2789831
·
verified ·
1 Parent(s): c079f25

Upload eval_sequential.py

Browse files
Files changed (1) hide show
  1. eval_sequential.py +228 -0
eval_sequential.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Sequential evaluation using base models.
3
+ Loads one model at a time to avoid OOM.
4
+ Evaluates on 30 examples for speed.
5
+ """
6
+ import json, time
7
+ import torch
8
+ from transformers import AutoModelForCausalLM, AutoTokenizer
9
+ from datasets import load_dataset
10
+
11
+ HUB_ORG = 'narcolepticchicken'
12
+ EVAL_DS = f'{HUB_ORG}/speculative-actions-eval'
13
+ ACTIONS = ['tool_call','retrieval','file_read','file_write','repair','verifier','ask_clarification','final_answer','blocked']
14
+
15
+ def load_model(name, device='cpu'):
16
+ print(f'Loading {name}...', flush=True)
17
+ tok = AutoTokenizer.from_pretrained(name, trust_remote_code=True)
18
+ if tok.pad_token is None:
19
+ tok.pad_token = tok.eos_token
20
+ model = AutoModelForCausalLM.from_pretrained(
21
+ name,
22
+ torch_dtype=torch.float32,
23
+ trust_remote_code=True,
24
+ low_cpu_mem_usage=True,
25
+ )
26
+ model = model.to(device)
27
+ model.eval()
28
+ return model, tok
29
+
30
+ def predict_action(model, tokenizer, prompt, device='cpu', max_new_tokens=15):
31
+ with torch.no_grad():
32
+ inputs = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=512)
33
+ if device != 'cpu':
34
+ inputs = {k: v.to(device) for k, v in inputs.items()}
35
+ outputs = model.generate(
36
+ **inputs,
37
+ max_new_tokens=max_new_tokens,
38
+ do_sample=False,
39
+ pad_token_id=tokenizer.pad_token_id,
40
+ )
41
+ text = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True).strip()
42
+ return text
43
+
44
+ def parse_action(text):
45
+ text_lower = text.lower()
46
+ for a in ACTIONS:
47
+ if a in text_lower:
48
+ return a
49
+ return 'tool_call'
50
+
51
+ def build_proposer_prompt(context, task_type):
52
+ return f"""Task: {task_type}
53
+ Context: {context}
54
+ Choose ONE action: tool_call, retrieval, file_read, file_write, repair, verifier, ask_clarification, final_answer, blocked
55
+
56
+ Action:"""
57
+
58
+ def build_verifier_prompt(context, task_type, proposed):
59
+ return f"""Task: {task_type}
60
+ Context: {context}
61
+ Proposed action: {proposed}
62
+ Is this correct? Answer YES or NO.
63
+
64
+ Answer:"""
65
+
66
+ def evaluate_config(data, proposer_name, verifier_name, strong_name, config, device='cpu'):
67
+ results = []
68
+
69
+ if config == 'A':
70
+ # Always strong
71
+ model, tok = load_model(strong_name, device)
72
+ for ex in data:
73
+ prompt = build_proposer_prompt(ex['context'], ex['task_type'])
74
+ pred = parse_action(predict_action(model, tok, prompt, device))
75
+ results.append({'pred': pred, 'true': ex['action'], 'cost': 1.0})
76
+ del model
77
+
78
+ elif config == 'B':
79
+ # Cheap only
80
+ model, tok = load_model(proposer_name, device)
81
+ for ex in data:
82
+ prompt = build_proposer_prompt(ex['context'], ex['task_type'])
83
+ pred = parse_action(predict_action(model, tok, prompt, device))
84
+ results.append({'pred': pred, 'true': ex['action'], 'cost': 0.2})
85
+ del model
86
+
87
+ elif config == 'C':
88
+ # Cheap + strong verifier
89
+ cheap, cheap_tok = load_model(proposer_name, device)
90
+ for ex in data:
91
+ prompt = build_proposer_prompt(ex['context'], ex['task_type'])
92
+ cheap_pred = parse_action(predict_action(cheap, cheap_tok, prompt, device))
93
+ results.append({'pred': cheap_pred, 'true': ex['action'], 'cost': 0.2, 'cheap_pred': cheap_pred})
94
+ del cheap
95
+
96
+ strong, strong_tok = load_model(strong_name, device)
97
+ for i, ex in enumerate(data):
98
+ verify_prompt = build_verifier_prompt(ex['context'], ex['task_type'], results[i]['cheap_pred'])
99
+ verify_text = predict_action(strong, strong_tok, verify_prompt, device, max_new_tokens=5)
100
+ accepted = 'yes' in verify_text.lower()
101
+ if accepted:
102
+ results[i]['cost'] = 0.2 + 0.3
103
+ else:
104
+ prompt = build_proposer_prompt(ex['context'], ex['task_type'])
105
+ pred = parse_action(predict_action(strong, strong_tok, prompt, device))
106
+ results[i]['pred'] = pred
107
+ results[i]['cost'] = 0.2 + 0.3 + 1.0
108
+ del strong
109
+
110
+ elif config == 'D':
111
+ # Cheap + trained verifier (base model as proxy)
112
+ cheap, cheap_tok = load_model(proposer_name, device)
113
+ for ex in data:
114
+ prompt = build_proposer_prompt(ex['context'], ex['task_type'])
115
+ cheap_pred = parse_action(predict_action(cheap, cheap_tok, prompt, device))
116
+ results.append({'pred': cheap_pred, 'true': ex['action'], 'cost': 0.2, 'cheap_pred': cheap_pred})
117
+ del cheap
118
+
119
+ verifier, verifier_tok = load_model(verifier_name, device)
120
+ for i, ex in enumerate(data):
121
+ verify_prompt = build_verifier_prompt(ex['context'], ex['task_type'], results[i]['cheap_pred'])
122
+ verify_text = predict_action(verifier, verifier_tok, verify_prompt, device, max_new_tokens=5)
123
+ accepted = 'yes' in verify_text.lower()
124
+ if accepted:
125
+ results[i]['cost'] = 0.2 + 0.15
126
+ else:
127
+ prompt = build_proposer_prompt(ex['context'], ex['task_type'])
128
+ pred = parse_action(predict_action(verifier, verifier_tok, prompt, device))
129
+ results[i]['pred'] = pred
130
+ results[i]['cost'] = 0.2 + 0.15 + 0.6
131
+ del verifier
132
+
133
+ elif config == 'E':
134
+ # Multi-proposal reranking
135
+ cheap, cheap_tok = load_model(proposer_name, device)
136
+ proposals_list = []
137
+ for ex in data:
138
+ proposals = []
139
+ for _ in range(3):
140
+ prompt = build_proposer_prompt(ex['context'], ex['task_type'])
141
+ proposals.append(parse_action(predict_action(cheap, cheap_tok, prompt, device)))
142
+ proposals_list.append(proposals)
143
+ results.append({'pred': proposals[0], 'true': ex['action'], 'cost': 0.2 * 3})
144
+ del cheap
145
+
146
+ strong, strong_tok = load_model(strong_name, device)
147
+ for i, ex in enumerate(data):
148
+ scores = []
149
+ for prop in proposals_list[i]:
150
+ score_prompt = f"""Task: {ex['task_type']}
151
+ Context: {ex['context']}
152
+ Action: {prop}
153
+ Rate 1-10:
154
+
155
+ Score:"""
156
+ score_text = predict_action(strong, strong_tok, score_prompt, device, max_new_tokens=5)
157
+ score = 5
158
+ for word in score_text.split():
159
+ try:
160
+ score = int(word.strip('.,!?'))
161
+ break
162
+ except:
163
+ pass
164
+ scores.append(score)
165
+ best_idx = scores.index(max(scores))
166
+ results[i]['pred'] = proposals_list[i][best_idx]
167
+ results[i]['cost'] = 0.2 * 3 + 0.3 * 3
168
+ del strong
169
+
170
+ if device == 'cuda':
171
+ torch.cuda.empty_cache()
172
+
173
+ return results
174
+
175
+ def compute_metrics(results_list):
176
+ correct = sum(1 for r in results_list if r['pred'] == r['true'])
177
+ total = len(results_list)
178
+ accuracy = correct / total
179
+ avg_cost = sum(r['cost'] for r in results_list) / total
180
+ return {'accuracy': accuracy, 'avg_cost': avg_cost, 'n': total}
181
+
182
+ def main():
183
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
184
+ print(f'Device: {device}', flush=True)
185
+
186
+ print('Loading eval dataset...', flush=True)
187
+ ds = load_dataset(EVAL_DS)['test']
188
+ data = [ds[i] for i in range(min(30, len(ds)))]
189
+ print(f'Evaluating on {len(data)} examples', flush=True)
190
+
191
+ proposer = 'Qwen/Qwen3-1.7B'
192
+ verifier = 'Qwen/Qwen3-4B'
193
+ strong = 'Qwen/Qwen2.5-7B'
194
+
195
+ all_results = {}
196
+
197
+ for cfg in ['A', 'B', 'C', 'D', 'E']:
198
+ print(f'\n=== Config {cfg} ===', flush=True)
199
+ start = time.time()
200
+ results = evaluate_config(data, proposer, verifier, strong, cfg, device)
201
+ elapsed = time.time() - start
202
+ metrics = compute_metrics(results)
203
+ all_results[cfg] = metrics
204
+ print(f"Config {cfg}: Accuracy={metrics['accuracy']:.3f}, Cost={metrics['avg_cost']:.2f}, Time={elapsed:.1f}s", flush=True)
205
+
206
+ print('\n=== Final Results ===', flush=True)
207
+ for cfg in ['A','B','C','D','E']:
208
+ r = all_results[cfg]
209
+ print(f"Config {cfg}: Accuracy={r['accuracy']:.3f}, Cost={r['avg_cost']:.2f}", flush=True)
210
+
211
+ with open('/tmp/eval_results_empirical.json', 'w') as f:
212
+ json.dump(all_results, f, indent=2)
213
+
214
+ print('\nSaved to /tmp/eval_results_empirical.json', flush=True)
215
+
216
+ # Upload to Hub
217
+ from huggingface_hub import HfApi
218
+ api = HfApi()
219
+ api.upload_file(
220
+ path_or_fileobj='/tmp/eval_results_empirical.json',
221
+ path_in_repo='eval_results_empirical.json',
222
+ repo_id=f'{HUB_ORG}/speculative-tool-actions',
223
+ repo_type='model'
224
+ )
225
+ print('Uploaded results to Hub', flush=True)
226
+
227
+ if __name__ == '__main__':
228
+ main()