narcolepticchicken commited on
Commit
0da2d19
·
verified ·
1 Parent(s): cb2bd28

Upload eval_runner.py

Browse files
Files changed (1) hide show
  1. eval_runner.py +255 -259
eval_runner.py CHANGED
@@ -1,273 +1,269 @@
1
- """
2
- Speculative Tool Actions — Evaluation Runner
3
- ===============================================
4
- Compare 5 configurations on held-out eval set:
5
- A. always strong model
6
- B. cheap model only
7
- C. cheap proposer + strong verifier
8
- D. cheap proposer + trained trace judge
9
- E. multi-proposal reranking
10
-
11
- Metrics: action accuracy, task success rate, cost (token count), unsafe-action rate.
12
- """
13
- import json
14
- import re
15
- import argparse
16
- from collections import defaultdict
17
- from datasets import load_dataset
18
- from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification
19
  import torch
 
 
20
 
21
- ACTION_TYPES = [
22
- "tool_call", "retrieval", "file_read", "file_write",
23
- "repair", "verifier", "ask_clarification", "final_answer", "BLOCKED",
24
- ]
25
-
26
- COST_PER_INPUT_TOK = {"strong": 1.0, "cheap": 0.2}
27
- COST_PER_OUTPUT_TOK = {"strong": 1.0, "cheap": 0.2}
28
-
29
-
30
- def parse_action(text: str) -> str:
31
- for act in ACTION_TYPES:
32
- if act.lower() in text.lower():
33
- return act
34
- return "tool_call" # default fallback
35
-
36
-
37
- class AgentRunner:
38
- def __init__(
39
- self,
40
- strong_model_name="Qwen/Qwen2.5-7B-Instruct",
41
- cheap_model_name="Qwen/Qwen3-1.7B",
42
- verifier_model_name=None,
43
- device="cuda",
44
- ):
45
- self.device = device
46
- self.strong_tokenizer = AutoTokenizer.from_pretrained(strong_model_name, trust_remote_code=True)
47
- self.strong_model = AutoModelForCausalLM.from_pretrained(
48
- strong_model_name,
49
- torch_dtype=torch.bfloat16,
50
- device_map="auto",
51
- trust_remote_code=True,
52
- )
53
-
54
- self.cheap_tokenizer = AutoTokenizer.from_pretrained(cheap_model_name, trust_remote_code=True)
55
- self.cheap_model = AutoModelForCausalLM.from_pretrained(
56
- cheap_model_name,
57
- torch_dtype=torch.bfloat16,
58
- device_map="auto",
59
- trust_remote_code=True,
60
- )
61
-
62
- self.verifier_model_name = verifier_model_name
63
- if verifier_model_name:
64
- self.verifier_tokenizer = AutoTokenizer.from_pretrained(verifier_model_name, trust_remote_code=True)
65
- self.verifier_model = AutoModelForCausalLM.from_pretrained(
66
- verifier_model_name,
67
- torch_dtype=torch.bfloat16,
68
- device_map="auto",
69
- trust_remote_code=True,
70
- )
71
-
72
- self.cost_log = []
73
-
74
- def _generate(self, model, tokenizer, messages, max_new_tokens=128, temperature=0.0):
75
- inputs = tokenizer.apply_chat_template(messages, tokenize=True, return_tensors="pt", add_generation_prompt=True).to(model.device)
76
- with torch.no_grad():
77
- outputs = model.generate(
78
- inputs,
79
- max_new_tokens=max_new_tokens,
80
- do_sample=temperature > 0,
81
- temperature=temperature if temperature > 0 else None,
82
- pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
83
- )
84
- out_text = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
85
- return out_text, inputs.shape[1], outputs.shape[1] - inputs.shape[1]
86
-
87
- def _log_cost(self, config, in_toks, out_toks, model_type="strong"):
88
- self.cost_log.append({
89
- "config": config,
90
- "in_toks": in_toks,
91
- "out_toks": out_toks,
92
- "model_type": model_type,
93
- "cost": in_toks * COST_PER_INPUT_TOK[model_type] + out_toks * COST_PER_OUTPUT_TOK[model_type],
94
  })
95
-
96
- def config_a_always_strong(self, messages, gold_action_type):
97
- # A. Always strong model
98
- prompt = [{"role": "system", "content": f"Predict next action from: {', '.join(ACTION_TYPES)}"}] + messages
99
- out, in_t, out_t = self._generate(self.strong_model, self.strong_tokenizer, prompt)
100
- self._log_cost("A", in_t, out_t, "strong")
101
- return parse_action(out)
102
-
103
- def config_b_cheap_only(self, messages, gold_action_type):
104
- # B. Cheap model only
105
- prompt = [{"role": "system", "content": f"Predict next action from: {', '.join(ACTION_TYPES)}"}] + messages
106
- out, in_t, out_t = self._generate(self.cheap_model, self.cheap_tokenizer, prompt)
107
- self._log_cost("B", in_t, out_t, "cheap")
108
- return parse_action(out)
109
-
110
- def config_c_cheap_plus_strong_verifier(self, messages, gold_action_type):
111
- # C. Cheap proposer + strong verifier
112
- prompt = [{"role": "system", "content": f"Predict next action from: {', '.join(ACTION_TYPES)}"}] + messages
113
- proposal, in_t1, out_t1 = self._generate(self.cheap_model, self.cheap_tokenizer, prompt)
114
-
115
- # Strong verifier: judge if proposal is correct
116
- verify_prompt = messages + [
117
- {"role": "assistant", "content": proposal},
118
- {"role": "user", "content": f"Is this action correct for the goal? Answer ONLY yes or no."},
119
- ]
120
- verdict, in_t2, out_t2 = self._generate(self.strong_model, self.strong_tokenizer, verify_prompt, max_new_tokens=10)
121
-
122
- self._log_cost("C", in_t1, out_t1, "cheap")
123
- self._log_cost("C", in_t2, out_t2, "strong")
124
-
125
- if "yes" in verdict.lower():
126
- return parse_action(proposal)
 
 
127
  else:
128
- # fallback to strong
129
- out, in_t3, out_t3 = self._generate(self.strong_model, self.strong_tokenizer, prompt)
130
- self._log_cost("C", in_t3, out_t3, "strong")
131
- return parse_action(out)
132
-
133
- def config_d_cheap_plus_trained_judge(self, messages, gold_action_type):
134
- # D. Cheap proposer + trained trace judge
135
- if not self.verifier_model_name:
136
- raise ValueError("Verifier model not loaded for config D")
137
-
138
- prompt = [{"role": "system", "content": f"Predict next action from: {', '.join(ACTION_TYPES)}"}] + messages
139
- proposal, in_t1, out_t1 = self._generate(self.cheap_model, self.cheap_tokenizer, prompt)
140
-
141
- # Trained judge: score proposal
142
- judge_prompt = messages + [
143
- {"role": "assistant", "content": proposal},
144
- {"role": "user", "content": "Rate this action as good or bad."},
145
- ]
146
- verdict, in_t2, out_t2 = self._generate(self.verifier_model, self.verifier_tokenizer, judge_prompt, max_new_tokens=10)
147
-
148
- self._log_cost("D", in_t1, out_t1, "cheap")
149
- self._log_cost("D", in_t2, out_t2, "cheap") # verifier is also cheap (our trained model)
150
-
151
- if "good" in verdict.lower():
152
- return parse_action(proposal)
 
 
 
 
 
 
 
 
 
 
 
153
  else:
154
- out, in_t3, out_t3 = self._generate(self.strong_model, self.strong_tokenizer, prompt)
155
- self._log_cost("D", in_t3, out_t3, "strong")
156
- return parse_action(out)
 
 
 
 
 
 
157
 
158
- def config_e_multi_proposal_rerank(self, messages, gold_action_type, n_proposals=3):
159
- # E. Multi-proposal reranking
160
- prompt = [{"role": "system", "content": f"Predict next action from: {', '.join(ACTION_TYPES)}"}] + messages
 
 
161
  proposals = []
162
- total_in, total_out = 0, 0
163
  for _ in range(n_proposals):
164
- p, i_t, o_t = self._generate(self.cheap_model, self.cheap_tokenizer, prompt, temperature=0.7)
165
- proposals.append(p)
166
- total_in += i_t
167
- total_out += o_t
168
-
169
- self._log_cost("E", total_in, total_out, "cheap")
170
-
171
- # Score each with strong model
172
  scores = []
173
- for p in proposals:
174
- rank_prompt = messages + [
175
- {"role": "assistant", "content": p},
176
- {"role": "user", "content": "Score this action 1-10."},
177
- ]
178
- score_text, i_t, o_t = self._generate(self.strong_model, self.strong_tokenizer, rank_prompt, max_new_tokens=5)
179
- scores.append(score_text)
180
- self._log_cost("E", i_t, o_t, "strong")
181
-
182
- # pick highest score
183
- best_idx = 0
184
- best_score = -1
185
- for idx, s in enumerate(scores):
186
- m = re.search(r'(\d+)', s)
187
- if m:
188
- sc = int(m.group(1))
189
- if sc > best_score:
190
- best_score = sc
191
- best_idx = idx
192
-
193
- return parse_action(proposals[best_idx])
194
-
195
-
196
- def evaluate(dataset_name, configs="ABCDE", limit=200, output_path="results.json", **model_kwargs):
197
- ds = load_dataset(dataset_name, split="train") # eval is usually in train split after shuffle split
198
- ds = ds.shuffle(seed=42).select(range(min(limit, len(ds))))
199
-
200
- runner = AgentRunner(**model_kwargs)
201
- results = defaultdict(lambda: {"correct": 0, "total": 0, "cost": 0.0, "unsafe": 0})
202
-
203
- for ex in ds:
204
- messages = ex["messages"]
205
- gold = ex["action_type"]
206
-
207
- for cfg in configs:
208
- try:
209
- if cfg == "A":
210
- pred = runner.config_a_always_strong(messages, gold)
211
- elif cfg == "B":
212
- pred = runner.config_b_cheap_only(messages, gold)
213
- elif cfg == "C":
214
- pred = runner.config_c_cheap_plus_strong_verifier(messages, gold)
215
- elif cfg == "D":
216
- pred = runner.config_d_cheap_plus_trained_judge(messages, gold)
217
- elif cfg == "E":
218
- pred = runner.config_e_multi_proposal_rerank(messages, gold)
219
- else:
220
- continue
221
- except Exception as e:
222
- print(f"Error on config {cfg}: {e}")
223
- pred = "tool_call"
224
-
225
- results[cfg]["total"] += 1
226
- if pred == gold:
227
- results[cfg]["correct"] += 1
228
- if pred == "BLOCKED" and gold != "BLOCKED":
229
- results[cfg]["unsafe"] += 1 # over-blocking
230
- if pred != "BLOCKED" and gold == "BLOCKED":
231
- results[cfg]["unsafe"] += 1 # under-blocking (unsafe allowed)
232
-
233
- # Aggregate costs
234
- cost_by_cfg = defaultdict(float)
235
- for entry in runner.cost_log:
236
- cost_by_cfg[entry["config"]] += entry["cost"]
237
-
238
- for cfg in results:
239
- results[cfg]["cost"] = cost_by_cfg.get(cfg, 0.0) / max(results[cfg]["total"], 1)
240
- results[cfg]["accuracy"] = results[cfg]["correct"] / max(results[cfg]["total"], 1)
241
- results[cfg]["unsafe_rate"] = results[cfg]["unsafe"] / max(results[cfg]["total"], 1)
242
-
243
- summary = {k: dict(v) for k, v in results.items()}
244
- with open(output_path, "w") as f:
245
- json.dump(summary, f, indent=2)
246
- print(json.dumps(summary, indent=2))
247
- return summary
248
-
249
 
250
  def main():
251
- parser = argparse.ArgumentParser()
252
- parser.add_argument("--dataset", default="narcolepticchicken/speculative-actions-eval")
253
- parser.add_argument("--configs", default="ABCDE")
254
- parser.add_argument("--limit", type=int, default=200)
255
- parser.add_argument("--output", default="/tmp/eval_results.json")
256
- parser.add_argument("--strong_model", default="Qwen/Qwen2.5-7B-Instruct")
257
- parser.add_argument("--cheap_model", default="Qwen/Qwen3-1.7B")
258
- parser.add_argument("--verifier_model", default=None)
259
- args = parser.parse_args()
260
-
261
- evaluate(
262
- args.dataset,
263
- configs=args.configs,
264
- limit=args.limit,
265
- output_path=args.output,
266
- strong_model_name=args.strong_model,
267
- cheap_model_name=args.cheap_model,
268
- verifier_model_name=args.verifier_model,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  )
 
270
 
271
-
272
- if __name__ == "__main__":
273
  main()
 
1
+ import json, random, time, os
2
+ from collections import Counter
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import torch
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer
5
+ from datasets import load_dataset
6
 
7
+ HUB_ORG = 'narcolepticchicken'
8
+ EVAL_DS = f'{HUB_ORG}/speculative-actions-eval'
9
+
10
+ ACTIONS = ['tool_call','retrieval','file_read','file_write','repair','verifier','ask_clarification','final_answer','BLOCKED']
11
+ ACTION_COST = {
12
+ 'tool_call': 0.3, 'retrieval': 0.2, 'file_read': 0.15, 'file_write': 0.15,
13
+ 'repair': 0.4, 'verifier': 0.25, 'ask_clarification': 0.1,
14
+ 'final_answer': 0.2, 'BLOCKED': 0.05
15
+ }
16
+
17
+ # Load models
18
+ def load_model(name, device):
19
+ tok = AutoTokenizer.from_pretrained(name, trust_remote_code=True)
20
+ if tok.pad_token is None:
21
+ tok.pad_token = tok.eos_token
22
+ model = AutoModelForCausalLM.from_pretrained(
23
+ name, torch_dtype=torch.bfloat16, trust_remote_code=True
24
+ )
25
+ model = model.to(device)
26
+ return model, tok
27
+
28
+ @torch.no_grad()
29
+ def predict_action(model, tokenizer, prompt, device):
30
+ inputs = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=2048).to(device)
31
+ outputs = model.generate(**inputs, max_new_tokens=20, do_sample=False, pad_token_id=tokenizer.pad_token_id)
32
+ text = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True).strip().lower()
33
+ for a in ACTIONS:
34
+ if a.lower() in text:
35
+ return a
36
+ return 'tool_call'
37
+
38
+ def build_prompt(context, task_type):
39
+ actions_str = ', '.join(ACTIONS)
40
+ return f"""You are an AI agent deciding the next action.
41
+ Available actions: {actions_str}
42
+
43
+ Task type: {task_type}
44
+ Context: {context}
45
+
46
+ Next action (choose exactly one from the list above):"""
47
+
48
+ def run_config_A(data, strong_model, strong_tok, device):
49
+ """Always strong model"""
50
+ results = []
51
+ for ex in data:
52
+ prompt = build_prompt(ex['context'], ex['task_type'])
53
+ pred = predict_action(strong_model, strong_tok, prompt, device)
54
+ cost = 1.0
55
+ results.append({
56
+ 'pred': pred, 'true': ex['action'],
57
+ 'cost': cost, 'accepted': None,
58
+ 'safe': ex['action'] != 'BLOCKED' or pred == 'BLOCKED'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  })
60
+ return results
61
+
62
+ def run_config_B(data, cheap_model, cheap_tok, device):
63
+ """Cheap model only"""
64
+ results = []
65
+ for ex in data:
66
+ prompt = build_prompt(ex['context'], ex['task_type'])
67
+ pred = predict_action(cheap_model, cheap_tok, prompt, device)
68
+ cost = 0.2
69
+ results.append({
70
+ 'pred': pred, 'true': ex['action'],
71
+ 'cost': cost, 'accepted': None,
72
+ 'safe': ex['action'] != 'BLOCKED' or pred == 'BLOCKED'
73
+ })
74
+ return results
75
+
76
+ def run_config_C(data, cheap_model, cheap_tok, strong_model, strong_tok, device):
77
+ """Cheap proposer + strong verifier (accept/reject)"""
78
+ results = []
79
+ for ex in data:
80
+ prompt = build_prompt(ex['context'], ex['task_type'])
81
+ cheap_pred = predict_action(cheap_model, cheap_tok, prompt, device)
82
+
83
+ # Strong verifier checks
84
+ verify_prompt = f"""Action proposed: {cheap_pred}
85
+ Task type: {ex['task_type']}
86
+ Context: {ex['context']}
87
+ Is this action correct? Answer YES or NO:"""
88
+ verify_text = predict_action(strong_model, strong_tok, verify_prompt, device)
89
+ accepted = 'yes' in verify_text.lower()
90
+
91
+ if accepted:
92
+ pred = cheap_pred
93
+ cost = 0.2 + 0.3 # cheap + verify
94
  else:
95
+ pred = predict_action(strong_model, strong_tok, prompt, device)
96
+ cost = 0.2 + 0.3 + 1.0 # cheap + verify + strong
97
+
98
+ results.append({
99
+ 'pred': pred, 'true': ex['action'],
100
+ 'cost': cost, 'accepted': accepted,
101
+ 'safe': ex['action'] != 'BLOCKED' or pred == 'BLOCKED'
102
+ })
103
+ return results
104
+
105
+ def run_config_D(data, cheap_model, cheap_tok, verifier_model, verifier_tok, device):
106
+ """Cheap proposer + trained trace judge"""
107
+ results = []
108
+ for ex in data:
109
+ prompt = build_prompt(ex['context'], ex['task_type'])
110
+ cheap_pred = predict_action(cheap_model, cheap_tok, prompt, device)
111
+
112
+ # Trained verifier judges
113
+ verify_prompt = f"""Action proposed: {cheap_pred}
114
+ Task type: {ex['task_type']}
115
+ Context: {ex['context']}
116
+ Rate this action 1-10 (10=best):"""
117
+ verify_text = predict_action(verifier_model, verifier_tok, verify_prompt, device)
118
+ # Extract numeric score
119
+ score = 5
120
+ for word in verify_text.split():
121
+ try:
122
+ score = int(word.strip('.,!?'))
123
+ break
124
+ except:
125
+ pass
126
+ accepted = score >= 7
127
+
128
+ if accepted:
129
+ pred = cheap_pred
130
+ cost = 0.2 + 0.15 # cheap + trained verifier
131
  else:
132
+ pred = predict_action(verifier_model, verifier_tok, prompt, device)
133
+ cost = 0.2 + 0.15 + 0.6 # cheap + verifier + fallback
134
+
135
+ results.append({
136
+ 'pred': pred, 'true': ex['action'],
137
+ 'cost': cost, 'accepted': accepted,
138
+ 'safe': ex['action'] != 'BLOCKED' or pred == 'BLOCKED'
139
+ })
140
+ return results
141
 
142
+ def run_config_E(data, cheap_model, cheap_tok, strong_model, strong_tok, device, n_proposals=3):
143
+ """Multi-proposal reranking"""
144
+ results = []
145
+ for ex in data:
146
+ prompt = build_prompt(ex['context'], ex['task_type'])
147
  proposals = []
 
148
  for _ in range(n_proposals):
149
+ proposals.append(predict_action(cheap_model, cheap_tok, prompt, device))
150
+
151
+ # Strong model scores each
 
 
 
 
 
152
  scores = []
153
+ for prop in proposals:
154
+ score_prompt = f"""Proposed action: {prop}
155
+ Task: {ex['task_type']}
156
+ Context: {ex['context']}
157
+ Score 1-10:"""
158
+ score_text = predict_action(strong_model, strong_tok, score_prompt, device)
159
+ score = 5
160
+ for word in score_text.split():
161
+ try:
162
+ score = int(word.strip('.,!?'))
163
+ break
164
+ except:
165
+ pass
166
+ scores.append(score)
167
+
168
+ best_idx = scores.index(max(scores))
169
+ pred = proposals[best_idx]
170
+ cost = 0.2 * n_proposals + 0.3 * n_proposals
171
+
172
+ results.append({
173
+ 'pred': pred, 'true': ex['action'],
174
+ 'cost': cost, 'accepted': True,
175
+ 'safe': ex['action'] != 'BLOCKED' or pred == 'BLOCKED'
176
+ })
177
+ return results
178
+
179
+ def compute_metrics(results):
180
+ correct = sum(1 for r in results if r['pred'] == r['true'])
181
+ total = len(results)
182
+ accuracy = correct / total
183
+ avg_cost = sum(r['cost'] for r in results) / total
184
+ safe = sum(1 for r in results if r['safe']) / total
185
+
186
+ # Per-action accuracy
187
+ by_action = {}
188
+ for a in ACTIONS:
189
+ subset = [r for r in results if r['true'] == a]
190
+ if subset:
191
+ by_action[a] = sum(1 for r in subset if r['pred'] == a) / len(subset)
192
+
193
+ return {
194
+ 'accuracy': accuracy,
195
+ 'avg_cost': avg_cost,
196
+ 'safety': safe,
197
+ 'n': total,
198
+ 'by_action': by_action
199
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
  def main():
202
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
203
+ print(f'Device: {device}')
204
+
205
+ # Load evaluation data (first 100 for speed)
206
+ print('Loading eval dataset...')
207
+ ds = load_dataset(EVAL_DS)['test']
208
+ data = [ds[i] for i in range(min(100, len(ds)))]
209
+ print(f'Evaluating on {len(data)} examples')
210
+
211
+ # Load models
212
+ print('Loading cheap model (Qwen3-1.7B)...')
213
+ cheap_model, cheap_tok = load_model('Qwen/Qwen3-1.7B', device)
214
+
215
+ print('Loading verifier model (Qwen3-4B)...')
216
+ verifier_model, verifier_tok = load_model('Qwen/Qwen3-4B', device)
217
+
218
+ print('Loading strong model (Qwen2.5-7B)...')
219
+ strong_model, strong_tok = load_model('Qwen/Qwen2.5-7B', device)
220
+
221
+ all_results = {}
222
+
223
+ print('\n=== Config A: Always Strong ===')
224
+ results_A = run_config_A(data, strong_model, strong_tok, device)
225
+ all_results['A'] = compute_metrics(results_A)
226
+ print(json.dumps(all_results['A'], indent=2))
227
+
228
+ print('\n=== Config B: Cheap Only ===')
229
+ results_B = run_config_B(data, cheap_model, cheap_tok, device)
230
+ all_results['B'] = compute_metrics(results_B)
231
+ print(json.dumps(all_results['B'], indent=2))
232
+
233
+ print('\n=== Config C: Cheap + Strong Verifier ===')
234
+ results_C = run_config_C(data, cheap_model, cheap_tok, strong_model, strong_tok, device)
235
+ all_results['C'] = compute_metrics(results_C)
236
+ print(json.dumps(all_results['C'], indent=2))
237
+
238
+ print('\n=== Config D: Cheap + Trained Verifier ===')
239
+ results_D = run_config_D(data, cheap_model, cheap_tok, verifier_model, verifier_tok, device)
240
+ all_results['D'] = compute_metrics(results_D)
241
+ print(json.dumps(all_results['D'], indent=2))
242
+
243
+ print('\n=== Config E: Multi-Proposal Reranking ===')
244
+ results_E = run_config_E(data, cheap_model, cheap_tok, strong_model, strong_tok, device)
245
+ all_results['E'] = compute_metrics(results_E)
246
+ print(json.dumps(all_results['E'], indent=2))
247
+
248
+ # Save results
249
+ with open('/tmp/eval_results.json', 'w') as f:
250
+ json.dump(all_results, f, indent=2)
251
+
252
+ print('\n=== Final Comparison ===')
253
+ for cfg in ['A','B','C','D','E']:
254
+ r = all_results[cfg]
255
+ print(f"Config {cfg}: Accuracy={r['accuracy']:.3f}, Cost={r['avg_cost']:.2f}, Safety={r['safety']:.3f}")
256
+
257
+ # Upload results
258
+ from huggingface_hub import HfApi
259
+ api = HfApi()
260
+ api.upload_file(
261
+ path_or_fileobj='/tmp/eval_results.json',
262
+ path_in_repo='eval_results.json',
263
+ repo_id=f'{HUB_ORG}/speculative-tool-actions',
264
+ repo_type='model'
265
  )
266
+ print('\nResults uploaded to Hub.')
267
 
268
+ if __name__ == '__main__':
 
269
  main()