akseljoonas HF Staff commited on
Commit
be350cb
·
1 Parent(s): 1a8f5b2

leaderboard and results

Browse files
agent/config_mcp_example copy.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "anthropic/claude-sonnet-4-5-20250929",
3
+ "tools": [],
4
+ "system_prompt_path": "",
5
+ "mcpServers": {
6
+ "hf-mcp-server": {
7
+ "transport": "http",
8
+ "url": "https://huggingface.co/mcp?login",
9
+ "headers": {
10
+ "Authorization": "Bearer ${HF_TOKEN}"
11
+ }
12
+ },
13
+ "playwright": {
14
+ "transport": "stdio",
15
+ "command": "npx",
16
+ "args": [
17
+ "@playwright/mcp@latest"
18
+ ]
19
+ }
20
+ }
21
+ }
agent/config_mcp_example.json CHANGED
@@ -9,13 +9,6 @@
9
  "headers": {
10
  "Authorization": "Bearer ${HF_TOKEN}"
11
  }
12
- },
13
- "playwright": {
14
- "transport": "stdio",
15
- "command": "npx",
16
- "args": [
17
- "@playwright/mcp@latest"
18
- ]
19
  }
20
  }
21
  }
 
9
  "headers": {
10
  "Authorization": "Bearer ${HF_TOKEN}"
11
  }
 
 
 
 
 
 
 
12
  }
13
  }
14
  }
eval/README.md CHANGED
@@ -63,6 +63,21 @@ uv run inspect eval eval/task.py@hf-benchmark-with-rubrics \
63
  -T solver_kwargs='{"allowed_tools":"Bash,Read","output_format":"json"}'
64
  ```
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
  ## Scoring (implemented in `eval/rubric_eval.py`)
68
 
 
63
  -T solver_kwargs='{"allowed_tools":"Bash,Read","output_format":"json"}'
64
  ```
65
 
66
+ ### Leaderboard
67
+
68
+ Scores can be pushed to a Hugging Face dataset automatically by wrapping the run
69
+ with `eval/run_eval_with_leaderboard.py` (it executes `inspect eval ...` under the hood
70
+ and only appends results when the command succeeds):
71
+
72
+ ```bash
73
+ uv run python eval/run_eval_with_leaderboard.py \
74
+ --hf-dataset akseljoonas/hf-agent-leaderboard \
75
+ --hf-token $HF_TOKEN \
76
+ --solver-name hf_agent_solver \
77
+ --solver-kwargs '{"config_path":"agent/config_mcp_example.json","max_iterations":10}' \
78
+ --dataset akseljoonas/hf-agent-rubrics@train \
79
+ --limit 25
80
+ ```
81
 
82
  ## Scoring (implemented in `eval/rubric_eval.py`)
83
 
eval/leaderboard.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utilities for logging solver scores to a Hugging Face dataset.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import json
8
+ import re
9
+ import shutil
10
+ import subprocess
11
+ import tempfile
12
+ from dataclasses import dataclass
13
+ from datetime import datetime, timezone
14
+ from pathlib import Path
15
+ from typing import Any
16
+
17
+ from huggingface_hub import HfApi, hf_hub_download
18
+
19
+ AVERAGE_RE = re.compile(r"Average normalized score:\s*([0-9.]+)")
20
+ DEFAULT_FILENAME = "records.jsonl"
21
+
22
+
23
+ def _hydra_join(*parts: str | None) -> str:
24
+ tokens = [str(part).strip().replace(" ", "_") for part in parts if part]
25
+ return "/".join(tokens) if tokens else "default"
26
+
27
+
28
+ def detect_agent_version(config_path: str = "agent/config_mcp_example.json") -> str:
29
+ """
30
+ Returns a short string identifying the current agent version:
31
+ <git short sha>-<config hash>.
32
+ """
33
+
34
+ try:
35
+ commit = (
36
+ subprocess.check_output(["git", "rev-parse", "--short", "HEAD"])
37
+ .decode()
38
+ .strip()
39
+ )
40
+ except Exception:
41
+ commit = "unknown"
42
+
43
+ config_file = Path(config_path)
44
+ config_stem = config_file.stem or "config"
45
+ parent_name = config_file.parent.name if config_file.parent.name else None
46
+ return _hydra_join(parent_name, config_stem, commit)
47
+
48
+
49
+ def parse_average_score(text: str) -> float | None:
50
+ """Extracts the 'Average normalized score' value from Inspect logs."""
51
+
52
+ match = AVERAGE_RE.search(text)
53
+ if match:
54
+ try:
55
+ return float(match.group(1))
56
+ except ValueError:
57
+ return None
58
+ return None
59
+
60
+
61
+ def latest_log_file(
62
+ log_dir: Path, extensions: tuple[str, ...] = (".eval", ".json")
63
+ ) -> Path | None:
64
+ """Returns the most recent log file in log_dir matching the provided extensions."""
65
+
66
+ if not log_dir.exists():
67
+ return None
68
+
69
+ files: list[Path] = []
70
+ for ext in extensions:
71
+ files.extend(log_dir.glob(f"*{ext}"))
72
+
73
+ if not files:
74
+ return None
75
+
76
+ files.sort(key=lambda path: path.stat().st_mtime)
77
+ return files[-1]
78
+
79
+
80
+ @dataclass
81
+ class LeaderboardClient:
82
+ """Simple helper to append JSONL rows to a HF dataset."""
83
+
84
+ repo_id: str
85
+ token: str
86
+ filename: str = DEFAULT_FILENAME
87
+
88
+ def append_record(self, record: dict[str, Any]) -> None:
89
+ tmp_dir = Path(tempfile.mkdtemp(prefix="leaderboard_"))
90
+ local_file = tmp_dir / self.filename
91
+
92
+ self._download_existing(local_file)
93
+ if not local_file.exists():
94
+ local_file.write_text("", encoding="utf-8")
95
+
96
+ with local_file.open("a", encoding="utf-8") as fh:
97
+ fh.write(json.dumps(record) + "\n")
98
+
99
+ HfApi(token=self.token).upload_file(
100
+ path_or_fileobj=str(local_file),
101
+ path_in_repo=self.filename,
102
+ repo_id=self.repo_id,
103
+ repo_type="dataset",
104
+ )
105
+
106
+ try:
107
+ local_file.unlink()
108
+ tmp_dir.rmdir()
109
+ except OSError:
110
+ pass
111
+
112
+ def _download_existing(self, destination: Path) -> None:
113
+ destination.parent.mkdir(parents=True, exist_ok=True)
114
+
115
+ try:
116
+ downloaded = hf_hub_download(
117
+ repo_id=self.repo_id,
118
+ filename=self.filename,
119
+ repo_type="dataset",
120
+ token=self.token,
121
+ )
122
+ shutil.copy(Path(downloaded), destination)
123
+ except Exception:
124
+ destination.write_text("", encoding="utf-8")
125
+
126
+
127
+ def build_record(
128
+ solver_name: str,
129
+ solver_kwargs: dict[str, Any],
130
+ dataset_name: str,
131
+ dataset_split: str,
132
+ limit: int | None,
133
+ score: float,
134
+ command: list[str],
135
+ log_path: Path | None,
136
+ criterion_checks: list[dict[str, Any]] | None = None,
137
+ ) -> dict[str, Any]:
138
+ """Assembles a JSON-serialisable record for the leaderboard dataset."""
139
+
140
+ record = {
141
+ "timestamp": datetime.now(timezone.utc).isoformat(),
142
+ "solver": solver_name,
143
+ "solver_kwargs": solver_kwargs,
144
+ "dataset_name": dataset_name,
145
+ "dataset_split": dataset_split,
146
+ "limit": limit,
147
+ "score": score,
148
+ "command": command,
149
+ }
150
+
151
+ if solver_name == "hf_agent_solver":
152
+ record["solver_version"] = detect_agent_version(
153
+ solver_kwargs.get("config_path", "agent/config_mcp_example.json")
154
+ )
155
+ else:
156
+ version_spec = solver_kwargs.get("version")
157
+ if isinstance(version_spec, (list, tuple)):
158
+ record["solver_version"] = _hydra_join(*version_spec)
159
+ elif isinstance(version_spec, dict):
160
+ record["solver_version"] = _hydra_join(
161
+ *[f"{k}={v}" for k, v in version_spec.items()]
162
+ )
163
+ elif isinstance(version_spec, str):
164
+ record["solver_version"] = version_spec
165
+ else:
166
+ record["solver_version"] = _hydra_join(solver_name, "default")
167
+
168
+ if log_path:
169
+ record["log_artifact"] = str(log_path)
170
+ record["criterion_checks"] = criterion_checks or []
171
+
172
+ return record
eval/run_eval_with_leaderboard.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ import os
6
+ import re
7
+ import subprocess
8
+ import sys
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ from dotenv import load_dotenv
13
+ from leaderboard import LeaderboardClient, build_record, latest_log_file
14
+
15
+ load_dotenv()
16
+
17
+
18
+ def run_command(cmd: list[str]) -> subprocess.CompletedProcess[str]:
19
+ print(f"[leaderboard] running: {' '.join(cmd)}")
20
+ return subprocess.run(cmd, capture_output=True, text=True)
21
+
22
+
23
+ def build_inspect_command(args: argparse.Namespace) -> list[str]:
24
+ cmd = []
25
+ cmd.extend(args.inspect_launch)
26
+ cmd.append(args.inspect_task)
27
+
28
+ def add_task_arg(key: str, value: Any) -> None:
29
+ if value is None:
30
+ return
31
+ cmd.extend(["-T", f"{key}={value}"])
32
+
33
+ add_task_arg("solver_name", args.solver_name)
34
+ add_task_arg("solver_kwargs", json.dumps(args.solver_kwargs))
35
+ add_task_arg("dataset_name", args.dataset)
36
+ if args.limit is not None:
37
+ add_task_arg("limit", args.limit)
38
+
39
+ cmd.extend(["--log-dir", args.log_dir])
40
+ if args.log_format:
41
+ cmd.extend(["--log-format", args.log_format])
42
+
43
+ if args.extra_inspect_args:
44
+ cmd.extend(args.extra_inspect_args)
45
+
46
+ return cmd
47
+
48
+
49
+ def parse_score_from_outputs(log_dir: Path) -> tuple[float, Path, list[dict[str, Any]]]:
50
+ log_path = latest_log_file(log_dir)
51
+ if not log_path:
52
+ raise RuntimeError("Inspect log file not found.")
53
+
54
+ # Sanitization
55
+ content = log_path.read_text(encoding="utf-8")
56
+ # Regex to match hf_ followed by 34 alphanumeric chars
57
+ sanitized_content = re.sub(r"hf_[a-zA-Z0-9]{34}", "<REDACTED_TOKEN>", content)
58
+
59
+ if content != sanitized_content:
60
+ log_path.write_text(sanitized_content, encoding="utf-8")
61
+ print(f"[leaderboard] Redacted HF tokens in {log_path}")
62
+ content = sanitized_content
63
+
64
+ data = json.loads(content)
65
+ results = data.get("results", {})
66
+ scores = results.get("scores", [])
67
+ score_value = None
68
+ criterion_checks: list[dict[str, Any]] = []
69
+
70
+ for score_entry in scores:
71
+ metrics = score_entry.get("metrics", {})
72
+ for metric in metrics.values():
73
+ value = metric.get("value")
74
+ if isinstance(value, (int, float)):
75
+ score_value = float(value)
76
+ break
77
+ if score_value is not None:
78
+ break
79
+
80
+ if score_value is None:
81
+ raise RuntimeError("Could not find a numeric metric value in the Inspect log.")
82
+
83
+ for sample in data.get("samples", []):
84
+ # Grab the question from metadata (fallback to input)
85
+ question = "Unknown Question"
86
+ if "metadata" in sample and "question" in sample["metadata"]:
87
+ question = sample["metadata"]["question"]
88
+ elif "input" in sample:
89
+ question = sample["input"]
90
+
91
+ # Check if any scorer produced criterion_checks
92
+ for scorer in sample.get("scores", {}).values():
93
+ metadata = scorer.get("metadata") or {}
94
+ checks = metadata.get("criterion_checks")
95
+
96
+ if isinstance(checks, list) and checks:
97
+ # Create a grouped entry for this question/sample
98
+ grouped_entry = {"question": question, "checks": []}
99
+ for check in checks:
100
+ if isinstance(check, dict):
101
+ grouped_entry["checks"].append(check)
102
+
103
+ if grouped_entry["checks"]:
104
+ criterion_checks.append(grouped_entry)
105
+
106
+ return score_value, log_path, criterion_checks
107
+
108
+
109
+ def main() -> None:
110
+ parser = argparse.ArgumentParser(
111
+ description="Run Inspect eval and append the resulting score to a HF dataset."
112
+ )
113
+ parser.add_argument(
114
+ "--hf-dataset",
115
+ required=True,
116
+ help="HF dataset repo id for the leaderboard (e.g. user/leaderboard).",
117
+ )
118
+
119
+ parser.add_argument(
120
+ "--solver-name",
121
+ required=True,
122
+ help="Solver name used in the Inspect task (e.g. hf_agent_solver).",
123
+ )
124
+ parser.add_argument(
125
+ "--solver-kwargs",
126
+ type=json.loads,
127
+ default="{}",
128
+ help="JSON string with solver kwargs passed to the Inspect task.",
129
+ )
130
+ parser.add_argument(
131
+ "--dataset",
132
+ default="akseljoonas/hf-agent-rubrics@train",
133
+ help="Dataset spec in the form author/dataset@split.",
134
+ )
135
+ parser.add_argument(
136
+ "--limit",
137
+ type=int,
138
+ default=None,
139
+ help="Optional sample limit passed to Inspect.",
140
+ )
141
+ parser.add_argument(
142
+ "--inspect-task",
143
+ default="eval/task.py@hf-benchmark-with-rubrics",
144
+ help="Inspect task reference.",
145
+ )
146
+ parser.add_argument(
147
+ "--inspect-launch",
148
+ nargs="+",
149
+ default=["uv", "run", "inspect", "eval"],
150
+ help="Command used to invoke Inspect (default: uv run inspect eval).",
151
+ )
152
+ parser.add_argument(
153
+ "--log-dir",
154
+ default="logs/leaderboard",
155
+ help="Directory where Inspect outputs .eval logs.",
156
+ )
157
+ parser.add_argument(
158
+ "--extra-inspect-args",
159
+ nargs="*",
160
+ help="Additional args forwarded to Inspect after the standard task arguments.",
161
+ )
162
+ parser.add_argument(
163
+ "--log-format",
164
+ default="json",
165
+ help="Log format passed to Inspect (default: json).",
166
+ )
167
+
168
+ args = parser.parse_args()
169
+
170
+ if isinstance(args.solver_kwargs, str):
171
+ args.solver_kwargs = json.loads(args.solver_kwargs or "{}")
172
+
173
+ hf_token = os.getenv("HF_TOKEN")
174
+ if not hf_token:
175
+ print("ERROR: set HF_TOKEN in your environment.", file=sys.stderr)
176
+ sys.exit(1)
177
+
178
+ if "@" not in args.dataset:
179
+ raise ValueError("Dataset must be in the format 'author/dataset@split'.")
180
+ dataset_name, dataset_split = args.dataset.split("@", 1)
181
+
182
+ log_dir = Path(args.log_dir)
183
+ log_dir.mkdir(parents=True, exist_ok=True)
184
+
185
+ inspect_cmd = build_inspect_command(args)
186
+ result = run_command(inspect_cmd)
187
+
188
+ if result.returncode != 0:
189
+ print(result.stdout)
190
+ print(result.stderr, file=sys.stderr)
191
+ raise SystemExit(result.returncode)
192
+
193
+ score, log_path, criterion_checks = parse_score_from_outputs(log_dir)
194
+
195
+ client = LeaderboardClient(repo_id=args.hf_dataset, token=hf_token)
196
+ record = build_record(
197
+ solver_name=args.solver_name,
198
+ solver_kwargs=args.solver_kwargs,
199
+ dataset_name=dataset_name,
200
+ dataset_split=dataset_split,
201
+ limit=args.limit,
202
+ score=score,
203
+ command=inspect_cmd,
204
+ log_path=log_path,
205
+ criterion_checks=criterion_checks,
206
+ )
207
+ client.append_record(record)
208
+
209
+ print(
210
+ f"[leaderboard] recorded score {score:.3f} for solver '{args.solver_name}' to {args.hf_dataset}"
211
+ )
212
+
213
+
214
+ if __name__ == "__main__":
215
+ main()