akseljoonas HF Staff commited on
Commit
235ace7
·
1 Parent(s): de136d0

updated eval

Browse files
eval/README.md CHANGED
@@ -5,10 +5,10 @@ Rubric-based evaluation pipeline implementing [Rubrics as Rewards](https://arxiv
5
  ## Pipeline
6
 
7
  ```
8
- QA pairs → generate_rubrics.py → evaluate.py → scores
9
  ```
10
 
11
- ### 1. Generate Rubrics
12
 
13
  Creates instance-specific evaluation criteria from question + reference answer.
14
 
@@ -27,50 +27,47 @@ python eval/generate_rubrics.py \
27
 
28
  **Output:** 7-20 weighted criteria per question (Essential: +5, Important: +3-4, Optional: +1-2, Pitfall: -1 to -2)
29
 
30
- ### 2. Evaluate Responses
31
 
32
- Scores responses using generated rubrics via LLM-as-judge.
33
 
34
- ```python
35
- from evaluate import evaluate_dataset_with_rubrics
 
 
 
 
 
 
36
 
37
- evaluate_dataset_with_rubrics(
38
- input_file="responses.jsonl",
39
- rubric_file="qa_rubrics.jsonl",
40
- ground_truth_file="qa_pairs.jsonl",
41
- output_file="results.jsonl",
42
- model="gpt-4o-mini",
43
- push_to_hub="akseljoonas/hf-agent-benchmark@evaluations"
44
- )
 
45
  ```
46
 
47
- **Output:** Normalized score [0, 1] + per-criterion satisfaction + reasoning
48
-
49
- ## HuggingFace Integration
50
-
51
- Both scripts upload DataFrames before saving JSONL:
52
 
53
- ```python
54
- from hf_dataset_io import df_to_hub, hub_to_df
55
 
56
- # Upload
57
- df_to_hub(df, "username/dataset@config", split="train")
58
 
59
- # Download
60
- df = hub_to_df("username/dataset@config", split="train")
 
 
 
61
  ```
62
 
63
- Use `@config` notation to organize: `@rubrics`, `@evaluations`, `@ground-truth`
64
-
65
- ## Key Parameters
66
-
67
- - **--max-concurrent**: Parallel workers (default: 30 for rubrics, 10 for eval)
68
- - **--push-to-hub**: Auto-upload to HF Hub (e.g., `user/dataset@rubrics`)
69
- - **--model**: LiteLLM model string
70
- - **split**: `train` for rubrics, `test` for evaluations
71
 
72
- ## Scoring
73
 
74
- RaR-Explicit: `score = Σ(weight × satisfied) / Σ(positive_weights)`
75
 
76
- Normalized to [0, 1], clipped if pitfalls make it negative.
 
5
  ## Pipeline
6
 
7
  ```
8
+ QA pairs → generate_rubrics.py → `eval/task.py@hf-benchmark-with-rubrics` → scores
9
  ```
10
 
11
+ ### 1. Generate Rubrics (if not already generated)
12
 
13
  Creates instance-specific evaluation criteria from question + reference answer.
14
 
 
27
 
28
  **Output:** 7-20 weighted criteria per question (Essential: +5, Important: +3-4, Optional: +1-2, Pitfall: -1 to -2)
29
 
30
+ ### 2. Evaluate Responses (Inspect)
31
 
32
+ Load your rubric dataset, run a solver, and score with `rubric_scorer` using `inspect-ai`.
33
 
34
+ Files:
35
+ - `eval/hf_agent_connector.py` contains a lightweight bridge that spins up
36
+ the existing hf-agent stack in `agent/` (tools, MCP, LiteLLM loop) and returns the assistant reply.
37
+ - `eval/solvers.py` keeps the solver implementations (e.g. `hf_agent_solver`,
38
+ `claude_code`). If additional solvers are needed, register them there and pass
39
+ `-T solver_name=<name>` to swap them in without touching the task.
40
+ - `eval/task.py` registers `hf-benchmark-with-rubrics`, which wires
41
+ the dataset, solver, and rubric scorer into a single Inspect task and does the eval.
42
 
43
+ ### Running the hf-agent (implemented in `agent/`) (args are optional)
44
+ ```bash
45
+ uv run inspect eval eval/task.py@hf-benchmark-with-rubrics \
46
+ -T dataset_name=akseljoonas/hf-agent-rubrics \
47
+ -T dataset_split=train \
48
+ -T limit=25 \
49
+ -T solver_name=hf_agent_solver \
50
+ -T solver_kwargs='{"config_path":"agent/config_mcp_example.json","max_iterations":10}' \
51
+ --log-dir logs/inspect
52
  ```
53
 
54
+ Different benchmarks can be used by making/running a new task in `eval/task.py`.
 
 
 
 
55
 
56
+ ### Running Claude Code headlessly
 
57
 
58
+ The `claude_code` solver shell-outs to the `claude` CLI (`claude -p ... --output-format json`)
59
+ so you can benchmark Claude Code without any interactive UI. Example:
60
 
61
+ Claude Code command example (kwargs are optional):
62
+ ```bash
63
+ uv run inspect eval eval/task.py@hf-benchmark-with-rubrics \
64
+ -T solver_name=claude_code \
65
+ -T solver_kwargs='{"allowed_tools":"Bash,Read","output_format":"json"}'
66
  ```
67
 
 
 
 
 
 
 
 
 
68
 
69
+ ## Scoring (implemented in `eval/rubric_eval.py`)
70
 
71
+ The scoring is implemented in `eval/rubric_eval.py` and is based on the RaR-Explicit formula: `score = Σ(weight × satisfied) / Σ(positive_weights)`.
72
 
73
+ The score is normalized to [0, 1] and clipped if pitfalls make it negative.
eval/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from eval.task import hf_benchmark_with_rubrics
2
+
3
+ __all__ = ["hf_benchmark_with_rubrics"]
eval/generate_rubrics.py CHANGED
@@ -17,9 +17,10 @@ from typing import Any, Dict, List
17
  import litellm
18
  import pandas as pd
19
  from dotenv import load_dotenv
20
- from hf_dataset_io import df_to_hub
21
  from pydantic import BaseModel
22
 
 
 
23
 
24
  class Rubric(BaseModel):
25
  title: str
 
17
  import litellm
18
  import pandas as pd
19
  from dotenv import load_dotenv
 
20
  from pydantic import BaseModel
21
 
22
+ from eval.hf_io import df_to_hub
23
+
24
 
25
  class Rubric(BaseModel):
26
  title: str
eval/hf_agent_connector.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import sys
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from agent.config import Config, load_config
9
+ from agent.core.agent_loop import Handlers
10
+ from agent.core.session import Session
11
+ from agent.core.tools import ToolRouter
12
+
13
+ PROJECT_ROOT = Path(__file__).resolve().parents[1]
14
+ if str(PROJECT_ROOT) not in sys.path:
15
+ sys.path.insert(0, str(PROJECT_ROOT))
16
+
17
+
18
+ def _resolve_project_path(path: str | Path) -> Path:
19
+ candidate = Path(path)
20
+ if candidate.is_absolute():
21
+ return candidate
22
+ return (PROJECT_ROOT / candidate).resolve()
23
+
24
+
25
+ class AgentResponseGenerator:
26
+ """
27
+ Thin async wrapper that executes the existing agent loop once and
28
+ returns the assistant's final message.
29
+ """
30
+
31
+ def __init__(self, config_path: str | Path, max_iterations: int = 10) -> None:
32
+ self.config_path = _resolve_project_path(config_path)
33
+ self.config: Config = load_config(str(self.config_path))
34
+ self.max_iterations = max_iterations
35
+
36
+ @property
37
+ def model_name(self) -> str:
38
+ """Expose the agent model name for downstream logging."""
39
+ return self.config.model_name
40
+
41
+ async def run(self, prompt: str) -> str:
42
+ """
43
+ Execute the agent loop for a single prompt and return the assistant reply.
44
+ """
45
+ tool_router = ToolRouter(self.config.mcpServers)
46
+
47
+ async with tool_router:
48
+ session = Session(asyncio.Queue(), config=self.config)
49
+ session.tool_router = tool_router
50
+ await Handlers.run_agent(
51
+ session,
52
+ prompt,
53
+ max_iterations=self.max_iterations,
54
+ )
55
+ return self._latest_assistant_response(session)
56
+
57
+ def _latest_assistant_response(self, session: Session) -> str:
58
+ """
59
+ Extract the final assistant response from the session history.
60
+ """
61
+ for message in reversed(session.context_manager.items):
62
+ if getattr(message, "role", None) == "assistant":
63
+ return _content_to_text(getattr(message, "content", ""))
64
+
65
+ raise RuntimeError("Agent did not produce an assistant message.")
66
+
67
+
68
+ def _content_to_text(content: Any) -> str:
69
+ """
70
+ Convert LiteLLM content payloads (str or list[dict]) into plain text.
71
+ """
72
+ if isinstance(content, str):
73
+ return content
74
+
75
+ if isinstance(content, list):
76
+ parts: list[str] = []
77
+ for block in content:
78
+ if isinstance(block, dict):
79
+ text = block.get("text")
80
+ if text:
81
+ parts.append(str(text))
82
+ else:
83
+ text = getattr(block, "text", None)
84
+ if text:
85
+ parts.append(str(text))
86
+ return "\n".join(parts)
87
+
88
+ return str(content)
eval/{hf_dataset_io.py → hf_io.py} RENAMED
@@ -5,245 +5,12 @@ Reusable functions for uploading and downloading JSONL data to/from HuggingFace
5
  Supports the dataset_name@config_name notation for managing multiple configurations.
6
  """
7
 
8
- import json
9
- from pathlib import Path
10
- from typing import Dict, List, Optional, Union
11
 
12
  import pandas as pd
13
  from datasets import Dataset, load_dataset
14
 
15
 
16
- def upload_jsonl_to_hf(
17
- jsonl_file: Union[str, Path],
18
- dataset_spec: str,
19
- split: str = "train",
20
- private: bool = False,
21
- ) -> bool:
22
- """
23
- Upload a JSONL file to HuggingFace Hub as a dataset.
24
-
25
- This function reads a JSONL file where each line is a complete JSON object,
26
- converts it to a HuggingFace Dataset, and uploads it to the Hub.
27
-
28
- Args:
29
- jsonl_file: Path to the JSONL file to upload. Each line should be a valid
30
- JSON object. Example format:
31
- ```
32
- {"question": "How to...", "solution": "...", "rubric": "[...]"}
33
- {"question": "What is...", "solution": "...", "rubric": "[...]"}
34
- ```
35
-
36
- dataset_spec: Dataset specification in the format "dataset_name" or
37
- "dataset_name@config_name". Examples:
38
- - "username/my-dataset" (uses "default" config)
39
- - "username/my-dataset@rubrics" (uses "rubrics" config)
40
- - "username/my-dataset@evaluations" (uses "evaluations" config)
41
-
42
- Multiple configs allow you to store different data types in the same
43
- dataset repository (e.g., raw data, rubrics, evaluation results).
44
-
45
- split: The dataset split name. Defaults to "train". Common values:
46
- - "train": Training or main data
47
- - "validation": Validation data
48
- - "test": Test data
49
-
50
- private: Whether to create a private dataset. Defaults to False (public).
51
-
52
- Returns:
53
- bool: True if upload succeeded, False otherwise
54
-
55
- Raises:
56
- FileNotFoundError: If the JSONL file doesn't exist
57
- ValueError: If the JSONL file is empty or contains invalid JSON
58
- Exception: For HuggingFace Hub upload errors
59
-
60
- Example:
61
- >>> # Upload rubrics with custom config
62
- >>> upload_jsonl_to_hf(
63
- ... "qa_rubrics.jsonl",
64
- ... "username/hf-agent-benchmark@rubrics",
65
- ... split="train"
66
- ... )
67
-
68
- >>> # Upload evaluation results with different config
69
- >>> upload_jsonl_to_hf(
70
- ... "evaluation_results.jsonl",
71
- ... "username/hf-agent-benchmark@evaluations",
72
- ... split="test"
73
- ... )
74
-
75
- Notes:
76
- - Requires authentication via `huggingface-cli login` or HF_TOKEN env var
77
- - If the dataset doesn't exist, it will be created automatically
78
- - If it exists, the specified config/split will be updated
79
- - Empty files will raise ValueError to prevent uploading invalid data
80
- """
81
- jsonl_path = Path(jsonl_file)
82
-
83
- # Validate file exists
84
- if not jsonl_path.exists():
85
- raise FileNotFoundError(f"JSONL file not found: {jsonl_file}")
86
-
87
- # Parse dataset specification
88
- if "@" in dataset_spec:
89
- dataset_name, config_name = dataset_spec.split("@", 1)
90
- else:
91
- dataset_name = dataset_spec
92
- config_name = "default"
93
-
94
- try:
95
- print(f"\nUploading {jsonl_path.name} to HuggingFace Hub...")
96
- print(f" Dataset: {dataset_name}")
97
- print(f" Config: {config_name}")
98
- print(f" Split: {split}")
99
-
100
- # Load JSONL file
101
- records = []
102
- with open(jsonl_path, "r") as f:
103
- for line_num, line in enumerate(f, start=1):
104
- line = line.strip()
105
- if line: # Skip empty lines
106
- try:
107
- records.append(json.loads(line))
108
- except json.JSONDecodeError as e:
109
- raise ValueError(f"Invalid JSON on line {line_num}: {e}") from e
110
-
111
- if not records:
112
- raise ValueError("JSONL file is empty or contains no valid records")
113
-
114
- print(f" Loaded {len(records)} records from JSONL")
115
-
116
- # Create HuggingFace Dataset
117
- dataset = Dataset.from_list(records)
118
-
119
- # Upload to HuggingFace Hub
120
- dataset.push_to_hub(
121
- dataset_name,
122
- config_name=config_name,
123
- split=split,
124
- private=private,
125
- )
126
-
127
- print(
128
- f"✓ Successfully uploaded to {dataset_name}@{config_name} (split: {split})"
129
- )
130
- return True
131
-
132
- except Exception as e:
133
- print(f"✗ Failed to upload to HuggingFace: {type(e).__name__}: {str(e)}")
134
- print(f" JSONL file preserved at: {jsonl_path}")
135
- return False
136
-
137
-
138
- def download_hf_to_jsonl(
139
- dataset_spec: str,
140
- output_file: Union[str, Path],
141
- split: str = "train",
142
- overwrite: bool = False,
143
- ) -> bool:
144
- """
145
- Download a dataset from HuggingFace Hub and save as JSONL.
146
-
147
- This function downloads a dataset from the HuggingFace Hub and saves it as a
148
- JSONL file where each line is a complete JSON object.
149
-
150
- Args:
151
- dataset_spec: Dataset specification in the format "dataset_name" or
152
- "dataset_name@config_name". Examples:
153
- - "username/my-dataset" (uses "default" config)
154
- - "username/my-dataset@rubrics" (uses "rubrics" config)
155
- - "username/my-dataset@evaluations" (uses "evaluations" config)
156
-
157
- output_file: Path where the JSONL file will be saved. Will create parent
158
- directories if they don't exist. Example: "data/downloaded_rubrics.jsonl"
159
-
160
- split: The dataset split to download. Defaults to "train". Common values:
161
- - "train": Training or main data
162
- - "validation": Validation data
163
- - "test": Test data
164
- - "all": Download all splits (creates one JSONL with all data)
165
-
166
- overwrite: Whether to overwrite existing file. Defaults to False.
167
-
168
- Returns:
169
- bool: True if download succeeded, False otherwise
170
-
171
- Raises:
172
- FileExistsError: If output file exists and overwrite=False
173
- ValueError: If the dataset/config/split doesn't exist
174
- Exception: For HuggingFace Hub download errors
175
-
176
- Example:
177
- >>> # Download rubrics from specific config
178
- >>> download_hf_to_jsonl(
179
- ... "username/hf-agent-benchmark@rubrics",
180
- ... "local_rubrics.jsonl",
181
- ... split="train"
182
- ... )
183
-
184
- >>> # Download evaluation results
185
- >>> download_hf_to_jsonl(
186
- ... "username/hf-agent-benchmark@evaluations",
187
- ... "local_evaluations.jsonl",
188
- ... split="test",
189
- ... overwrite=True
190
- ... )
191
-
192
- Notes:
193
- - Requires authentication for private datasets via `huggingface-cli login`
194
- - Downloaded data will be in the same format as uploaded (preserves structure)
195
- - Each line in the output JSONL is a complete, valid JSON object
196
- - Large datasets may take time to download
197
- """
198
- output_path = Path(output_file)
199
-
200
- # Check if file exists
201
- if output_path.exists() and not overwrite:
202
- raise FileExistsError(
203
- f"Output file already exists: {output_file}. "
204
- "Use overwrite=True to replace it."
205
- )
206
-
207
- # Parse dataset specification
208
- if "@" in dataset_spec:
209
- dataset_name, config_name = dataset_spec.split("@", 1)
210
- else:
211
- dataset_name = dataset_spec
212
- config_name = "default"
213
-
214
- try:
215
- print("\nDownloading from HuggingFace Hub...")
216
- print(f" Dataset: {dataset_name}")
217
- print(f" Config: {config_name}")
218
- print(f" Split: {split}")
219
-
220
- # Download dataset from HuggingFace Hub
221
- dataset = load_dataset(
222
- dataset_name,
223
- name=config_name,
224
- split=split,
225
- )
226
-
227
- print(f" Downloaded {len(dataset)} records")
228
-
229
- # Create parent directories if needed
230
- output_path.parent.mkdir(parents=True, exist_ok=True)
231
-
232
- # Write to JSONL
233
- with open(output_path, "w") as f:
234
- for record in dataset:
235
- # Convert record to JSON and write as line
236
- f.write(json.dumps(record) + "\n")
237
-
238
- print(f"✓ Successfully saved to {output_path}")
239
- print(f" Total records: {len(dataset)}")
240
- return True
241
-
242
- except Exception as e:
243
- print(f"✗ Failed to download from HuggingFace: {type(e).__name__}: {str(e)}")
244
- return False
245
-
246
-
247
  def list_dataset_configs(dataset_name: str) -> Optional[List[str]]:
248
  """
249
  List all available configs for a dataset on HuggingFace Hub.
@@ -269,60 +36,6 @@ def list_dataset_configs(dataset_name: str) -> Optional[List[str]]:
269
  return None
270
 
271
 
272
- def get_dataset_info(dataset_spec: str, split: str = "train") -> Optional[Dict]:
273
- """
274
- Get information about a dataset on HuggingFace Hub.
275
-
276
- Args:
277
- dataset_spec: Dataset specification ("dataset_name" or "dataset_name@config")
278
- split: The split to get info for (default: "train")
279
-
280
- Returns:
281
- Dictionary with dataset info, or None if unable to retrieve
282
-
283
- Example:
284
- >>> info = get_dataset_info("username/hf-agent-benchmark@rubrics")
285
- >>> print(f"Records: {info['num_rows']}")
286
- >>> print(f"Columns: {info['column_names']}")
287
- """
288
- # Parse dataset specification
289
- if "@" in dataset_spec:
290
- dataset_name, config_name = dataset_spec.split("@", 1)
291
- else:
292
- dataset_name = dataset_spec
293
- config_name = "default"
294
-
295
- try:
296
- # Load just to get info (streaming mode for efficiency)
297
- dataset = load_dataset(
298
- dataset_name,
299
- name=config_name,
300
- split=split,
301
- streaming=True,
302
- )
303
-
304
- # Get basic info
305
- info = {
306
- "dataset_name": dataset_name,
307
- "config_name": config_name,
308
- "split": split,
309
- "features": str(dataset.features),
310
- "column_names": dataset.column_names
311
- if hasattr(dataset, "column_names")
312
- else None,
313
- }
314
-
315
- # Try to get row count (only works for non-streaming)
316
- dataset_full = load_dataset(dataset_name, name=config_name, split=split)
317
- info["num_rows"] = len(dataset_full)
318
-
319
- return info
320
-
321
- except Exception as e:
322
- print(f"✗ Failed to get dataset info: {type(e).__name__}: {str(e)}")
323
- return None
324
-
325
-
326
  def df_to_hub(
327
  df: pd.DataFrame,
328
  dataset_spec: str,
@@ -500,18 +213,3 @@ def hub_to_df(
500
  except Exception as e:
501
  print(f"✗ Failed to download from HuggingFace: {type(e).__name__}: {str(e)}")
502
  return None
503
-
504
-
505
- if __name__ == "__main__":
506
- # Example usage
507
- print("HuggingFace Dataset I/O Utilities")
508
- print("=" * 60)
509
- print("\nExample: Upload rubrics")
510
- print(' upload_jsonl_to_hf("qa_rubrics.jsonl", "username/dataset@rubrics")')
511
- print("\nExample: Download evaluations")
512
- print(' download_hf_to_jsonl("username/dataset@evaluations", "local.jsonl")')
513
- print("\nExample: List configs")
514
- print(' list_dataset_configs("username/dataset")')
515
- print("\nExample: Get dataset info")
516
- print(' get_dataset_info("username/dataset@rubrics")')
517
- print("=" * 60)
 
5
  Supports the dataset_name@config_name notation for managing multiple configurations.
6
  """
7
 
8
+ from typing import List, Optional
 
 
9
 
10
  import pandas as pd
11
  from datasets import Dataset, load_dataset
12
 
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  def list_dataset_configs(dataset_name: str) -> Optional[List[str]]:
15
  """
16
  List all available configs for a dataset on HuggingFace Hub.
 
36
  return None
37
 
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  def df_to_hub(
40
  df: pd.DataFrame,
41
  dataset_spec: str,
 
213
  except Exception as e:
214
  print(f"✗ Failed to download from HuggingFace: {type(e).__name__}: {str(e)}")
215
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval/{evaluate.py → rubric_eval.py} RENAMED
@@ -4,13 +4,9 @@ Rubric-based evaluation following the "Rubrics as Rewards" paper.
4
  Implements RaR-Explicit: Weighted sum of individual criterion scores (Equation 1)
5
  """
6
 
7
- import json
8
- from concurrent.futures import ThreadPoolExecutor, as_completed
9
- from typing import Dict, List, Optional
10
 
11
  import litellm
12
- import pandas as pd
13
- from hf_dataset_io import df_to_hub
14
  from pydantic import BaseModel
15
 
16
 
@@ -32,17 +28,6 @@ class RubricEvaluation(BaseModel):
32
  normalized_score: float # Score normalized to [0, 1]
33
 
34
 
35
- class EvaluatedResponse(BaseModel):
36
- """Complete evaluated response with rubric scores."""
37
-
38
- discussion_title: str
39
- discussion_url: str
40
- question: str
41
- response: str
42
- reference_answer: str
43
- evaluation: RubricEvaluation
44
-
45
-
46
  CRITERION_PROMPT = """You are evaluating whether a response satisfies a specific evaluation criterion.
47
 
48
  Question: {question}
@@ -69,32 +54,6 @@ class RubricData(BaseModel):
69
  weight: int
70
 
71
 
72
- def load_rubrics_from_file(rubric_file: str) -> Dict[str, List[RubricData]]:
73
- """
74
- Load rubrics from JSONL file and index by question.
75
-
76
- Args:
77
- rubric_file: Path to rubric JSONL file
78
-
79
- Returns:
80
- Dictionary mapping questions to their rubrics
81
- """
82
- rubrics_by_question = {}
83
-
84
- with open(rubric_file, "r") as f:
85
- for line in f:
86
- entry = json.loads(line)
87
- question = entry["question"]
88
-
89
- # Parse rubric JSON string
90
- rubric_data = json.loads(entry["rubric"])
91
- rubrics = [RubricData(**r) for r in rubric_data["rubrics"]]
92
-
93
- rubrics_by_question[question] = rubrics
94
-
95
- return rubrics_by_question
96
-
97
-
98
  def check_criterion(
99
  question: str, response: str, criterion: RubricData, model: str = "gpt-4o-mini"
100
  ) -> CriterionCheck:
@@ -137,7 +96,6 @@ def check_criterion(
137
  def evaluate_with_rubrics(
138
  question: str,
139
  response: str,
140
- reference_answer: str,
141
  rubrics: List[RubricData],
142
  model: str = "gpt-4o-mini",
143
  ) -> RubricEvaluation:
@@ -182,176 +140,3 @@ def evaluate_with_rubrics(
182
  normalized_score=normalized_score,
183
  criterion_checks=checks,
184
  )
185
-
186
-
187
- def evaluate_dataset_with_rubrics(
188
- input_file: str,
189
- rubric_file: str,
190
- ground_truth_file: str,
191
- output_file: str = "rubric_evaluation_results.jsonl",
192
- model: str = "gpt-4o-mini",
193
- max_concurrent: int = 10,
194
- limit: Optional[int] = None,
195
- push_to_hub: Optional[str] = None,
196
- ) -> None:
197
- """
198
- Evaluate all responses using rubric-based assessment.
199
-
200
- Args:
201
- input_file: Path to JSONL with responses to evaluate
202
- rubric_file: Path to JSONL with rubrics (output from generate_rubrics.py)
203
- ground_truth_file: Path to JSONL with ground truth answers
204
- output_file: Path to output JSONL file
205
- model: LLM model for judging
206
- max_concurrent: Maximum concurrent evaluations
207
- limit: Optional limit on number of examples
208
- push_to_hub: Optional HuggingFace dataset spec (e.g., username/dataset@evaluations)
209
- """
210
- # Load data
211
- print(f"Loading responses from {input_file}...")
212
- with open(input_file, "r") as f:
213
- responses = [json.loads(line) for line in f]
214
-
215
- print(f"Loading rubrics from {rubric_file}...")
216
- rubrics_by_question = load_rubrics_from_file(rubric_file)
217
-
218
- print(f"Loading ground truth from {ground_truth_file}...")
219
- with open(ground_truth_file, "r") as f:
220
- ground_truths = [json.loads(line) for line in f]
221
-
222
- if limit:
223
- responses = responses[:limit]
224
- ground_truths = ground_truths[:limit]
225
-
226
- print(f"Loaded {len(responses)} responses to evaluate")
227
- print(f"Judge model: {model}")
228
-
229
- # Match responses with rubrics and ground truth
230
- evaluation_tasks = []
231
- for response_data, gt_data in zip(responses, ground_truths):
232
- question = gt_data["question"]
233
-
234
- # Find rubrics for this question
235
- rubrics = rubrics_by_question.get(question)
236
- if not rubrics:
237
- print(f"Warning: No rubrics found for question: {question[:50]}...")
238
- continue
239
-
240
- evaluation_tasks.append(
241
- {
242
- "question": question,
243
- "response": response_data["solution"],
244
- "reference_answer": gt_data["solution"],
245
- "rubrics": rubrics,
246
- "metadata": {
247
- "discussion_title": response_data.get("discussion_title", ""),
248
- "discussion_url": response_data.get("discussion_url", ""),
249
- },
250
- }
251
- )
252
-
253
- print(
254
- f"Running {len(evaluation_tasks)} evaluations with {max_concurrent} parallel workers..."
255
- )
256
-
257
- # Run evaluations in parallel
258
- results = []
259
- with ThreadPoolExecutor(max_workers=max_concurrent) as executor:
260
- # Submit all tasks
261
- future_to_idx = {}
262
- for idx, task in enumerate(evaluation_tasks):
263
- future = executor.submit(
264
- evaluate_with_rubrics,
265
- question=task["question"],
266
- response=task["response"],
267
- reference_answer=task["reference_answer"],
268
- rubrics=task["rubrics"],
269
- model=model,
270
- )
271
- future_to_idx[future] = idx
272
-
273
- # Collect results in order
274
- results = [None] * len(evaluation_tasks)
275
- completed = 0
276
- for future in as_completed(future_to_idx):
277
- idx = future_to_idx[future]
278
- results[idx] = future.result()
279
- completed += 1
280
- print(f"Completed: {completed}/{len(evaluation_tasks)}", end="\r")
281
-
282
- print() # New line after progress
283
-
284
- # Combine results with metadata
285
- output_data = []
286
- total_score = 0.0
287
-
288
- for task, evaluation in zip(evaluation_tasks, results):
289
- evaluated_response = EvaluatedResponse(
290
- discussion_title=task["metadata"]["discussion_title"],
291
- discussion_url=task["metadata"]["discussion_url"],
292
- question=task["question"],
293
- response=task["response"],
294
- reference_answer=task["reference_answer"],
295
- evaluation=evaluation,
296
- )
297
- output_data.append(evaluated_response)
298
- total_score += evaluation.normalized_score
299
-
300
- # Convert to DataFrame for HuggingFace upload
301
- results_df = pd.DataFrame([entry.model_dump() for entry in output_data])
302
-
303
- # Upload to HuggingFace if specified (before saving JSONL)
304
- if push_to_hub:
305
- print(f"\nUploading to HuggingFace: {push_to_hub}")
306
- upload_success = df_to_hub(
307
- df=results_df,
308
- dataset_spec=push_to_hub,
309
- split="test",
310
- private=False,
311
- )
312
- if not upload_success:
313
- print("Warning: HuggingFace upload failed, but continuing to save JSONL...")
314
-
315
- # Write results to JSONL file
316
- print(f"\nWriting results to {output_file}...")
317
- with open(output_file, "w") as f:
318
- for entry in output_data:
319
- f.write(entry.model_dump_json() + "\n")
320
-
321
- # Print summary
322
- avg_score = total_score / len(output_data) if output_data else 0.0
323
-
324
- print("\n" + "=" * 60)
325
- print("RUBRIC-BASED EVALUATION SUMMARY")
326
- print("=" * 60)
327
- print(f"Total examples: {len(output_data)}")
328
- print(f"Judge model: {model}")
329
- print(f"Average normalized score: {avg_score:.3f}")
330
- print(f"Average percentage: {avg_score * 100:.1f}%")
331
-
332
- # Per-criterion statistics
333
- total_satisfied = sum(
334
- sum(1 for check in eval.evaluation.criterion_checks if check.satisfied)
335
- for eval in output_data
336
- )
337
- total_criteria = sum(len(eval.evaluation.criterion_checks) for eval in output_data)
338
- satisfaction_rate = total_satisfied / total_criteria if total_criteria > 0 else 0.0
339
- print(f"Criteria satisfaction rate: {satisfaction_rate * 100:.1f}%")
340
-
341
- if push_to_hub and upload_success:
342
- print(f"Pushed to: {push_to_hub}")
343
-
344
- print("=" * 60)
345
-
346
-
347
- if __name__ == "__main__":
348
- evaluate_dataset_with_rubrics(
349
- input_file="eval/qa_pairs_accepted.jsonl",
350
- rubric_file="eval/qa_rubrics.jsonl",
351
- ground_truth_file="eval/qa_pairs_accepted.jsonl",
352
- output_file="rubric_evaluation.jsonl",
353
- model="gpt-4o-mini",
354
- max_concurrent=10,
355
- limit=30, # Set to None to evaluate all
356
- push_to_hub="akseljoonas/hf-agent-benchmark@ground-truth", # Set to "username/dataset@evaluations" to upload
357
- )
 
4
  Implements RaR-Explicit: Weighted sum of individual criterion scores (Equation 1)
5
  """
6
 
7
+ from typing import List, Optional
 
 
8
 
9
  import litellm
 
 
10
  from pydantic import BaseModel
11
 
12
 
 
28
  normalized_score: float # Score normalized to [0, 1]
29
 
30
 
 
 
 
 
 
 
 
 
 
 
 
31
  CRITERION_PROMPT = """You are evaluating whether a response satisfies a specific evaluation criterion.
32
 
33
  Question: {question}
 
54
  weight: int
55
 
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  def check_criterion(
58
  question: str, response: str, criterion: RubricData, model: str = "gpt-4o-mini"
59
  ) -> CriterionCheck:
 
96
  def evaluate_with_rubrics(
97
  question: str,
98
  response: str,
 
99
  rubrics: List[RubricData],
100
  model: str = "gpt-4o-mini",
101
  ) -> RubricEvaluation:
 
140
  normalized_score=normalized_score,
141
  criterion_checks=checks,
142
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval/solvers.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Collection of Inspect AI solvers used by the rubric task.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import asyncio
8
+ import json
9
+ from typing import Callable, Dict, List, Sequence
10
+
11
+ from inspect_ai.model import ChatMessageAssistant, ModelOutput
12
+ from inspect_ai.solver import Solver, solver
13
+ from inspect_ai.solver._task_state import TaskState
14
+
15
+ from eval.hf_agent_connector import AgentResponseGenerator
16
+
17
+
18
+ async def _run_subprocess(command: Sequence[str]) -> str:
19
+ process = await asyncio.create_subprocess_exec(
20
+ *command,
21
+ stdout=asyncio.subprocess.PIPE,
22
+ stderr=asyncio.subprocess.PIPE,
23
+ )
24
+ stdout, stderr = await process.communicate()
25
+ if process.returncode != 0:
26
+ raise RuntimeError(
27
+ f"Command {' '.join(command)} failed with code {process.returncode}:\n"
28
+ f"{stderr.decode().strip()}"
29
+ )
30
+ return stdout.decode().strip()
31
+
32
+
33
+ @solver(name="hf_agent_solver")
34
+ def hf_agent_solver(
35
+ config_path: str = "agent/config_mcp_example.json",
36
+ max_iterations: int = 10,
37
+ ) -> Solver:
38
+ runner = AgentResponseGenerator(
39
+ config_path=config_path,
40
+ max_iterations=max_iterations,
41
+ )
42
+
43
+ async def solve(state: TaskState, generate) -> TaskState:
44
+ response = await runner.run(state.input_text)
45
+ assistant_message = ChatMessageAssistant(
46
+ content=response,
47
+ model=runner.model_name,
48
+ source="generate",
49
+ )
50
+ state.messages.append(assistant_message)
51
+ state.output = ModelOutput.from_message(assistant_message)
52
+ state.completed = True
53
+ return state
54
+
55
+ return solve
56
+
57
+
58
+ @solver(name="claude_code")
59
+ def claude_code(
60
+ output_format: str = "json",
61
+ mcp_config: str | None = None,
62
+ ) -> Solver:
63
+ if output_format not in {"text", "json", "stream-json"}:
64
+ raise ValueError("output_format must be one of: text, json, stream-json")
65
+
66
+ async def solve(state: TaskState, generate) -> TaskState:
67
+ prompt = state.input_text
68
+
69
+ cmd: List[str] = ["claude", "-p", prompt, "--output-format", output_format]
70
+ if mcp_config:
71
+ cmd += ["--mcp-config", mcp_config]
72
+
73
+ stdout = await _run_subprocess(cmd)
74
+ response_text = stdout
75
+ session_id = None
76
+
77
+ if output_format in {"json", "stream-json"}:
78
+ # stream-json may emit multiple JSON objects; take the last complete line
79
+ candidate_line = stdout.strip().splitlines()[-1]
80
+ try:
81
+ payload = json.loads(candidate_line)
82
+ response_text = (
83
+ payload.get("result") or payload.get("message", "") or stdout
84
+ )
85
+ session_id = payload.get("session_id")
86
+ except (json.JSONDecodeError, AttributeError):
87
+ response_text = stdout
88
+
89
+ assistant_message = ChatMessageAssistant(
90
+ content=response_text,
91
+ model="claude-code",
92
+ source="generate",
93
+ metadata={"session_id": session_id} if session_id else None,
94
+ )
95
+ state.messages.append(assistant_message)
96
+ state.output = ModelOutput.from_message(assistant_message)
97
+ state.completed = True
98
+ return state
99
+
100
+ return solve
101
+
102
+
103
+ SOLVER_REGISTRY: Dict[str, Callable[..., Solver]] = {
104
+ "hf_agent_solver": hf_agent_solver,
105
+ "claude_code": claude_code,
106
+ }
107
+
108
+
109
+ def get_solver(name: str, **kwargs) -> Solver:
110
+ try:
111
+ factory = SOLVER_REGISTRY[name]
112
+ except KeyError as exc:
113
+ available = ", ".join(sorted(SOLVER_REGISTRY))
114
+ raise ValueError(f"Unknown solver '{name}'. Available: {available}") from exc
115
+
116
+ return factory(**kwargs)
eval/task.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Inspect AI task definition that runs the existing agent and reuses the rubric scorer.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import asyncio
8
+ import json
9
+ import sys
10
+ from pathlib import Path
11
+ from typing import Any, Sequence
12
+
13
+ from inspect_ai import Task, task
14
+ from inspect_ai.dataset import Sample, hf_dataset
15
+ from inspect_ai.scorer import Score, Target, mean, scorer
16
+ from inspect_ai.solver._task_state import TaskState
17
+
18
+ PROJECT_ROOT = Path(__file__).resolve().parents[1]
19
+ if str(PROJECT_ROOT) not in sys.path:
20
+ sys.path.insert(0, str(PROJECT_ROOT))
21
+
22
+ from eval.rubric_eval import RubricData, evaluate_with_rubrics # noqa: E402
23
+ from eval.solvers import get_solver # noqa: E402
24
+
25
+
26
+ def _record_to_sample(record: dict[str, Any]) -> Sample:
27
+ rubric_payload = json.loads(record["rubric"])
28
+ rubrics = rubric_payload.get("rubrics", [])
29
+
30
+ metadata = {
31
+ "question": record["question"],
32
+ "discussion_title": record.get("discussion_title"),
33
+ "discussion_url": record.get("discussion_url"),
34
+ "rubric_title": rubric_payload.get("title"),
35
+ "rubric_description": rubric_payload.get("description"),
36
+ "rubrics": rubrics,
37
+ }
38
+
39
+ return Sample(
40
+ input=record["question"],
41
+ target=record["solution"],
42
+ id=record.get("discussion_topic_id"),
43
+ metadata=metadata,
44
+ )
45
+
46
+
47
+ def _load_dataset(dataset_name: str, split: str, limit: int | None) -> Sequence[Sample]:
48
+ return hf_dataset(
49
+ dataset_name, sample_fields=_record_to_sample, split=split, limit=limit
50
+ )
51
+
52
+
53
+ def _metadata_to_rubrics(metadata: dict[str, Any]) -> list[RubricData]:
54
+ raw_rubrics = metadata.get("rubrics", [])
55
+ return [RubricData(**rubric) for rubric in raw_rubrics]
56
+
57
+
58
+ @scorer(metrics=[mean()], name="rubric_scorer")
59
+ def rubric_scorer(judge_model: str = "gpt-4o-mini"):
60
+ async def score(state: TaskState, target: Target) -> Score:
61
+ response_text = state.output.completion or state.output.message.text
62
+ question = state.metadata.get("question", state.input_text)
63
+ rubrics = _metadata_to_rubrics(state.metadata)
64
+
65
+ evaluation = await asyncio.to_thread(
66
+ evaluate_with_rubrics,
67
+ question,
68
+ response_text,
69
+ rubrics,
70
+ judge_model,
71
+ )
72
+
73
+ score_metadata = {
74
+ "raw_score": evaluation.raw_score,
75
+ "criterion_checks": [
76
+ check.model_dump() for check in evaluation.criterion_checks
77
+ ],
78
+ "discussion_title": state.metadata.get("discussion_title"),
79
+ "discussion_url": state.metadata.get("discussion_url"),
80
+ "reference_answer": target.text,
81
+ }
82
+
83
+ return Score(
84
+ value=evaluation.normalized_score,
85
+ answer=response_text,
86
+ explanation=f"Normalized score {evaluation.normalized_score:.3f}",
87
+ metadata=score_metadata,
88
+ )
89
+
90
+ return score
91
+
92
+
93
+ @task(name="hf-benchmark-with-rubrics")
94
+ def hf_benchmark_with_rubrics(
95
+ solver_name: str = "hf_agent_solver",
96
+ solver_kwargs: dict[str, Any] = {
97
+ "max_iterations": 10,
98
+ "config_path": "agent/config_mcp_example.json",
99
+ },
100
+ dataset_name: str = "akseljoonas/hf-agent-rubrics@train",
101
+ limit: int | None = None,
102
+ judge_model: str = "gpt-4o-mini",
103
+ ) -> Task:
104
+ if "@" not in dataset_name:
105
+ raise ValueError("Dataset name must be in the format 'author/dataset@split'")
106
+ dataset_name, dataset_split = dataset_name.split("@")
107
+ dataset = _load_dataset(dataset_name, dataset_split, limit=limit)
108
+
109
+ return Task(
110
+ dataset=dataset,
111
+ solver=get_solver(solver_name, **solver_kwargs),
112
+ scorer=rubric_scorer(judge_model=judge_model),
113
+ metadata={
114
+ "dataset_name": dataset_name,
115
+ "dataset_split": dataset_split,
116
+ "solver_name": solver_name,
117
+ "judge_model": judge_model,
118
+ },
119
+ )