Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
Commit ·
035d186
1
Parent(s): ecacd30
updated eval
Browse files- eval/README.md +33 -36
- eval/__init__.py +3 -0
- eval/generate_rubrics.py +2 -1
- eval/hf_agent_connector.py +88 -0
- eval/{hf_dataset_io.py → hf_io.py} +1 -303
- eval/{evaluate.py → rubric_eval.py} +1 -216
- eval/solvers.py +116 -0
- eval/task.py +119 -0
eval/README.md
CHANGED
|
@@ -5,10 +5,10 @@ Rubric-based evaluation pipeline implementing [Rubrics as Rewards](https://arxiv
|
|
| 5 |
## Pipeline
|
| 6 |
|
| 7 |
```
|
| 8 |
-
QA pairs → generate_rubrics.py →
|
| 9 |
```
|
| 10 |
|
| 11 |
-
### 1. Generate Rubrics
|
| 12 |
|
| 13 |
Creates instance-specific evaluation criteria from question + reference answer.
|
| 14 |
|
|
@@ -27,50 +27,47 @@ python eval/generate_rubrics.py \
|
|
| 27 |
|
| 28 |
**Output:** 7-20 weighted criteria per question (Essential: +5, Important: +3-4, Optional: +1-2, Pitfall: -1 to -2)
|
| 29 |
|
| 30 |
-
### 2. Evaluate Responses
|
| 31 |
|
| 32 |
-
|
| 33 |
|
| 34 |
-
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
|
|
|
| 45 |
```
|
| 46 |
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
## HuggingFace Integration
|
| 50 |
-
|
| 51 |
-
Both scripts upload DataFrames before saving JSONL:
|
| 52 |
|
| 53 |
-
|
| 54 |
-
from hf_dataset_io import df_to_hub, hub_to_df
|
| 55 |
|
| 56 |
-
|
| 57 |
-
|
| 58 |
|
| 59 |
-
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
| 61 |
```
|
| 62 |
|
| 63 |
-
Use `@config` notation to organize: `@rubrics`, `@evaluations`, `@ground-truth`
|
| 64 |
-
|
| 65 |
-
## Key Parameters
|
| 66 |
-
|
| 67 |
-
- **--max-concurrent**: Parallel workers (default: 30 for rubrics, 10 for eval)
|
| 68 |
-
- **--push-to-hub**: Auto-upload to HF Hub (e.g., `user/dataset@rubrics`)
|
| 69 |
-
- **--model**: LiteLLM model string
|
| 70 |
-
- **split**: `train` for rubrics, `test` for evaluations
|
| 71 |
|
| 72 |
-
## Scoring
|
| 73 |
|
| 74 |
-
RaR-Explicit: `score = Σ(weight × satisfied) / Σ(positive_weights)`
|
| 75 |
|
| 76 |
-
|
|
|
|
| 5 |
## Pipeline
|
| 6 |
|
| 7 |
```
|
| 8 |
+
QA pairs → generate_rubrics.py → `eval/task.py@hf-benchmark-with-rubrics` → scores
|
| 9 |
```
|
| 10 |
|
| 11 |
+
### 1. Generate Rubrics (if not already generated)
|
| 12 |
|
| 13 |
Creates instance-specific evaluation criteria from question + reference answer.
|
| 14 |
|
|
|
|
| 27 |
|
| 28 |
**Output:** 7-20 weighted criteria per question (Essential: +5, Important: +3-4, Optional: +1-2, Pitfall: -1 to -2)
|
| 29 |
|
| 30 |
+
### 2. Evaluate Responses (Inspect)
|
| 31 |
|
| 32 |
+
Load your rubric dataset, run a solver, and score with `rubric_scorer` using `inspect-ai`.
|
| 33 |
|
| 34 |
+
Files:
|
| 35 |
+
- `eval/hf_agent_connector.py` contains a lightweight bridge that spins up
|
| 36 |
+
the existing hf-agent stack in `agent/` (tools, MCP, LiteLLM loop) and returns the assistant reply.
|
| 37 |
+
- `eval/solvers.py` keeps the solver implementations (e.g. `hf_agent_solver`,
|
| 38 |
+
`claude_code`). If additional solvers are needed, register them there and pass
|
| 39 |
+
`-T solver_name=<name>` to swap them in without touching the task.
|
| 40 |
+
- `eval/task.py` registers `hf-benchmark-with-rubrics`, which wires
|
| 41 |
+
the dataset, solver, and rubric scorer into a single Inspect task and does the eval.
|
| 42 |
|
| 43 |
+
### Running the hf-agent (implemented in `agent/`) (args are optional)
|
| 44 |
+
```bash
|
| 45 |
+
uv run inspect eval eval/task.py@hf-benchmark-with-rubrics \
|
| 46 |
+
-T dataset_name=akseljoonas/hf-agent-rubrics \
|
| 47 |
+
-T dataset_split=train \
|
| 48 |
+
-T limit=25 \
|
| 49 |
+
-T solver_name=hf_agent_solver \
|
| 50 |
+
-T solver_kwargs='{"config_path":"agent/config_mcp_example.json","max_iterations":10}' \
|
| 51 |
+
--log-dir logs/inspect
|
| 52 |
```
|
| 53 |
|
| 54 |
+
Different benchmarks can be used by making/running a new task in `eval/task.py`.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
+
### Running Claude Code headlessly
|
|
|
|
| 57 |
|
| 58 |
+
The `claude_code` solver shell-outs to the `claude` CLI (`claude -p ... --output-format json`)
|
| 59 |
+
so you can benchmark Claude Code without any interactive UI. Example:
|
| 60 |
|
| 61 |
+
Claude Code command example (kwargs are optional):
|
| 62 |
+
```bash
|
| 63 |
+
uv run inspect eval eval/task.py@hf-benchmark-with-rubrics \
|
| 64 |
+
-T solver_name=claude_code \
|
| 65 |
+
-T solver_kwargs='{"allowed_tools":"Bash,Read","output_format":"json"}'
|
| 66 |
```
|
| 67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
+
## Scoring (implemented in `eval/rubric_eval.py`)
|
| 70 |
|
| 71 |
+
The scoring is implemented in `eval/rubric_eval.py` and is based on the RaR-Explicit formula: `score = Σ(weight × satisfied) / Σ(positive_weights)`.
|
| 72 |
|
| 73 |
+
The score is normalized to [0, 1] and clipped if pitfalls make it negative.
|
eval/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from eval.task import hf_benchmark_with_rubrics
|
| 2 |
+
|
| 3 |
+
__all__ = ["hf_benchmark_with_rubrics"]
|
eval/generate_rubrics.py
CHANGED
|
@@ -17,9 +17,10 @@ from typing import Any, Dict, List
|
|
| 17 |
import litellm
|
| 18 |
import pandas as pd
|
| 19 |
from dotenv import load_dotenv
|
| 20 |
-
from hf_dataset_io import df_to_hub
|
| 21 |
from pydantic import BaseModel
|
| 22 |
|
|
|
|
|
|
|
| 23 |
|
| 24 |
class Rubric(BaseModel):
|
| 25 |
title: str
|
|
|
|
| 17 |
import litellm
|
| 18 |
import pandas as pd
|
| 19 |
from dotenv import load_dotenv
|
|
|
|
| 20 |
from pydantic import BaseModel
|
| 21 |
|
| 22 |
+
from eval.hf_io import df_to_hub
|
| 23 |
+
|
| 24 |
|
| 25 |
class Rubric(BaseModel):
|
| 26 |
title: str
|
eval/hf_agent_connector.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import asyncio
|
| 4 |
+
import sys
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Any
|
| 7 |
+
|
| 8 |
+
from agent.config import Config, load_config
|
| 9 |
+
from agent.core.agent_loop import Handlers
|
| 10 |
+
from agent.core.session import Session
|
| 11 |
+
from agent.core.tools import ToolRouter
|
| 12 |
+
|
| 13 |
+
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
| 14 |
+
if str(PROJECT_ROOT) not in sys.path:
|
| 15 |
+
sys.path.insert(0, str(PROJECT_ROOT))
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def _resolve_project_path(path: str | Path) -> Path:
|
| 19 |
+
candidate = Path(path)
|
| 20 |
+
if candidate.is_absolute():
|
| 21 |
+
return candidate
|
| 22 |
+
return (PROJECT_ROOT / candidate).resolve()
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class AgentResponseGenerator:
|
| 26 |
+
"""
|
| 27 |
+
Thin async wrapper that executes the existing agent loop once and
|
| 28 |
+
returns the assistant's final message.
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
def __init__(self, config_path: str | Path, max_iterations: int = 10) -> None:
|
| 32 |
+
self.config_path = _resolve_project_path(config_path)
|
| 33 |
+
self.config: Config = load_config(str(self.config_path))
|
| 34 |
+
self.max_iterations = max_iterations
|
| 35 |
+
|
| 36 |
+
@property
|
| 37 |
+
def model_name(self) -> str:
|
| 38 |
+
"""Expose the agent model name for downstream logging."""
|
| 39 |
+
return self.config.model_name
|
| 40 |
+
|
| 41 |
+
async def run(self, prompt: str) -> str:
|
| 42 |
+
"""
|
| 43 |
+
Execute the agent loop for a single prompt and return the assistant reply.
|
| 44 |
+
"""
|
| 45 |
+
tool_router = ToolRouter(self.config.mcpServers)
|
| 46 |
+
|
| 47 |
+
async with tool_router:
|
| 48 |
+
session = Session(asyncio.Queue(), config=self.config)
|
| 49 |
+
session.tool_router = tool_router
|
| 50 |
+
await Handlers.run_agent(
|
| 51 |
+
session,
|
| 52 |
+
prompt,
|
| 53 |
+
max_iterations=self.max_iterations,
|
| 54 |
+
)
|
| 55 |
+
return self._latest_assistant_response(session)
|
| 56 |
+
|
| 57 |
+
def _latest_assistant_response(self, session: Session) -> str:
|
| 58 |
+
"""
|
| 59 |
+
Extract the final assistant response from the session history.
|
| 60 |
+
"""
|
| 61 |
+
for message in reversed(session.context_manager.items):
|
| 62 |
+
if getattr(message, "role", None) == "assistant":
|
| 63 |
+
return _content_to_text(getattr(message, "content", ""))
|
| 64 |
+
|
| 65 |
+
raise RuntimeError("Agent did not produce an assistant message.")
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def _content_to_text(content: Any) -> str:
|
| 69 |
+
"""
|
| 70 |
+
Convert LiteLLM content payloads (str or list[dict]) into plain text.
|
| 71 |
+
"""
|
| 72 |
+
if isinstance(content, str):
|
| 73 |
+
return content
|
| 74 |
+
|
| 75 |
+
if isinstance(content, list):
|
| 76 |
+
parts: list[str] = []
|
| 77 |
+
for block in content:
|
| 78 |
+
if isinstance(block, dict):
|
| 79 |
+
text = block.get("text")
|
| 80 |
+
if text:
|
| 81 |
+
parts.append(str(text))
|
| 82 |
+
else:
|
| 83 |
+
text = getattr(block, "text", None)
|
| 84 |
+
if text:
|
| 85 |
+
parts.append(str(text))
|
| 86 |
+
return "\n".join(parts)
|
| 87 |
+
|
| 88 |
+
return str(content)
|
eval/{hf_dataset_io.py → hf_io.py}
RENAMED
|
@@ -5,245 +5,12 @@ Reusable functions for uploading and downloading JSONL data to/from HuggingFace
|
|
| 5 |
Supports the dataset_name@config_name notation for managing multiple configurations.
|
| 6 |
"""
|
| 7 |
|
| 8 |
-
import
|
| 9 |
-
from pathlib import Path
|
| 10 |
-
from typing import Dict, List, Optional, Union
|
| 11 |
|
| 12 |
import pandas as pd
|
| 13 |
from datasets import Dataset, load_dataset
|
| 14 |
|
| 15 |
|
| 16 |
-
def upload_jsonl_to_hf(
|
| 17 |
-
jsonl_file: Union[str, Path],
|
| 18 |
-
dataset_spec: str,
|
| 19 |
-
split: str = "train",
|
| 20 |
-
private: bool = False,
|
| 21 |
-
) -> bool:
|
| 22 |
-
"""
|
| 23 |
-
Upload a JSONL file to HuggingFace Hub as a dataset.
|
| 24 |
-
|
| 25 |
-
This function reads a JSONL file where each line is a complete JSON object,
|
| 26 |
-
converts it to a HuggingFace Dataset, and uploads it to the Hub.
|
| 27 |
-
|
| 28 |
-
Args:
|
| 29 |
-
jsonl_file: Path to the JSONL file to upload. Each line should be a valid
|
| 30 |
-
JSON object. Example format:
|
| 31 |
-
```
|
| 32 |
-
{"question": "How to...", "solution": "...", "rubric": "[...]"}
|
| 33 |
-
{"question": "What is...", "solution": "...", "rubric": "[...]"}
|
| 34 |
-
```
|
| 35 |
-
|
| 36 |
-
dataset_spec: Dataset specification in the format "dataset_name" or
|
| 37 |
-
"dataset_name@config_name". Examples:
|
| 38 |
-
- "username/my-dataset" (uses "default" config)
|
| 39 |
-
- "username/my-dataset@rubrics" (uses "rubrics" config)
|
| 40 |
-
- "username/my-dataset@evaluations" (uses "evaluations" config)
|
| 41 |
-
|
| 42 |
-
Multiple configs allow you to store different data types in the same
|
| 43 |
-
dataset repository (e.g., raw data, rubrics, evaluation results).
|
| 44 |
-
|
| 45 |
-
split: The dataset split name. Defaults to "train". Common values:
|
| 46 |
-
- "train": Training or main data
|
| 47 |
-
- "validation": Validation data
|
| 48 |
-
- "test": Test data
|
| 49 |
-
|
| 50 |
-
private: Whether to create a private dataset. Defaults to False (public).
|
| 51 |
-
|
| 52 |
-
Returns:
|
| 53 |
-
bool: True if upload succeeded, False otherwise
|
| 54 |
-
|
| 55 |
-
Raises:
|
| 56 |
-
FileNotFoundError: If the JSONL file doesn't exist
|
| 57 |
-
ValueError: If the JSONL file is empty or contains invalid JSON
|
| 58 |
-
Exception: For HuggingFace Hub upload errors
|
| 59 |
-
|
| 60 |
-
Example:
|
| 61 |
-
>>> # Upload rubrics with custom config
|
| 62 |
-
>>> upload_jsonl_to_hf(
|
| 63 |
-
... "qa_rubrics.jsonl",
|
| 64 |
-
... "username/hf-agent-benchmark@rubrics",
|
| 65 |
-
... split="train"
|
| 66 |
-
... )
|
| 67 |
-
|
| 68 |
-
>>> # Upload evaluation results with different config
|
| 69 |
-
>>> upload_jsonl_to_hf(
|
| 70 |
-
... "evaluation_results.jsonl",
|
| 71 |
-
... "username/hf-agent-benchmark@evaluations",
|
| 72 |
-
... split="test"
|
| 73 |
-
... )
|
| 74 |
-
|
| 75 |
-
Notes:
|
| 76 |
-
- Requires authentication via `huggingface-cli login` or HF_TOKEN env var
|
| 77 |
-
- If the dataset doesn't exist, it will be created automatically
|
| 78 |
-
- If it exists, the specified config/split will be updated
|
| 79 |
-
- Empty files will raise ValueError to prevent uploading invalid data
|
| 80 |
-
"""
|
| 81 |
-
jsonl_path = Path(jsonl_file)
|
| 82 |
-
|
| 83 |
-
# Validate file exists
|
| 84 |
-
if not jsonl_path.exists():
|
| 85 |
-
raise FileNotFoundError(f"JSONL file not found: {jsonl_file}")
|
| 86 |
-
|
| 87 |
-
# Parse dataset specification
|
| 88 |
-
if "@" in dataset_spec:
|
| 89 |
-
dataset_name, config_name = dataset_spec.split("@", 1)
|
| 90 |
-
else:
|
| 91 |
-
dataset_name = dataset_spec
|
| 92 |
-
config_name = "default"
|
| 93 |
-
|
| 94 |
-
try:
|
| 95 |
-
print(f"\nUploading {jsonl_path.name} to HuggingFace Hub...")
|
| 96 |
-
print(f" Dataset: {dataset_name}")
|
| 97 |
-
print(f" Config: {config_name}")
|
| 98 |
-
print(f" Split: {split}")
|
| 99 |
-
|
| 100 |
-
# Load JSONL file
|
| 101 |
-
records = []
|
| 102 |
-
with open(jsonl_path, "r") as f:
|
| 103 |
-
for line_num, line in enumerate(f, start=1):
|
| 104 |
-
line = line.strip()
|
| 105 |
-
if line: # Skip empty lines
|
| 106 |
-
try:
|
| 107 |
-
records.append(json.loads(line))
|
| 108 |
-
except json.JSONDecodeError as e:
|
| 109 |
-
raise ValueError(f"Invalid JSON on line {line_num}: {e}") from e
|
| 110 |
-
|
| 111 |
-
if not records:
|
| 112 |
-
raise ValueError("JSONL file is empty or contains no valid records")
|
| 113 |
-
|
| 114 |
-
print(f" Loaded {len(records)} records from JSONL")
|
| 115 |
-
|
| 116 |
-
# Create HuggingFace Dataset
|
| 117 |
-
dataset = Dataset.from_list(records)
|
| 118 |
-
|
| 119 |
-
# Upload to HuggingFace Hub
|
| 120 |
-
dataset.push_to_hub(
|
| 121 |
-
dataset_name,
|
| 122 |
-
config_name=config_name,
|
| 123 |
-
split=split,
|
| 124 |
-
private=private,
|
| 125 |
-
)
|
| 126 |
-
|
| 127 |
-
print(
|
| 128 |
-
f"✓ Successfully uploaded to {dataset_name}@{config_name} (split: {split})"
|
| 129 |
-
)
|
| 130 |
-
return True
|
| 131 |
-
|
| 132 |
-
except Exception as e:
|
| 133 |
-
print(f"✗ Failed to upload to HuggingFace: {type(e).__name__}: {str(e)}")
|
| 134 |
-
print(f" JSONL file preserved at: {jsonl_path}")
|
| 135 |
-
return False
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
def download_hf_to_jsonl(
|
| 139 |
-
dataset_spec: str,
|
| 140 |
-
output_file: Union[str, Path],
|
| 141 |
-
split: str = "train",
|
| 142 |
-
overwrite: bool = False,
|
| 143 |
-
) -> bool:
|
| 144 |
-
"""
|
| 145 |
-
Download a dataset from HuggingFace Hub and save as JSONL.
|
| 146 |
-
|
| 147 |
-
This function downloads a dataset from the HuggingFace Hub and saves it as a
|
| 148 |
-
JSONL file where each line is a complete JSON object.
|
| 149 |
-
|
| 150 |
-
Args:
|
| 151 |
-
dataset_spec: Dataset specification in the format "dataset_name" or
|
| 152 |
-
"dataset_name@config_name". Examples:
|
| 153 |
-
- "username/my-dataset" (uses "default" config)
|
| 154 |
-
- "username/my-dataset@rubrics" (uses "rubrics" config)
|
| 155 |
-
- "username/my-dataset@evaluations" (uses "evaluations" config)
|
| 156 |
-
|
| 157 |
-
output_file: Path where the JSONL file will be saved. Will create parent
|
| 158 |
-
directories if they don't exist. Example: "data/downloaded_rubrics.jsonl"
|
| 159 |
-
|
| 160 |
-
split: The dataset split to download. Defaults to "train". Common values:
|
| 161 |
-
- "train": Training or main data
|
| 162 |
-
- "validation": Validation data
|
| 163 |
-
- "test": Test data
|
| 164 |
-
- "all": Download all splits (creates one JSONL with all data)
|
| 165 |
-
|
| 166 |
-
overwrite: Whether to overwrite existing file. Defaults to False.
|
| 167 |
-
|
| 168 |
-
Returns:
|
| 169 |
-
bool: True if download succeeded, False otherwise
|
| 170 |
-
|
| 171 |
-
Raises:
|
| 172 |
-
FileExistsError: If output file exists and overwrite=False
|
| 173 |
-
ValueError: If the dataset/config/split doesn't exist
|
| 174 |
-
Exception: For HuggingFace Hub download errors
|
| 175 |
-
|
| 176 |
-
Example:
|
| 177 |
-
>>> # Download rubrics from specific config
|
| 178 |
-
>>> download_hf_to_jsonl(
|
| 179 |
-
... "username/hf-agent-benchmark@rubrics",
|
| 180 |
-
... "local_rubrics.jsonl",
|
| 181 |
-
... split="train"
|
| 182 |
-
... )
|
| 183 |
-
|
| 184 |
-
>>> # Download evaluation results
|
| 185 |
-
>>> download_hf_to_jsonl(
|
| 186 |
-
... "username/hf-agent-benchmark@evaluations",
|
| 187 |
-
... "local_evaluations.jsonl",
|
| 188 |
-
... split="test",
|
| 189 |
-
... overwrite=True
|
| 190 |
-
... )
|
| 191 |
-
|
| 192 |
-
Notes:
|
| 193 |
-
- Requires authentication for private datasets via `huggingface-cli login`
|
| 194 |
-
- Downloaded data will be in the same format as uploaded (preserves structure)
|
| 195 |
-
- Each line in the output JSONL is a complete, valid JSON object
|
| 196 |
-
- Large datasets may take time to download
|
| 197 |
-
"""
|
| 198 |
-
output_path = Path(output_file)
|
| 199 |
-
|
| 200 |
-
# Check if file exists
|
| 201 |
-
if output_path.exists() and not overwrite:
|
| 202 |
-
raise FileExistsError(
|
| 203 |
-
f"Output file already exists: {output_file}. "
|
| 204 |
-
"Use overwrite=True to replace it."
|
| 205 |
-
)
|
| 206 |
-
|
| 207 |
-
# Parse dataset specification
|
| 208 |
-
if "@" in dataset_spec:
|
| 209 |
-
dataset_name, config_name = dataset_spec.split("@", 1)
|
| 210 |
-
else:
|
| 211 |
-
dataset_name = dataset_spec
|
| 212 |
-
config_name = "default"
|
| 213 |
-
|
| 214 |
-
try:
|
| 215 |
-
print("\nDownloading from HuggingFace Hub...")
|
| 216 |
-
print(f" Dataset: {dataset_name}")
|
| 217 |
-
print(f" Config: {config_name}")
|
| 218 |
-
print(f" Split: {split}")
|
| 219 |
-
|
| 220 |
-
# Download dataset from HuggingFace Hub
|
| 221 |
-
dataset = load_dataset(
|
| 222 |
-
dataset_name,
|
| 223 |
-
name=config_name,
|
| 224 |
-
split=split,
|
| 225 |
-
)
|
| 226 |
-
|
| 227 |
-
print(f" Downloaded {len(dataset)} records")
|
| 228 |
-
|
| 229 |
-
# Create parent directories if needed
|
| 230 |
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 231 |
-
|
| 232 |
-
# Write to JSONL
|
| 233 |
-
with open(output_path, "w") as f:
|
| 234 |
-
for record in dataset:
|
| 235 |
-
# Convert record to JSON and write as line
|
| 236 |
-
f.write(json.dumps(record) + "\n")
|
| 237 |
-
|
| 238 |
-
print(f"✓ Successfully saved to {output_path}")
|
| 239 |
-
print(f" Total records: {len(dataset)}")
|
| 240 |
-
return True
|
| 241 |
-
|
| 242 |
-
except Exception as e:
|
| 243 |
-
print(f"✗ Failed to download from HuggingFace: {type(e).__name__}: {str(e)}")
|
| 244 |
-
return False
|
| 245 |
-
|
| 246 |
-
|
| 247 |
def list_dataset_configs(dataset_name: str) -> Optional[List[str]]:
|
| 248 |
"""
|
| 249 |
List all available configs for a dataset on HuggingFace Hub.
|
|
@@ -269,60 +36,6 @@ def list_dataset_configs(dataset_name: str) -> Optional[List[str]]:
|
|
| 269 |
return None
|
| 270 |
|
| 271 |
|
| 272 |
-
def get_dataset_info(dataset_spec: str, split: str = "train") -> Optional[Dict]:
|
| 273 |
-
"""
|
| 274 |
-
Get information about a dataset on HuggingFace Hub.
|
| 275 |
-
|
| 276 |
-
Args:
|
| 277 |
-
dataset_spec: Dataset specification ("dataset_name" or "dataset_name@config")
|
| 278 |
-
split: The split to get info for (default: "train")
|
| 279 |
-
|
| 280 |
-
Returns:
|
| 281 |
-
Dictionary with dataset info, or None if unable to retrieve
|
| 282 |
-
|
| 283 |
-
Example:
|
| 284 |
-
>>> info = get_dataset_info("username/hf-agent-benchmark@rubrics")
|
| 285 |
-
>>> print(f"Records: {info['num_rows']}")
|
| 286 |
-
>>> print(f"Columns: {info['column_names']}")
|
| 287 |
-
"""
|
| 288 |
-
# Parse dataset specification
|
| 289 |
-
if "@" in dataset_spec:
|
| 290 |
-
dataset_name, config_name = dataset_spec.split("@", 1)
|
| 291 |
-
else:
|
| 292 |
-
dataset_name = dataset_spec
|
| 293 |
-
config_name = "default"
|
| 294 |
-
|
| 295 |
-
try:
|
| 296 |
-
# Load just to get info (streaming mode for efficiency)
|
| 297 |
-
dataset = load_dataset(
|
| 298 |
-
dataset_name,
|
| 299 |
-
name=config_name,
|
| 300 |
-
split=split,
|
| 301 |
-
streaming=True,
|
| 302 |
-
)
|
| 303 |
-
|
| 304 |
-
# Get basic info
|
| 305 |
-
info = {
|
| 306 |
-
"dataset_name": dataset_name,
|
| 307 |
-
"config_name": config_name,
|
| 308 |
-
"split": split,
|
| 309 |
-
"features": str(dataset.features),
|
| 310 |
-
"column_names": dataset.column_names
|
| 311 |
-
if hasattr(dataset, "column_names")
|
| 312 |
-
else None,
|
| 313 |
-
}
|
| 314 |
-
|
| 315 |
-
# Try to get row count (only works for non-streaming)
|
| 316 |
-
dataset_full = load_dataset(dataset_name, name=config_name, split=split)
|
| 317 |
-
info["num_rows"] = len(dataset_full)
|
| 318 |
-
|
| 319 |
-
return info
|
| 320 |
-
|
| 321 |
-
except Exception as e:
|
| 322 |
-
print(f"✗ Failed to get dataset info: {type(e).__name__}: {str(e)}")
|
| 323 |
-
return None
|
| 324 |
-
|
| 325 |
-
|
| 326 |
def df_to_hub(
|
| 327 |
df: pd.DataFrame,
|
| 328 |
dataset_spec: str,
|
|
@@ -500,18 +213,3 @@ def hub_to_df(
|
|
| 500 |
except Exception as e:
|
| 501 |
print(f"✗ Failed to download from HuggingFace: {type(e).__name__}: {str(e)}")
|
| 502 |
return None
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
if __name__ == "__main__":
|
| 506 |
-
# Example usage
|
| 507 |
-
print("HuggingFace Dataset I/O Utilities")
|
| 508 |
-
print("=" * 60)
|
| 509 |
-
print("\nExample: Upload rubrics")
|
| 510 |
-
print(' upload_jsonl_to_hf("qa_rubrics.jsonl", "username/dataset@rubrics")')
|
| 511 |
-
print("\nExample: Download evaluations")
|
| 512 |
-
print(' download_hf_to_jsonl("username/dataset@evaluations", "local.jsonl")')
|
| 513 |
-
print("\nExample: List configs")
|
| 514 |
-
print(' list_dataset_configs("username/dataset")')
|
| 515 |
-
print("\nExample: Get dataset info")
|
| 516 |
-
print(' get_dataset_info("username/dataset@rubrics")')
|
| 517 |
-
print("=" * 60)
|
|
|
|
| 5 |
Supports the dataset_name@config_name notation for managing multiple configurations.
|
| 6 |
"""
|
| 7 |
|
| 8 |
+
from typing import List, Optional
|
|
|
|
|
|
|
| 9 |
|
| 10 |
import pandas as pd
|
| 11 |
from datasets import Dataset, load_dataset
|
| 12 |
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
def list_dataset_configs(dataset_name: str) -> Optional[List[str]]:
|
| 15 |
"""
|
| 16 |
List all available configs for a dataset on HuggingFace Hub.
|
|
|
|
| 36 |
return None
|
| 37 |
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
def df_to_hub(
|
| 40 |
df: pd.DataFrame,
|
| 41 |
dataset_spec: str,
|
|
|
|
| 213 |
except Exception as e:
|
| 214 |
print(f"✗ Failed to download from HuggingFace: {type(e).__name__}: {str(e)}")
|
| 215 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval/{evaluate.py → rubric_eval.py}
RENAMED
|
@@ -4,13 +4,9 @@ Rubric-based evaluation following the "Rubrics as Rewards" paper.
|
|
| 4 |
Implements RaR-Explicit: Weighted sum of individual criterion scores (Equation 1)
|
| 5 |
"""
|
| 6 |
|
| 7 |
-
import
|
| 8 |
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 9 |
-
from typing import Dict, List, Optional
|
| 10 |
|
| 11 |
import litellm
|
| 12 |
-
import pandas as pd
|
| 13 |
-
from hf_dataset_io import df_to_hub
|
| 14 |
from pydantic import BaseModel
|
| 15 |
|
| 16 |
|
|
@@ -32,17 +28,6 @@ class RubricEvaluation(BaseModel):
|
|
| 32 |
normalized_score: float # Score normalized to [0, 1]
|
| 33 |
|
| 34 |
|
| 35 |
-
class EvaluatedResponse(BaseModel):
|
| 36 |
-
"""Complete evaluated response with rubric scores."""
|
| 37 |
-
|
| 38 |
-
discussion_title: str
|
| 39 |
-
discussion_url: str
|
| 40 |
-
question: str
|
| 41 |
-
response: str
|
| 42 |
-
reference_answer: str
|
| 43 |
-
evaluation: RubricEvaluation
|
| 44 |
-
|
| 45 |
-
|
| 46 |
CRITERION_PROMPT = """You are evaluating whether a response satisfies a specific evaluation criterion.
|
| 47 |
|
| 48 |
Question: {question}
|
|
@@ -69,32 +54,6 @@ class RubricData(BaseModel):
|
|
| 69 |
weight: int
|
| 70 |
|
| 71 |
|
| 72 |
-
def load_rubrics_from_file(rubric_file: str) -> Dict[str, List[RubricData]]:
|
| 73 |
-
"""
|
| 74 |
-
Load rubrics from JSONL file and index by question.
|
| 75 |
-
|
| 76 |
-
Args:
|
| 77 |
-
rubric_file: Path to rubric JSONL file
|
| 78 |
-
|
| 79 |
-
Returns:
|
| 80 |
-
Dictionary mapping questions to their rubrics
|
| 81 |
-
"""
|
| 82 |
-
rubrics_by_question = {}
|
| 83 |
-
|
| 84 |
-
with open(rubric_file, "r") as f:
|
| 85 |
-
for line in f:
|
| 86 |
-
entry = json.loads(line)
|
| 87 |
-
question = entry["question"]
|
| 88 |
-
|
| 89 |
-
# Parse rubric JSON string
|
| 90 |
-
rubric_data = json.loads(entry["rubric"])
|
| 91 |
-
rubrics = [RubricData(**r) for r in rubric_data["rubrics"]]
|
| 92 |
-
|
| 93 |
-
rubrics_by_question[question] = rubrics
|
| 94 |
-
|
| 95 |
-
return rubrics_by_question
|
| 96 |
-
|
| 97 |
-
|
| 98 |
def check_criterion(
|
| 99 |
question: str, response: str, criterion: RubricData, model: str = "gpt-4o-mini"
|
| 100 |
) -> CriterionCheck:
|
|
@@ -137,7 +96,6 @@ def check_criterion(
|
|
| 137 |
def evaluate_with_rubrics(
|
| 138 |
question: str,
|
| 139 |
response: str,
|
| 140 |
-
reference_answer: str,
|
| 141 |
rubrics: List[RubricData],
|
| 142 |
model: str = "gpt-4o-mini",
|
| 143 |
) -> RubricEvaluation:
|
|
@@ -182,176 +140,3 @@ def evaluate_with_rubrics(
|
|
| 182 |
normalized_score=normalized_score,
|
| 183 |
criterion_checks=checks,
|
| 184 |
)
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
def evaluate_dataset_with_rubrics(
|
| 188 |
-
input_file: str,
|
| 189 |
-
rubric_file: str,
|
| 190 |
-
ground_truth_file: str,
|
| 191 |
-
output_file: str = "rubric_evaluation_results.jsonl",
|
| 192 |
-
model: str = "gpt-4o-mini",
|
| 193 |
-
max_concurrent: int = 10,
|
| 194 |
-
limit: Optional[int] = None,
|
| 195 |
-
push_to_hub: Optional[str] = None,
|
| 196 |
-
) -> None:
|
| 197 |
-
"""
|
| 198 |
-
Evaluate all responses using rubric-based assessment.
|
| 199 |
-
|
| 200 |
-
Args:
|
| 201 |
-
input_file: Path to JSONL with responses to evaluate
|
| 202 |
-
rubric_file: Path to JSONL with rubrics (output from generate_rubrics.py)
|
| 203 |
-
ground_truth_file: Path to JSONL with ground truth answers
|
| 204 |
-
output_file: Path to output JSONL file
|
| 205 |
-
model: LLM model for judging
|
| 206 |
-
max_concurrent: Maximum concurrent evaluations
|
| 207 |
-
limit: Optional limit on number of examples
|
| 208 |
-
push_to_hub: Optional HuggingFace dataset spec (e.g., username/dataset@evaluations)
|
| 209 |
-
"""
|
| 210 |
-
# Load data
|
| 211 |
-
print(f"Loading responses from {input_file}...")
|
| 212 |
-
with open(input_file, "r") as f:
|
| 213 |
-
responses = [json.loads(line) for line in f]
|
| 214 |
-
|
| 215 |
-
print(f"Loading rubrics from {rubric_file}...")
|
| 216 |
-
rubrics_by_question = load_rubrics_from_file(rubric_file)
|
| 217 |
-
|
| 218 |
-
print(f"Loading ground truth from {ground_truth_file}...")
|
| 219 |
-
with open(ground_truth_file, "r") as f:
|
| 220 |
-
ground_truths = [json.loads(line) for line in f]
|
| 221 |
-
|
| 222 |
-
if limit:
|
| 223 |
-
responses = responses[:limit]
|
| 224 |
-
ground_truths = ground_truths[:limit]
|
| 225 |
-
|
| 226 |
-
print(f"Loaded {len(responses)} responses to evaluate")
|
| 227 |
-
print(f"Judge model: {model}")
|
| 228 |
-
|
| 229 |
-
# Match responses with rubrics and ground truth
|
| 230 |
-
evaluation_tasks = []
|
| 231 |
-
for response_data, gt_data in zip(responses, ground_truths):
|
| 232 |
-
question = gt_data["question"]
|
| 233 |
-
|
| 234 |
-
# Find rubrics for this question
|
| 235 |
-
rubrics = rubrics_by_question.get(question)
|
| 236 |
-
if not rubrics:
|
| 237 |
-
print(f"Warning: No rubrics found for question: {question[:50]}...")
|
| 238 |
-
continue
|
| 239 |
-
|
| 240 |
-
evaluation_tasks.append(
|
| 241 |
-
{
|
| 242 |
-
"question": question,
|
| 243 |
-
"response": response_data["solution"],
|
| 244 |
-
"reference_answer": gt_data["solution"],
|
| 245 |
-
"rubrics": rubrics,
|
| 246 |
-
"metadata": {
|
| 247 |
-
"discussion_title": response_data.get("discussion_title", ""),
|
| 248 |
-
"discussion_url": response_data.get("discussion_url", ""),
|
| 249 |
-
},
|
| 250 |
-
}
|
| 251 |
-
)
|
| 252 |
-
|
| 253 |
-
print(
|
| 254 |
-
f"Running {len(evaluation_tasks)} evaluations with {max_concurrent} parallel workers..."
|
| 255 |
-
)
|
| 256 |
-
|
| 257 |
-
# Run evaluations in parallel
|
| 258 |
-
results = []
|
| 259 |
-
with ThreadPoolExecutor(max_workers=max_concurrent) as executor:
|
| 260 |
-
# Submit all tasks
|
| 261 |
-
future_to_idx = {}
|
| 262 |
-
for idx, task in enumerate(evaluation_tasks):
|
| 263 |
-
future = executor.submit(
|
| 264 |
-
evaluate_with_rubrics,
|
| 265 |
-
question=task["question"],
|
| 266 |
-
response=task["response"],
|
| 267 |
-
reference_answer=task["reference_answer"],
|
| 268 |
-
rubrics=task["rubrics"],
|
| 269 |
-
model=model,
|
| 270 |
-
)
|
| 271 |
-
future_to_idx[future] = idx
|
| 272 |
-
|
| 273 |
-
# Collect results in order
|
| 274 |
-
results = [None] * len(evaluation_tasks)
|
| 275 |
-
completed = 0
|
| 276 |
-
for future in as_completed(future_to_idx):
|
| 277 |
-
idx = future_to_idx[future]
|
| 278 |
-
results[idx] = future.result()
|
| 279 |
-
completed += 1
|
| 280 |
-
print(f"Completed: {completed}/{len(evaluation_tasks)}", end="\r")
|
| 281 |
-
|
| 282 |
-
print() # New line after progress
|
| 283 |
-
|
| 284 |
-
# Combine results with metadata
|
| 285 |
-
output_data = []
|
| 286 |
-
total_score = 0.0
|
| 287 |
-
|
| 288 |
-
for task, evaluation in zip(evaluation_tasks, results):
|
| 289 |
-
evaluated_response = EvaluatedResponse(
|
| 290 |
-
discussion_title=task["metadata"]["discussion_title"],
|
| 291 |
-
discussion_url=task["metadata"]["discussion_url"],
|
| 292 |
-
question=task["question"],
|
| 293 |
-
response=task["response"],
|
| 294 |
-
reference_answer=task["reference_answer"],
|
| 295 |
-
evaluation=evaluation,
|
| 296 |
-
)
|
| 297 |
-
output_data.append(evaluated_response)
|
| 298 |
-
total_score += evaluation.normalized_score
|
| 299 |
-
|
| 300 |
-
# Convert to DataFrame for HuggingFace upload
|
| 301 |
-
results_df = pd.DataFrame([entry.model_dump() for entry in output_data])
|
| 302 |
-
|
| 303 |
-
# Upload to HuggingFace if specified (before saving JSONL)
|
| 304 |
-
if push_to_hub:
|
| 305 |
-
print(f"\nUploading to HuggingFace: {push_to_hub}")
|
| 306 |
-
upload_success = df_to_hub(
|
| 307 |
-
df=results_df,
|
| 308 |
-
dataset_spec=push_to_hub,
|
| 309 |
-
split="test",
|
| 310 |
-
private=False,
|
| 311 |
-
)
|
| 312 |
-
if not upload_success:
|
| 313 |
-
print("Warning: HuggingFace upload failed, but continuing to save JSONL...")
|
| 314 |
-
|
| 315 |
-
# Write results to JSONL file
|
| 316 |
-
print(f"\nWriting results to {output_file}...")
|
| 317 |
-
with open(output_file, "w") as f:
|
| 318 |
-
for entry in output_data:
|
| 319 |
-
f.write(entry.model_dump_json() + "\n")
|
| 320 |
-
|
| 321 |
-
# Print summary
|
| 322 |
-
avg_score = total_score / len(output_data) if output_data else 0.0
|
| 323 |
-
|
| 324 |
-
print("\n" + "=" * 60)
|
| 325 |
-
print("RUBRIC-BASED EVALUATION SUMMARY")
|
| 326 |
-
print("=" * 60)
|
| 327 |
-
print(f"Total examples: {len(output_data)}")
|
| 328 |
-
print(f"Judge model: {model}")
|
| 329 |
-
print(f"Average normalized score: {avg_score:.3f}")
|
| 330 |
-
print(f"Average percentage: {avg_score * 100:.1f}%")
|
| 331 |
-
|
| 332 |
-
# Per-criterion statistics
|
| 333 |
-
total_satisfied = sum(
|
| 334 |
-
sum(1 for check in eval.evaluation.criterion_checks if check.satisfied)
|
| 335 |
-
for eval in output_data
|
| 336 |
-
)
|
| 337 |
-
total_criteria = sum(len(eval.evaluation.criterion_checks) for eval in output_data)
|
| 338 |
-
satisfaction_rate = total_satisfied / total_criteria if total_criteria > 0 else 0.0
|
| 339 |
-
print(f"Criteria satisfaction rate: {satisfaction_rate * 100:.1f}%")
|
| 340 |
-
|
| 341 |
-
if push_to_hub and upload_success:
|
| 342 |
-
print(f"Pushed to: {push_to_hub}")
|
| 343 |
-
|
| 344 |
-
print("=" * 60)
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
if __name__ == "__main__":
|
| 348 |
-
evaluate_dataset_with_rubrics(
|
| 349 |
-
input_file="eval/qa_pairs_accepted.jsonl",
|
| 350 |
-
rubric_file="eval/qa_rubrics.jsonl",
|
| 351 |
-
ground_truth_file="eval/qa_pairs_accepted.jsonl",
|
| 352 |
-
output_file="rubric_evaluation.jsonl",
|
| 353 |
-
model="gpt-4o-mini",
|
| 354 |
-
max_concurrent=10,
|
| 355 |
-
limit=30, # Set to None to evaluate all
|
| 356 |
-
push_to_hub="akseljoonas/hf-agent-benchmark@ground-truth", # Set to "username/dataset@evaluations" to upload
|
| 357 |
-
)
|
|
|
|
| 4 |
Implements RaR-Explicit: Weighted sum of individual criterion scores (Equation 1)
|
| 5 |
"""
|
| 6 |
|
| 7 |
+
from typing import List, Optional
|
|
|
|
|
|
|
| 8 |
|
| 9 |
import litellm
|
|
|
|
|
|
|
| 10 |
from pydantic import BaseModel
|
| 11 |
|
| 12 |
|
|
|
|
| 28 |
normalized_score: float # Score normalized to [0, 1]
|
| 29 |
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
CRITERION_PROMPT = """You are evaluating whether a response satisfies a specific evaluation criterion.
|
| 32 |
|
| 33 |
Question: {question}
|
|
|
|
| 54 |
weight: int
|
| 55 |
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
def check_criterion(
|
| 58 |
question: str, response: str, criterion: RubricData, model: str = "gpt-4o-mini"
|
| 59 |
) -> CriterionCheck:
|
|
|
|
| 96 |
def evaluate_with_rubrics(
|
| 97 |
question: str,
|
| 98 |
response: str,
|
|
|
|
| 99 |
rubrics: List[RubricData],
|
| 100 |
model: str = "gpt-4o-mini",
|
| 101 |
) -> RubricEvaluation:
|
|
|
|
| 140 |
normalized_score=normalized_score,
|
| 141 |
criterion_checks=checks,
|
| 142 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval/solvers.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Collection of Inspect AI solvers used by the rubric task.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
|
| 7 |
+
import asyncio
|
| 8 |
+
import json
|
| 9 |
+
from typing import Callable, Dict, List, Sequence
|
| 10 |
+
|
| 11 |
+
from inspect_ai.model import ChatMessageAssistant, ModelOutput
|
| 12 |
+
from inspect_ai.solver import Solver, solver
|
| 13 |
+
from inspect_ai.solver._task_state import TaskState
|
| 14 |
+
|
| 15 |
+
from eval.hf_agent_connector import AgentResponseGenerator
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
async def _run_subprocess(command: Sequence[str]) -> str:
|
| 19 |
+
process = await asyncio.create_subprocess_exec(
|
| 20 |
+
*command,
|
| 21 |
+
stdout=asyncio.subprocess.PIPE,
|
| 22 |
+
stderr=asyncio.subprocess.PIPE,
|
| 23 |
+
)
|
| 24 |
+
stdout, stderr = await process.communicate()
|
| 25 |
+
if process.returncode != 0:
|
| 26 |
+
raise RuntimeError(
|
| 27 |
+
f"Command {' '.join(command)} failed with code {process.returncode}:\n"
|
| 28 |
+
f"{stderr.decode().strip()}"
|
| 29 |
+
)
|
| 30 |
+
return stdout.decode().strip()
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
@solver(name="hf_agent_solver")
|
| 34 |
+
def hf_agent_solver(
|
| 35 |
+
config_path: str = "agent/config_mcp_example.json",
|
| 36 |
+
max_iterations: int = 10,
|
| 37 |
+
) -> Solver:
|
| 38 |
+
runner = AgentResponseGenerator(
|
| 39 |
+
config_path=config_path,
|
| 40 |
+
max_iterations=max_iterations,
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
async def solve(state: TaskState, generate) -> TaskState:
|
| 44 |
+
response = await runner.run(state.input_text)
|
| 45 |
+
assistant_message = ChatMessageAssistant(
|
| 46 |
+
content=response,
|
| 47 |
+
model=runner.model_name,
|
| 48 |
+
source="generate",
|
| 49 |
+
)
|
| 50 |
+
state.messages.append(assistant_message)
|
| 51 |
+
state.output = ModelOutput.from_message(assistant_message)
|
| 52 |
+
state.completed = True
|
| 53 |
+
return state
|
| 54 |
+
|
| 55 |
+
return solve
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
@solver(name="claude_code")
|
| 59 |
+
def claude_code(
|
| 60 |
+
output_format: str = "json",
|
| 61 |
+
mcp_config: str | None = None,
|
| 62 |
+
) -> Solver:
|
| 63 |
+
if output_format not in {"text", "json", "stream-json"}:
|
| 64 |
+
raise ValueError("output_format must be one of: text, json, stream-json")
|
| 65 |
+
|
| 66 |
+
async def solve(state: TaskState, generate) -> TaskState:
|
| 67 |
+
prompt = state.input_text
|
| 68 |
+
|
| 69 |
+
cmd: List[str] = ["claude", "-p", prompt, "--output-format", output_format]
|
| 70 |
+
if mcp_config:
|
| 71 |
+
cmd += ["--mcp-config", mcp_config]
|
| 72 |
+
|
| 73 |
+
stdout = await _run_subprocess(cmd)
|
| 74 |
+
response_text = stdout
|
| 75 |
+
session_id = None
|
| 76 |
+
|
| 77 |
+
if output_format in {"json", "stream-json"}:
|
| 78 |
+
# stream-json may emit multiple JSON objects; take the last complete line
|
| 79 |
+
candidate_line = stdout.strip().splitlines()[-1]
|
| 80 |
+
try:
|
| 81 |
+
payload = json.loads(candidate_line)
|
| 82 |
+
response_text = (
|
| 83 |
+
payload.get("result") or payload.get("message", "") or stdout
|
| 84 |
+
)
|
| 85 |
+
session_id = payload.get("session_id")
|
| 86 |
+
except (json.JSONDecodeError, AttributeError):
|
| 87 |
+
response_text = stdout
|
| 88 |
+
|
| 89 |
+
assistant_message = ChatMessageAssistant(
|
| 90 |
+
content=response_text,
|
| 91 |
+
model="claude-code",
|
| 92 |
+
source="generate",
|
| 93 |
+
metadata={"session_id": session_id} if session_id else None,
|
| 94 |
+
)
|
| 95 |
+
state.messages.append(assistant_message)
|
| 96 |
+
state.output = ModelOutput.from_message(assistant_message)
|
| 97 |
+
state.completed = True
|
| 98 |
+
return state
|
| 99 |
+
|
| 100 |
+
return solve
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
SOLVER_REGISTRY: Dict[str, Callable[..., Solver]] = {
|
| 104 |
+
"hf_agent_solver": hf_agent_solver,
|
| 105 |
+
"claude_code": claude_code,
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def get_solver(name: str, **kwargs) -> Solver:
|
| 110 |
+
try:
|
| 111 |
+
factory = SOLVER_REGISTRY[name]
|
| 112 |
+
except KeyError as exc:
|
| 113 |
+
available = ", ".join(sorted(SOLVER_REGISTRY))
|
| 114 |
+
raise ValueError(f"Unknown solver '{name}'. Available: {available}") from exc
|
| 115 |
+
|
| 116 |
+
return factory(**kwargs)
|
eval/task.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Inspect AI task definition that runs the existing agent and reuses the rubric scorer.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
|
| 7 |
+
import asyncio
|
| 8 |
+
import json
|
| 9 |
+
import sys
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Any, Sequence
|
| 12 |
+
|
| 13 |
+
from inspect_ai import Task, task
|
| 14 |
+
from inspect_ai.dataset import Sample, hf_dataset
|
| 15 |
+
from inspect_ai.scorer import Score, Target, mean, scorer
|
| 16 |
+
from inspect_ai.solver._task_state import TaskState
|
| 17 |
+
|
| 18 |
+
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
| 19 |
+
if str(PROJECT_ROOT) not in sys.path:
|
| 20 |
+
sys.path.insert(0, str(PROJECT_ROOT))
|
| 21 |
+
|
| 22 |
+
from eval.rubric_eval import RubricData, evaluate_with_rubrics # noqa: E402
|
| 23 |
+
from eval.solvers import get_solver # noqa: E402
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def _record_to_sample(record: dict[str, Any]) -> Sample:
|
| 27 |
+
rubric_payload = json.loads(record["rubric"])
|
| 28 |
+
rubrics = rubric_payload.get("rubrics", [])
|
| 29 |
+
|
| 30 |
+
metadata = {
|
| 31 |
+
"question": record["question"],
|
| 32 |
+
"discussion_title": record.get("discussion_title"),
|
| 33 |
+
"discussion_url": record.get("discussion_url"),
|
| 34 |
+
"rubric_title": rubric_payload.get("title"),
|
| 35 |
+
"rubric_description": rubric_payload.get("description"),
|
| 36 |
+
"rubrics": rubrics,
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
return Sample(
|
| 40 |
+
input=record["question"],
|
| 41 |
+
target=record["solution"],
|
| 42 |
+
id=record.get("discussion_topic_id"),
|
| 43 |
+
metadata=metadata,
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def _load_dataset(dataset_name: str, split: str, limit: int | None) -> Sequence[Sample]:
|
| 48 |
+
return hf_dataset(
|
| 49 |
+
dataset_name, sample_fields=_record_to_sample, split=split, limit=limit
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def _metadata_to_rubrics(metadata: dict[str, Any]) -> list[RubricData]:
|
| 54 |
+
raw_rubrics = metadata.get("rubrics", [])
|
| 55 |
+
return [RubricData(**rubric) for rubric in raw_rubrics]
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
@scorer(metrics=[mean()], name="rubric_scorer")
|
| 59 |
+
def rubric_scorer(judge_model: str = "gpt-4o-mini"):
|
| 60 |
+
async def score(state: TaskState, target: Target) -> Score:
|
| 61 |
+
response_text = state.output.completion or state.output.message.text
|
| 62 |
+
question = state.metadata.get("question", state.input_text)
|
| 63 |
+
rubrics = _metadata_to_rubrics(state.metadata)
|
| 64 |
+
|
| 65 |
+
evaluation = await asyncio.to_thread(
|
| 66 |
+
evaluate_with_rubrics,
|
| 67 |
+
question,
|
| 68 |
+
response_text,
|
| 69 |
+
rubrics,
|
| 70 |
+
judge_model,
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
score_metadata = {
|
| 74 |
+
"raw_score": evaluation.raw_score,
|
| 75 |
+
"criterion_checks": [
|
| 76 |
+
check.model_dump() for check in evaluation.criterion_checks
|
| 77 |
+
],
|
| 78 |
+
"discussion_title": state.metadata.get("discussion_title"),
|
| 79 |
+
"discussion_url": state.metadata.get("discussion_url"),
|
| 80 |
+
"reference_answer": target.text,
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
return Score(
|
| 84 |
+
value=evaluation.normalized_score,
|
| 85 |
+
answer=response_text,
|
| 86 |
+
explanation=f"Normalized score {evaluation.normalized_score:.3f}",
|
| 87 |
+
metadata=score_metadata,
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
return score
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
@task(name="hf-benchmark-with-rubrics")
|
| 94 |
+
def hf_benchmark_with_rubrics(
|
| 95 |
+
solver_name: str = "hf_agent_solver",
|
| 96 |
+
solver_kwargs: dict[str, Any] = {
|
| 97 |
+
"max_iterations": 10,
|
| 98 |
+
"config_path": "agent/config_mcp_example.json",
|
| 99 |
+
},
|
| 100 |
+
dataset_name: str = "akseljoonas/hf-agent-rubrics@train",
|
| 101 |
+
limit: int | None = None,
|
| 102 |
+
judge_model: str = "gpt-4o-mini",
|
| 103 |
+
) -> Task:
|
| 104 |
+
if "@" not in dataset_name:
|
| 105 |
+
raise ValueError("Dataset name must be in the format 'author/dataset@split'")
|
| 106 |
+
dataset_name, dataset_split = dataset_name.split("@")
|
| 107 |
+
dataset = _load_dataset(dataset_name, dataset_split, limit=limit)
|
| 108 |
+
|
| 109 |
+
return Task(
|
| 110 |
+
dataset=dataset,
|
| 111 |
+
solver=get_solver(solver_name, **solver_kwargs),
|
| 112 |
+
scorer=rubric_scorer(judge_model=judge_model),
|
| 113 |
+
metadata={
|
| 114 |
+
"dataset_name": dataset_name,
|
| 115 |
+
"dataset_split": dataset_split,
|
| 116 |
+
"solver_name": solver_name,
|
| 117 |
+
"judge_model": judge_model,
|
| 118 |
+
},
|
| 119 |
+
)
|