Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
Commit ·
bc84cfe
1
Parent(s): 202a610
eval script done
Browse files- .gitignore +3 -1
- eval/df.ipynb +0 -137
- eval/evaluate.py +256 -123
- eval/generate_rubrics.py +315 -0
- pyproject.toml +2 -0
- uv.lock +90 -0
.gitignore
CHANGED
|
@@ -10,4 +10,6 @@ wheels/
|
|
| 10 |
.venv
|
| 11 |
.env
|
| 12 |
.DS_Store
|
| 13 |
-
.claude/
|
|
|
|
|
|
|
|
|
| 10 |
.venv
|
| 11 |
.env
|
| 12 |
.DS_Store
|
| 13 |
+
.claude/
|
| 14 |
+
*.jsonl
|
| 15 |
+
*.csv
|
eval/df.ipynb
DELETED
|
@@ -1,137 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"cells": [
|
| 3 |
-
{
|
| 4 |
-
"cell_type": "code",
|
| 5 |
-
"execution_count": 1,
|
| 6 |
-
"id": "b7f67653",
|
| 7 |
-
"metadata": {},
|
| 8 |
-
"outputs": [
|
| 9 |
-
{
|
| 10 |
-
"data": {
|
| 11 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 12 |
-
"model_id": "1fcf9d61b3664bc99d616101c201aca8",
|
| 13 |
-
"version_major": 2,
|
| 14 |
-
"version_minor": 0
|
| 15 |
-
},
|
| 16 |
-
"text/plain": [
|
| 17 |
-
"Generating train split: 0 examples [00:00, ? examples/s]"
|
| 18 |
-
]
|
| 19 |
-
},
|
| 20 |
-
"metadata": {},
|
| 21 |
-
"output_type": "display_data"
|
| 22 |
-
}
|
| 23 |
-
],
|
| 24 |
-
"source": [
|
| 25 |
-
"from datasets import load_dataset\n",
|
| 26 |
-
"\n",
|
| 27 |
-
"ds = load_dataset(\"json\", data_files=\"qa_pairs.jsonl\", split=\"train\")\n"
|
| 28 |
-
]
|
| 29 |
-
},
|
| 30 |
-
{
|
| 31 |
-
"cell_type": "code",
|
| 32 |
-
"execution_count": 2,
|
| 33 |
-
"id": "55cd7b9c",
|
| 34 |
-
"metadata": {},
|
| 35 |
-
"outputs": [
|
| 36 |
-
{
|
| 37 |
-
"data": {
|
| 38 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 39 |
-
"model_id": "cb9452a5789b4b20bd0b01cce111f961",
|
| 40 |
-
"version_major": 2,
|
| 41 |
-
"version_minor": 0
|
| 42 |
-
},
|
| 43 |
-
"text/plain": [
|
| 44 |
-
"Uploading the dataset shards: 0%| | 0/1 [00:00<?, ? shards/s]"
|
| 45 |
-
]
|
| 46 |
-
},
|
| 47 |
-
"metadata": {},
|
| 48 |
-
"output_type": "display_data"
|
| 49 |
-
},
|
| 50 |
-
{
|
| 51 |
-
"data": {
|
| 52 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 53 |
-
"model_id": "4a77e027bda5405991bfde524347013c",
|
| 54 |
-
"version_major": 2,
|
| 55 |
-
"version_minor": 0
|
| 56 |
-
},
|
| 57 |
-
"text/plain": [
|
| 58 |
-
"Creating parquet from Arrow format: 0%| | 0/1 [00:00<?, ?ba/s]"
|
| 59 |
-
]
|
| 60 |
-
},
|
| 61 |
-
"metadata": {},
|
| 62 |
-
"output_type": "display_data"
|
| 63 |
-
},
|
| 64 |
-
{
|
| 65 |
-
"data": {
|
| 66 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 67 |
-
"model_id": "4d70f759da55470fba30326d99d6ac1f",
|
| 68 |
-
"version_major": 2,
|
| 69 |
-
"version_minor": 0
|
| 70 |
-
},
|
| 71 |
-
"text/plain": [
|
| 72 |
-
"Processing Files (0 / 0): | | 0.00B / 0.00B "
|
| 73 |
-
]
|
| 74 |
-
},
|
| 75 |
-
"metadata": {},
|
| 76 |
-
"output_type": "display_data"
|
| 77 |
-
},
|
| 78 |
-
{
|
| 79 |
-
"data": {
|
| 80 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 81 |
-
"model_id": "617db8df8ac94a2889d8760fc7f0113a",
|
| 82 |
-
"version_major": 2,
|
| 83 |
-
"version_minor": 0
|
| 84 |
-
},
|
| 85 |
-
"text/plain": [
|
| 86 |
-
"New Data Upload: | | 0.00B / 0.00B "
|
| 87 |
-
]
|
| 88 |
-
},
|
| 89 |
-
"metadata": {},
|
| 90 |
-
"output_type": "display_data"
|
| 91 |
-
},
|
| 92 |
-
{
|
| 93 |
-
"data": {
|
| 94 |
-
"text/plain": [
|
| 95 |
-
"CommitInfo(commit_url='https://huggingface.co/datasets/akseljoonas/qa_pairs/commit/6947117631cb56686c192533427bb4400382b4fd', commit_message='Upload dataset', commit_description='', oid='6947117631cb56686c192533427bb4400382b4fd', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/akseljoonas/qa_pairs', endpoint='https://huggingface.co', repo_type='dataset', repo_id='akseljoonas/qa_pairs'), pr_revision=None, pr_num=None)"
|
| 96 |
-
]
|
| 97 |
-
},
|
| 98 |
-
"execution_count": 2,
|
| 99 |
-
"metadata": {},
|
| 100 |
-
"output_type": "execute_result"
|
| 101 |
-
}
|
| 102 |
-
],
|
| 103 |
-
"source": [
|
| 104 |
-
"ds.push_to_hub(\"akseljoonas/qa_pairs\")"
|
| 105 |
-
]
|
| 106 |
-
},
|
| 107 |
-
{
|
| 108 |
-
"cell_type": "code",
|
| 109 |
-
"execution_count": null,
|
| 110 |
-
"id": "16d4760d",
|
| 111 |
-
"metadata": {},
|
| 112 |
-
"outputs": [],
|
| 113 |
-
"source": []
|
| 114 |
-
}
|
| 115 |
-
],
|
| 116 |
-
"metadata": {
|
| 117 |
-
"kernelspec": {
|
| 118 |
-
"display_name": ".venv",
|
| 119 |
-
"language": "python",
|
| 120 |
-
"name": "python3"
|
| 121 |
-
},
|
| 122 |
-
"language_info": {
|
| 123 |
-
"codemirror_mode": {
|
| 124 |
-
"name": "ipython",
|
| 125 |
-
"version": 3
|
| 126 |
-
},
|
| 127 |
-
"file_extension": ".py",
|
| 128 |
-
"mimetype": "text/x-python",
|
| 129 |
-
"name": "python",
|
| 130 |
-
"nbconvert_exporter": "python",
|
| 131 |
-
"pygments_lexer": "ipython3",
|
| 132 |
-
"version": "3.12.11"
|
| 133 |
-
}
|
| 134 |
-
},
|
| 135 |
-
"nbformat": 4,
|
| 136 |
-
"nbformat_minor": 5
|
| 137 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval/evaluate.py
CHANGED
|
@@ -1,201 +1,334 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import json
|
| 2 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 3 |
-
from typing import
|
| 4 |
|
| 5 |
import litellm
|
| 6 |
-
from
|
| 7 |
-
Correctness,
|
| 8 |
-
EvaluatedQuestionAndSolution,
|
| 9 |
-
JudgementResult,
|
| 10 |
-
QuestionAndSolution,
|
| 11 |
-
)
|
| 12 |
|
| 13 |
-
# from: https://github.com/centerforaisafety/hle/blob/7b6be5aad6f9b43af3857de7867f3b52f6e4acb3/hle_eval/run_judge_results.py#L16-L33
|
| 14 |
-
GRADER_TEMPLATE = """
|
| 15 |
-
Judge whether the following [response] to [question] is correct or not based on if the [response] includes the precise and unambiguous [correct_answer] below.
|
| 16 |
|
| 17 |
-
|
|
|
|
| 18 |
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
-
Your judgement must be in the format and criteria specified below:
|
| 22 |
|
| 23 |
-
|
|
|
|
| 24 |
|
| 25 |
-
|
|
|
|
|
|
|
| 26 |
|
| 27 |
-
reasoning: Explain why the [correct_answer] is included or not included in the extracted_final_answer, focusing only on if there are meaningful differences between [correct_answer] and the extracted_final_answer. Do not comment on any background to the problem, do not attempt to solve the problem, do not argue for any answer different than [correct_answer], focus only on whether the answers match.
|
| 28 |
|
| 29 |
-
|
|
|
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
-
confidence: The extracted confidence score between 0|%| and 100|%| from [response]. Put 100 if there is no confidence score available.
|
| 33 |
-
""".strip()
|
| 34 |
|
| 35 |
-
|
| 36 |
|
|
|
|
| 37 |
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
"""
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
Args:
|
| 48 |
question: The question being answered
|
| 49 |
response: The response to evaluate
|
| 50 |
-
|
| 51 |
-
model:
|
| 52 |
|
| 53 |
Returns:
|
| 54 |
-
|
| 55 |
"""
|
| 56 |
-
prompt =
|
| 57 |
-
question=question,
|
|
|
|
|
|
|
| 58 |
)
|
| 59 |
|
| 60 |
-
# Use litellm with structured output
|
| 61 |
llm_response = litellm.completion(
|
| 62 |
model=model,
|
| 63 |
messages=[
|
| 64 |
{
|
| 65 |
"role": "system",
|
| 66 |
-
"content": "You are an expert
|
| 67 |
},
|
| 68 |
{"role": "user", "content": prompt},
|
| 69 |
],
|
| 70 |
-
response_format=JudgementResult,
|
| 71 |
temperature=0.0,
|
|
|
|
| 72 |
)
|
| 73 |
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
llm_response.choices[0].message.content
|
| 77 |
-
)
|
| 78 |
return result
|
| 79 |
|
| 80 |
|
| 81 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
input_file: str,
|
| 83 |
-
|
| 84 |
-
|
|
|
|
| 85 |
model: str = "gpt-4o-mini",
|
| 86 |
-
max_concurrent: int =
|
| 87 |
-
limit: int = None,
|
| 88 |
) -> None:
|
| 89 |
"""
|
| 90 |
-
Evaluate all
|
| 91 |
|
| 92 |
Args:
|
| 93 |
-
input_file: Path to
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
|
|
|
|
|
|
| 98 |
"""
|
| 99 |
-
# Load
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
if limit:
|
| 104 |
-
|
|
|
|
| 105 |
|
| 106 |
-
print(f"Loaded {len(
|
|
|
|
| 107 |
|
| 108 |
-
#
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
|
| 113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
|
|
|
|
|
|
|
| 119 |
with ThreadPoolExecutor(max_workers=max_concurrent) as executor:
|
| 120 |
# Submit all tasks
|
| 121 |
future_to_idx = {}
|
| 122 |
-
for idx,
|
| 123 |
-
question = ground_truth.question
|
| 124 |
-
ground_truth_answer = ground_truth.solution
|
| 125 |
-
response = qa_pair.solution
|
| 126 |
-
|
| 127 |
future = executor.submit(
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
|
|
|
| 132 |
model=model,
|
| 133 |
)
|
| 134 |
future_to_idx[future] = idx
|
| 135 |
|
| 136 |
# Collect results in order
|
| 137 |
-
results = [None] * len(
|
|
|
|
| 138 |
for future in as_completed(future_to_idx):
|
| 139 |
idx = future_to_idx[future]
|
| 140 |
results[idx] = future.result()
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
)
|
| 155 |
-
output_data.append(
|
|
|
|
| 156 |
|
| 157 |
-
|
| 158 |
-
correct_count += 1
|
| 159 |
-
else:
|
| 160 |
-
error_count += 1
|
| 161 |
-
|
| 162 |
-
# Write results using proper model serialization
|
| 163 |
print(f"Writing results to {output_file}...")
|
| 164 |
with open(output_file, "w") as f:
|
| 165 |
for entry in output_data:
|
| 166 |
f.write(entry.model_dump_json() + "\n")
|
| 167 |
|
| 168 |
# Print summary
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
print("
|
| 174 |
-
print("
|
| 175 |
-
print("
|
| 176 |
-
print(f"
|
| 177 |
-
print(f"
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
#
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
def main():
|
| 189 |
-
"""Main entry point for the evaluation script"""
|
| 190 |
-
evaluate_dataset(
|
| 191 |
-
input_file="eval/qa_pairs.jsonl",
|
| 192 |
-
eval_file="eval/qa_pairs.jsonl",
|
| 193 |
-
output_file="evaluation_results.jsonl",
|
| 194 |
-
model="gpt-4o-mini",
|
| 195 |
-
max_concurrent=30,
|
| 196 |
-
limit=10, # Set to None to evaluate all, or a number to limit
|
| 197 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
|
| 199 |
|
| 200 |
if __name__ == "__main__":
|
| 201 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Rubric-based evaluation following the "Rubrics as Rewards" paper.
|
| 3 |
+
|
| 4 |
+
Implements RaR-Explicit: Weighted sum of individual criterion scores (Equation 1)
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
import json
|
| 8 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 9 |
+
from typing import Dict, List, Optional
|
| 10 |
|
| 11 |
import litellm
|
| 12 |
+
from pydantic import BaseModel
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
+
class CriterionCheck(BaseModel):
|
| 16 |
+
"""Result of checking a single rubric criterion."""
|
| 17 |
|
| 18 |
+
title: str
|
| 19 |
+
description: str
|
| 20 |
+
weight: int
|
| 21 |
+
satisfied: bool
|
| 22 |
+
reasoning: Optional[str] = None
|
| 23 |
|
|
|
|
| 24 |
|
| 25 |
+
class RubricEvaluation(BaseModel):
|
| 26 |
+
"""Complete rubric-based evaluation result."""
|
| 27 |
|
| 28 |
+
criterion_checks: List[CriterionCheck]
|
| 29 |
+
raw_score: float # Unnormalized score
|
| 30 |
+
normalized_score: float # Score normalized to [0, 1]
|
| 31 |
|
|
|
|
| 32 |
|
| 33 |
+
class EvaluatedResponse(BaseModel):
|
| 34 |
+
"""Complete evaluated response with rubric scores."""
|
| 35 |
|
| 36 |
+
discussion_title: str
|
| 37 |
+
discussion_url: str
|
| 38 |
+
question: str
|
| 39 |
+
response: str
|
| 40 |
+
reference_answer: str
|
| 41 |
+
evaluation: RubricEvaluation
|
| 42 |
|
|
|
|
|
|
|
| 43 |
|
| 44 |
+
CRITERION_PROMPT = """You are evaluating whether a response satisfies a specific evaluation criterion.
|
| 45 |
|
| 46 |
+
Question: {question}
|
| 47 |
|
| 48 |
+
Response to evaluate: {response}
|
| 49 |
+
|
| 50 |
+
Evaluation Criterion:
|
| 51 |
+
{criterion_description}
|
| 52 |
+
|
| 53 |
+
Your task: Determine if the response satisfies this criterion.
|
| 54 |
+
|
| 55 |
+
Output a JSON object with:
|
| 56 |
+
- "satisfied": true or false
|
| 57 |
+
- "reasoning": Brief explanation (1-2 sentences) of why it does or doesn't satisfy the criterion
|
| 58 |
+
|
| 59 |
+
Be strict but fair. The criterion must be clearly satisfied for you to answer true."""
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
class RubricData(BaseModel):
|
| 63 |
+
"""Rubric data loaded from file."""
|
| 64 |
+
|
| 65 |
+
title: str
|
| 66 |
+
description: str
|
| 67 |
+
weight: int
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def load_rubrics_from_file(rubric_file: str) -> Dict[str, List[RubricData]]:
|
| 71 |
+
"""
|
| 72 |
+
Load rubrics from JSONL file and index by question.
|
| 73 |
+
|
| 74 |
+
Args:
|
| 75 |
+
rubric_file: Path to rubric JSONL file
|
| 76 |
+
|
| 77 |
+
Returns:
|
| 78 |
+
Dictionary mapping questions to their rubrics
|
| 79 |
"""
|
| 80 |
+
rubrics_by_question = {}
|
| 81 |
+
|
| 82 |
+
with open(rubric_file, "r") as f:
|
| 83 |
+
for line in f:
|
| 84 |
+
entry = json.loads(line)
|
| 85 |
+
question = entry["question"]
|
| 86 |
+
|
| 87 |
+
# Parse rubric JSON string
|
| 88 |
+
rubric_data = json.loads(entry["rubric"])
|
| 89 |
+
rubrics = [RubricData(**r) for r in rubric_data["rubrics"]]
|
| 90 |
+
|
| 91 |
+
rubrics_by_question[question] = rubrics
|
| 92 |
+
|
| 93 |
+
return rubrics_by_question
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def check_criterion(
|
| 97 |
+
question: str, response: str, criterion: RubricData, model: str = "gpt-4o-mini"
|
| 98 |
+
) -> CriterionCheck:
|
| 99 |
+
"""
|
| 100 |
+
Check if response satisfies a single criterion.
|
| 101 |
|
| 102 |
Args:
|
| 103 |
question: The question being answered
|
| 104 |
response: The response to evaluate
|
| 105 |
+
criterion: The rubric criterion to check
|
| 106 |
+
model: LLM model for judging
|
| 107 |
|
| 108 |
Returns:
|
| 109 |
+
CriterionCheck with satisfaction result
|
| 110 |
"""
|
| 111 |
+
prompt = CRITERION_PROMPT.format(
|
| 112 |
+
question=question,
|
| 113 |
+
response=response,
|
| 114 |
+
criterion_description=criterion.description,
|
| 115 |
)
|
| 116 |
|
|
|
|
| 117 |
llm_response = litellm.completion(
|
| 118 |
model=model,
|
| 119 |
messages=[
|
| 120 |
{
|
| 121 |
"role": "system",
|
| 122 |
+
"content": "You are an expert evaluator for rubric-based assessment.",
|
| 123 |
},
|
| 124 |
{"role": "user", "content": prompt},
|
| 125 |
],
|
|
|
|
| 126 |
temperature=0.0,
|
| 127 |
+
response_format=CriterionCheck,
|
| 128 |
)
|
| 129 |
|
| 130 |
+
result = CriterionCheck.model_validate_json(llm_response.choices[0].message.content)
|
| 131 |
+
|
|
|
|
|
|
|
| 132 |
return result
|
| 133 |
|
| 134 |
|
| 135 |
+
def evaluate_with_rubrics(
|
| 136 |
+
question: str,
|
| 137 |
+
response: str,
|
| 138 |
+
reference_answer: str,
|
| 139 |
+
rubrics: List[RubricData],
|
| 140 |
+
model: str = "gpt-4o-mini",
|
| 141 |
+
) -> RubricEvaluation:
|
| 142 |
+
"""
|
| 143 |
+
Evaluate response using RaR-Explicit method (weighted sum).
|
| 144 |
+
|
| 145 |
+
Implements Equation 1 from paper:
|
| 146 |
+
r(x, ŷ) = Σ(w_j * c_j(x, ŷ)) / Σ(w_j)
|
| 147 |
+
|
| 148 |
+
Args:
|
| 149 |
+
question: The question
|
| 150 |
+
response: Response to evaluate
|
| 151 |
+
reference_answer: Reference answer (not directly used, but available)
|
| 152 |
+
rubrics: List of rubric criteria
|
| 153 |
+
model: LLM model for judging
|
| 154 |
+
|
| 155 |
+
Returns:
|
| 156 |
+
RubricEvaluation with normalized score
|
| 157 |
+
"""
|
| 158 |
+
# Check each criterion independently
|
| 159 |
+
checks = []
|
| 160 |
+
for rubric in rubrics:
|
| 161 |
+
check = check_criterion(question, response, rubric, model)
|
| 162 |
+
checks.append(check)
|
| 163 |
+
|
| 164 |
+
# Calculate weighted score (Equation 1)
|
| 165 |
+
# Only positive weights contribute to denominator
|
| 166 |
+
positive_weights = sum(abs(r.weight) for r in rubrics if r.weight > 0)
|
| 167 |
+
|
| 168 |
+
raw_score = 0.0
|
| 169 |
+
for check in checks:
|
| 170 |
+
if check.satisfied:
|
| 171 |
+
raw_score += check.weight
|
| 172 |
+
|
| 173 |
+
# Normalize to [0, 1]
|
| 174 |
+
normalized_score = raw_score / positive_weights if positive_weights > 0 else 0.0
|
| 175 |
+
# Clip to [0, 1] in case pitfalls make it negative
|
| 176 |
+
normalized_score = max(0.0, min(1.0, normalized_score))
|
| 177 |
+
|
| 178 |
+
return RubricEvaluation(
|
| 179 |
+
raw_score=raw_score,
|
| 180 |
+
normalized_score=normalized_score,
|
| 181 |
+
criterion_checks=checks,
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
def evaluate_dataset_with_rubrics(
|
| 186 |
input_file: str,
|
| 187 |
+
rubric_file: str,
|
| 188 |
+
ground_truth_file: str,
|
| 189 |
+
output_file: str = "rubric_evaluation_results.jsonl",
|
| 190 |
model: str = "gpt-4o-mini",
|
| 191 |
+
max_concurrent: int = 10,
|
| 192 |
+
limit: Optional[int] = None,
|
| 193 |
) -> None:
|
| 194 |
"""
|
| 195 |
+
Evaluate all responses using rubric-based assessment.
|
| 196 |
|
| 197 |
Args:
|
| 198 |
+
input_file: Path to JSONL with responses to evaluate
|
| 199 |
+
rubric_file: Path to JSONL with rubrics (output from generate_rubrics.py)
|
| 200 |
+
ground_truth_file: Path to JSONL with ground truth answers
|
| 201 |
+
output_file: Path to output JSONL file
|
| 202 |
+
model: LLM model for judging
|
| 203 |
+
max_concurrent: Maximum concurrent evaluations
|
| 204 |
+
limit: Optional limit on number of examples
|
| 205 |
"""
|
| 206 |
+
# Load data
|
| 207 |
+
print(f"Loading responses from {input_file}...")
|
| 208 |
+
with open(input_file, "r") as f:
|
| 209 |
+
responses = [json.loads(line) for line in f]
|
| 210 |
+
|
| 211 |
+
print(f"Loading rubrics from {rubric_file}...")
|
| 212 |
+
rubrics_by_question = load_rubrics_from_file(rubric_file)
|
| 213 |
+
|
| 214 |
+
print(f"Loading ground truth from {ground_truth_file}...")
|
| 215 |
+
with open(ground_truth_file, "r") as f:
|
| 216 |
+
ground_truths = [json.loads(line) for line in f]
|
| 217 |
+
|
| 218 |
if limit:
|
| 219 |
+
responses = responses[:limit]
|
| 220 |
+
ground_truths = ground_truths[:limit]
|
| 221 |
|
| 222 |
+
print(f"Loaded {len(responses)} responses to evaluate")
|
| 223 |
+
print(f"Judge model: {model}")
|
| 224 |
|
| 225 |
+
# Match responses with rubrics and ground truth
|
| 226 |
+
evaluation_tasks = []
|
| 227 |
+
for response_data, gt_data in zip(responses, ground_truths):
|
| 228 |
+
question = gt_data["question"]
|
| 229 |
|
| 230 |
+
# Find rubrics for this question
|
| 231 |
+
rubrics = rubrics_by_question.get(question)
|
| 232 |
+
if not rubrics:
|
| 233 |
+
print(f"Warning: No rubrics found for question: {question[:50]}...")
|
| 234 |
+
continue
|
| 235 |
|
| 236 |
+
evaluation_tasks.append(
|
| 237 |
+
{
|
| 238 |
+
"question": question,
|
| 239 |
+
"response": response_data["solution"],
|
| 240 |
+
"reference_answer": gt_data["solution"],
|
| 241 |
+
"rubrics": rubrics,
|
| 242 |
+
"metadata": {
|
| 243 |
+
"discussion_title": response_data.get("discussion_title", ""),
|
| 244 |
+
"discussion_url": response_data.get("discussion_url", ""),
|
| 245 |
+
},
|
| 246 |
+
}
|
| 247 |
+
)
|
| 248 |
+
|
| 249 |
+
print(
|
| 250 |
+
f"Running {len(evaluation_tasks)} evaluations with {max_concurrent} parallel workers..."
|
| 251 |
+
)
|
| 252 |
|
| 253 |
+
# Run evaluations in parallel
|
| 254 |
+
results = []
|
| 255 |
with ThreadPoolExecutor(max_workers=max_concurrent) as executor:
|
| 256 |
# Submit all tasks
|
| 257 |
future_to_idx = {}
|
| 258 |
+
for idx, task in enumerate(evaluation_tasks):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
future = executor.submit(
|
| 260 |
+
evaluate_with_rubrics,
|
| 261 |
+
question=task["question"],
|
| 262 |
+
response=task["response"],
|
| 263 |
+
reference_answer=task["reference_answer"],
|
| 264 |
+
rubrics=task["rubrics"],
|
| 265 |
model=model,
|
| 266 |
)
|
| 267 |
future_to_idx[future] = idx
|
| 268 |
|
| 269 |
# Collect results in order
|
| 270 |
+
results = [None] * len(evaluation_tasks)
|
| 271 |
+
completed = 0
|
| 272 |
for future in as_completed(future_to_idx):
|
| 273 |
idx = future_to_idx[future]
|
| 274 |
results[idx] = future.result()
|
| 275 |
+
completed += 1
|
| 276 |
+
print(f"Completed: {completed}/{len(evaluation_tasks)}", end="\r")
|
| 277 |
+
|
| 278 |
+
print() # New line after progress
|
| 279 |
+
|
| 280 |
+
# Combine results with metadata
|
| 281 |
+
output_data = []
|
| 282 |
+
total_score = 0.0
|
| 283 |
+
|
| 284 |
+
for task, evaluation in zip(evaluation_tasks, results):
|
| 285 |
+
evaluated_response = EvaluatedResponse(
|
| 286 |
+
discussion_title=task["metadata"]["discussion_title"],
|
| 287 |
+
discussion_url=task["metadata"]["discussion_url"],
|
| 288 |
+
question=task["question"],
|
| 289 |
+
response=task["response"],
|
| 290 |
+
reference_answer=task["reference_answer"],
|
| 291 |
+
evaluation=evaluation,
|
| 292 |
)
|
| 293 |
+
output_data.append(evaluated_response)
|
| 294 |
+
total_score += evaluation.normalized_score
|
| 295 |
|
| 296 |
+
# Write results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
print(f"Writing results to {output_file}...")
|
| 298 |
with open(output_file, "w") as f:
|
| 299 |
for entry in output_data:
|
| 300 |
f.write(entry.model_dump_json() + "\n")
|
| 301 |
|
| 302 |
# Print summary
|
| 303 |
+
avg_score = total_score / len(output_data) if output_data else 0.0
|
| 304 |
+
|
| 305 |
+
print("\n" + "=" * 60)
|
| 306 |
+
print("RUBRIC-BASED EVALUATION SUMMARY")
|
| 307 |
+
print("=" * 60)
|
| 308 |
+
print(f"Total examples: {len(output_data)}")
|
| 309 |
+
print(f"Judge model: {model}")
|
| 310 |
+
print(f"Average normalized score: {avg_score:.3f}")
|
| 311 |
+
print(f"Average percentage: {avg_score * 100:.1f}%")
|
| 312 |
+
|
| 313 |
+
# Per-criterion statistics
|
| 314 |
+
total_satisfied = sum(
|
| 315 |
+
sum(1 for check in eval.evaluation.criterion_checks if check.satisfied)
|
| 316 |
+
for eval in output_data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
)
|
| 318 |
+
total_criteria = sum(len(eval.evaluation.criterion_checks) for eval in output_data)
|
| 319 |
+
satisfaction_rate = total_satisfied / total_criteria if total_criteria > 0 else 0.0
|
| 320 |
+
print(f"Criteria satisfaction rate: {satisfaction_rate * 100:.1f}%")
|
| 321 |
+
|
| 322 |
+
print("=" * 60)
|
| 323 |
|
| 324 |
|
| 325 |
if __name__ == "__main__":
|
| 326 |
+
evaluate_dataset_with_rubrics(
|
| 327 |
+
input_file="eval/qa_pairs_accepted.jsonl",
|
| 328 |
+
rubric_file="eval/qa_rubrics.jsonl",
|
| 329 |
+
ground_truth_file="eval/qa_pairs_accepted.jsonl",
|
| 330 |
+
output_file="rubric_evaluation.jsonl",
|
| 331 |
+
model="gpt-4o-mini",
|
| 332 |
+
max_concurrent=10,
|
| 333 |
+
limit=30, # Set to None to evaluate all
|
| 334 |
+
)
|
eval/generate_rubrics.py
ADDED
|
@@ -0,0 +1,315 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env env python3
|
| 2 |
+
"""
|
| 3 |
+
Rubric Generation Script for HF-Agent Benchmark
|
| 4 |
+
|
| 5 |
+
Generates instance-specific evaluation rubrics following the "Rubrics as Rewards" paper.
|
| 6 |
+
Uses LiteLLM to call LLM models for rubric synthesis with expert grounding via reference answers.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import argparse
|
| 10 |
+
import json
|
| 11 |
+
import os
|
| 12 |
+
import sys
|
| 13 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
from typing import Any, Dict, List
|
| 16 |
+
|
| 17 |
+
import litellm
|
| 18 |
+
import pandas as pd
|
| 19 |
+
from dotenv import load_dotenv
|
| 20 |
+
from pydantic import BaseModel
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class Rubric(BaseModel):
|
| 24 |
+
title: str
|
| 25 |
+
description: str
|
| 26 |
+
weight: int
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class RubricList(BaseModel):
|
| 30 |
+
rubrics: List[Rubric]
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
# Load environment variables
|
| 34 |
+
load_dotenv()
|
| 35 |
+
|
| 36 |
+
# Rubric generation prompt template based on RaR paper
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
PROMPT_TEMPLATE = """You are an expert rubric writer. Your job is to generate a self-contained set of evaluation criteria (“rubrics”) for judging
|
| 40 |
+
how good, helpful and complete an agent's trajectory is to a given user question/request.
|
| 41 |
+
|
| 42 |
+
Rubrics can cover aspects of a response such as, but not limited to, factual correctness, helpfulness, completeness, harmlessness, correctness of using Hugging Face best practices (based on HF documentation), depth of
|
| 43 |
+
reasoning, contextual relevance and usefulness. Each item must be self-contained – non expert readers should not need to
|
| 44 |
+
infer anything or consult external information. Begin each description with its category: “Essential Criteria: . . . ”, “Important
|
| 45 |
+
Criteria: . . . ”, “Optional Criteria: . . . ”, or “Pitfall Criteria: Does not mention . . . ”.
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
Inputs: !!!
|
| 49 |
+
- question: <<<{question}>>>
|
| 50 |
+
- reference_answer (ideal solution): <<<{reference_answer}>>>
|
| 51 |
+
- thread: <<<{thread}>>>
|
| 52 |
+
|
| 53 |
+
Total items:
|
| 54 |
+
• Choose 7–20 rubric items based on the complexity of the question.
|
| 55 |
+
|
| 56 |
+
Each rubric item:
|
| 57 |
+
• title (2–4 words).
|
| 58 |
+
• description: One sentence starting with its category prefix that explicitly states exactly what to look for. For example:
|
| 59 |
+
– Essential Criteria: Writes a up-to-date, correct, complete and working training loop using the latest Hugging Face best practices. Launches the training with hf-jobs.
|
| 60 |
+
– Pitfall Criteria: Deprecated launcher usage. Uses python -m torch.distributed.launch instead of torchrun / accelerate.
|
| 61 |
+
– Important Criteria: Explains common DDP knobs. Mentions ddp_find_unused_parameters=False for models with conditional branches; optional ddp_timeout; brief note on when they matter and why.
|
| 62 |
+
– Optional Criteria: Briefly notes --deepspeed ds_config.json as an alternative scaler when models get big (but stays on DDP for this Q).
|
| 63 |
+
• weight: For Essential/Important/Optional, use 1–5 (5 = most important); for Pitfall, use –1 or –2.
|
| 64 |
+
|
| 65 |
+
Category guidance:
|
| 66 |
+
• Essential: Critical actions to answer/complete the user's question/request; if missing, the response is invalid and useless (weight 5).
|
| 67 |
+
• Important: Key reasoning, completeness, or clarity; strongly affects quality and usefulness (weight 3–4).
|
| 68 |
+
• Optional: Helpfulness in educating the user or providing extra depth; nice to have but not deal-breaking (weight 1–2).
|
| 69 |
+
• Pitfall: Common mistakes or omissions specific to this prompt—identify things a respondent often forgets or misstates.
|
| 70 |
+
Each Pitfall description must begin with “Pitfall Criteria: Does not mention . . . ” or “Pitfall Criteria: Recommends . . . ”
|
| 71 |
+
and use weight –1 or –2.
|
| 72 |
+
|
| 73 |
+
To ensure self-contained guidance:
|
| 74 |
+
• When referring to answer choices, explicitly say “Identifies (A)”, “Identifies (B)”, etc., rather than vague phrasing.
|
| 75 |
+
• If the format requires an action like calling a tool or launching a training run, include a rubric item such as:
|
| 76 |
+
– Essential Criteria: Includes a clear statement "Launches the training with hf-jobs.".
|
| 77 |
+
• If reasoning should precede the answer, include a rubric like:
|
| 78 |
+
– Important Criteria: Presents the explanation and reasoning before stating the final answer.
|
| 79 |
+
• If brevity is valued, include a rubric like:
|
| 80 |
+
– Optional Criteria: Remains concise and avoids unnecessary detail.
|
| 81 |
+
• If the question context demands mention of specific findings/best practices, include that explicitly (e.g., “Essential Criteria: Mentions
|
| 82 |
+
that training data must be in "messages" column for LLM training”).
|
| 83 |
+
|
| 84 |
+
Output: Provide a JSON array of rubric objects. Each object must contain exactly three keys—title, description, and weight.
|
| 85 |
+
Do not copy large blocks of the question or reference_answer into the text. Each description must begin with its category
|
| 86 |
+
prefix, and no extra keys are allowed.
|
| 87 |
+
Now, given the question, thread and reference_answer, generate the rubric as described. The reference answer is an good and helpful response
|
| 88 |
+
but not necessarily exhaustive; use it only as guidance."""
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def build_prompt(
|
| 92 |
+
question: str, reference_answer: str, thread: List[Dict[str, str]]
|
| 93 |
+
) -> List[Dict[str, str]]:
|
| 94 |
+
"""
|
| 95 |
+
Build the messages list for LiteLLM completion.
|
| 96 |
+
|
| 97 |
+
Args:
|
| 98 |
+
question: The question/task to evaluate
|
| 99 |
+
reference_answer: The reference/accepted solution
|
| 100 |
+
|
| 101 |
+
Returns:
|
| 102 |
+
List of message dicts for LiteLLM
|
| 103 |
+
"""
|
| 104 |
+
prompt = PROMPT_TEMPLATE.format(
|
| 105 |
+
question=question, reference_answer=reference_answer, thread=thread
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
return [{"role": "user", "content": prompt}]
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def validate_rubric(rubric_list: List[Dict[str, Any]]) -> bool:
|
| 112 |
+
"""
|
| 113 |
+
Validate that rubric meets basic requirements.
|
| 114 |
+
|
| 115 |
+
Args:
|
| 116 |
+
rubric_list: List of rubric items to validate
|
| 117 |
+
|
| 118 |
+
Returns:
|
| 119 |
+
True if valid, False otherwise
|
| 120 |
+
"""
|
| 121 |
+
# Check count
|
| 122 |
+
if not (7 <= len(rubric_list) <= 20):
|
| 123 |
+
return False
|
| 124 |
+
|
| 125 |
+
# Check each item
|
| 126 |
+
category_prefixes = [
|
| 127 |
+
"Essential Criteria:",
|
| 128 |
+
"Important Criteria:",
|
| 129 |
+
"Optional Criteria:",
|
| 130 |
+
"Pitfall Criteria:",
|
| 131 |
+
]
|
| 132 |
+
|
| 133 |
+
for item in rubric_list:
|
| 134 |
+
# Check keys
|
| 135 |
+
if set(item.keys()) != {"title", "description", "weight"}:
|
| 136 |
+
return False
|
| 137 |
+
|
| 138 |
+
# Check description starts with category prefix
|
| 139 |
+
if not any(
|
| 140 |
+
item["description"].startswith(prefix) for prefix in category_prefixes
|
| 141 |
+
):
|
| 142 |
+
return False
|
| 143 |
+
|
| 144 |
+
return True
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def generate_rubric(row: pd.Series, model: str, timeout: int = 120) -> Dict[str, Any]:
|
| 148 |
+
"""
|
| 149 |
+
Generate rubric for a single question using LiteLLM.
|
| 150 |
+
|
| 151 |
+
Args:
|
| 152 |
+
question: The question text
|
| 153 |
+
reference_answer: The reference solution
|
| 154 |
+
model: Model name for LiteLLM
|
| 155 |
+
timeout: Request timeout in seconds
|
| 156 |
+
|
| 157 |
+
Returns:
|
| 158 |
+
Dict with rubric_list and rubric_count, or None on failure
|
| 159 |
+
"""
|
| 160 |
+
|
| 161 |
+
messages = build_prompt(row["question"], row["solution"], row["thread"])
|
| 162 |
+
|
| 163 |
+
try:
|
| 164 |
+
response = litellm.completion(
|
| 165 |
+
model=model,
|
| 166 |
+
messages=messages,
|
| 167 |
+
timeout=timeout,
|
| 168 |
+
response_format=RubricList,
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
# Parse structured output
|
| 172 |
+
rubric_list: RubricList = RubricList.model_validate_json(
|
| 173 |
+
response.choices[0].message.content
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
return rubric_list.model_dump_json()
|
| 177 |
+
except Exception as e:
|
| 178 |
+
print(f"Error generating rubric: {e}", file=sys.stderr)
|
| 179 |
+
return None
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
def load_input_data(infile: str) -> pd.DataFrame:
|
| 183 |
+
"""
|
| 184 |
+
Load input data from CSV or JSONL file.
|
| 185 |
+
|
| 186 |
+
Args:
|
| 187 |
+
infile: Path to input file
|
| 188 |
+
|
| 189 |
+
Returns:
|
| 190 |
+
DataFrame with loaded data
|
| 191 |
+
"""
|
| 192 |
+
path = Path(infile)
|
| 193 |
+
|
| 194 |
+
if not path.exists():
|
| 195 |
+
raise FileNotFoundError(f"Input file not found: {infile}")
|
| 196 |
+
|
| 197 |
+
if path.suffix == ".csv":
|
| 198 |
+
# Try to auto-detect delimiter (comma or semicolon)
|
| 199 |
+
df = pd.read_csv(infile, sep=None, engine="python")
|
| 200 |
+
elif path.suffix == ".jsonl":
|
| 201 |
+
df = pd.read_json(infile, lines=True)
|
| 202 |
+
else:
|
| 203 |
+
raise ValueError(f"Unsupported file format: {path.suffix}. Use .csv or .jsonl")
|
| 204 |
+
|
| 205 |
+
# Validate required columns
|
| 206 |
+
required_cols = [
|
| 207 |
+
"discussion_title",
|
| 208 |
+
"discussion_url",
|
| 209 |
+
"question",
|
| 210 |
+
"thread",
|
| 211 |
+
"solution",
|
| 212 |
+
]
|
| 213 |
+
missing_cols = [col for col in required_cols if col not in df.columns]
|
| 214 |
+
|
| 215 |
+
if missing_cols:
|
| 216 |
+
raise ValueError(f"Missing required columns: {missing_cols}")
|
| 217 |
+
|
| 218 |
+
return df
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
def main():
|
| 222 |
+
parser = argparse.ArgumentParser(
|
| 223 |
+
description="Generate rubrics for HF-agent benchmark evaluation"
|
| 224 |
+
)
|
| 225 |
+
parser.add_argument(
|
| 226 |
+
"--infile", type=str, required=True, help="Input file path (.csv or .jsonl)"
|
| 227 |
+
)
|
| 228 |
+
parser.add_argument(
|
| 229 |
+
"--outfile", type=str, required=True, help="Output JSONL file path"
|
| 230 |
+
)
|
| 231 |
+
parser.add_argument(
|
| 232 |
+
"--model",
|
| 233 |
+
type=str,
|
| 234 |
+
default="anthropic/claude-sonnet-4-5-20250929",
|
| 235 |
+
help="LiteLLM model name (default: from LITELLM_MODEL env or gpt-4o-mini)",
|
| 236 |
+
)
|
| 237 |
+
parser.add_argument(
|
| 238 |
+
"--timeout",
|
| 239 |
+
type=int,
|
| 240 |
+
default=120,
|
| 241 |
+
help="Request timeout in seconds (default: 120)",
|
| 242 |
+
)
|
| 243 |
+
parser.add_argument(
|
| 244 |
+
"--max-concurrent",
|
| 245 |
+
type=int,
|
| 246 |
+
default=30,
|
| 247 |
+
help="Maximum number of concurrent workers (default: 30)",
|
| 248 |
+
)
|
| 249 |
+
|
| 250 |
+
args = parser.parse_args()
|
| 251 |
+
|
| 252 |
+
# Determine model
|
| 253 |
+
model = args.model or os.getenv("LITELLM_MODEL", "gpt-4o-mini")
|
| 254 |
+
print(f"Using model: {model}")
|
| 255 |
+
|
| 256 |
+
# Load input data
|
| 257 |
+
print(f"Loading data from {args.infile}...")
|
| 258 |
+
df = load_input_data(args.infile)
|
| 259 |
+
print(f"Loaded {len(df)} examples")
|
| 260 |
+
|
| 261 |
+
# Run rubric generation in parallel using ThreadPoolExecutor
|
| 262 |
+
print(f"Running generation with {args.max_concurrent} parallel workers...")
|
| 263 |
+
|
| 264 |
+
with ThreadPoolExecutor(max_workers=args.max_concurrent) as executor:
|
| 265 |
+
# Submit all tasks
|
| 266 |
+
future_to_idx = {}
|
| 267 |
+
for idx, row in df.iterrows():
|
| 268 |
+
future = executor.submit(
|
| 269 |
+
generate_rubric,
|
| 270 |
+
row=row,
|
| 271 |
+
model=model,
|
| 272 |
+
timeout=args.timeout,
|
| 273 |
+
)
|
| 274 |
+
future_to_idx[future] = idx
|
| 275 |
+
|
| 276 |
+
# Collect results in order
|
| 277 |
+
results = [None] * len(df)
|
| 278 |
+
completed = 0
|
| 279 |
+
for future in as_completed(future_to_idx):
|
| 280 |
+
idx = future_to_idx[future]
|
| 281 |
+
results[idx] = future.result()
|
| 282 |
+
completed += 1
|
| 283 |
+
print(f"Completed: {completed}/{len(df)}", end="\r")
|
| 284 |
+
|
| 285 |
+
print() # New line after progress
|
| 286 |
+
|
| 287 |
+
# Write results to file
|
| 288 |
+
print(f"Writing results to {args.outfile}...")
|
| 289 |
+
success_count = 0
|
| 290 |
+
failure_count = 0
|
| 291 |
+
|
| 292 |
+
with open(args.outfile, "w") as outf:
|
| 293 |
+
for idx, (_, row) in enumerate(df.iterrows()):
|
| 294 |
+
rubric_result = results[idx]
|
| 295 |
+
|
| 296 |
+
if rubric_result is None:
|
| 297 |
+
failure_count += 1
|
| 298 |
+
continue
|
| 299 |
+
|
| 300 |
+
# Merge with original data
|
| 301 |
+
output_row = row.to_dict()
|
| 302 |
+
output_row["rubric"] = rubric_result
|
| 303 |
+
|
| 304 |
+
# Write JSONL line
|
| 305 |
+
outf.write(json.dumps(output_row, default=str) + "\n")
|
| 306 |
+
success_count += 1
|
| 307 |
+
|
| 308 |
+
print("\nComplete!")
|
| 309 |
+
print(f"Success: {success_count}/{len(df)}")
|
| 310 |
+
print(f"Failures: {failure_count}/{len(df)}")
|
| 311 |
+
print(f"Output written to: {args.outfile}")
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
if __name__ == "__main__":
|
| 315 |
+
main()
|
pyproject.toml
CHANGED
|
@@ -10,4 +10,6 @@ dependencies = [
|
|
| 10 |
"pydantic>=2.12.3",
|
| 11 |
"litellm>=1.0.0",
|
| 12 |
"tenacity>=8.0.0",
|
|
|
|
|
|
|
| 13 |
]
|
|
|
|
| 10 |
"pydantic>=2.12.3",
|
| 11 |
"litellm>=1.0.0",
|
| 12 |
"tenacity>=8.0.0",
|
| 13 |
+
"pandas>=2.3.3",
|
| 14 |
+
"python-dotenv>=1.2.1",
|
| 15 |
]
|
uv.lock
CHANGED
|
@@ -401,7 +401,9 @@ source = { virtual = "." }
|
|
| 401 |
dependencies = [
|
| 402 |
{ name = "litellm" },
|
| 403 |
{ name = "numpy" },
|
|
|
|
| 404 |
{ name = "pydantic" },
|
|
|
|
| 405 |
{ name = "requests" },
|
| 406 |
{ name = "tenacity" },
|
| 407 |
]
|
|
@@ -410,7 +412,9 @@ dependencies = [
|
|
| 410 |
requires-dist = [
|
| 411 |
{ name = "litellm", specifier = ">=1.0.0" },
|
| 412 |
{ name = "numpy", specifier = ">=1.24.0" },
|
|
|
|
| 413 |
{ name = "pydantic", specifier = ">=2.12.3" },
|
|
|
|
| 414 |
{ name = "requests", specifier = ">=2.32.5" },
|
| 415 |
{ name = "tenacity", specifier = ">=8.0.0" },
|
| 416 |
]
|
|
@@ -897,6 +901,53 @@ wheels = [
|
|
| 897 |
{ url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469, upload-time = "2025-04-19T11:48:57.875Z" },
|
| 898 |
]
|
| 899 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 900 |
[[package]]
|
| 901 |
name = "propcache"
|
| 902 |
version = "0.4.1"
|
|
@@ -1063,6 +1114,18 @@ wheels = [
|
|
| 1063 |
{ url = "https://files.pythonhosted.org/packages/2b/c6/db8d13a1f8ab3f1eb08c88bd00fd62d44311e3456d1e85c0e59e0a0376e7/pydantic_core-2.41.4-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd8a5028425820731d8c6c098ab642d7b8b999758e24acae03ed38a66eca8335", size = 2139008, upload-time = "2025-10-14T10:23:04.539Z" },
|
| 1064 |
]
|
| 1065 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1066 |
[[package]]
|
| 1067 |
name = "python-dotenv"
|
| 1068 |
version = "1.2.1"
|
|
@@ -1072,6 +1135,15 @@ wheels = [
|
|
| 1072 |
{ url = "https://files.pythonhosted.org/packages/14/1b/a298b06749107c305e1fe0f814c6c74aea7b2f1e10989cb30f544a1b3253/python_dotenv-1.2.1-py3-none-any.whl", hash = "sha256:b81ee9561e9ca4004139c6cbba3a238c32b03e4894671e181b671e8cb8425d61", size = 21230, upload-time = "2025-10-26T15:12:09.109Z" },
|
| 1073 |
]
|
| 1074 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1075 |
[[package]]
|
| 1076 |
name = "pyyaml"
|
| 1077 |
version = "6.0.3"
|
|
@@ -1315,6 +1387,15 @@ wheels = [
|
|
| 1315 |
{ url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload-time = "2023-10-24T04:13:38.866Z" },
|
| 1316 |
]
|
| 1317 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1318 |
[[package]]
|
| 1319 |
name = "sniffio"
|
| 1320 |
version = "1.3.1"
|
|
@@ -1451,6 +1532,15 @@ wheels = [
|
|
| 1451 |
{ url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" },
|
| 1452 |
]
|
| 1453 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1454 |
[[package]]
|
| 1455 |
name = "urllib3"
|
| 1456 |
version = "2.5.0"
|
|
|
|
| 401 |
dependencies = [
|
| 402 |
{ name = "litellm" },
|
| 403 |
{ name = "numpy" },
|
| 404 |
+
{ name = "pandas" },
|
| 405 |
{ name = "pydantic" },
|
| 406 |
+
{ name = "python-dotenv" },
|
| 407 |
{ name = "requests" },
|
| 408 |
{ name = "tenacity" },
|
| 409 |
]
|
|
|
|
| 412 |
requires-dist = [
|
| 413 |
{ name = "litellm", specifier = ">=1.0.0" },
|
| 414 |
{ name = "numpy", specifier = ">=1.24.0" },
|
| 415 |
+
{ name = "pandas", specifier = ">=2.3.3" },
|
| 416 |
{ name = "pydantic", specifier = ">=2.12.3" },
|
| 417 |
+
{ name = "python-dotenv", specifier = ">=1.2.1" },
|
| 418 |
{ name = "requests", specifier = ">=2.32.5" },
|
| 419 |
{ name = "tenacity", specifier = ">=8.0.0" },
|
| 420 |
]
|
|
|
|
| 901 |
{ url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469, upload-time = "2025-04-19T11:48:57.875Z" },
|
| 902 |
]
|
| 903 |
|
| 904 |
+
[[package]]
|
| 905 |
+
name = "pandas"
|
| 906 |
+
version = "2.3.3"
|
| 907 |
+
source = { registry = "https://pypi.org/simple" }
|
| 908 |
+
dependencies = [
|
| 909 |
+
{ name = "numpy" },
|
| 910 |
+
{ name = "python-dateutil" },
|
| 911 |
+
{ name = "pytz" },
|
| 912 |
+
{ name = "tzdata" },
|
| 913 |
+
]
|
| 914 |
+
sdist = { url = "https://files.pythonhosted.org/packages/33/01/d40b85317f86cf08d853a4f495195c73815fdf205eef3993821720274518/pandas-2.3.3.tar.gz", hash = "sha256:e05e1af93b977f7eafa636d043f9f94c7ee3ac81af99c13508215942e64c993b", size = 4495223, upload-time = "2025-09-29T23:34:51.853Z" }
|
| 915 |
+
wheels = [
|
| 916 |
+
{ url = "https://files.pythonhosted.org/packages/9c/fb/231d89e8637c808b997d172b18e9d4a4bc7bf31296196c260526055d1ea0/pandas-2.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d21f6d74eb1725c2efaa71a2bfc661a0689579b58e9c0ca58a739ff0b002b53", size = 11597846, upload-time = "2025-09-29T23:19:48.856Z" },
|
| 917 |
+
{ url = "https://files.pythonhosted.org/packages/5c/bd/bf8064d9cfa214294356c2d6702b716d3cf3bb24be59287a6a21e24cae6b/pandas-2.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3fd2f887589c7aa868e02632612ba39acb0b8948faf5cc58f0850e165bd46f35", size = 10729618, upload-time = "2025-09-29T23:39:08.659Z" },
|
| 918 |
+
{ url = "https://files.pythonhosted.org/packages/57/56/cf2dbe1a3f5271370669475ead12ce77c61726ffd19a35546e31aa8edf4e/pandas-2.3.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ecaf1e12bdc03c86ad4a7ea848d66c685cb6851d807a26aa245ca3d2017a1908", size = 11737212, upload-time = "2025-09-29T23:19:59.765Z" },
|
| 919 |
+
{ url = "https://files.pythonhosted.org/packages/e5/63/cd7d615331b328e287d8233ba9fdf191a9c2d11b6af0c7a59cfcec23de68/pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b3d11d2fda7eb164ef27ffc14b4fcab16a80e1ce67e9f57e19ec0afaf715ba89", size = 12362693, upload-time = "2025-09-29T23:20:14.098Z" },
|
| 920 |
+
{ url = "https://files.pythonhosted.org/packages/a6/de/8b1895b107277d52f2b42d3a6806e69cfef0d5cf1d0ba343470b9d8e0a04/pandas-2.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a68e15f780eddf2b07d242e17a04aa187a7ee12b40b930bfdd78070556550e98", size = 12771002, upload-time = "2025-09-29T23:20:26.76Z" },
|
| 921 |
+
{ url = "https://files.pythonhosted.org/packages/87/21/84072af3187a677c5893b170ba2c8fbe450a6ff911234916da889b698220/pandas-2.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:371a4ab48e950033bcf52b6527eccb564f52dc826c02afd9a1bc0ab731bba084", size = 13450971, upload-time = "2025-09-29T23:20:41.344Z" },
|
| 922 |
+
{ url = "https://files.pythonhosted.org/packages/86/41/585a168330ff063014880a80d744219dbf1dd7a1c706e75ab3425a987384/pandas-2.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:a16dcec078a01eeef8ee61bf64074b4e524a2a3f4b3be9326420cabe59c4778b", size = 10992722, upload-time = "2025-09-29T23:20:54.139Z" },
|
| 923 |
+
{ url = "https://files.pythonhosted.org/packages/cd/4b/18b035ee18f97c1040d94debd8f2e737000ad70ccc8f5513f4eefad75f4b/pandas-2.3.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:56851a737e3470de7fa88e6131f41281ed440d29a9268dcbf0002da5ac366713", size = 11544671, upload-time = "2025-09-29T23:21:05.024Z" },
|
| 924 |
+
{ url = "https://files.pythonhosted.org/packages/31/94/72fac03573102779920099bcac1c3b05975c2cb5f01eac609faf34bed1ca/pandas-2.3.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:bdcd9d1167f4885211e401b3036c0c8d9e274eee67ea8d0758a256d60704cfe8", size = 10680807, upload-time = "2025-09-29T23:21:15.979Z" },
|
| 925 |
+
{ url = "https://files.pythonhosted.org/packages/16/87/9472cf4a487d848476865321de18cc8c920b8cab98453ab79dbbc98db63a/pandas-2.3.3-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e32e7cc9af0f1cc15548288a51a3b681cc2a219faa838e995f7dc53dbab1062d", size = 11709872, upload-time = "2025-09-29T23:21:27.165Z" },
|
| 926 |
+
{ url = "https://files.pythonhosted.org/packages/15/07/284f757f63f8a8d69ed4472bfd85122bd086e637bf4ed09de572d575a693/pandas-2.3.3-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:318d77e0e42a628c04dc56bcef4b40de67918f7041c2b061af1da41dcff670ac", size = 12306371, upload-time = "2025-09-29T23:21:40.532Z" },
|
| 927 |
+
{ url = "https://files.pythonhosted.org/packages/33/81/a3afc88fca4aa925804a27d2676d22dcd2031c2ebe08aabd0ae55b9ff282/pandas-2.3.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4e0a175408804d566144e170d0476b15d78458795bb18f1304fb94160cabf40c", size = 12765333, upload-time = "2025-09-29T23:21:55.77Z" },
|
| 928 |
+
{ url = "https://files.pythonhosted.org/packages/8d/0f/b4d4ae743a83742f1153464cf1a8ecfafc3ac59722a0b5c8602310cb7158/pandas-2.3.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:93c2d9ab0fc11822b5eece72ec9587e172f63cff87c00b062f6e37448ced4493", size = 13418120, upload-time = "2025-09-29T23:22:10.109Z" },
|
| 929 |
+
{ url = "https://files.pythonhosted.org/packages/4f/c7/e54682c96a895d0c808453269e0b5928a07a127a15704fedb643e9b0a4c8/pandas-2.3.3-cp313-cp313-win_amd64.whl", hash = "sha256:f8bfc0e12dc78f777f323f55c58649591b2cd0c43534e8355c51d3fede5f4dee", size = 10993991, upload-time = "2025-09-29T23:25:04.889Z" },
|
| 930 |
+
{ url = "https://files.pythonhosted.org/packages/f9/ca/3f8d4f49740799189e1395812f3bf23b5e8fc7c190827d55a610da72ce55/pandas-2.3.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:75ea25f9529fdec2d2e93a42c523962261e567d250b0013b16210e1d40d7c2e5", size = 12048227, upload-time = "2025-09-29T23:22:24.343Z" },
|
| 931 |
+
{ url = "https://files.pythonhosted.org/packages/0e/5a/f43efec3e8c0cc92c4663ccad372dbdff72b60bdb56b2749f04aa1d07d7e/pandas-2.3.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:74ecdf1d301e812db96a465a525952f4dde225fdb6d8e5a521d47e1f42041e21", size = 11411056, upload-time = "2025-09-29T23:22:37.762Z" },
|
| 932 |
+
{ url = "https://files.pythonhosted.org/packages/46/b1/85331edfc591208c9d1a63a06baa67b21d332e63b7a591a5ba42a10bb507/pandas-2.3.3-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6435cb949cb34ec11cc9860246ccb2fdc9ecd742c12d3304989017d53f039a78", size = 11645189, upload-time = "2025-09-29T23:22:51.688Z" },
|
| 933 |
+
{ url = "https://files.pythonhosted.org/packages/44/23/78d645adc35d94d1ac4f2a3c4112ab6f5b8999f4898b8cdf01252f8df4a9/pandas-2.3.3-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:900f47d8f20860de523a1ac881c4c36d65efcb2eb850e6948140fa781736e110", size = 12121912, upload-time = "2025-09-29T23:23:05.042Z" },
|
| 934 |
+
{ url = "https://files.pythonhosted.org/packages/53/da/d10013df5e6aaef6b425aa0c32e1fc1f3e431e4bcabd420517dceadce354/pandas-2.3.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a45c765238e2ed7d7c608fc5bc4a6f88b642f2f01e70c0c23d2224dd21829d86", size = 12712160, upload-time = "2025-09-29T23:23:28.57Z" },
|
| 935 |
+
{ url = "https://files.pythonhosted.org/packages/bd/17/e756653095a083d8a37cbd816cb87148debcfcd920129b25f99dd8d04271/pandas-2.3.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c4fc4c21971a1a9f4bdb4c73978c7f7256caa3e62b323f70d6cb80db583350bc", size = 13199233, upload-time = "2025-09-29T23:24:24.876Z" },
|
| 936 |
+
{ url = "https://files.pythonhosted.org/packages/04/fd/74903979833db8390b73b3a8a7d30d146d710bd32703724dd9083950386f/pandas-2.3.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:ee15f284898e7b246df8087fc82b87b01686f98ee67d85a17b7ab44143a3a9a0", size = 11540635, upload-time = "2025-09-29T23:25:52.486Z" },
|
| 937 |
+
{ url = "https://files.pythonhosted.org/packages/21/00/266d6b357ad5e6d3ad55093a7e8efc7dd245f5a842b584db9f30b0f0a287/pandas-2.3.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1611aedd912e1ff81ff41c745822980c49ce4a7907537be8692c8dbc31924593", size = 10759079, upload-time = "2025-09-29T23:26:33.204Z" },
|
| 938 |
+
{ url = "https://files.pythonhosted.org/packages/ca/05/d01ef80a7a3a12b2f8bbf16daba1e17c98a2f039cbc8e2f77a2c5a63d382/pandas-2.3.3-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6d2cefc361461662ac48810cb14365a365ce864afe85ef1f447ff5a1e99ea81c", size = 11814049, upload-time = "2025-09-29T23:27:15.384Z" },
|
| 939 |
+
{ url = "https://files.pythonhosted.org/packages/15/b2/0e62f78c0c5ba7e3d2c5945a82456f4fac76c480940f805e0b97fcbc2f65/pandas-2.3.3-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ee67acbbf05014ea6c763beb097e03cd629961c8a632075eeb34247120abcb4b", size = 12332638, upload-time = "2025-09-29T23:27:51.625Z" },
|
| 940 |
+
{ url = "https://files.pythonhosted.org/packages/c5/33/dd70400631b62b9b29c3c93d2feee1d0964dc2bae2e5ad7a6c73a7f25325/pandas-2.3.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c46467899aaa4da076d5abc11084634e2d197e9460643dd455ac3db5856b24d6", size = 12886834, upload-time = "2025-09-29T23:28:21.289Z" },
|
| 941 |
+
{ url = "https://files.pythonhosted.org/packages/d3/18/b5d48f55821228d0d2692b34fd5034bb185e854bdb592e9c640f6290e012/pandas-2.3.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:6253c72c6a1d990a410bc7de641d34053364ef8bcd3126f7e7450125887dffe3", size = 13409925, upload-time = "2025-09-29T23:28:58.261Z" },
|
| 942 |
+
{ url = "https://files.pythonhosted.org/packages/a6/3d/124ac75fcd0ecc09b8fdccb0246ef65e35b012030defb0e0eba2cbbbe948/pandas-2.3.3-cp314-cp314-win_amd64.whl", hash = "sha256:1b07204a219b3b7350abaae088f451860223a52cfb8a6c53358e7948735158e5", size = 11109071, upload-time = "2025-09-29T23:32:27.484Z" },
|
| 943 |
+
{ url = "https://files.pythonhosted.org/packages/89/9c/0e21c895c38a157e0faa1fb64587a9226d6dd46452cac4532d80c3c4a244/pandas-2.3.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2462b1a365b6109d275250baaae7b760fd25c726aaca0054649286bcfbb3e8ec", size = 12048504, upload-time = "2025-09-29T23:29:31.47Z" },
|
| 944 |
+
{ url = "https://files.pythonhosted.org/packages/d7/82/b69a1c95df796858777b68fbe6a81d37443a33319761d7c652ce77797475/pandas-2.3.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:0242fe9a49aa8b4d78a4fa03acb397a58833ef6199e9aa40a95f027bb3a1b6e7", size = 11410702, upload-time = "2025-09-29T23:29:54.591Z" },
|
| 945 |
+
{ url = "https://files.pythonhosted.org/packages/f9/88/702bde3ba0a94b8c73a0181e05144b10f13f29ebfc2150c3a79062a8195d/pandas-2.3.3-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a21d830e78df0a515db2b3d2f5570610f5e6bd2e27749770e8bb7b524b89b450", size = 11634535, upload-time = "2025-09-29T23:30:21.003Z" },
|
| 946 |
+
{ url = "https://files.pythonhosted.org/packages/a4/1e/1bac1a839d12e6a82ec6cb40cda2edde64a2013a66963293696bbf31fbbb/pandas-2.3.3-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2e3ebdb170b5ef78f19bfb71b0dc5dc58775032361fa188e814959b74d726dd5", size = 12121582, upload-time = "2025-09-29T23:30:43.391Z" },
|
| 947 |
+
{ url = "https://files.pythonhosted.org/packages/44/91/483de934193e12a3b1d6ae7c8645d083ff88dec75f46e827562f1e4b4da6/pandas-2.3.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:d051c0e065b94b7a3cea50eb1ec32e912cd96dba41647eb24104b6c6c14c5788", size = 12699963, upload-time = "2025-09-29T23:31:10.009Z" },
|
| 948 |
+
{ url = "https://files.pythonhosted.org/packages/70/44/5191d2e4026f86a2a109053e194d3ba7a31a2d10a9c2348368c63ed4e85a/pandas-2.3.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3869faf4bd07b3b66a9f462417d0ca3a9df29a9f6abd5d0d0dbab15dac7abe87", size = 13202175, upload-time = "2025-09-29T23:31:59.173Z" },
|
| 949 |
+
]
|
| 950 |
+
|
| 951 |
[[package]]
|
| 952 |
name = "propcache"
|
| 953 |
version = "0.4.1"
|
|
|
|
| 1114 |
{ url = "https://files.pythonhosted.org/packages/2b/c6/db8d13a1f8ab3f1eb08c88bd00fd62d44311e3456d1e85c0e59e0a0376e7/pydantic_core-2.41.4-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd8a5028425820731d8c6c098ab642d7b8b999758e24acae03ed38a66eca8335", size = 2139008, upload-time = "2025-10-14T10:23:04.539Z" },
|
| 1115 |
]
|
| 1116 |
|
| 1117 |
+
[[package]]
|
| 1118 |
+
name = "python-dateutil"
|
| 1119 |
+
version = "2.9.0.post0"
|
| 1120 |
+
source = { registry = "https://pypi.org/simple" }
|
| 1121 |
+
dependencies = [
|
| 1122 |
+
{ name = "six" },
|
| 1123 |
+
]
|
| 1124 |
+
sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432, upload-time = "2024-03-01T18:36:20.211Z" }
|
| 1125 |
+
wheels = [
|
| 1126 |
+
{ url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" },
|
| 1127 |
+
]
|
| 1128 |
+
|
| 1129 |
[[package]]
|
| 1130 |
name = "python-dotenv"
|
| 1131 |
version = "1.2.1"
|
|
|
|
| 1135 |
{ url = "https://files.pythonhosted.org/packages/14/1b/a298b06749107c305e1fe0f814c6c74aea7b2f1e10989cb30f544a1b3253/python_dotenv-1.2.1-py3-none-any.whl", hash = "sha256:b81ee9561e9ca4004139c6cbba3a238c32b03e4894671e181b671e8cb8425d61", size = 21230, upload-time = "2025-10-26T15:12:09.109Z" },
|
| 1136 |
]
|
| 1137 |
|
| 1138 |
+
[[package]]
|
| 1139 |
+
name = "pytz"
|
| 1140 |
+
version = "2025.2"
|
| 1141 |
+
source = { registry = "https://pypi.org/simple" }
|
| 1142 |
+
sdist = { url = "https://files.pythonhosted.org/packages/f8/bf/abbd3cdfb8fbc7fb3d4d38d320f2441b1e7cbe29be4f23797b4a2b5d8aac/pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3", size = 320884, upload-time = "2025-03-25T02:25:00.538Z" }
|
| 1143 |
+
wheels = [
|
| 1144 |
+
{ url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload-time = "2025-03-25T02:24:58.468Z" },
|
| 1145 |
+
]
|
| 1146 |
+
|
| 1147 |
[[package]]
|
| 1148 |
name = "pyyaml"
|
| 1149 |
version = "6.0.3"
|
|
|
|
| 1387 |
{ url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload-time = "2023-10-24T04:13:38.866Z" },
|
| 1388 |
]
|
| 1389 |
|
| 1390 |
+
[[package]]
|
| 1391 |
+
name = "six"
|
| 1392 |
+
version = "1.17.0"
|
| 1393 |
+
source = { registry = "https://pypi.org/simple" }
|
| 1394 |
+
sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031, upload-time = "2024-12-04T17:35:28.174Z" }
|
| 1395 |
+
wheels = [
|
| 1396 |
+
{ url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" },
|
| 1397 |
+
]
|
| 1398 |
+
|
| 1399 |
[[package]]
|
| 1400 |
name = "sniffio"
|
| 1401 |
version = "1.3.1"
|
|
|
|
| 1532 |
{ url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" },
|
| 1533 |
]
|
| 1534 |
|
| 1535 |
+
[[package]]
|
| 1536 |
+
name = "tzdata"
|
| 1537 |
+
version = "2025.2"
|
| 1538 |
+
source = { registry = "https://pypi.org/simple" }
|
| 1539 |
+
sdist = { url = "https://files.pythonhosted.org/packages/95/32/1a225d6164441be760d75c2c42e2780dc0873fe382da3e98a2e1e48361e5/tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9", size = 196380, upload-time = "2025-03-23T13:54:43.652Z" }
|
| 1540 |
+
wheels = [
|
| 1541 |
+
{ url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" },
|
| 1542 |
+
]
|
| 1543 |
+
|
| 1544 |
[[package]]
|
| 1545 |
name = "urllib3"
|
| 1546 |
version = "2.5.0"
|