akseljoonas HF Staff commited on
Commit
af80aa7
·
1 Parent(s): 4ce436f

eval script done

Browse files
.gitignore CHANGED
@@ -10,4 +10,6 @@ wheels/
10
  .venv
11
  .env
12
  .DS_Store
13
- .claude/
 
 
 
10
  .venv
11
  .env
12
  .DS_Store
13
+ .claude/
14
+ *.jsonl
15
+ *.csv
eval/df.csv DELETED
The diff for this file is too large to render. See raw diff
 
eval/df.ipynb DELETED
@@ -1,137 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 1,
6
- "id": "b7f67653",
7
- "metadata": {},
8
- "outputs": [
9
- {
10
- "data": {
11
- "application/vnd.jupyter.widget-view+json": {
12
- "model_id": "1fcf9d61b3664bc99d616101c201aca8",
13
- "version_major": 2,
14
- "version_minor": 0
15
- },
16
- "text/plain": [
17
- "Generating train split: 0 examples [00:00, ? examples/s]"
18
- ]
19
- },
20
- "metadata": {},
21
- "output_type": "display_data"
22
- }
23
- ],
24
- "source": [
25
- "from datasets import load_dataset\n",
26
- "\n",
27
- "ds = load_dataset(\"json\", data_files=\"qa_pairs.jsonl\", split=\"train\")\n"
28
- ]
29
- },
30
- {
31
- "cell_type": "code",
32
- "execution_count": 2,
33
- "id": "55cd7b9c",
34
- "metadata": {},
35
- "outputs": [
36
- {
37
- "data": {
38
- "application/vnd.jupyter.widget-view+json": {
39
- "model_id": "cb9452a5789b4b20bd0b01cce111f961",
40
- "version_major": 2,
41
- "version_minor": 0
42
- },
43
- "text/plain": [
44
- "Uploading the dataset shards: 0%| | 0/1 [00:00<?, ? shards/s]"
45
- ]
46
- },
47
- "metadata": {},
48
- "output_type": "display_data"
49
- },
50
- {
51
- "data": {
52
- "application/vnd.jupyter.widget-view+json": {
53
- "model_id": "4a77e027bda5405991bfde524347013c",
54
- "version_major": 2,
55
- "version_minor": 0
56
- },
57
- "text/plain": [
58
- "Creating parquet from Arrow format: 0%| | 0/1 [00:00<?, ?ba/s]"
59
- ]
60
- },
61
- "metadata": {},
62
- "output_type": "display_data"
63
- },
64
- {
65
- "data": {
66
- "application/vnd.jupyter.widget-view+json": {
67
- "model_id": "4d70f759da55470fba30326d99d6ac1f",
68
- "version_major": 2,
69
- "version_minor": 0
70
- },
71
- "text/plain": [
72
- "Processing Files (0 / 0): | | 0.00B / 0.00B "
73
- ]
74
- },
75
- "metadata": {},
76
- "output_type": "display_data"
77
- },
78
- {
79
- "data": {
80
- "application/vnd.jupyter.widget-view+json": {
81
- "model_id": "617db8df8ac94a2889d8760fc7f0113a",
82
- "version_major": 2,
83
- "version_minor": 0
84
- },
85
- "text/plain": [
86
- "New Data Upload: | | 0.00B / 0.00B "
87
- ]
88
- },
89
- "metadata": {},
90
- "output_type": "display_data"
91
- },
92
- {
93
- "data": {
94
- "text/plain": [
95
- "CommitInfo(commit_url='https://huggingface.co/datasets/akseljoonas/qa_pairs/commit/6947117631cb56686c192533427bb4400382b4fd', commit_message='Upload dataset', commit_description='', oid='6947117631cb56686c192533427bb4400382b4fd', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/akseljoonas/qa_pairs', endpoint='https://huggingface.co', repo_type='dataset', repo_id='akseljoonas/qa_pairs'), pr_revision=None, pr_num=None)"
96
- ]
97
- },
98
- "execution_count": 2,
99
- "metadata": {},
100
- "output_type": "execute_result"
101
- }
102
- ],
103
- "source": [
104
- "ds.push_to_hub(\"akseljoonas/qa_pairs\")"
105
- ]
106
- },
107
- {
108
- "cell_type": "code",
109
- "execution_count": null,
110
- "id": "16d4760d",
111
- "metadata": {},
112
- "outputs": [],
113
- "source": []
114
- }
115
- ],
116
- "metadata": {
117
- "kernelspec": {
118
- "display_name": ".venv",
119
- "language": "python",
120
- "name": "python3"
121
- },
122
- "language_info": {
123
- "codemirror_mode": {
124
- "name": "ipython",
125
- "version": 3
126
- },
127
- "file_extension": ".py",
128
- "mimetype": "text/x-python",
129
- "name": "python",
130
- "nbconvert_exporter": "python",
131
- "pygments_lexer": "ipython3",
132
- "version": "3.12.11"
133
- }
134
- },
135
- "nbformat": 4,
136
- "nbformat_minor": 5
137
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eval/evaluate.py CHANGED
@@ -1,201 +1,334 @@
 
 
 
 
 
 
1
  import json
2
  from concurrent.futures import ThreadPoolExecutor, as_completed
3
- from typing import Any, Dict
4
 
5
  import litellm
6
- from models import (
7
- Correctness,
8
- EvaluatedQuestionAndSolution,
9
- JudgementResult,
10
- QuestionAndSolution,
11
- )
12
 
13
- # from: https://github.com/centerforaisafety/hle/blob/7b6be5aad6f9b43af3857de7867f3b52f6e4acb3/hle_eval/run_judge_results.py#L16-L33
14
- GRADER_TEMPLATE = """
15
- Judge whether the following [response] to [question] is correct or not based on if the [response] includes the precise and unambiguous [correct_answer] below.
16
 
17
- [question]: {question}
 
18
 
19
- [response]: {response}
 
 
 
 
20
 
21
- Your judgement must be in the format and criteria specified below:
22
 
23
- extracted_final_answer: The final exact answer extracted from the [response]. Put the extracted answer as 'None' if there is no exact, final answer to extract from the response.
 
24
 
25
- [correct_answer]: {correct_answer}
 
 
26
 
27
- reasoning: Explain why the [correct_answer] is included or not included in the extracted_final_answer, focusing only on if there are meaningful differences between [correct_answer] and the extracted_final_answer. Do not comment on any background to the problem, do not attempt to solve the problem, do not argue for any answer different than [correct_answer], focus only on whether the answers match.
28
 
29
- correct: Answer 'yes' if [correct_answer] is included in the extracted_final_answer given above, or is within a small margin of error for numerical problems. Answer 'no' otherwise, i.e. if there if there is any inconsistency, ambiguity, non-equivalency, or if the extracted answer is incorrect.
 
30
 
 
 
 
 
 
 
31
 
32
- confidence: The extracted confidence score between 0|%| and 100|%| from [response]. Put 100 if there is no confidence score available.
33
- """.strip()
34
 
35
- CHOICE_STRINGS = ["yes", "no"]
36
 
 
37
 
38
- def evaluate_single_response(
39
- question: str,
40
- response: str,
41
- correct_answer: str,
42
- model: str = "gpt-4o-mini",
43
- ) -> Dict[str, Any]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  """
45
- Evaluate a single response against the ground truth using LLM as judge.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  Args:
48
  question: The question being answered
49
  response: The response to evaluate
50
- correct_answer: The ground truth answer
51
- model: The LLM model to use for judging
52
 
53
  Returns:
54
- Dictionary containing the judgement result and metadata
55
  """
56
- prompt = GRADER_TEMPLATE.format(
57
- question=question, response=response, correct_answer=correct_answer
 
 
58
  )
59
 
60
- # Use litellm with structured output
61
  llm_response = litellm.completion(
62
  model=model,
63
  messages=[
64
  {
65
  "role": "system",
66
- "content": "You are an expert judge evaluating answers for accuracy and equivalence.",
67
  },
68
  {"role": "user", "content": prompt},
69
  ],
70
- response_format=JudgementResult,
71
  temperature=0.0,
 
72
  )
73
 
74
- # Parse structured output
75
- result: JudgementResult = JudgementResult.model_validate_json(
76
- llm_response.choices[0].message.content
77
- )
78
  return result
79
 
80
 
81
- def evaluate_dataset(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  input_file: str,
83
- eval_file: str,
84
- output_file: str = "evaluation_results.jsonl",
 
85
  model: str = "gpt-4o-mini",
86
- max_concurrent: int = 30,
87
- limit: int = None,
88
  ) -> None:
89
  """
90
- Evaluate all QA pairs in the input file using LLM as judge.
91
 
92
  Args:
93
- input_file: Path to input JSONL file with QA pairs
94
- output_file: Path to output JSONL file for results
95
- model: The LLM model to use for judging
96
- max_concurrent: Maximum number of concurrent threads
97
- limit: Optional limit on number of examples to evaluate
 
 
98
  """
99
- # Load input data as proper models
100
- to_evaluate = [
101
- QuestionAndSolution.model_validate_json(line) for line in open(input_file, "r")
102
- ]
 
 
 
 
 
 
 
 
103
  if limit:
104
- to_evaluate = to_evaluate[:limit]
 
105
 
106
- print(f"Loaded {len(to_evaluate)} QA pairs to evaluate")
 
107
 
108
- # Load ground truth dataset
109
- print(f"Loading ground truth from {eval_file}...")
110
- with open(eval_file, "r") as f:
111
- ground_truths = [QuestionAndSolution.model_validate_json(line) for line in f]
112
 
113
- print(f"Loaded {len(ground_truths)} ground truths")
 
 
 
 
114
 
115
- # Run evaluations in parallel using ThreadPoolExecutor
116
- print(f"Running evaluations with {max_concurrent} parallel workers...")
117
- results = []
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
 
 
119
  with ThreadPoolExecutor(max_workers=max_concurrent) as executor:
120
  # Submit all tasks
121
  future_to_idx = {}
122
- for idx, (qa_pair, ground_truth) in enumerate(zip(to_evaluate, ground_truths)):
123
- question = ground_truth.question
124
- ground_truth_answer = ground_truth.solution
125
- response = qa_pair.solution
126
-
127
  future = executor.submit(
128
- evaluate_single_response,
129
- response=response,
130
- question=question,
131
- correct_answer=ground_truth_answer,
 
132
  model=model,
133
  )
134
  future_to_idx[future] = idx
135
 
136
  # Collect results in order
137
- results = [None] * len(to_evaluate)
 
138
  for future in as_completed(future_to_idx):
139
  idx = future_to_idx[future]
140
  results[idx] = future.result()
141
-
142
- # Combine results with original data using proper models
143
- output_data: list[EvaluatedQuestionAndSolution] = []
144
- correct_count = 0
145
- error_count = 0
146
-
147
- for qa_pair, result in zip(to_evaluate, results):
148
- print(result.model_dump_json())
149
-
150
- # Create proper evaluated model
151
- output_entry = EvaluatedQuestionAndSolution(
152
- **qa_pair.model_dump(),
153
- evaluation=result
 
 
 
 
154
  )
155
- output_data.append(output_entry)
 
156
 
157
- if result.correct == Correctness.yes:
158
- correct_count += 1
159
- else:
160
- error_count += 1
161
-
162
- # Write results using proper model serialization
163
  print(f"Writing results to {output_file}...")
164
  with open(output_file, "w") as f:
165
  for entry in output_data:
166
  f.write(entry.model_dump_json() + "\n")
167
 
168
  # Print summary
169
- total = len(to_evaluate)
170
- success_rate = (total - error_count) / total * 100 if total > 0 else 0
171
- accuracy = correct_count / total * 100 if total > 0 else 0
172
-
173
- print("\n" + "=" * 50)
174
- print("EVALUATION SUMMARY")
175
- print("=" * 50)
176
- print(f"Total examples: {total}")
177
- print(f"Successful evaluations: {total - error_count}")
178
- print(f"Errors: {error_count}")
179
- print(f"Success rate: {success_rate:.2f}%")
180
- print(f"Correct answers: {correct_count}")
181
- print(f"Accuracy: {accuracy:.2f}%")
182
- print("=" * 50)
183
-
184
-
185
- #
186
-
187
-
188
- def main():
189
- """Main entry point for the evaluation script"""
190
- evaluate_dataset(
191
- input_file="eval/qa_pairs.jsonl",
192
- eval_file="eval/qa_pairs.jsonl",
193
- output_file="evaluation_results.jsonl",
194
- model="gpt-4o-mini",
195
- max_concurrent=30,
196
- limit=10, # Set to None to evaluate all, or a number to limit
197
  )
 
 
 
 
 
198
 
199
 
200
  if __name__ == "__main__":
201
- main()
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Rubric-based evaluation following the "Rubrics as Rewards" paper.
3
+
4
+ Implements RaR-Explicit: Weighted sum of individual criterion scores (Equation 1)
5
+ """
6
+
7
  import json
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
9
+ from typing import Dict, List, Optional
10
 
11
  import litellm
12
+ from pydantic import BaseModel
 
 
 
 
 
13
 
 
 
 
14
 
15
+ class CriterionCheck(BaseModel):
16
+ """Result of checking a single rubric criterion."""
17
 
18
+ title: str
19
+ description: str
20
+ weight: int
21
+ satisfied: bool
22
+ reasoning: Optional[str] = None
23
 
 
24
 
25
+ class RubricEvaluation(BaseModel):
26
+ """Complete rubric-based evaluation result."""
27
 
28
+ criterion_checks: List[CriterionCheck]
29
+ raw_score: float # Unnormalized score
30
+ normalized_score: float # Score normalized to [0, 1]
31
 
 
32
 
33
+ class EvaluatedResponse(BaseModel):
34
+ """Complete evaluated response with rubric scores."""
35
 
36
+ discussion_title: str
37
+ discussion_url: str
38
+ question: str
39
+ response: str
40
+ reference_answer: str
41
+ evaluation: RubricEvaluation
42
 
 
 
43
 
44
+ CRITERION_PROMPT = """You are evaluating whether a response satisfies a specific evaluation criterion.
45
 
46
+ Question: {question}
47
 
48
+ Response to evaluate: {response}
49
+
50
+ Evaluation Criterion:
51
+ {criterion_description}
52
+
53
+ Your task: Determine if the response satisfies this criterion.
54
+
55
+ Output a JSON object with:
56
+ - "satisfied": true or false
57
+ - "reasoning": Brief explanation (1-2 sentences) of why it does or doesn't satisfy the criterion
58
+
59
+ Be strict but fair. The criterion must be clearly satisfied for you to answer true."""
60
+
61
+
62
+ class RubricData(BaseModel):
63
+ """Rubric data loaded from file."""
64
+
65
+ title: str
66
+ description: str
67
+ weight: int
68
+
69
+
70
+ def load_rubrics_from_file(rubric_file: str) -> Dict[str, List[RubricData]]:
71
+ """
72
+ Load rubrics from JSONL file and index by question.
73
+
74
+ Args:
75
+ rubric_file: Path to rubric JSONL file
76
+
77
+ Returns:
78
+ Dictionary mapping questions to their rubrics
79
  """
80
+ rubrics_by_question = {}
81
+
82
+ with open(rubric_file, "r") as f:
83
+ for line in f:
84
+ entry = json.loads(line)
85
+ question = entry["question"]
86
+
87
+ # Parse rubric JSON string
88
+ rubric_data = json.loads(entry["rubric"])
89
+ rubrics = [RubricData(**r) for r in rubric_data["rubrics"]]
90
+
91
+ rubrics_by_question[question] = rubrics
92
+
93
+ return rubrics_by_question
94
+
95
+
96
+ def check_criterion(
97
+ question: str, response: str, criterion: RubricData, model: str = "gpt-4o-mini"
98
+ ) -> CriterionCheck:
99
+ """
100
+ Check if response satisfies a single criterion.
101
 
102
  Args:
103
  question: The question being answered
104
  response: The response to evaluate
105
+ criterion: The rubric criterion to check
106
+ model: LLM model for judging
107
 
108
  Returns:
109
+ CriterionCheck with satisfaction result
110
  """
111
+ prompt = CRITERION_PROMPT.format(
112
+ question=question,
113
+ response=response,
114
+ criterion_description=criterion.description,
115
  )
116
 
 
117
  llm_response = litellm.completion(
118
  model=model,
119
  messages=[
120
  {
121
  "role": "system",
122
+ "content": "You are an expert evaluator for rubric-based assessment.",
123
  },
124
  {"role": "user", "content": prompt},
125
  ],
 
126
  temperature=0.0,
127
+ response_format=CriterionCheck,
128
  )
129
 
130
+ result = CriterionCheck.model_validate_json(llm_response.choices[0].message.content)
131
+
 
 
132
  return result
133
 
134
 
135
+ def evaluate_with_rubrics(
136
+ question: str,
137
+ response: str,
138
+ reference_answer: str,
139
+ rubrics: List[RubricData],
140
+ model: str = "gpt-4o-mini",
141
+ ) -> RubricEvaluation:
142
+ """
143
+ Evaluate response using RaR-Explicit method (weighted sum).
144
+
145
+ Implements Equation 1 from paper:
146
+ r(x, ŷ) = Σ(w_j * c_j(x, ŷ)) / Σ(w_j)
147
+
148
+ Args:
149
+ question: The question
150
+ response: Response to evaluate
151
+ reference_answer: Reference answer (not directly used, but available)
152
+ rubrics: List of rubric criteria
153
+ model: LLM model for judging
154
+
155
+ Returns:
156
+ RubricEvaluation with normalized score
157
+ """
158
+ # Check each criterion independently
159
+ checks = []
160
+ for rubric in rubrics:
161
+ check = check_criterion(question, response, rubric, model)
162
+ checks.append(check)
163
+
164
+ # Calculate weighted score (Equation 1)
165
+ # Only positive weights contribute to denominator
166
+ positive_weights = sum(abs(r.weight) for r in rubrics if r.weight > 0)
167
+
168
+ raw_score = 0.0
169
+ for check in checks:
170
+ if check.satisfied:
171
+ raw_score += check.weight
172
+
173
+ # Normalize to [0, 1]
174
+ normalized_score = raw_score / positive_weights if positive_weights > 0 else 0.0
175
+ # Clip to [0, 1] in case pitfalls make it negative
176
+ normalized_score = max(0.0, min(1.0, normalized_score))
177
+
178
+ return RubricEvaluation(
179
+ raw_score=raw_score,
180
+ normalized_score=normalized_score,
181
+ criterion_checks=checks,
182
+ )
183
+
184
+
185
+ def evaluate_dataset_with_rubrics(
186
  input_file: str,
187
+ rubric_file: str,
188
+ ground_truth_file: str,
189
+ output_file: str = "rubric_evaluation_results.jsonl",
190
  model: str = "gpt-4o-mini",
191
+ max_concurrent: int = 10,
192
+ limit: Optional[int] = None,
193
  ) -> None:
194
  """
195
+ Evaluate all responses using rubric-based assessment.
196
 
197
  Args:
198
+ input_file: Path to JSONL with responses to evaluate
199
+ rubric_file: Path to JSONL with rubrics (output from generate_rubrics.py)
200
+ ground_truth_file: Path to JSONL with ground truth answers
201
+ output_file: Path to output JSONL file
202
+ model: LLM model for judging
203
+ max_concurrent: Maximum concurrent evaluations
204
+ limit: Optional limit on number of examples
205
  """
206
+ # Load data
207
+ print(f"Loading responses from {input_file}...")
208
+ with open(input_file, "r") as f:
209
+ responses = [json.loads(line) for line in f]
210
+
211
+ print(f"Loading rubrics from {rubric_file}...")
212
+ rubrics_by_question = load_rubrics_from_file(rubric_file)
213
+
214
+ print(f"Loading ground truth from {ground_truth_file}...")
215
+ with open(ground_truth_file, "r") as f:
216
+ ground_truths = [json.loads(line) for line in f]
217
+
218
  if limit:
219
+ responses = responses[:limit]
220
+ ground_truths = ground_truths[:limit]
221
 
222
+ print(f"Loaded {len(responses)} responses to evaluate")
223
+ print(f"Judge model: {model}")
224
 
225
+ # Match responses with rubrics and ground truth
226
+ evaluation_tasks = []
227
+ for response_data, gt_data in zip(responses, ground_truths):
228
+ question = gt_data["question"]
229
 
230
+ # Find rubrics for this question
231
+ rubrics = rubrics_by_question.get(question)
232
+ if not rubrics:
233
+ print(f"Warning: No rubrics found for question: {question[:50]}...")
234
+ continue
235
 
236
+ evaluation_tasks.append(
237
+ {
238
+ "question": question,
239
+ "response": response_data["solution"],
240
+ "reference_answer": gt_data["solution"],
241
+ "rubrics": rubrics,
242
+ "metadata": {
243
+ "discussion_title": response_data.get("discussion_title", ""),
244
+ "discussion_url": response_data.get("discussion_url", ""),
245
+ },
246
+ }
247
+ )
248
+
249
+ print(
250
+ f"Running {len(evaluation_tasks)} evaluations with {max_concurrent} parallel workers..."
251
+ )
252
 
253
+ # Run evaluations in parallel
254
+ results = []
255
  with ThreadPoolExecutor(max_workers=max_concurrent) as executor:
256
  # Submit all tasks
257
  future_to_idx = {}
258
+ for idx, task in enumerate(evaluation_tasks):
 
 
 
 
259
  future = executor.submit(
260
+ evaluate_with_rubrics,
261
+ question=task["question"],
262
+ response=task["response"],
263
+ reference_answer=task["reference_answer"],
264
+ rubrics=task["rubrics"],
265
  model=model,
266
  )
267
  future_to_idx[future] = idx
268
 
269
  # Collect results in order
270
+ results = [None] * len(evaluation_tasks)
271
+ completed = 0
272
  for future in as_completed(future_to_idx):
273
  idx = future_to_idx[future]
274
  results[idx] = future.result()
275
+ completed += 1
276
+ print(f"Completed: {completed}/{len(evaluation_tasks)}", end="\r")
277
+
278
+ print() # New line after progress
279
+
280
+ # Combine results with metadata
281
+ output_data = []
282
+ total_score = 0.0
283
+
284
+ for task, evaluation in zip(evaluation_tasks, results):
285
+ evaluated_response = EvaluatedResponse(
286
+ discussion_title=task["metadata"]["discussion_title"],
287
+ discussion_url=task["metadata"]["discussion_url"],
288
+ question=task["question"],
289
+ response=task["response"],
290
+ reference_answer=task["reference_answer"],
291
+ evaluation=evaluation,
292
  )
293
+ output_data.append(evaluated_response)
294
+ total_score += evaluation.normalized_score
295
 
296
+ # Write results
 
 
 
 
 
297
  print(f"Writing results to {output_file}...")
298
  with open(output_file, "w") as f:
299
  for entry in output_data:
300
  f.write(entry.model_dump_json() + "\n")
301
 
302
  # Print summary
303
+ avg_score = total_score / len(output_data) if output_data else 0.0
304
+
305
+ print("\n" + "=" * 60)
306
+ print("RUBRIC-BASED EVALUATION SUMMARY")
307
+ print("=" * 60)
308
+ print(f"Total examples: {len(output_data)}")
309
+ print(f"Judge model: {model}")
310
+ print(f"Average normalized score: {avg_score:.3f}")
311
+ print(f"Average percentage: {avg_score * 100:.1f}%")
312
+
313
+ # Per-criterion statistics
314
+ total_satisfied = sum(
315
+ sum(1 for check in eval.evaluation.criterion_checks if check.satisfied)
316
+ for eval in output_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
  )
318
+ total_criteria = sum(len(eval.evaluation.criterion_checks) for eval in output_data)
319
+ satisfaction_rate = total_satisfied / total_criteria if total_criteria > 0 else 0.0
320
+ print(f"Criteria satisfaction rate: {satisfaction_rate * 100:.1f}%")
321
+
322
+ print("=" * 60)
323
 
324
 
325
  if __name__ == "__main__":
326
+ evaluate_dataset_with_rubrics(
327
+ input_file="eval/qa_pairs_accepted.jsonl",
328
+ rubric_file="eval/qa_rubrics.jsonl",
329
+ ground_truth_file="eval/qa_pairs_accepted.jsonl",
330
+ output_file="rubric_evaluation.jsonl",
331
+ model="gpt-4o-mini",
332
+ max_concurrent=10,
333
+ limit=30, # Set to None to evaluate all
334
+ )
eval/generate_rubrics.py ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env env python3
2
+ """
3
+ Rubric Generation Script for HF-Agent Benchmark
4
+
5
+ Generates instance-specific evaluation rubrics following the "Rubrics as Rewards" paper.
6
+ Uses LiteLLM to call LLM models for rubric synthesis with expert grounding via reference answers.
7
+ """
8
+
9
+ import argparse
10
+ import json
11
+ import os
12
+ import sys
13
+ from concurrent.futures import ThreadPoolExecutor, as_completed
14
+ from pathlib import Path
15
+ from typing import Any, Dict, List
16
+
17
+ import litellm
18
+ import pandas as pd
19
+ from dotenv import load_dotenv
20
+ from pydantic import BaseModel
21
+
22
+
23
+ class Rubric(BaseModel):
24
+ title: str
25
+ description: str
26
+ weight: int
27
+
28
+
29
+ class RubricList(BaseModel):
30
+ rubrics: List[Rubric]
31
+
32
+
33
+ # Load environment variables
34
+ load_dotenv()
35
+
36
+ # Rubric generation prompt template based on RaR paper
37
+
38
+
39
+ PROMPT_TEMPLATE = """You are an expert rubric writer. Your job is to generate a self-contained set of evaluation criteria (“rubrics”) for judging
40
+ how good, helpful and complete an agent's trajectory is to a given user question/request.
41
+
42
+ Rubrics can cover aspects of a response such as, but not limited to, factual correctness, helpfulness, completeness, harmlessness, correctness of using Hugging Face best practices (based on HF documentation), depth of
43
+ reasoning, contextual relevance and usefulness. Each item must be self-contained – non expert readers should not need to
44
+ infer anything or consult external information. Begin each description with its category: “Essential Criteria: . . . ”, “Important
45
+ Criteria: . . . ”, “Optional Criteria: . . . ”, or “Pitfall Criteria: Does not mention . . . ”.
46
+
47
+
48
+ Inputs: !!!
49
+ - question: <<<{question}>>>
50
+ - reference_answer (ideal solution): <<<{reference_answer}>>>
51
+ - thread: <<<{thread}>>>
52
+
53
+ Total items:
54
+ • Choose 7–20 rubric items based on the complexity of the question.
55
+
56
+ Each rubric item:
57
+ • title (2–4 words).
58
+ • description: One sentence starting with its category prefix that explicitly states exactly what to look for. For example:
59
+ – Essential Criteria: Writes a up-to-date, correct, complete and working training loop using the latest Hugging Face best practices. Launches the training with hf-jobs.
60
+ – Pitfall Criteria: Deprecated launcher usage. Uses python -m torch.distributed.launch instead of torchrun / accelerate.
61
+ – Important Criteria: Explains common DDP knobs. Mentions ddp_find_unused_parameters=False for models with conditional branches; optional ddp_timeout; brief note on when they matter and why.
62
+ – Optional Criteria: Briefly notes --deepspeed ds_config.json as an alternative scaler when models get big (but stays on DDP for this Q).
63
+ • weight: For Essential/Important/Optional, use 1–5 (5 = most important); for Pitfall, use –1 or –2.
64
+
65
+ Category guidance:
66
+ • Essential: Critical actions to answer/complete the user's question/request; if missing, the response is invalid and useless (weight 5).
67
+ • Important: Key reasoning, completeness, or clarity; strongly affects quality and usefulness (weight 3–4).
68
+ • Optional: Helpfulness in educating the user or providing extra depth; nice to have but not deal-breaking (weight 1–2).
69
+ • Pitfall: Common mistakes or omissions specific to this prompt—identify things a respondent often forgets or misstates.
70
+ Each Pitfall description must begin with “Pitfall Criteria: Does not mention . . . ” or “Pitfall Criteria: Recommends . . . ”
71
+ and use weight –1 or –2.
72
+
73
+ To ensure self-contained guidance:
74
+ • When referring to answer choices, explicitly say “Identifies (A)”, “Identifies (B)”, etc., rather than vague phrasing.
75
+ • If the format requires an action like calling a tool or launching a training run, include a rubric item such as:
76
+ – Essential Criteria: Includes a clear statement "Launches the training with hf-jobs.".
77
+ • If reasoning should precede the answer, include a rubric like:
78
+ – Important Criteria: Presents the explanation and reasoning before stating the final answer.
79
+ • If brevity is valued, include a rubric like:
80
+ – Optional Criteria: Remains concise and avoids unnecessary detail.
81
+ • If the question context demands mention of specific findings/best practices, include that explicitly (e.g., “Essential Criteria: Mentions
82
+ that training data must be in "messages" column for LLM training”).
83
+
84
+ Output: Provide a JSON array of rubric objects. Each object must contain exactly three keys—title, description, and weight.
85
+ Do not copy large blocks of the question or reference_answer into the text. Each description must begin with its category
86
+ prefix, and no extra keys are allowed.
87
+ Now, given the question, thread and reference_answer, generate the rubric as described. The reference answer is an good and helpful response
88
+ but not necessarily exhaustive; use it only as guidance."""
89
+
90
+
91
+ def build_prompt(
92
+ question: str, reference_answer: str, thread: List[Dict[str, str]]
93
+ ) -> List[Dict[str, str]]:
94
+ """
95
+ Build the messages list for LiteLLM completion.
96
+
97
+ Args:
98
+ question: The question/task to evaluate
99
+ reference_answer: The reference/accepted solution
100
+
101
+ Returns:
102
+ List of message dicts for LiteLLM
103
+ """
104
+ prompt = PROMPT_TEMPLATE.format(
105
+ question=question, reference_answer=reference_answer, thread=thread
106
+ )
107
+
108
+ return [{"role": "user", "content": prompt}]
109
+
110
+
111
+ def validate_rubric(rubric_list: List[Dict[str, Any]]) -> bool:
112
+ """
113
+ Validate that rubric meets basic requirements.
114
+
115
+ Args:
116
+ rubric_list: List of rubric items to validate
117
+
118
+ Returns:
119
+ True if valid, False otherwise
120
+ """
121
+ # Check count
122
+ if not (7 <= len(rubric_list) <= 20):
123
+ return False
124
+
125
+ # Check each item
126
+ category_prefixes = [
127
+ "Essential Criteria:",
128
+ "Important Criteria:",
129
+ "Optional Criteria:",
130
+ "Pitfall Criteria:",
131
+ ]
132
+
133
+ for item in rubric_list:
134
+ # Check keys
135
+ if set(item.keys()) != {"title", "description", "weight"}:
136
+ return False
137
+
138
+ # Check description starts with category prefix
139
+ if not any(
140
+ item["description"].startswith(prefix) for prefix in category_prefixes
141
+ ):
142
+ return False
143
+
144
+ return True
145
+
146
+
147
+ def generate_rubric(row: pd.Series, model: str, timeout: int = 120) -> Dict[str, Any]:
148
+ """
149
+ Generate rubric for a single question using LiteLLM.
150
+
151
+ Args:
152
+ question: The question text
153
+ reference_answer: The reference solution
154
+ model: Model name for LiteLLM
155
+ timeout: Request timeout in seconds
156
+
157
+ Returns:
158
+ Dict with rubric_list and rubric_count, or None on failure
159
+ """
160
+
161
+ messages = build_prompt(row["question"], row["solution"], row["thread"])
162
+
163
+ try:
164
+ response = litellm.completion(
165
+ model=model,
166
+ messages=messages,
167
+ timeout=timeout,
168
+ response_format=RubricList,
169
+ )
170
+
171
+ # Parse structured output
172
+ rubric_list: RubricList = RubricList.model_validate_json(
173
+ response.choices[0].message.content
174
+ )
175
+
176
+ return rubric_list.model_dump_json()
177
+ except Exception as e:
178
+ print(f"Error generating rubric: {e}", file=sys.stderr)
179
+ return None
180
+
181
+
182
+ def load_input_data(infile: str) -> pd.DataFrame:
183
+ """
184
+ Load input data from CSV or JSONL file.
185
+
186
+ Args:
187
+ infile: Path to input file
188
+
189
+ Returns:
190
+ DataFrame with loaded data
191
+ """
192
+ path = Path(infile)
193
+
194
+ if not path.exists():
195
+ raise FileNotFoundError(f"Input file not found: {infile}")
196
+
197
+ if path.suffix == ".csv":
198
+ # Try to auto-detect delimiter (comma or semicolon)
199
+ df = pd.read_csv(infile, sep=None, engine="python")
200
+ elif path.suffix == ".jsonl":
201
+ df = pd.read_json(infile, lines=True)
202
+ else:
203
+ raise ValueError(f"Unsupported file format: {path.suffix}. Use .csv or .jsonl")
204
+
205
+ # Validate required columns
206
+ required_cols = [
207
+ "discussion_title",
208
+ "discussion_url",
209
+ "question",
210
+ "thread",
211
+ "solution",
212
+ ]
213
+ missing_cols = [col for col in required_cols if col not in df.columns]
214
+
215
+ if missing_cols:
216
+ raise ValueError(f"Missing required columns: {missing_cols}")
217
+
218
+ return df
219
+
220
+
221
+ def main():
222
+ parser = argparse.ArgumentParser(
223
+ description="Generate rubrics for HF-agent benchmark evaluation"
224
+ )
225
+ parser.add_argument(
226
+ "--infile", type=str, required=True, help="Input file path (.csv or .jsonl)"
227
+ )
228
+ parser.add_argument(
229
+ "--outfile", type=str, required=True, help="Output JSONL file path"
230
+ )
231
+ parser.add_argument(
232
+ "--model",
233
+ type=str,
234
+ default="anthropic/claude-sonnet-4-5-20250929",
235
+ help="LiteLLM model name (default: from LITELLM_MODEL env or gpt-4o-mini)",
236
+ )
237
+ parser.add_argument(
238
+ "--timeout",
239
+ type=int,
240
+ default=120,
241
+ help="Request timeout in seconds (default: 120)",
242
+ )
243
+ parser.add_argument(
244
+ "--max-concurrent",
245
+ type=int,
246
+ default=30,
247
+ help="Maximum number of concurrent workers (default: 30)",
248
+ )
249
+
250
+ args = parser.parse_args()
251
+
252
+ # Determine model
253
+ model = args.model or os.getenv("LITELLM_MODEL", "gpt-4o-mini")
254
+ print(f"Using model: {model}")
255
+
256
+ # Load input data
257
+ print(f"Loading data from {args.infile}...")
258
+ df = load_input_data(args.infile)
259
+ print(f"Loaded {len(df)} examples")
260
+
261
+ # Run rubric generation in parallel using ThreadPoolExecutor
262
+ print(f"Running generation with {args.max_concurrent} parallel workers...")
263
+
264
+ with ThreadPoolExecutor(max_workers=args.max_concurrent) as executor:
265
+ # Submit all tasks
266
+ future_to_idx = {}
267
+ for idx, row in df.iterrows():
268
+ future = executor.submit(
269
+ generate_rubric,
270
+ row=row,
271
+ model=model,
272
+ timeout=args.timeout,
273
+ )
274
+ future_to_idx[future] = idx
275
+
276
+ # Collect results in order
277
+ results = [None] * len(df)
278
+ completed = 0
279
+ for future in as_completed(future_to_idx):
280
+ idx = future_to_idx[future]
281
+ results[idx] = future.result()
282
+ completed += 1
283
+ print(f"Completed: {completed}/{len(df)}", end="\r")
284
+
285
+ print() # New line after progress
286
+
287
+ # Write results to file
288
+ print(f"Writing results to {args.outfile}...")
289
+ success_count = 0
290
+ failure_count = 0
291
+
292
+ with open(args.outfile, "w") as outf:
293
+ for idx, (_, row) in enumerate(df.iterrows()):
294
+ rubric_result = results[idx]
295
+
296
+ if rubric_result is None:
297
+ failure_count += 1
298
+ continue
299
+
300
+ # Merge with original data
301
+ output_row = row.to_dict()
302
+ output_row["rubric"] = rubric_result
303
+
304
+ # Write JSONL line
305
+ outf.write(json.dumps(output_row, default=str) + "\n")
306
+ success_count += 1
307
+
308
+ print("\nComplete!")
309
+ print(f"Success: {success_count}/{len(df)}")
310
+ print(f"Failures: {failure_count}/{len(df)}")
311
+ print(f"Output written to: {args.outfile}")
312
+
313
+
314
+ if __name__ == "__main__":
315
+ main()
eval/qa_pairs_accepted.csv ADDED
The diff for this file is too large to render. See raw diff
 
evaluation_results.jsonl CHANGED
The diff for this file is too large to render. See raw diff
 
pyproject.toml CHANGED
@@ -10,4 +10,6 @@ dependencies = [
10
  "pydantic>=2.12.3",
11
  "litellm>=1.0.0",
12
  "tenacity>=8.0.0",
 
 
13
  ]
 
10
  "pydantic>=2.12.3",
11
  "litellm>=1.0.0",
12
  "tenacity>=8.0.0",
13
+ "pandas>=2.3.3",
14
+ "python-dotenv>=1.2.1",
15
  ]
uv.lock CHANGED
@@ -401,7 +401,9 @@ source = { virtual = "." }
401
  dependencies = [
402
  { name = "litellm" },
403
  { name = "numpy" },
 
404
  { name = "pydantic" },
 
405
  { name = "requests" },
406
  { name = "tenacity" },
407
  ]
@@ -410,7 +412,9 @@ dependencies = [
410
  requires-dist = [
411
  { name = "litellm", specifier = ">=1.0.0" },
412
  { name = "numpy", specifier = ">=1.24.0" },
 
413
  { name = "pydantic", specifier = ">=2.12.3" },
 
414
  { name = "requests", specifier = ">=2.32.5" },
415
  { name = "tenacity", specifier = ">=8.0.0" },
416
  ]
@@ -897,6 +901,53 @@ wheels = [
897
  { url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469, upload-time = "2025-04-19T11:48:57.875Z" },
898
  ]
899
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
900
  [[package]]
901
  name = "propcache"
902
  version = "0.4.1"
@@ -1063,6 +1114,18 @@ wheels = [
1063
  { url = "https://files.pythonhosted.org/packages/2b/c6/db8d13a1f8ab3f1eb08c88bd00fd62d44311e3456d1e85c0e59e0a0376e7/pydantic_core-2.41.4-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd8a5028425820731d8c6c098ab642d7b8b999758e24acae03ed38a66eca8335", size = 2139008, upload-time = "2025-10-14T10:23:04.539Z" },
1064
  ]
1065
 
 
 
 
 
 
 
 
 
 
 
 
 
1066
  [[package]]
1067
  name = "python-dotenv"
1068
  version = "1.2.1"
@@ -1072,6 +1135,15 @@ wheels = [
1072
  { url = "https://files.pythonhosted.org/packages/14/1b/a298b06749107c305e1fe0f814c6c74aea7b2f1e10989cb30f544a1b3253/python_dotenv-1.2.1-py3-none-any.whl", hash = "sha256:b81ee9561e9ca4004139c6cbba3a238c32b03e4894671e181b671e8cb8425d61", size = 21230, upload-time = "2025-10-26T15:12:09.109Z" },
1073
  ]
1074
 
 
 
 
 
 
 
 
 
 
1075
  [[package]]
1076
  name = "pyyaml"
1077
  version = "6.0.3"
@@ -1315,6 +1387,15 @@ wheels = [
1315
  { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload-time = "2023-10-24T04:13:38.866Z" },
1316
  ]
1317
 
 
 
 
 
 
 
 
 
 
1318
  [[package]]
1319
  name = "sniffio"
1320
  version = "1.3.1"
@@ -1451,6 +1532,15 @@ wheels = [
1451
  { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" },
1452
  ]
1453
 
 
 
 
 
 
 
 
 
 
1454
  [[package]]
1455
  name = "urllib3"
1456
  version = "2.5.0"
 
401
  dependencies = [
402
  { name = "litellm" },
403
  { name = "numpy" },
404
+ { name = "pandas" },
405
  { name = "pydantic" },
406
+ { name = "python-dotenv" },
407
  { name = "requests" },
408
  { name = "tenacity" },
409
  ]
 
412
  requires-dist = [
413
  { name = "litellm", specifier = ">=1.0.0" },
414
  { name = "numpy", specifier = ">=1.24.0" },
415
+ { name = "pandas", specifier = ">=2.3.3" },
416
  { name = "pydantic", specifier = ">=2.12.3" },
417
+ { name = "python-dotenv", specifier = ">=1.2.1" },
418
  { name = "requests", specifier = ">=2.32.5" },
419
  { name = "tenacity", specifier = ">=8.0.0" },
420
  ]
 
901
  { url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469, upload-time = "2025-04-19T11:48:57.875Z" },
902
  ]
903
 
904
+ [[package]]
905
+ name = "pandas"
906
+ version = "2.3.3"
907
+ source = { registry = "https://pypi.org/simple" }
908
+ dependencies = [
909
+ { name = "numpy" },
910
+ { name = "python-dateutil" },
911
+ { name = "pytz" },
912
+ { name = "tzdata" },
913
+ ]
914
+ sdist = { url = "https://files.pythonhosted.org/packages/33/01/d40b85317f86cf08d853a4f495195c73815fdf205eef3993821720274518/pandas-2.3.3.tar.gz", hash = "sha256:e05e1af93b977f7eafa636d043f9f94c7ee3ac81af99c13508215942e64c993b", size = 4495223, upload-time = "2025-09-29T23:34:51.853Z" }
915
+ wheels = [
916
+ { url = "https://files.pythonhosted.org/packages/9c/fb/231d89e8637c808b997d172b18e9d4a4bc7bf31296196c260526055d1ea0/pandas-2.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d21f6d74eb1725c2efaa71a2bfc661a0689579b58e9c0ca58a739ff0b002b53", size = 11597846, upload-time = "2025-09-29T23:19:48.856Z" },
917
+ { url = "https://files.pythonhosted.org/packages/5c/bd/bf8064d9cfa214294356c2d6702b716d3cf3bb24be59287a6a21e24cae6b/pandas-2.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3fd2f887589c7aa868e02632612ba39acb0b8948faf5cc58f0850e165bd46f35", size = 10729618, upload-time = "2025-09-29T23:39:08.659Z" },
918
+ { url = "https://files.pythonhosted.org/packages/57/56/cf2dbe1a3f5271370669475ead12ce77c61726ffd19a35546e31aa8edf4e/pandas-2.3.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ecaf1e12bdc03c86ad4a7ea848d66c685cb6851d807a26aa245ca3d2017a1908", size = 11737212, upload-time = "2025-09-29T23:19:59.765Z" },
919
+ { url = "https://files.pythonhosted.org/packages/e5/63/cd7d615331b328e287d8233ba9fdf191a9c2d11b6af0c7a59cfcec23de68/pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b3d11d2fda7eb164ef27ffc14b4fcab16a80e1ce67e9f57e19ec0afaf715ba89", size = 12362693, upload-time = "2025-09-29T23:20:14.098Z" },
920
+ { url = "https://files.pythonhosted.org/packages/a6/de/8b1895b107277d52f2b42d3a6806e69cfef0d5cf1d0ba343470b9d8e0a04/pandas-2.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a68e15f780eddf2b07d242e17a04aa187a7ee12b40b930bfdd78070556550e98", size = 12771002, upload-time = "2025-09-29T23:20:26.76Z" },
921
+ { url = "https://files.pythonhosted.org/packages/87/21/84072af3187a677c5893b170ba2c8fbe450a6ff911234916da889b698220/pandas-2.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:371a4ab48e950033bcf52b6527eccb564f52dc826c02afd9a1bc0ab731bba084", size = 13450971, upload-time = "2025-09-29T23:20:41.344Z" },
922
+ { url = "https://files.pythonhosted.org/packages/86/41/585a168330ff063014880a80d744219dbf1dd7a1c706e75ab3425a987384/pandas-2.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:a16dcec078a01eeef8ee61bf64074b4e524a2a3f4b3be9326420cabe59c4778b", size = 10992722, upload-time = "2025-09-29T23:20:54.139Z" },
923
+ { url = "https://files.pythonhosted.org/packages/cd/4b/18b035ee18f97c1040d94debd8f2e737000ad70ccc8f5513f4eefad75f4b/pandas-2.3.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:56851a737e3470de7fa88e6131f41281ed440d29a9268dcbf0002da5ac366713", size = 11544671, upload-time = "2025-09-29T23:21:05.024Z" },
924
+ { url = "https://files.pythonhosted.org/packages/31/94/72fac03573102779920099bcac1c3b05975c2cb5f01eac609faf34bed1ca/pandas-2.3.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:bdcd9d1167f4885211e401b3036c0c8d9e274eee67ea8d0758a256d60704cfe8", size = 10680807, upload-time = "2025-09-29T23:21:15.979Z" },
925
+ { url = "https://files.pythonhosted.org/packages/16/87/9472cf4a487d848476865321de18cc8c920b8cab98453ab79dbbc98db63a/pandas-2.3.3-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e32e7cc9af0f1cc15548288a51a3b681cc2a219faa838e995f7dc53dbab1062d", size = 11709872, upload-time = "2025-09-29T23:21:27.165Z" },
926
+ { url = "https://files.pythonhosted.org/packages/15/07/284f757f63f8a8d69ed4472bfd85122bd086e637bf4ed09de572d575a693/pandas-2.3.3-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:318d77e0e42a628c04dc56bcef4b40de67918f7041c2b061af1da41dcff670ac", size = 12306371, upload-time = "2025-09-29T23:21:40.532Z" },
927
+ { url = "https://files.pythonhosted.org/packages/33/81/a3afc88fca4aa925804a27d2676d22dcd2031c2ebe08aabd0ae55b9ff282/pandas-2.3.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4e0a175408804d566144e170d0476b15d78458795bb18f1304fb94160cabf40c", size = 12765333, upload-time = "2025-09-29T23:21:55.77Z" },
928
+ { url = "https://files.pythonhosted.org/packages/8d/0f/b4d4ae743a83742f1153464cf1a8ecfafc3ac59722a0b5c8602310cb7158/pandas-2.3.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:93c2d9ab0fc11822b5eece72ec9587e172f63cff87c00b062f6e37448ced4493", size = 13418120, upload-time = "2025-09-29T23:22:10.109Z" },
929
+ { url = "https://files.pythonhosted.org/packages/4f/c7/e54682c96a895d0c808453269e0b5928a07a127a15704fedb643e9b0a4c8/pandas-2.3.3-cp313-cp313-win_amd64.whl", hash = "sha256:f8bfc0e12dc78f777f323f55c58649591b2cd0c43534e8355c51d3fede5f4dee", size = 10993991, upload-time = "2025-09-29T23:25:04.889Z" },
930
+ { url = "https://files.pythonhosted.org/packages/f9/ca/3f8d4f49740799189e1395812f3bf23b5e8fc7c190827d55a610da72ce55/pandas-2.3.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:75ea25f9529fdec2d2e93a42c523962261e567d250b0013b16210e1d40d7c2e5", size = 12048227, upload-time = "2025-09-29T23:22:24.343Z" },
931
+ { url = "https://files.pythonhosted.org/packages/0e/5a/f43efec3e8c0cc92c4663ccad372dbdff72b60bdb56b2749f04aa1d07d7e/pandas-2.3.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:74ecdf1d301e812db96a465a525952f4dde225fdb6d8e5a521d47e1f42041e21", size = 11411056, upload-time = "2025-09-29T23:22:37.762Z" },
932
+ { url = "https://files.pythonhosted.org/packages/46/b1/85331edfc591208c9d1a63a06baa67b21d332e63b7a591a5ba42a10bb507/pandas-2.3.3-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6435cb949cb34ec11cc9860246ccb2fdc9ecd742c12d3304989017d53f039a78", size = 11645189, upload-time = "2025-09-29T23:22:51.688Z" },
933
+ { url = "https://files.pythonhosted.org/packages/44/23/78d645adc35d94d1ac4f2a3c4112ab6f5b8999f4898b8cdf01252f8df4a9/pandas-2.3.3-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:900f47d8f20860de523a1ac881c4c36d65efcb2eb850e6948140fa781736e110", size = 12121912, upload-time = "2025-09-29T23:23:05.042Z" },
934
+ { url = "https://files.pythonhosted.org/packages/53/da/d10013df5e6aaef6b425aa0c32e1fc1f3e431e4bcabd420517dceadce354/pandas-2.3.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a45c765238e2ed7d7c608fc5bc4a6f88b642f2f01e70c0c23d2224dd21829d86", size = 12712160, upload-time = "2025-09-29T23:23:28.57Z" },
935
+ { url = "https://files.pythonhosted.org/packages/bd/17/e756653095a083d8a37cbd816cb87148debcfcd920129b25f99dd8d04271/pandas-2.3.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c4fc4c21971a1a9f4bdb4c73978c7f7256caa3e62b323f70d6cb80db583350bc", size = 13199233, upload-time = "2025-09-29T23:24:24.876Z" },
936
+ { url = "https://files.pythonhosted.org/packages/04/fd/74903979833db8390b73b3a8a7d30d146d710bd32703724dd9083950386f/pandas-2.3.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:ee15f284898e7b246df8087fc82b87b01686f98ee67d85a17b7ab44143a3a9a0", size = 11540635, upload-time = "2025-09-29T23:25:52.486Z" },
937
+ { url = "https://files.pythonhosted.org/packages/21/00/266d6b357ad5e6d3ad55093a7e8efc7dd245f5a842b584db9f30b0f0a287/pandas-2.3.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1611aedd912e1ff81ff41c745822980c49ce4a7907537be8692c8dbc31924593", size = 10759079, upload-time = "2025-09-29T23:26:33.204Z" },
938
+ { url = "https://files.pythonhosted.org/packages/ca/05/d01ef80a7a3a12b2f8bbf16daba1e17c98a2f039cbc8e2f77a2c5a63d382/pandas-2.3.3-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6d2cefc361461662ac48810cb14365a365ce864afe85ef1f447ff5a1e99ea81c", size = 11814049, upload-time = "2025-09-29T23:27:15.384Z" },
939
+ { url = "https://files.pythonhosted.org/packages/15/b2/0e62f78c0c5ba7e3d2c5945a82456f4fac76c480940f805e0b97fcbc2f65/pandas-2.3.3-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ee67acbbf05014ea6c763beb097e03cd629961c8a632075eeb34247120abcb4b", size = 12332638, upload-time = "2025-09-29T23:27:51.625Z" },
940
+ { url = "https://files.pythonhosted.org/packages/c5/33/dd70400631b62b9b29c3c93d2feee1d0964dc2bae2e5ad7a6c73a7f25325/pandas-2.3.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c46467899aaa4da076d5abc11084634e2d197e9460643dd455ac3db5856b24d6", size = 12886834, upload-time = "2025-09-29T23:28:21.289Z" },
941
+ { url = "https://files.pythonhosted.org/packages/d3/18/b5d48f55821228d0d2692b34fd5034bb185e854bdb592e9c640f6290e012/pandas-2.3.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:6253c72c6a1d990a410bc7de641d34053364ef8bcd3126f7e7450125887dffe3", size = 13409925, upload-time = "2025-09-29T23:28:58.261Z" },
942
+ { url = "https://files.pythonhosted.org/packages/a6/3d/124ac75fcd0ecc09b8fdccb0246ef65e35b012030defb0e0eba2cbbbe948/pandas-2.3.3-cp314-cp314-win_amd64.whl", hash = "sha256:1b07204a219b3b7350abaae088f451860223a52cfb8a6c53358e7948735158e5", size = 11109071, upload-time = "2025-09-29T23:32:27.484Z" },
943
+ { url = "https://files.pythonhosted.org/packages/89/9c/0e21c895c38a157e0faa1fb64587a9226d6dd46452cac4532d80c3c4a244/pandas-2.3.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2462b1a365b6109d275250baaae7b760fd25c726aaca0054649286bcfbb3e8ec", size = 12048504, upload-time = "2025-09-29T23:29:31.47Z" },
944
+ { url = "https://files.pythonhosted.org/packages/d7/82/b69a1c95df796858777b68fbe6a81d37443a33319761d7c652ce77797475/pandas-2.3.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:0242fe9a49aa8b4d78a4fa03acb397a58833ef6199e9aa40a95f027bb3a1b6e7", size = 11410702, upload-time = "2025-09-29T23:29:54.591Z" },
945
+ { url = "https://files.pythonhosted.org/packages/f9/88/702bde3ba0a94b8c73a0181e05144b10f13f29ebfc2150c3a79062a8195d/pandas-2.3.3-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a21d830e78df0a515db2b3d2f5570610f5e6bd2e27749770e8bb7b524b89b450", size = 11634535, upload-time = "2025-09-29T23:30:21.003Z" },
946
+ { url = "https://files.pythonhosted.org/packages/a4/1e/1bac1a839d12e6a82ec6cb40cda2edde64a2013a66963293696bbf31fbbb/pandas-2.3.3-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2e3ebdb170b5ef78f19bfb71b0dc5dc58775032361fa188e814959b74d726dd5", size = 12121582, upload-time = "2025-09-29T23:30:43.391Z" },
947
+ { url = "https://files.pythonhosted.org/packages/44/91/483de934193e12a3b1d6ae7c8645d083ff88dec75f46e827562f1e4b4da6/pandas-2.3.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:d051c0e065b94b7a3cea50eb1ec32e912cd96dba41647eb24104b6c6c14c5788", size = 12699963, upload-time = "2025-09-29T23:31:10.009Z" },
948
+ { url = "https://files.pythonhosted.org/packages/70/44/5191d2e4026f86a2a109053e194d3ba7a31a2d10a9c2348368c63ed4e85a/pandas-2.3.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3869faf4bd07b3b66a9f462417d0ca3a9df29a9f6abd5d0d0dbab15dac7abe87", size = 13202175, upload-time = "2025-09-29T23:31:59.173Z" },
949
+ ]
950
+
951
  [[package]]
952
  name = "propcache"
953
  version = "0.4.1"
 
1114
  { url = "https://files.pythonhosted.org/packages/2b/c6/db8d13a1f8ab3f1eb08c88bd00fd62d44311e3456d1e85c0e59e0a0376e7/pydantic_core-2.41.4-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd8a5028425820731d8c6c098ab642d7b8b999758e24acae03ed38a66eca8335", size = 2139008, upload-time = "2025-10-14T10:23:04.539Z" },
1115
  ]
1116
 
1117
+ [[package]]
1118
+ name = "python-dateutil"
1119
+ version = "2.9.0.post0"
1120
+ source = { registry = "https://pypi.org/simple" }
1121
+ dependencies = [
1122
+ { name = "six" },
1123
+ ]
1124
+ sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432, upload-time = "2024-03-01T18:36:20.211Z" }
1125
+ wheels = [
1126
+ { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" },
1127
+ ]
1128
+
1129
  [[package]]
1130
  name = "python-dotenv"
1131
  version = "1.2.1"
 
1135
  { url = "https://files.pythonhosted.org/packages/14/1b/a298b06749107c305e1fe0f814c6c74aea7b2f1e10989cb30f544a1b3253/python_dotenv-1.2.1-py3-none-any.whl", hash = "sha256:b81ee9561e9ca4004139c6cbba3a238c32b03e4894671e181b671e8cb8425d61", size = 21230, upload-time = "2025-10-26T15:12:09.109Z" },
1136
  ]
1137
 
1138
+ [[package]]
1139
+ name = "pytz"
1140
+ version = "2025.2"
1141
+ source = { registry = "https://pypi.org/simple" }
1142
+ sdist = { url = "https://files.pythonhosted.org/packages/f8/bf/abbd3cdfb8fbc7fb3d4d38d320f2441b1e7cbe29be4f23797b4a2b5d8aac/pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3", size = 320884, upload-time = "2025-03-25T02:25:00.538Z" }
1143
+ wheels = [
1144
+ { url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload-time = "2025-03-25T02:24:58.468Z" },
1145
+ ]
1146
+
1147
  [[package]]
1148
  name = "pyyaml"
1149
  version = "6.0.3"
 
1387
  { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload-time = "2023-10-24T04:13:38.866Z" },
1388
  ]
1389
 
1390
+ [[package]]
1391
+ name = "six"
1392
+ version = "1.17.0"
1393
+ source = { registry = "https://pypi.org/simple" }
1394
+ sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031, upload-time = "2024-12-04T17:35:28.174Z" }
1395
+ wheels = [
1396
+ { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" },
1397
+ ]
1398
+
1399
  [[package]]
1400
  name = "sniffio"
1401
  version = "1.3.1"
 
1532
  { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" },
1533
  ]
1534
 
1535
+ [[package]]
1536
+ name = "tzdata"
1537
+ version = "2025.2"
1538
+ source = { registry = "https://pypi.org/simple" }
1539
+ sdist = { url = "https://files.pythonhosted.org/packages/95/32/1a225d6164441be760d75c2c42e2780dc0873fe382da3e98a2e1e48361e5/tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9", size = 196380, upload-time = "2025-03-23T13:54:43.652Z" }
1540
+ wheels = [
1541
+ { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" },
1542
+ ]
1543
+
1544
  [[package]]
1545
  name = "urllib3"
1546
  version = "2.5.0"