Spaces:

smolagents
/

ml-intern

Running on CPU Upgrade

App Files Files Community

akseljoonas HF Staff commited on Oct 30, 2025

Commit

af80aa7

1 Parent(s): 4ce436f

eval script done

Browse files

Files changed (9) hide show

.gitignore +3 -1
eval/df.csv +0 -0
eval/df.ipynb +0 -137
eval/evaluate.py +256 -123
eval/generate_rubrics.py +315 -0
eval/qa_pairs_accepted.csv +0 -0
evaluation_results.jsonl +0 -0
pyproject.toml +2 -0
uv.lock +90 -0

.gitignore CHANGED Viewed

@@ -10,4 +10,6 @@ wheels/
 .venv
 .env
 .DS_Store
-.claude/

 .venv
 .env
 .DS_Store
+.claude/
+*.jsonl
+*.csv

eval/df.csv DELETED Viewed

The diff for this file is too large to render. See raw diff

eval/df.ipynb DELETED Viewed

@@ -1,137 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "b7f67653",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "1fcf9d61b3664bc99d616101c201aca8",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Generating train split: 0 examples [00:00, ? examples/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "from datasets import load_dataset\n",
-    "\n",
-    "ds = load_dataset(\"json\", data_files=\"qa_pairs.jsonl\", split=\"train\")\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "55cd7b9c",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "cb9452a5789b4b20bd0b01cce111f961",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "4a77e027bda5405991bfde524347013c",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "4d70f759da55470fba30326d99d6ac1f",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Processing Files (0 / 0): |          |  0.00B /  0.00B            "
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "617db8df8ac94a2889d8760fc7f0113a",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "New Data Upload: |          |  0.00B /  0.00B            "
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/plain": [
-       "CommitInfo(commit_url='https://huggingface.co/datasets/akseljoonas/qa_pairs/commit/6947117631cb56686c192533427bb4400382b4fd', commit_message='Upload dataset', commit_description='', oid='6947117631cb56686c192533427bb4400382b4fd', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/akseljoonas/qa_pairs', endpoint='https://huggingface.co', repo_type='dataset', repo_id='akseljoonas/qa_pairs'), pr_revision=None, pr_num=None)"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "ds.push_to_hub(\"akseljoonas/qa_pairs\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "16d4760d",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": ".venv",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.11"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

eval/evaluate.py CHANGED Viewed

@@ -1,201 +1,334 @@
 import json
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import Any, Dict
 import litellm
-from models import (
-    Correctness,
-    EvaluatedQuestionAndSolution,
-    JudgementResult,
-    QuestionAndSolution,
-)
-# from: https://github.com/centerforaisafety/hle/blob/7b6be5aad6f9b43af3857de7867f3b52f6e4acb3/hle_eval/run_judge_results.py#L16-L33
-GRADER_TEMPLATE = """
-Judge whether the following [response] to [question] is correct or not based on if the [response] includes the precise and unambiguous [correct_answer] below.
-[question]: {question}
-[response]: {response}
-Your judgement must be in the format and criteria specified below:
-extracted_final_answer: The final exact answer extracted from the [response]. Put the extracted answer as 'None' if there is no exact, final answer to extract from the response.
-[correct_answer]: {correct_answer}
-reasoning: Explain why the [correct_answer] is included or not included in the extracted_final_answer, focusing only on if there are meaningful differences between [correct_answer] and the extracted_final_answer. Do not comment on any background to the problem, do not attempt to solve the problem, do not argue for any answer different than [correct_answer], focus only on whether the answers match.
-correct: Answer 'yes' if [correct_answer] is included in the extracted_final_answer given above, or is within a small margin of error for numerical problems. Answer 'no' otherwise, i.e. if there if there is any inconsistency, ambiguity, non-equivalency, or if the extracted answer is incorrect.
-confidence: The extracted confidence score between 0|%| and 100|%| from [response]. Put 100 if there is no confidence score available.
-""".strip()
-CHOICE_STRINGS = ["yes", "no"]
-def evaluate_single_response(
-    question: str,
-    response: str,
-    correct_answer: str,
-    model: str = "gpt-4o-mini",
-) -> Dict[str, Any]:
     """
-    Evaluate a single response against the ground truth using LLM as judge.
     Args:
         question: The question being answered
         response: The response to evaluate
-        correct_answer: The ground truth answer
-        model: The LLM model to use for judging
     Returns:
-        Dictionary containing the judgement result and metadata
     """
-    prompt = GRADER_TEMPLATE.format(
-        question=question, response=response, correct_answer=correct_answer
     )
-    # Use litellm with structured output
     llm_response = litellm.completion(
         model=model,
         messages=[
             {
                 "role": "system",
-                "content": "You are an expert judge evaluating answers for accuracy and equivalence.",
             },
             {"role": "user", "content": prompt},
         ],
-        response_format=JudgementResult,
         temperature=0.0,
     )
-    # Parse structured output
-    result: JudgementResult = JudgementResult.model_validate_json(
-        llm_response.choices[0].message.content
-    )
     return result
-def evaluate_dataset(
     input_file: str,
-    eval_file: str,
-    output_file: str = "evaluation_results.jsonl",
     model: str = "gpt-4o-mini",
-    max_concurrent: int = 30,
-    limit: int = None,
 ) -> None:
     """
-    Evaluate all QA pairs in the input file using LLM as judge.
     Args:
-        input_file: Path to input JSONL file with QA pairs
-        output_file: Path to output JSONL file for results
-        model: The LLM model to use for judging
-        max_concurrent: Maximum number of concurrent threads
-        limit: Optional limit on number of examples to evaluate
     """
-    # Load input data as proper models
-    to_evaluate = [
-        QuestionAndSolution.model_validate_json(line) for line in open(input_file, "r")
-    ]
     if limit:
-        to_evaluate = to_evaluate[:limit]
-    print(f"Loaded {len(to_evaluate)} QA pairs to evaluate")
-    # Load ground truth dataset
-    print(f"Loading ground truth from {eval_file}...")
-    with open(eval_file, "r") as f:
-        ground_truths = [QuestionAndSolution.model_validate_json(line) for line in f]
-    print(f"Loaded {len(ground_truths)} ground truths")
-    # Run evaluations in parallel using ThreadPoolExecutor
-    print(f"Running evaluations with {max_concurrent} parallel workers...")
-    results = []
     with ThreadPoolExecutor(max_workers=max_concurrent) as executor:
         # Submit all tasks
         future_to_idx = {}
-        for idx, (qa_pair, ground_truth) in enumerate(zip(to_evaluate, ground_truths)):
-            question = ground_truth.question
-            ground_truth_answer = ground_truth.solution
-            response = qa_pair.solution
             future = executor.submit(
-                evaluate_single_response,
-                response=response,
-                question=question,
-                correct_answer=ground_truth_answer,
                 model=model,
             )
             future_to_idx[future] = idx
         # Collect results in order
-        results = [None] * len(to_evaluate)
         for future in as_completed(future_to_idx):
             idx = future_to_idx[future]
             results[idx] = future.result()
-    # Combine results with original data using proper models
-    output_data: list[EvaluatedQuestionAndSolution] = []
-    correct_count = 0
-    error_count = 0
-    for qa_pair, result in zip(to_evaluate, results):
-        print(result.model_dump_json())
-        # Create proper evaluated model
-        output_entry = EvaluatedQuestionAndSolution(
-            **qa_pair.model_dump(),
-            evaluation=result
         )
-        output_data.append(output_entry)
-        if result.correct == Correctness.yes:
-            correct_count += 1
-        else:
-            error_count += 1
-    # Write results using proper model serialization
     print(f"Writing results to {output_file}...")
     with open(output_file, "w") as f:
         for entry in output_data:
             f.write(entry.model_dump_json() + "\n")
     # Print summary
-    total = len(to_evaluate)
-    success_rate = (total - error_count) / total * 100 if total > 0 else 0
-    accuracy = correct_count / total * 100 if total > 0 else 0
-    print("\n" + "=" * 50)
-    print("EVALUATION SUMMARY")
-    print("=" * 50)
-    print(f"Total examples: {total}")
-    print(f"Successful evaluations: {total - error_count}")
-    print(f"Errors: {error_count}")
-    print(f"Success rate: {success_rate:.2f}%")
-    print(f"Correct answers: {correct_count}")
-    print(f"Accuracy: {accuracy:.2f}%")
-    print("=" * 50)
-#
-def main():
-    """Main entry point for the evaluation script"""
-    evaluate_dataset(
-        input_file="eval/qa_pairs.jsonl",
-        eval_file="eval/qa_pairs.jsonl",
-        output_file="evaluation_results.jsonl",
-        model="gpt-4o-mini",
-        max_concurrent=30,
-        limit=10,  # Set to None to evaluate all, or a number to limit
     )
 if __name__ == "__main__":
-    main()

+"""
+Rubric-based evaluation following the "Rubrics as Rewards" paper.
+Implements RaR-Explicit: Weighted sum of individual criterion scores (Equation 1)
+"""
 import json
 from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Dict, List, Optional
 import litellm
+from pydantic import BaseModel
+class CriterionCheck(BaseModel):
+    """Result of checking a single rubric criterion."""
+    title: str
+    description: str
+    weight: int
+    satisfied: bool
+    reasoning: Optional[str] = None
+class RubricEvaluation(BaseModel):
+    """Complete rubric-based evaluation result."""
+    criterion_checks: List[CriterionCheck]
+    raw_score: float  # Unnormalized score
+    normalized_score: float  # Score normalized to [0, 1]
+class EvaluatedResponse(BaseModel):
+    """Complete evaluated response with rubric scores."""
+    discussion_title: str
+    discussion_url: str
+    question: str
+    response: str
+    reference_answer: str
+    evaluation: RubricEvaluation
+CRITERION_PROMPT = """You are evaluating whether a response satisfies a specific evaluation criterion.
+Question: {question}
+Response to evaluate: {response}
+Evaluation Criterion:
+{criterion_description}
+Your task: Determine if the response satisfies this criterion.
+Output a JSON object with:
+- "satisfied": true or false
+- "reasoning": Brief explanation (1-2 sentences) of why it does or doesn't satisfy the criterion
+Be strict but fair. The criterion must be clearly satisfied for you to answer true."""
+class RubricData(BaseModel):
+    """Rubric data loaded from file."""
+    title: str
+    description: str
+    weight: int
+def load_rubrics_from_file(rubric_file: str) -> Dict[str, List[RubricData]]:
+    """
+    Load rubrics from JSONL file and index by question.
+    Args:
+        rubric_file: Path to rubric JSONL file
+    Returns:
+        Dictionary mapping questions to their rubrics
     """
+    rubrics_by_question = {}
+    with open(rubric_file, "r") as f:
+        for line in f:
+            entry = json.loads(line)
+            question = entry["question"]
+            # Parse rubric JSON string
+            rubric_data = json.loads(entry["rubric"])
+            rubrics = [RubricData(**r) for r in rubric_data["rubrics"]]
+            rubrics_by_question[question] = rubrics
+    return rubrics_by_question
+def check_criterion(
+    question: str, response: str, criterion: RubricData, model: str = "gpt-4o-mini"
+) -> CriterionCheck:
+    """
+    Check if response satisfies a single criterion.
     Args:
         question: The question being answered
         response: The response to evaluate
+        criterion: The rubric criterion to check
+        model: LLM model for judging
     Returns:
+        CriterionCheck with satisfaction result
     """
+    prompt = CRITERION_PROMPT.format(
+        question=question,
+        response=response,
+        criterion_description=criterion.description,
     )
     llm_response = litellm.completion(
         model=model,
         messages=[
             {
                 "role": "system",
+                "content": "You are an expert evaluator for rubric-based assessment.",
             },
             {"role": "user", "content": prompt},
         ],
         temperature=0.0,
+        response_format=CriterionCheck,
     )
+    result = CriterionCheck.model_validate_json(llm_response.choices[0].message.content)
     return result
+def evaluate_with_rubrics(
+    question: str,
+    response: str,
+    reference_answer: str,
+    rubrics: List[RubricData],
+    model: str = "gpt-4o-mini",
+) -> RubricEvaluation:
+    """
+    Evaluate response using RaR-Explicit method (weighted sum).
+    Implements Equation 1 from paper:
+    r(x, ŷ) = Σ(w_j * c_j(x, ŷ)) / Σ(w_j)
+    Args:
+        question: The question
+        response: Response to evaluate
+        reference_answer: Reference answer (not directly used, but available)
+        rubrics: List of rubric criteria
+        model: LLM model for judging
+    Returns:
+        RubricEvaluation with normalized score
+    """
+    # Check each criterion independently
+    checks = []
+    for rubric in rubrics:
+        check = check_criterion(question, response, rubric, model)
+        checks.append(check)
+    # Calculate weighted score (Equation 1)
+    # Only positive weights contribute to denominator
+    positive_weights = sum(abs(r.weight) for r in rubrics if r.weight > 0)
+    raw_score = 0.0
+    for check in checks:
+        if check.satisfied:
+            raw_score += check.weight
+    # Normalize to [0, 1]
+    normalized_score = raw_score / positive_weights if positive_weights > 0 else 0.0
+    # Clip to [0, 1] in case pitfalls make it negative
+    normalized_score = max(0.0, min(1.0, normalized_score))
+    return RubricEvaluation(
+        raw_score=raw_score,
+        normalized_score=normalized_score,
+        criterion_checks=checks,
+    )
+def evaluate_dataset_with_rubrics(
     input_file: str,
+    rubric_file: str,
+    ground_truth_file: str,
+    output_file: str = "rubric_evaluation_results.jsonl",
     model: str = "gpt-4o-mini",
+    max_concurrent: int = 10,
+    limit: Optional[int] = None,
 ) -> None:
     """
+    Evaluate all responses using rubric-based assessment.
     Args:
+        input_file: Path to JSONL with responses to evaluate
+        rubric_file: Path to JSONL with rubrics (output from generate_rubrics.py)
+        ground_truth_file: Path to JSONL with ground truth answers
+        output_file: Path to output JSONL file
+        model: LLM model for judging
+        max_concurrent: Maximum concurrent evaluations
+        limit: Optional limit on number of examples
     """
+    # Load data
+    print(f"Loading responses from {input_file}...")
+    with open(input_file, "r") as f:
+        responses = [json.loads(line) for line in f]
+    print(f"Loading rubrics from {rubric_file}...")
+    rubrics_by_question = load_rubrics_from_file(rubric_file)
+    print(f"Loading ground truth from {ground_truth_file}...")
+    with open(ground_truth_file, "r") as f:
+        ground_truths = [json.loads(line) for line in f]
     if limit:
+        responses = responses[:limit]
+        ground_truths = ground_truths[:limit]
+    print(f"Loaded {len(responses)} responses to evaluate")
+    print(f"Judge model: {model}")
+    # Match responses with rubrics and ground truth
+    evaluation_tasks = []
+    for response_data, gt_data in zip(responses, ground_truths):
+        question = gt_data["question"]
+        # Find rubrics for this question
+        rubrics = rubrics_by_question.get(question)
+        if not rubrics:
+            print(f"Warning: No rubrics found for question: {question[:50]}...")
+            continue
+        evaluation_tasks.append(
+            {
+                "question": question,
+                "response": response_data["solution"],
+                "reference_answer": gt_data["solution"],
+                "rubrics": rubrics,
+                "metadata": {
+                    "discussion_title": response_data.get("discussion_title", ""),
+                    "discussion_url": response_data.get("discussion_url", ""),
+                },
+            }
+        )
+    print(
+        f"Running {len(evaluation_tasks)} evaluations with {max_concurrent} parallel workers..."
+    )
+    # Run evaluations in parallel
+    results = []
     with ThreadPoolExecutor(max_workers=max_concurrent) as executor:
         # Submit all tasks
         future_to_idx = {}
+        for idx, task in enumerate(evaluation_tasks):
             future = executor.submit(
+                evaluate_with_rubrics,
+                question=task["question"],
+                response=task["response"],
+                reference_answer=task["reference_answer"],
+                rubrics=task["rubrics"],
                 model=model,
             )
             future_to_idx[future] = idx
         # Collect results in order
+        results = [None] * len(evaluation_tasks)
+        completed = 0
         for future in as_completed(future_to_idx):
             idx = future_to_idx[future]
             results[idx] = future.result()
+            completed += 1
+            print(f"Completed: {completed}/{len(evaluation_tasks)}", end="\r")
+    print()  # New line after progress
+    # Combine results with metadata
+    output_data = []
+    total_score = 0.0
+    for task, evaluation in zip(evaluation_tasks, results):
+        evaluated_response = EvaluatedResponse(
+            discussion_title=task["metadata"]["discussion_title"],
+            discussion_url=task["metadata"]["discussion_url"],
+            question=task["question"],
+            response=task["response"],
+            reference_answer=task["reference_answer"],
+            evaluation=evaluation,
         )
+        output_data.append(evaluated_response)
+        total_score += evaluation.normalized_score
+    # Write results
     print(f"Writing results to {output_file}...")
     with open(output_file, "w") as f:
         for entry in output_data:
             f.write(entry.model_dump_json() + "\n")
     # Print summary
+    avg_score = total_score / len(output_data) if output_data else 0.0
+    print("\n" + "=" * 60)
+    print("RUBRIC-BASED EVALUATION SUMMARY")
+    print("=" * 60)
+    print(f"Total examples: {len(output_data)}")
+    print(f"Judge model: {model}")
+    print(f"Average normalized score: {avg_score:.3f}")
+    print(f"Average percentage: {avg_score * 100:.1f}%")
+    # Per-criterion statistics
+    total_satisfied = sum(
+        sum(1 for check in eval.evaluation.criterion_checks if check.satisfied)
+        for eval in output_data
     )
+    total_criteria = sum(len(eval.evaluation.criterion_checks) for eval in output_data)
+    satisfaction_rate = total_satisfied / total_criteria if total_criteria > 0 else 0.0
+    print(f"Criteria satisfaction rate: {satisfaction_rate * 100:.1f}%")
+    print("=" * 60)
 if __name__ == "__main__":
+    evaluate_dataset_with_rubrics(
+        input_file="eval/qa_pairs_accepted.jsonl",
+        rubric_file="eval/qa_rubrics.jsonl",
+        ground_truth_file="eval/qa_pairs_accepted.jsonl",
+        output_file="rubric_evaluation.jsonl",
+        model="gpt-4o-mini",
+        max_concurrent=10,
+        limit=30,  # Set to None to evaluate all
+    )

eval/generate_rubrics.py ADDED Viewed

	@@ -0,0 +1,315 @@

+#!/usr/bin/env env python3
+"""
+Rubric Generation Script for HF-Agent Benchmark
+Generates instance-specific evaluation rubrics following the "Rubrics as Rewards" paper.
+Uses LiteLLM to call LLM models for rubric synthesis with expert grounding via reference answers.
+"""
+import argparse
+import json
+import os
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import Any, Dict, List
+import litellm
+import pandas as pd
+from dotenv import load_dotenv
+from pydantic import BaseModel
+class Rubric(BaseModel):
+    title: str
+    description: str
+    weight: int
+class RubricList(BaseModel):
+    rubrics: List[Rubric]
+# Load environment variables
+load_dotenv()
+# Rubric generation prompt template based on RaR paper
+PROMPT_TEMPLATE = """You are an expert rubric writer. Your job is to generate a self-contained set of evaluation criteria (“rubrics”) for judging
+how good, helpful and complete an agent's trajectory is to a given user question/request.
+Rubrics can cover aspects of a response such as, but not limited to, factual correctness, helpfulness, completeness, harmlessness, correctness of using Hugging Face best practices (based on HF documentation), depth of
+reasoning, contextual relevance and usefulness. Each item must be self-contained – non expert readers should not need to
+infer anything or consult external information. Begin each description with its category: “Essential Criteria: . . . ”, “Important
+Criteria: . . . ”, “Optional Criteria: . . . ”, or “Pitfall Criteria: Does not mention . . . ”.
+Inputs: !!!
+- question: <<<{question}>>>
+- reference_answer (ideal solution): <<<{reference_answer}>>>
+- thread: <<<{thread}>>>
+Total items:
+• Choose 7–20 rubric items based on the complexity of the question.
+Each rubric item:
+• title (2–4 words).
+• description: One sentence starting with its category prefix that explicitly states exactly what to look for. For example:
+– Essential Criteria: Writes a up-to-date, correct, complete and working training loop using the latest Hugging Face best practices. Launches the training with hf-jobs.
+– Pitfall Criteria: Deprecated launcher usage. Uses python -m torch.distributed.launch instead of torchrun / accelerate.
+– Important Criteria: Explains common DDP knobs. Mentions ddp_find_unused_parameters=False for models with conditional branches; optional ddp_timeout; brief note on when they matter and why.
+– Optional Criteria: Briefly notes --deepspeed ds_config.json as an alternative scaler when models get big (but stays on DDP for this Q).
+• weight: For Essential/Important/Optional, use 1–5 (5 = most important); for Pitfall, use –1 or –2.
+Category guidance:
+• Essential: Critical actions to answer/complete the user's question/request; if missing, the response is invalid and useless (weight 5).
+• Important: Key reasoning, completeness, or clarity; strongly affects quality and usefulness (weight 3–4).
+• Optional: Helpfulness in educating the user or providing extra depth; nice to have but not deal-breaking (weight 1–2).
+• Pitfall: Common mistakes or omissions specific to this prompt—identify things a respondent often forgets or misstates.
+Each Pitfall description must begin with “Pitfall Criteria: Does not mention . . . ” or “Pitfall Criteria: Recommends . . . ”
+and use weight –1 or –2.
+To ensure self-contained guidance:
+• When referring to answer choices, explicitly say “Identifies (A)”, “Identifies (B)”, etc., rather than vague phrasing.
+• If the format requires an action like calling a tool or launching a training run, include a rubric item such as:
+– Essential Criteria: Includes a clear statement "Launches the training with hf-jobs.".
+• If reasoning should precede the answer, include a rubric like:
+– Important Criteria: Presents the explanation and reasoning before stating the final answer.
+• If brevity is valued, include a rubric like:
+– Optional Criteria: Remains concise and avoids unnecessary detail.
+• If the question context demands mention of specific findings/best practices, include that explicitly (e.g., “Essential Criteria: Mentions
+that training data must be in "messages" column for LLM training”).
+Output: Provide a JSON array of rubric objects. Each object must contain exactly three keys—title, description, and weight.
+Do not copy large blocks of the question or reference_answer into the text. Each description must begin with its category
+prefix, and no extra keys are allowed.
+Now, given the question, thread and reference_answer, generate the rubric as described. The reference answer is an good and helpful response
+but not necessarily exhaustive; use it only as guidance."""
+def build_prompt(
+    question: str, reference_answer: str, thread: List[Dict[str, str]]
+) -> List[Dict[str, str]]:
+    """
+    Build the messages list for LiteLLM completion.
+    Args:
+        question: The question/task to evaluate
+        reference_answer: The reference/accepted solution
+    Returns:
+        List of message dicts for LiteLLM
+    """
+    prompt = PROMPT_TEMPLATE.format(
+        question=question, reference_answer=reference_answer, thread=thread
+    )
+    return [{"role": "user", "content": prompt}]
+def validate_rubric(rubric_list: List[Dict[str, Any]]) -> bool:
+    """
+    Validate that rubric meets basic requirements.
+    Args:
+        rubric_list: List of rubric items to validate
+    Returns:
+        True if valid, False otherwise
+    """
+    # Check count
+    if not (7 <= len(rubric_list) <= 20):
+        return False
+    # Check each item
+    category_prefixes = [
+        "Essential Criteria:",
+        "Important Criteria:",
+        "Optional Criteria:",
+        "Pitfall Criteria:",
+    ]
+    for item in rubric_list:
+        # Check keys
+        if set(item.keys()) != {"title", "description", "weight"}:
+            return False
+        # Check description starts with category prefix
+        if not any(
+            item["description"].startswith(prefix) for prefix in category_prefixes
+        ):
+            return False
+    return True
+def generate_rubric(row: pd.Series, model: str, timeout: int = 120) -> Dict[str, Any]:
+    """
+    Generate rubric for a single question using LiteLLM.
+    Args:
+        question: The question text
+        reference_answer: The reference solution
+        model: Model name for LiteLLM
+        timeout: Request timeout in seconds
+    Returns:
+        Dict with rubric_list and rubric_count, or None on failure
+    """
+    messages = build_prompt(row["question"], row["solution"], row["thread"])
+    try:
+        response = litellm.completion(
+            model=model,
+            messages=messages,
+            timeout=timeout,
+            response_format=RubricList,
+        )
+        # Parse structured output
+        rubric_list: RubricList = RubricList.model_validate_json(
+            response.choices[0].message.content
+        )
+        return rubric_list.model_dump_json()
+    except Exception as e:
+        print(f"Error generating rubric: {e}", file=sys.stderr)
+        return None
+def load_input_data(infile: str) -> pd.DataFrame:
+    """
+    Load input data from CSV or JSONL file.
+    Args:
+        infile: Path to input file
+    Returns:
+        DataFrame with loaded data
+    """
+    path = Path(infile)
+    if not path.exists():
+        raise FileNotFoundError(f"Input file not found: {infile}")
+    if path.suffix == ".csv":
+        # Try to auto-detect delimiter (comma or semicolon)
+        df = pd.read_csv(infile, sep=None, engine="python")
+    elif path.suffix == ".jsonl":
+        df = pd.read_json(infile, lines=True)
+    else:
+        raise ValueError(f"Unsupported file format: {path.suffix}. Use .csv or .jsonl")
+    # Validate required columns
+    required_cols = [
+        "discussion_title",
+        "discussion_url",
+        "question",
+        "thread",
+        "solution",
+    ]
+    missing_cols = [col for col in required_cols if col not in df.columns]
+    if missing_cols:
+        raise ValueError(f"Missing required columns: {missing_cols}")
+    return df
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate rubrics for HF-agent benchmark evaluation"
+    )
+    parser.add_argument(
+        "--infile", type=str, required=True, help="Input file path (.csv or .jsonl)"
+    )
+    parser.add_argument(
+        "--outfile", type=str, required=True, help="Output JSONL file path"
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="anthropic/claude-sonnet-4-5-20250929",
+        help="LiteLLM model name (default: from LITELLM_MODEL env or gpt-4o-mini)",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=int,
+        default=120,
+        help="Request timeout in seconds (default: 120)",
+    )
+    parser.add_argument(
+        "--max-concurrent",
+        type=int,
+        default=30,
+        help="Maximum number of concurrent workers (default: 30)",
+    )
+    args = parser.parse_args()
+    # Determine model
+    model = args.model or os.getenv("LITELLM_MODEL", "gpt-4o-mini")
+    print(f"Using model: {model}")
+    # Load input data
+    print(f"Loading data from {args.infile}...")
+    df = load_input_data(args.infile)
+    print(f"Loaded {len(df)} examples")
+    # Run rubric generation in parallel using ThreadPoolExecutor
+    print(f"Running generation with {args.max_concurrent} parallel workers...")
+    with ThreadPoolExecutor(max_workers=args.max_concurrent) as executor:
+        # Submit all tasks
+        future_to_idx = {}
+        for idx, row in df.iterrows():
+            future = executor.submit(
+                generate_rubric,
+                row=row,
+                model=model,
+                timeout=args.timeout,
+            )
+            future_to_idx[future] = idx
+        # Collect results in order
+        results = [None] * len(df)
+        completed = 0
+        for future in as_completed(future_to_idx):
+            idx = future_to_idx[future]
+            results[idx] = future.result()
+            completed += 1
+            print(f"Completed: {completed}/{len(df)}", end="\r")
+    print()  # New line after progress
+    # Write results to file
+    print(f"Writing results to {args.outfile}...")
+    success_count = 0
+    failure_count = 0
+    with open(args.outfile, "w") as outf:
+        for idx, (_, row) in enumerate(df.iterrows()):
+            rubric_result = results[idx]
+            if rubric_result is None:
+                failure_count += 1
+                continue
+            # Merge with original data
+            output_row = row.to_dict()
+            output_row["rubric"] = rubric_result
+            # Write JSONL line
+            outf.write(json.dumps(output_row, default=str) + "\n")
+            success_count += 1
+    print("\nComplete!")
+    print(f"Success: {success_count}/{len(df)}")
+    print(f"Failures: {failure_count}/{len(df)}")
+    print(f"Output written to: {args.outfile}")
+if __name__ == "__main__":
+    main()

eval/qa_pairs_accepted.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

evaluation_results.jsonl CHANGED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml CHANGED Viewed

@@ -10,4 +10,6 @@ dependencies = [
     "pydantic>=2.12.3",
     "litellm>=1.0.0",
     "tenacity>=8.0.0",
 ]

     "pydantic>=2.12.3",
     "litellm>=1.0.0",
     "tenacity>=8.0.0",
+    "pandas>=2.3.3",
+    "python-dotenv>=1.2.1",
 ]

uv.lock CHANGED Viewed

@@ -401,7 +401,9 @@ source = { virtual = "." }
 dependencies = [
     { name = "litellm" },
     { name = "numpy" },
     { name = "pydantic" },
     { name = "requests" },
     { name = "tenacity" },
 ]
@@ -410,7 +412,9 @@ dependencies = [
 requires-dist = [
     { name = "litellm", specifier = ">=1.0.0" },
     { name = "numpy", specifier = ">=1.24.0" },
     { name = "pydantic", specifier = ">=2.12.3" },
     { name = "requests", specifier = ">=2.32.5" },
     { name = "tenacity", specifier = ">=8.0.0" },
 ]
@@ -897,6 +901,53 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469, upload-time = "2025-04-19T11:48:57.875Z" },
 ]
 [[package]]
 name = "propcache"
 version = "0.4.1"
@@ -1063,6 +1114,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/2b/c6/db8d13a1f8ab3f1eb08c88bd00fd62d44311e3456d1e85c0e59e0a0376e7/pydantic_core-2.41.4-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd8a5028425820731d8c6c098ab642d7b8b999758e24acae03ed38a66eca8335", size = 2139008, upload-time = "2025-10-14T10:23:04.539Z" },
 ]
 [[package]]
 name = "python-dotenv"
 version = "1.2.1"
@@ -1072,6 +1135,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/14/1b/a298b06749107c305e1fe0f814c6c74aea7b2f1e10989cb30f544a1b3253/python_dotenv-1.2.1-py3-none-any.whl", hash = "sha256:b81ee9561e9ca4004139c6cbba3a238c32b03e4894671e181b671e8cb8425d61", size = 21230, upload-time = "2025-10-26T15:12:09.109Z" },
 ]
 [[package]]
 name = "pyyaml"
 version = "6.0.3"
@@ -1315,6 +1387,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload-time = "2023-10-24T04:13:38.866Z" },
 ]
 [[package]]
 name = "sniffio"
 version = "1.3.1"
@@ -1451,6 +1532,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" },
 ]
 [[package]]
 name = "urllib3"
 version = "2.5.0"

 dependencies = [
     { name = "litellm" },
     { name = "numpy" },
+    { name = "pandas" },
     { name = "pydantic" },
+    { name = "python-dotenv" },
     { name = "requests" },
     { name = "tenacity" },
 ]
 requires-dist = [
     { name = "litellm", specifier = ">=1.0.0" },
     { name = "numpy", specifier = ">=1.24.0" },
+    { name = "pandas", specifier = ">=2.3.3" },
     { name = "pydantic", specifier = ">=2.12.3" },
+    { name = "python-dotenv", specifier = ">=1.2.1" },
     { name = "requests", specifier = ">=2.32.5" },
     { name = "tenacity", specifier = ">=8.0.0" },
 ]
     { url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469, upload-time = "2025-04-19T11:48:57.875Z" },
 ]
+[[package]]
+name = "pandas"
+version = "2.3.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy" },
+    { name = "python-dateutil" },
+    { name = "pytz" },
+    { name = "tzdata" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/33/01/d40b85317f86cf08d853a4f495195c73815fdf205eef3993821720274518/pandas-2.3.3.tar.gz", hash = "sha256:e05e1af93b977f7eafa636d043f9f94c7ee3ac81af99c13508215942e64c993b", size = 4495223, upload-time = "2025-09-29T23:34:51.853Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9c/fb/231d89e8637c808b997d172b18e9d4a4bc7bf31296196c260526055d1ea0/pandas-2.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d21f6d74eb1725c2efaa71a2bfc661a0689579b58e9c0ca58a739ff0b002b53", size = 11597846, upload-time = "2025-09-29T23:19:48.856Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/bd/bf8064d9cfa214294356c2d6702b716d3cf3bb24be59287a6a21e24cae6b/pandas-2.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3fd2f887589c7aa868e02632612ba39acb0b8948faf5cc58f0850e165bd46f35", size = 10729618, upload-time = "2025-09-29T23:39:08.659Z" },
+    { url = "https://files.pythonhosted.org/packages/57/56/cf2dbe1a3f5271370669475ead12ce77c61726ffd19a35546e31aa8edf4e/pandas-2.3.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ecaf1e12bdc03c86ad4a7ea848d66c685cb6851d807a26aa245ca3d2017a1908", size = 11737212, upload-time = "2025-09-29T23:19:59.765Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/63/cd7d615331b328e287d8233ba9fdf191a9c2d11b6af0c7a59cfcec23de68/pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b3d11d2fda7eb164ef27ffc14b4fcab16a80e1ce67e9f57e19ec0afaf715ba89", size = 12362693, upload-time = "2025-09-29T23:20:14.098Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/de/8b1895b107277d52f2b42d3a6806e69cfef0d5cf1d0ba343470b9d8e0a04/pandas-2.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a68e15f780eddf2b07d242e17a04aa187a7ee12b40b930bfdd78070556550e98", size = 12771002, upload-time = "2025-09-29T23:20:26.76Z" },
+    { url = "https://files.pythonhosted.org/packages/87/21/84072af3187a677c5893b170ba2c8fbe450a6ff911234916da889b698220/pandas-2.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:371a4ab48e950033bcf52b6527eccb564f52dc826c02afd9a1bc0ab731bba084", size = 13450971, upload-time = "2025-09-29T23:20:41.344Z" },
+    { url = "https://files.pythonhosted.org/packages/86/41/585a168330ff063014880a80d744219dbf1dd7a1c706e75ab3425a987384/pandas-2.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:a16dcec078a01eeef8ee61bf64074b4e524a2a3f4b3be9326420cabe59c4778b", size = 10992722, upload-time = "2025-09-29T23:20:54.139Z" },
+    { url = "https://files.pythonhosted.org/packages/cd/4b/18b035ee18f97c1040d94debd8f2e737000ad70ccc8f5513f4eefad75f4b/pandas-2.3.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:56851a737e3470de7fa88e6131f41281ed440d29a9268dcbf0002da5ac366713", size = 11544671, upload-time = "2025-09-29T23:21:05.024Z" },
+    { url = "https://files.pythonhosted.org/packages/31/94/72fac03573102779920099bcac1c3b05975c2cb5f01eac609faf34bed1ca/pandas-2.3.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:bdcd9d1167f4885211e401b3036c0c8d9e274eee67ea8d0758a256d60704cfe8", size = 10680807, upload-time = "2025-09-29T23:21:15.979Z" },
+    { url = "https://files.pythonhosted.org/packages/16/87/9472cf4a487d848476865321de18cc8c920b8cab98453ab79dbbc98db63a/pandas-2.3.3-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e32e7cc9af0f1cc15548288a51a3b681cc2a219faa838e995f7dc53dbab1062d", size = 11709872, upload-time = "2025-09-29T23:21:27.165Z" },
+    { url = "https://files.pythonhosted.org/packages/15/07/284f757f63f8a8d69ed4472bfd85122bd086e637bf4ed09de572d575a693/pandas-2.3.3-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:318d77e0e42a628c04dc56bcef4b40de67918f7041c2b061af1da41dcff670ac", size = 12306371, upload-time = "2025-09-29T23:21:40.532Z" },
+    { url = "https://files.pythonhosted.org/packages/33/81/a3afc88fca4aa925804a27d2676d22dcd2031c2ebe08aabd0ae55b9ff282/pandas-2.3.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4e0a175408804d566144e170d0476b15d78458795bb18f1304fb94160cabf40c", size = 12765333, upload-time = "2025-09-29T23:21:55.77Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/0f/b4d4ae743a83742f1153464cf1a8ecfafc3ac59722a0b5c8602310cb7158/pandas-2.3.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:93c2d9ab0fc11822b5eece72ec9587e172f63cff87c00b062f6e37448ced4493", size = 13418120, upload-time = "2025-09-29T23:22:10.109Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/c7/e54682c96a895d0c808453269e0b5928a07a127a15704fedb643e9b0a4c8/pandas-2.3.3-cp313-cp313-win_amd64.whl", hash = "sha256:f8bfc0e12dc78f777f323f55c58649591b2cd0c43534e8355c51d3fede5f4dee", size = 10993991, upload-time = "2025-09-29T23:25:04.889Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/ca/3f8d4f49740799189e1395812f3bf23b5e8fc7c190827d55a610da72ce55/pandas-2.3.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:75ea25f9529fdec2d2e93a42c523962261e567d250b0013b16210e1d40d7c2e5", size = 12048227, upload-time = "2025-09-29T23:22:24.343Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/5a/f43efec3e8c0cc92c4663ccad372dbdff72b60bdb56b2749f04aa1d07d7e/pandas-2.3.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:74ecdf1d301e812db96a465a525952f4dde225fdb6d8e5a521d47e1f42041e21", size = 11411056, upload-time = "2025-09-29T23:22:37.762Z" },
+    { url = "https://files.pythonhosted.org/packages/46/b1/85331edfc591208c9d1a63a06baa67b21d332e63b7a591a5ba42a10bb507/pandas-2.3.3-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6435cb949cb34ec11cc9860246ccb2fdc9ecd742c12d3304989017d53f039a78", size = 11645189, upload-time = "2025-09-29T23:22:51.688Z" },
+    { url = "https://files.pythonhosted.org/packages/44/23/78d645adc35d94d1ac4f2a3c4112ab6f5b8999f4898b8cdf01252f8df4a9/pandas-2.3.3-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:900f47d8f20860de523a1ac881c4c36d65efcb2eb850e6948140fa781736e110", size = 12121912, upload-time = "2025-09-29T23:23:05.042Z" },
+    { url = "https://files.pythonhosted.org/packages/53/da/d10013df5e6aaef6b425aa0c32e1fc1f3e431e4bcabd420517dceadce354/pandas-2.3.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a45c765238e2ed7d7c608fc5bc4a6f88b642f2f01e70c0c23d2224dd21829d86", size = 12712160, upload-time = "2025-09-29T23:23:28.57Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/17/e756653095a083d8a37cbd816cb87148debcfcd920129b25f99dd8d04271/pandas-2.3.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c4fc4c21971a1a9f4bdb4c73978c7f7256caa3e62b323f70d6cb80db583350bc", size = 13199233, upload-time = "2025-09-29T23:24:24.876Z" },
+    { url = "https://files.pythonhosted.org/packages/04/fd/74903979833db8390b73b3a8a7d30d146d710bd32703724dd9083950386f/pandas-2.3.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:ee15f284898e7b246df8087fc82b87b01686f98ee67d85a17b7ab44143a3a9a0", size = 11540635, upload-time = "2025-09-29T23:25:52.486Z" },
+    { url = "https://files.pythonhosted.org/packages/21/00/266d6b357ad5e6d3ad55093a7e8efc7dd245f5a842b584db9f30b0f0a287/pandas-2.3.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1611aedd912e1ff81ff41c745822980c49ce4a7907537be8692c8dbc31924593", size = 10759079, upload-time = "2025-09-29T23:26:33.204Z" },
+    { url = "https://files.pythonhosted.org/packages/ca/05/d01ef80a7a3a12b2f8bbf16daba1e17c98a2f039cbc8e2f77a2c5a63d382/pandas-2.3.3-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6d2cefc361461662ac48810cb14365a365ce864afe85ef1f447ff5a1e99ea81c", size = 11814049, upload-time = "2025-09-29T23:27:15.384Z" },
+    { url = "https://files.pythonhosted.org/packages/15/b2/0e62f78c0c5ba7e3d2c5945a82456f4fac76c480940f805e0b97fcbc2f65/pandas-2.3.3-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ee67acbbf05014ea6c763beb097e03cd629961c8a632075eeb34247120abcb4b", size = 12332638, upload-time = "2025-09-29T23:27:51.625Z" },
+    { url = "https://files.pythonhosted.org/packages/c5/33/dd70400631b62b9b29c3c93d2feee1d0964dc2bae2e5ad7a6c73a7f25325/pandas-2.3.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c46467899aaa4da076d5abc11084634e2d197e9460643dd455ac3db5856b24d6", size = 12886834, upload-time = "2025-09-29T23:28:21.289Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/18/b5d48f55821228d0d2692b34fd5034bb185e854bdb592e9c640f6290e012/pandas-2.3.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:6253c72c6a1d990a410bc7de641d34053364ef8bcd3126f7e7450125887dffe3", size = 13409925, upload-time = "2025-09-29T23:28:58.261Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/3d/124ac75fcd0ecc09b8fdccb0246ef65e35b012030defb0e0eba2cbbbe948/pandas-2.3.3-cp314-cp314-win_amd64.whl", hash = "sha256:1b07204a219b3b7350abaae088f451860223a52cfb8a6c53358e7948735158e5", size = 11109071, upload-time = "2025-09-29T23:32:27.484Z" },
+    { url = "https://files.pythonhosted.org/packages/89/9c/0e21c895c38a157e0faa1fb64587a9226d6dd46452cac4532d80c3c4a244/pandas-2.3.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2462b1a365b6109d275250baaae7b760fd25c726aaca0054649286bcfbb3e8ec", size = 12048504, upload-time = "2025-09-29T23:29:31.47Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/82/b69a1c95df796858777b68fbe6a81d37443a33319761d7c652ce77797475/pandas-2.3.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:0242fe9a49aa8b4d78a4fa03acb397a58833ef6199e9aa40a95f027bb3a1b6e7", size = 11410702, upload-time = "2025-09-29T23:29:54.591Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/88/702bde3ba0a94b8c73a0181e05144b10f13f29ebfc2150c3a79062a8195d/pandas-2.3.3-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a21d830e78df0a515db2b3d2f5570610f5e6bd2e27749770e8bb7b524b89b450", size = 11634535, upload-time = "2025-09-29T23:30:21.003Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/1e/1bac1a839d12e6a82ec6cb40cda2edde64a2013a66963293696bbf31fbbb/pandas-2.3.3-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2e3ebdb170b5ef78f19bfb71b0dc5dc58775032361fa188e814959b74d726dd5", size = 12121582, upload-time = "2025-09-29T23:30:43.391Z" },
+    { url = "https://files.pythonhosted.org/packages/44/91/483de934193e12a3b1d6ae7c8645d083ff88dec75f46e827562f1e4b4da6/pandas-2.3.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:d051c0e065b94b7a3cea50eb1ec32e912cd96dba41647eb24104b6c6c14c5788", size = 12699963, upload-time = "2025-09-29T23:31:10.009Z" },
+    { url = "https://files.pythonhosted.org/packages/70/44/5191d2e4026f86a2a109053e194d3ba7a31a2d10a9c2348368c63ed4e85a/pandas-2.3.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3869faf4bd07b3b66a9f462417d0ca3a9df29a9f6abd5d0d0dbab15dac7abe87", size = 13202175, upload-time = "2025-09-29T23:31:59.173Z" },
+]
 [[package]]
 name = "propcache"
 version = "0.4.1"
     { url = "https://files.pythonhosted.org/packages/2b/c6/db8d13a1f8ab3f1eb08c88bd00fd62d44311e3456d1e85c0e59e0a0376e7/pydantic_core-2.41.4-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd8a5028425820731d8c6c098ab642d7b8b999758e24acae03ed38a66eca8335", size = 2139008, upload-time = "2025-10-14T10:23:04.539Z" },
 ]
+[[package]]
+name = "python-dateutil"
+version = "2.9.0.post0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "six" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432, upload-time = "2024-03-01T18:36:20.211Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" },
+]
 [[package]]
 name = "python-dotenv"
 version = "1.2.1"
     { url = "https://files.pythonhosted.org/packages/14/1b/a298b06749107c305e1fe0f814c6c74aea7b2f1e10989cb30f544a1b3253/python_dotenv-1.2.1-py3-none-any.whl", hash = "sha256:b81ee9561e9ca4004139c6cbba3a238c32b03e4894671e181b671e8cb8425d61", size = 21230, upload-time = "2025-10-26T15:12:09.109Z" },
 ]
+[[package]]
+name = "pytz"
+version = "2025.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f8/bf/abbd3cdfb8fbc7fb3d4d38d320f2441b1e7cbe29be4f23797b4a2b5d8aac/pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3", size = 320884, upload-time = "2025-03-25T02:25:00.538Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload-time = "2025-03-25T02:24:58.468Z" },
+]
 [[package]]
 name = "pyyaml"
 version = "6.0.3"
     { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload-time = "2023-10-24T04:13:38.866Z" },
 ]
+[[package]]
+name = "six"
+version = "1.17.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031, upload-time = "2024-12-04T17:35:28.174Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" },
+]
 [[package]]
 name = "sniffio"
 version = "1.3.1"
     { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" },
 ]
+[[package]]
+name = "tzdata"
+version = "2025.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/95/32/1a225d6164441be760d75c2c42e2780dc0873fe382da3e98a2e1e48361e5/tzdata-2025.2.tar.gz", hash = "sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9", size = 196380, upload-time = "2025-03-23T13:54:43.652Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" },
+]
 [[package]]
 name = "urllib3"
 version = "2.5.0"