| """Evaluation results utilities for the `.eval_results/*.yaml` format. |
| |
| See https://huggingface.co/docs/hub/eval-results for more details. |
| Specifications are available at https://github.com/huggingface/hub-docs/blob/main/eval_results.yaml. |
| """ |
|
|
| from dataclasses import dataclass |
| from typing import Any |
|
|
|
|
| @dataclass |
| class EvalResultEntry: |
| """ |
| Evaluation result entry for the `.eval_results/*.yaml` format. |
| |
| Represents evaluation scores stored in model repos that automatically appear on |
| the model page and the benchmark dataset's leaderboard. |
| |
| For the legacy `model-index` format in `README.md`, use [`EvalResult`] instead. |
| |
| See https://huggingface.co/docs/hub/eval-results for more details. |
| |
| Args: |
| dataset_id (`str`): |
| Benchmark dataset ID from the Hub. Example: "cais/hle", "Idavidrein/gpqa". |
| task_id (`str`): |
| Task identifier within the benchmark. Example: "gpqa_diamond". |
| value (`Any`): |
| The metric value. Example: 20.90. |
| dataset_revision (`str`, *optional*): |
| Git SHA of the benchmark dataset. |
| verify_token (`str`, *optional*): |
| A signature that can be used to prove that evaluation is provably auditable and reproducible. |
| date (`str`, *optional*): |
| When the evaluation was run (ISO-8601 datetime). Defaults to git commit time. |
| source_url (`str`, *optional*): |
| Link to the evaluation source (e.g., https://huggingface.co/spaces/SaylorTwift/smollm3-mmlu-pro). Required if `source_name`, `source_user`, or `source_org` is provided. |
| source_name (`str`, *optional*): |
| Display name for the source. Example: "Eval Logs". |
| source_user (`str`, *optional*): |
| HF user name for attribution. Example: "celinah". |
| source_org (`str`, *optional*): |
| HF org name for attribution. Example: "cais". |
| notes (`str`, *optional*): |
| Details about the evaluation setup. Example: "tools", "no-tools", "chain-of-thought". |
| |
| Example: |
| ```python |
| >>> from huggingface_hub import EvalResultEntry |
| >>> # Minimal example with required fields only |
| >>> result = EvalResultEntry( |
| ... dataset_id="Idavidrein/gpqa", |
| ... task_id="gpqa_diamond", |
| ... value=0.412, |
| ... ) |
| >>> # Full example with all fields |
| >>> result = EvalResultEntry( |
| ... dataset_id="cais/hle", |
| ... task_id="default", |
| ... value=20.90, |
| ... dataset_revision="5503434ddd753f426f4b38109466949a1217c2bb", |
| ... verify_token="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...", |
| ... date="2025-01-15T10:30:00Z", |
| ... source_url="https://huggingface.co/datasets/cais/hle", |
| ... source_name="CAIS HLE", |
| ... source_org="cais", |
| ... notes="no-tools", |
| ... ) |
| |
| ``` |
| """ |
|
|
| dataset_id: str |
| task_id: str |
| value: Any |
| dataset_revision: str | None = None |
| verify_token: str | None = None |
| date: str | None = None |
| source_url: str | None = None |
| source_name: str | None = None |
| source_user: str | None = None |
| source_org: str | None = None |
| notes: str | None = None |
|
|
| def __post_init__(self) -> None: |
| if ( |
| self.source_name is not None or self.source_user is not None or self.source_org is not None |
| ) and self.source_url is None: |
| raise ValueError( |
| "If `source_name`, `source_user`, or `source_org` is provided, `source_url` must also be provided." |
| ) |
|
|
|
|
| def eval_result_entries_to_yaml(entries: list[EvalResultEntry]) -> list[dict[str, Any]]: |
| """Convert a list of [`EvalResultEntry`] objects to a YAML-serializable list of dicts. |
| |
| This produces the format expected in `.eval_results/*.yaml` files. |
| |
| Args: |
| entries (`list[EvalResultEntry]`): |
| List of evaluation result entries to serialize. |
| |
| Returns: |
| `list[dict[str, Any]]`: A list of dictionaries ready to be dumped to YAML. |
| |
| Example: |
| ```python |
| >>> from huggingface_hub import EvalResultEntry, eval_result_entries_to_yaml |
| >>> entries = [ |
| ... EvalResultEntry(dataset_id="cais/hle", task_id="default", value=20.90), |
| ... EvalResultEntry(dataset_id="Idavidrein/gpqa", task_id="gpqa_diamond", value=0.412), |
| ... ] |
| >>> yaml_data = eval_result_entries_to_yaml(entries) |
| >>> yaml_data[0] |
| {'dataset': {'id': 'cais/hle', 'task_id': 'default'}, 'value': 20.9} |
| |
| ``` |
| |
| To upload eval results to the Hub: |
| ```python |
| >>> import yaml |
| >>> from huggingface_hub import upload_file, EvalResultEntry, eval_result_entries_to_yaml |
| >>> entries = [ |
| ... EvalResultEntry(dataset_id="cais/hle", task_id="default", value=20.90), |
| ... ] |
| >>> yaml_content = yaml.dump(eval_result_entries_to_yaml(entries)) |
| >>> upload_file( |
| ... path_or_fileobj=yaml_content.encode(), |
| ... path_in_repo=".eval_results/hle.yaml", |
| ... repo_id="your-username/your-model", |
| ... ) |
| |
| ``` |
| """ |
| result = [] |
| for entry in entries: |
| |
| dataset: dict[str, Any] = {"id": entry.dataset_id, "task_id": entry.task_id} |
| if entry.dataset_revision is not None: |
| dataset["revision"] = entry.dataset_revision |
|
|
| data: dict[str, Any] = {"dataset": dataset, "value": entry.value} |
| if entry.verify_token is not None: |
| data["verifyToken"] = entry.verify_token |
| if entry.date is not None: |
| data["date"] = entry.date |
| |
| if entry.source_url is not None: |
| source: dict[str, Any] = {"url": entry.source_url} |
| if entry.source_name is not None: |
| source["name"] = entry.source_name |
| if entry.source_user is not None: |
| source["user"] = entry.source_user |
| if entry.source_org is not None: |
| source["org"] = entry.source_org |
| data["source"] = source |
| if entry.notes is not None: |
| data["notes"] = entry.notes |
|
|
| result.append(data) |
| return result |
|
|
|
|
| def parse_eval_result_entries(data: list[dict[str, Any]]) -> list[EvalResultEntry]: |
| """Parse a list of dicts into [`EvalResultEntry`] objects. |
| |
| This parses the `.eval_results/*.yaml` format. For the legacy `model-index` format, |
| use [`model_index_to_eval_results`] instead. |
| |
| Args: |
| data (`list[dict[str, Any]]`): |
| A list of dictionaries (e.g., parsed from YAML or API response). |
| |
| Returns: |
| `list[EvalResultEntry]`: A list of evaluation result entry objects. |
| |
| Example: |
| ```python |
| >>> from huggingface_hub import parse_eval_result_entries |
| >>> data = [ |
| ... {"dataset": {"id": "cais/hle", "task_id": "default"}, "value": 20.90}, |
| ... {"dataset": {"id": "Idavidrein/gpqa", "task_id": "gpqa_diamond"}, "value": 0.412}, |
| ... ] |
| >>> entries = parse_eval_result_entries(data) |
| >>> entries[0].dataset_id |
| 'cais/hle' |
| >>> entries[0].value |
| 20.9 |
| |
| ``` |
| """ |
| entries = [] |
| for item in data: |
| entry_data = item.get("data", item) |
| dataset = entry_data.get("dataset", {}) |
| source = entry_data.get("source", {}) |
| entry = EvalResultEntry( |
| dataset_id=dataset["id"], |
| value=entry_data["value"], |
| task_id=dataset["task_id"], |
| dataset_revision=dataset.get("revision"), |
| verify_token=entry_data.get("verifyToken"), |
| date=entry_data.get("date"), |
| source_url=source.get("url") if source else None, |
| source_name=source.get("name") if source else None, |
| source_user=source.get("user") if source else None, |
| source_org=source.get("org") if source else None, |
| notes=entry_data.get("notes"), |
| ) |
| entries.append(entry) |
| return entries |
|
|