Spaces:

ycwhencpp
/

train-new

Paused

File size: 8,126 Bytes

5e9fb2f

"""Evaluation results utilities for the `.eval_results/*.yaml` format.

See https://huggingface.co/docs/hub/eval-results for more details.
Specifications are available at https://github.com/huggingface/hub-docs/blob/main/eval_results.yaml.
"""

from dataclasses import dataclass
from typing import Any


@dataclass
class EvalResultEntry:
    """
    Evaluation result entry for the `.eval_results/*.yaml` format.

    Represents evaluation scores stored in model repos that automatically appear on
    the model page and the benchmark dataset's leaderboard.

    For the legacy `model-index` format in `README.md`, use [`EvalResult`] instead.

    See https://huggingface.co/docs/hub/eval-results for more details.

    Args:
        dataset_id (`str`):
            Benchmark dataset ID from the Hub. Example: "cais/hle", "Idavidrein/gpqa".
        task_id (`str`):
            Task identifier within the benchmark. Example: "gpqa_diamond".
        value (`Any`):
            The metric value. Example: 20.90.
        dataset_revision (`str`, *optional*):
            Git SHA of the benchmark dataset.
        verify_token (`str`, *optional*):
            A signature that can be used to prove that evaluation is provably auditable and reproducible.
        date (`str`, *optional*):
            When the evaluation was run (ISO-8601 datetime). Defaults to git commit time.
        source_url (`str`, *optional*):
            Link to the evaluation source (e.g., https://huggingface.co/spaces/SaylorTwift/smollm3-mmlu-pro). Required if `source_name`, `source_user`, or `source_org` is provided.
        source_name (`str`, *optional*):
            Display name for the source. Example: "Eval Logs".
        source_user (`str`, *optional*):
            HF user name for attribution. Example: "celinah".
        source_org (`str`, *optional*):
            HF org name for attribution. Example: "cais".
        notes (`str`, *optional*):
            Details about the evaluation setup. Example: "tools", "no-tools", "chain-of-thought".

    Example:
        ```python
        >>> from huggingface_hub import EvalResultEntry
        >>> # Minimal example with required fields only
        >>> result = EvalResultEntry(
        ...     dataset_id="Idavidrein/gpqa",
        ...     task_id="gpqa_diamond",
        ...     value=0.412,
        ... )
        >>> # Full example with all fields
        >>> result = EvalResultEntry(
        ...     dataset_id="cais/hle",
        ...     task_id="default",
        ...     value=20.90,
        ...     dataset_revision="5503434ddd753f426f4b38109466949a1217c2bb",
        ...     verify_token="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...",
        ...     date="2025-01-15T10:30:00Z",
        ...     source_url="https://huggingface.co/datasets/cais/hle",
        ...     source_name="CAIS HLE",
        ...     source_org="cais",
        ...     notes="no-tools",
        ... )

        ```
    """

    dataset_id: str
    task_id: str
    value: Any
    dataset_revision: str | None = None
    verify_token: str | None = None
    date: str | None = None
    source_url: str | None = None
    source_name: str | None = None
    source_user: str | None = None
    source_org: str | None = None
    notes: str | None = None

    def __post_init__(self) -> None:
        if (
            self.source_name is not None or self.source_user is not None or self.source_org is not None
        ) and self.source_url is None:
            raise ValueError(
                "If `source_name`, `source_user`, or `source_org` is provided, `source_url` must also be provided."
            )


def eval_result_entries_to_yaml(entries: list[EvalResultEntry]) -> list[dict[str, Any]]:
    """Convert a list of [`EvalResultEntry`] objects to a YAML-serializable list of dicts.

    This produces the format expected in `.eval_results/*.yaml` files.

    Args:
        entries (`list[EvalResultEntry]`):
            List of evaluation result entries to serialize.

    Returns:
        `list[dict[str, Any]]`: A list of dictionaries ready to be dumped to YAML.

    Example:
        ```python
        >>> from huggingface_hub import EvalResultEntry, eval_result_entries_to_yaml
        >>> entries = [
        ...     EvalResultEntry(dataset_id="cais/hle", task_id="default", value=20.90),
        ...     EvalResultEntry(dataset_id="Idavidrein/gpqa", task_id="gpqa_diamond", value=0.412),
        ... ]
        >>> yaml_data = eval_result_entries_to_yaml(entries)
        >>> yaml_data[0]
        {'dataset': {'id': 'cais/hle', 'task_id': 'default'}, 'value': 20.9}

        ```

        To upload eval results to the Hub:
        ```python
        >>> import yaml
        >>> from huggingface_hub import upload_file, EvalResultEntry, eval_result_entries_to_yaml
        >>> entries = [
        ...     EvalResultEntry(dataset_id="cais/hle", task_id="default", value=20.90),
        ... ]
        >>> yaml_content = yaml.dump(eval_result_entries_to_yaml(entries))
        >>> upload_file(
        ...     path_or_fileobj=yaml_content.encode(),
        ...     path_in_repo=".eval_results/hle.yaml",
        ...     repo_id="your-username/your-model",
        ... )

        ```
    """
    result = []
    for entry in entries:
        # build the dataset object
        dataset: dict[str, Any] = {"id": entry.dataset_id, "task_id": entry.task_id}
        if entry.dataset_revision is not None:
            dataset["revision"] = entry.dataset_revision

        data: dict[str, Any] = {"dataset": dataset, "value": entry.value}
        if entry.verify_token is not None:
            data["verifyToken"] = entry.verify_token
        if entry.date is not None:
            data["date"] = entry.date
        # build the source object
        if entry.source_url is not None:
            source: dict[str, Any] = {"url": entry.source_url}
            if entry.source_name is not None:
                source["name"] = entry.source_name
            if entry.source_user is not None:
                source["user"] = entry.source_user
            if entry.source_org is not None:
                source["org"] = entry.source_org
            data["source"] = source
        if entry.notes is not None:
            data["notes"] = entry.notes

        result.append(data)
    return result


def parse_eval_result_entries(data: list[dict[str, Any]]) -> list[EvalResultEntry]:
    """Parse a list of dicts into [`EvalResultEntry`] objects.

    This parses the `.eval_results/*.yaml` format. For the legacy `model-index` format,
    use [`model_index_to_eval_results`] instead.

    Args:
        data (`list[dict[str, Any]]`):
            A list of dictionaries (e.g., parsed from YAML or API response).

    Returns:
        `list[EvalResultEntry]`: A list of evaluation result entry objects.

    Example:
        ```python
        >>> from huggingface_hub import parse_eval_result_entries
        >>> data = [
        ...     {"dataset": {"id": "cais/hle", "task_id": "default"}, "value": 20.90},
        ...     {"dataset": {"id": "Idavidrein/gpqa", "task_id": "gpqa_diamond"}, "value": 0.412},
        ... ]
        >>> entries = parse_eval_result_entries(data)
        >>> entries[0].dataset_id
        'cais/hle'
        >>> entries[0].value
        20.9

        ```
    """
    entries = []
    for item in data:
        entry_data = item.get("data", item)
        dataset = entry_data.get("dataset", {})
        source = entry_data.get("source", {})
        entry = EvalResultEntry(
            dataset_id=dataset["id"],
            value=entry_data["value"],
            task_id=dataset["task_id"],
            dataset_revision=dataset.get("revision"),
            verify_token=entry_data.get("verifyToken"),
            date=entry_data.get("date"),
            source_url=source.get("url") if source else None,
            source_name=source.get("name") if source else None,
            source_user=source.get("user") if source else None,
            source_org=source.get("org") if source else None,
            notes=entry_data.get("notes"),
        )
        entries.append(entry)
    return entries