File size: 8,126 Bytes
5e9fb2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
"""Evaluation results utilities for the `.eval_results/*.yaml` format.

See https://huggingface.co/docs/hub/eval-results for more details.
Specifications are available at https://github.com/huggingface/hub-docs/blob/main/eval_results.yaml.
"""

from dataclasses import dataclass
from typing import Any


@dataclass
class EvalResultEntry:
    """
    Evaluation result entry for the `.eval_results/*.yaml` format.

    Represents evaluation scores stored in model repos that automatically appear on
    the model page and the benchmark dataset's leaderboard.

    For the legacy `model-index` format in `README.md`, use [`EvalResult`] instead.

    See https://huggingface.co/docs/hub/eval-results for more details.

    Args:
        dataset_id (`str`):
            Benchmark dataset ID from the Hub. Example: "cais/hle", "Idavidrein/gpqa".
        task_id (`str`):
            Task identifier within the benchmark. Example: "gpqa_diamond".
        value (`Any`):
            The metric value. Example: 20.90.
        dataset_revision (`str`, *optional*):
            Git SHA of the benchmark dataset.
        verify_token (`str`, *optional*):
            A signature that can be used to prove that evaluation is provably auditable and reproducible.
        date (`str`, *optional*):
            When the evaluation was run (ISO-8601 datetime). Defaults to git commit time.
        source_url (`str`, *optional*):
            Link to the evaluation source (e.g., https://huggingface.co/spaces/SaylorTwift/smollm3-mmlu-pro). Required if `source_name`, `source_user`, or `source_org` is provided.
        source_name (`str`, *optional*):
            Display name for the source. Example: "Eval Logs".
        source_user (`str`, *optional*):
            HF user name for attribution. Example: "celinah".
        source_org (`str`, *optional*):
            HF org name for attribution. Example: "cais".
        notes (`str`, *optional*):
            Details about the evaluation setup. Example: "tools", "no-tools", "chain-of-thought".

    Example:
        ```python
        >>> from huggingface_hub import EvalResultEntry
        >>> # Minimal example with required fields only
        >>> result = EvalResultEntry(
        ...     dataset_id="Idavidrein/gpqa",
        ...     task_id="gpqa_diamond",
        ...     value=0.412,
        ... )
        >>> # Full example with all fields
        >>> result = EvalResultEntry(
        ...     dataset_id="cais/hle",
        ...     task_id="default",
        ...     value=20.90,
        ...     dataset_revision="5503434ddd753f426f4b38109466949a1217c2bb",
        ...     verify_token="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...",
        ...     date="2025-01-15T10:30:00Z",
        ...     source_url="https://huggingface.co/datasets/cais/hle",
        ...     source_name="CAIS HLE",
        ...     source_org="cais",
        ...     notes="no-tools",
        ... )

        ```
    """

    dataset_id: str
    task_id: str
    value: Any
    dataset_revision: str | None = None
    verify_token: str | None = None
    date: str | None = None
    source_url: str | None = None
    source_name: str | None = None
    source_user: str | None = None
    source_org: str | None = None
    notes: str | None = None

    def __post_init__(self) -> None:
        if (
            self.source_name is not None or self.source_user is not None or self.source_org is not None
        ) and self.source_url is None:
            raise ValueError(
                "If `source_name`, `source_user`, or `source_org` is provided, `source_url` must also be provided."
            )


def eval_result_entries_to_yaml(entries: list[EvalResultEntry]) -> list[dict[str, Any]]:
    """Convert a list of [`EvalResultEntry`] objects to a YAML-serializable list of dicts.

    This produces the format expected in `.eval_results/*.yaml` files.

    Args:
        entries (`list[EvalResultEntry]`):
            List of evaluation result entries to serialize.

    Returns:
        `list[dict[str, Any]]`: A list of dictionaries ready to be dumped to YAML.

    Example:
        ```python
        >>> from huggingface_hub import EvalResultEntry, eval_result_entries_to_yaml
        >>> entries = [
        ...     EvalResultEntry(dataset_id="cais/hle", task_id="default", value=20.90),
        ...     EvalResultEntry(dataset_id="Idavidrein/gpqa", task_id="gpqa_diamond", value=0.412),
        ... ]
        >>> yaml_data = eval_result_entries_to_yaml(entries)
        >>> yaml_data[0]
        {'dataset': {'id': 'cais/hle', 'task_id': 'default'}, 'value': 20.9}

        ```

        To upload eval results to the Hub:
        ```python
        >>> import yaml
        >>> from huggingface_hub import upload_file, EvalResultEntry, eval_result_entries_to_yaml
        >>> entries = [
        ...     EvalResultEntry(dataset_id="cais/hle", task_id="default", value=20.90),
        ... ]
        >>> yaml_content = yaml.dump(eval_result_entries_to_yaml(entries))
        >>> upload_file(
        ...     path_or_fileobj=yaml_content.encode(),
        ...     path_in_repo=".eval_results/hle.yaml",
        ...     repo_id="your-username/your-model",
        ... )

        ```
    """
    result = []
    for entry in entries:
        # build the dataset object
        dataset: dict[str, Any] = {"id": entry.dataset_id, "task_id": entry.task_id}
        if entry.dataset_revision is not None:
            dataset["revision"] = entry.dataset_revision

        data: dict[str, Any] = {"dataset": dataset, "value": entry.value}
        if entry.verify_token is not None:
            data["verifyToken"] = entry.verify_token
        if entry.date is not None:
            data["date"] = entry.date
        # build the source object
        if entry.source_url is not None:
            source: dict[str, Any] = {"url": entry.source_url}
            if entry.source_name is not None:
                source["name"] = entry.source_name
            if entry.source_user is not None:
                source["user"] = entry.source_user
            if entry.source_org is not None:
                source["org"] = entry.source_org
            data["source"] = source
        if entry.notes is not None:
            data["notes"] = entry.notes

        result.append(data)
    return result


def parse_eval_result_entries(data: list[dict[str, Any]]) -> list[EvalResultEntry]:
    """Parse a list of dicts into [`EvalResultEntry`] objects.

    This parses the `.eval_results/*.yaml` format. For the legacy `model-index` format,
    use [`model_index_to_eval_results`] instead.

    Args:
        data (`list[dict[str, Any]]`):
            A list of dictionaries (e.g., parsed from YAML or API response).

    Returns:
        `list[EvalResultEntry]`: A list of evaluation result entry objects.

    Example:
        ```python
        >>> from huggingface_hub import parse_eval_result_entries
        >>> data = [
        ...     {"dataset": {"id": "cais/hle", "task_id": "default"}, "value": 20.90},
        ...     {"dataset": {"id": "Idavidrein/gpqa", "task_id": "gpqa_diamond"}, "value": 0.412},
        ... ]
        >>> entries = parse_eval_result_entries(data)
        >>> entries[0].dataset_id
        'cais/hle'
        >>> entries[0].value
        20.9

        ```
    """
    entries = []
    for item in data:
        entry_data = item.get("data", item)
        dataset = entry_data.get("dataset", {})
        source = entry_data.get("source", {})
        entry = EvalResultEntry(
            dataset_id=dataset["id"],
            value=entry_data["value"],
            task_id=dataset["task_id"],
            dataset_revision=dataset.get("revision"),
            verify_token=entry_data.get("verifyToken"),
            date=entry_data.get("date"),
            source_url=source.get("url") if source else None,
            source_name=source.get("name") if source else None,
            source_user=source.get("user") if source else None,
            source_org=source.get("org") if source else None,
            notes=entry_data.get("notes"),
        )
        entries.append(entry)
    return entries