Spaces:

ycwhencpp
/

train-new

Paused

App Files Files Community

train-new / .venv-hf /lib /python3.14 /site-packages /huggingface_hub /_eval_results.py

ycwhencpp

Sync repo: updated train_grpo notebook for training run

5e9fb2f verified 13 days ago

raw

history blame contribute delete

8.13 kB

	"""Evaluation results utilities for the `.eval_results/*.yaml` format.

	See https://huggingface.co/docs/hub/eval-results for more details.
	Specifications are available at https://github.com/huggingface/hub-docs/blob/main/eval_results.yaml.
	"""

	from dataclasses import dataclass
	from typing import Any


	@dataclass
	class EvalResultEntry:
	"""
	Evaluation result entry for the `.eval_results/*.yaml` format.

	Represents evaluation scores stored in model repos that automatically appear on
	the model page and the benchmark dataset's leaderboard.

	For the legacy `model-index` format in `README.md`, use [`EvalResult`] instead.

	See https://huggingface.co/docs/hub/eval-results for more details.

	Args:
	dataset_id (`str`):
	Benchmark dataset ID from the Hub. Example: "cais/hle", "Idavidrein/gpqa".
	task_id (`str`):
	Task identifier within the benchmark. Example: "gpqa_diamond".
	value (`Any`):
	The metric value. Example: 20.90.
	dataset_revision (`str`, optional):
	Git SHA of the benchmark dataset.
	verify_token (`str`, optional):
	A signature that can be used to prove that evaluation is provably auditable and reproducible.
	date (`str`, optional):
	When the evaluation was run (ISO-8601 datetime). Defaults to git commit time.
	source_url (`str`, optional):
	Link to the evaluation source (e.g., https://huggingface.co/spaces/SaylorTwift/smollm3-mmlu-pro). Required if `source_name`, `source_user`, or `source_org` is provided.
	source_name (`str`, optional):
	Display name for the source. Example: "Eval Logs".
	source_user (`str`, optional):
	HF user name for attribution. Example: "celinah".
	source_org (`str`, optional):
	HF org name for attribution. Example: "cais".
	notes (`str`, optional):
	Details about the evaluation setup. Example: "tools", "no-tools", "chain-of-thought".

	Example:
	```python
	>>> from huggingface_hub import EvalResultEntry
	>>> # Minimal example with required fields only
	>>> result = EvalResultEntry(
	... dataset_id="Idavidrein/gpqa",
	... task_id="gpqa_diamond",
	... value=0.412,
	... )
	>>> # Full example with all fields
	>>> result = EvalResultEntry(
	... dataset_id="cais/hle",
	... task_id="default",
	... value=20.90,
	... dataset_revision="5503434ddd753f426f4b38109466949a1217c2bb",
	... verify_token="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...",
	... date="2025-01-15T10:30:00Z",
	... source_url="https://huggingface.co/datasets/cais/hle",
	... source_name="CAIS HLE",
	... source_org="cais",
	... notes="no-tools",
	... )

	```
	"""

	dataset_id: str
	task_id: str
	value: Any
	dataset_revision: str \| None = None
	verify_token: str \| None = None
	date: str \| None = None
	source_url: str \| None = None
	source_name: str \| None = None
	source_user: str \| None = None
	source_org: str \| None = None
	notes: str \| None = None

	def __post_init__(self) -> None:
	if (
	self.source_name is not None or self.source_user is not None or self.source_org is not None
	) and self.source_url is None:
	raise ValueError(
	"If `source_name`, `source_user`, or `source_org` is provided, `source_url` must also be provided."
	)


	def eval_result_entries_to_yaml(entries: list[EvalResultEntry]) -> list[dict[str, Any]]:
	"""Convert a list of [`EvalResultEntry`] objects to a YAML-serializable list of dicts.

	This produces the format expected in `.eval_results/*.yaml` files.

	Args:
	entries (`list[EvalResultEntry]`):
	List of evaluation result entries to serialize.

	Returns:
	`list[dict[str, Any]]`: A list of dictionaries ready to be dumped to YAML.

	Example:
	```python
	>>> from huggingface_hub import EvalResultEntry, eval_result_entries_to_yaml
	>>> entries = [
	... EvalResultEntry(dataset_id="cais/hle", task_id="default", value=20.90),
	... EvalResultEntry(dataset_id="Idavidrein/gpqa", task_id="gpqa_diamond", value=0.412),
	... ]
	>>> yaml_data = eval_result_entries_to_yaml(entries)
	>>> yaml_data[0]
	{'dataset': {'id': 'cais/hle', 'task_id': 'default'}, 'value': 20.9}

	```

	To upload eval results to the Hub:
	```python
	>>> import yaml
	>>> from huggingface_hub import upload_file, EvalResultEntry, eval_result_entries_to_yaml
	>>> entries = [
	... EvalResultEntry(dataset_id="cais/hle", task_id="default", value=20.90),
	... ]
	>>> yaml_content = yaml.dump(eval_result_entries_to_yaml(entries))
	>>> upload_file(
	... path_or_fileobj=yaml_content.encode(),
	... path_in_repo=".eval_results/hle.yaml",
	... repo_id="your-username/your-model",
	... )

	```
	"""
	result = []
	for entry in entries:
	# build the dataset object
	dataset: dict[str, Any] = {"id": entry.dataset_id, "task_id": entry.task_id}
	if entry.dataset_revision is not None:
	dataset["revision"] = entry.dataset_revision

	data: dict[str, Any] = {"dataset": dataset, "value": entry.value}
	if entry.verify_token is not None:
	data["verifyToken"] = entry.verify_token
	if entry.date is not None:
	data["date"] = entry.date
	# build the source object
	if entry.source_url is not None:
	source: dict[str, Any] = {"url": entry.source_url}
	if entry.source_name is not None:
	source["name"] = entry.source_name
	if entry.source_user is not None:
	source["user"] = entry.source_user
	if entry.source_org is not None:
	source["org"] = entry.source_org
	data["source"] = source
	if entry.notes is not None:
	data["notes"] = entry.notes

	result.append(data)
	return result


	def parse_eval_result_entries(data: list[dict[str, Any]]) -> list[EvalResultEntry]:
	"""Parse a list of dicts into [`EvalResultEntry`] objects.

	This parses the `.eval_results/*.yaml` format. For the legacy `model-index` format,
	use [`model_index_to_eval_results`] instead.

	Args:
	data (`list[dict[str, Any]]`):
	A list of dictionaries (e.g., parsed from YAML or API response).

	Returns:
	`list[EvalResultEntry]`: A list of evaluation result entry objects.

	Example:
	```python
	>>> from huggingface_hub import parse_eval_result_entries
	>>> data = [
	... {"dataset": {"id": "cais/hle", "task_id": "default"}, "value": 20.90},
	... {"dataset": {"id": "Idavidrein/gpqa", "task_id": "gpqa_diamond"}, "value": 0.412},
	... ]
	>>> entries = parse_eval_result_entries(data)
	>>> entries[0].dataset_id
	'cais/hle'
	>>> entries[0].value
	20.9

	```
	"""
	entries = []
	for item in data:
	entry_data = item.get("data", item)
	dataset = entry_data.get("dataset", {})
	source = entry_data.get("source", {})
	entry = EvalResultEntry(
	dataset_id=dataset["id"],
	value=entry_data["value"],
	task_id=dataset["task_id"],
	dataset_revision=dataset.get("revision"),
	verify_token=entry_data.get("verifyToken"),
	date=entry_data.get("date"),
	source_url=source.get("url") if source else None,
	source_name=source.get("name") if source else None,
	source_user=source.get("user") if source else None,
	source_org=source.get("org") if source else None,
	notes=entry_data.get("notes"),
	)
	entries.append(entry)
	return entries