import json from pathlib import Path from typing import Any REQUIRED_FIELDS = {"x", "y", "reasoning_type"} def load_dataset(path: str | Path) -> list[dict[str, Any]]: """Load and validate the small CARB-style seed dataset.""" dataset_path = Path(path) with dataset_path.open("r", encoding="utf-8") as f: rows = json.load(f) if not isinstance(rows, list): raise ValueError("Dataset must be a JSON list.") for index, row in enumerate(rows): missing = REQUIRED_FIELDS.difference(row) if missing: raise ValueError(f"Row {index} is missing required fields: {sorted(missing)}") if row["y"] not in (0, 1): raise ValueError(f"Row {index} has non-binary label: {row['y']!r}") if not isinstance(row["x"], str) or not row["x"].strip(): raise ValueError(f"Row {index} has an empty input string.") if not isinstance(row["reasoning_type"], str) or not row["reasoning_type"].strip(): raise ValueError(f"Row {index} has an empty reasoning_type.") return rows