"""HuggingFace dataset publisher for the CI-Triage scenario corpus. Imports ``huggingface_hub`` and ``datasets`` lazily so the rest of the package remains usable without those dependencies installed. """ from __future__ import annotations import json from pathlib import Path def generate_dataset_readme(scenarios_dir: Path) -> str: """Auto-generated README for the HF dataset, accurate to the actual counts.""" def _count(split: str) -> int: d = scenarios_dir / split return len(list(d.glob("*.json"))) if d.exists() else 0 train_n = _count("train") val_n = _count("val") held_out_n = _count("held_out") return f"""--- license: cc-by-4.0 task_categories: - text-classification language: - en tags: - ci-triage - openenv - rl-environment - failure-diagnosis --- # CI-Triage Scenarios A corpus of CI-failure scenarios for the **CI-Triage-Env** OpenEnv RL environment. Generated from public OSS CI logs (anonymized) and open-license datasets (DeFlaker, iDFlakies, FlakeFlagger, LogHub). ## Splits | Split | Count | Notes | |-------|-------|-------| | train | {train_n} | Unambiguous families only | | val | {val_n} | Unambiguous families only | | held_out | {held_out_n} | Includes ALL ambiguous instances (calibration probe) | ## Schema Each row contains: - `scenario_id` (string): unique identifier (`-s-`) - `family` (string): one of `real_bug`, `race_flake`, `timing_flake`, `infra_network`, `infra_resource`, `dependency_drift`, `ambiguous` - `scenario_json` (string): full `Scenario` JSON, validates against `ci_triage_env.schemas.scenario.Scenario` - `difficulty` (string): `easy` / `medium` / `hard` ## Failure Families | Family | Description | |--------|-------------| | `real_bug` | A genuine code defect introduced by a recent commit | | `race_flake` | Non-deterministic failure from a data race / goroutine conflict | | `timing_flake` | Intermittent timeout under CI scheduler load | | `infra_network` | DNS / TLS / connectivity failure on the CI node | | `infra_resource` | OOM-kill, disk full, or file-descriptor exhaustion | | `dependency_drift` | Breaking change from a dependency version bump | | `ambiguous` | Multiple plausible causes — correct response is low confidence | ## License CC-BY-4.0. Generated from public OSS CI logs (anonymized) and open-license datasets (DeFlaker, iDFlakies, FlakeFlagger, LogHub). ## Citation If you use this corpus, please cite the originating datasets and the Meta PyTorch OpenEnv hackathon submission (CI-Triage-Env). """ def publish_to_hf( scenarios_dir: Path, dataset_name: str, token: str | None = None, ) -> None: """Upload the generated corpus to the HuggingFace dataset hub. Args: scenarios_dir: Directory produced by ``CorpusBuilder.build()``, containing ``train/``, ``val/``, and ``held_out/`` subdirectories. dataset_name: HF repo id, e.g. ``"your-org/ci-triage-scenarios"``. token: HF API token. Falls back to ``HF_TOKEN`` env var if ``None``. """ from datasets import Dataset, DatasetDict from huggingface_hub import HfApi, create_repo api = HfApi(token=token) create_repo(repo_id=dataset_name, repo_type="dataset", exist_ok=True, token=token) splits: dict[str, Dataset] = {} for split_name in ("train", "val", "held_out"): split_dir = scenarios_dir / split_name if not split_dir.exists(): continue records = [] for path in sorted(split_dir.glob("*.json")): scenario_dict = json.loads(path.read_text()) records.append( { "scenario_id": scenario_dict["scenario_id"], "family": scenario_dict["family"], "scenario_json": json.dumps(scenario_dict), "difficulty": scenario_dict["metadata"]["difficulty"], } ) if records: splits[split_name] = Dataset.from_list(records) dataset_dict = DatasetDict(splits) dataset_dict.push_to_hub(dataset_name, token=token) readme = generate_dataset_readme(scenarios_dir) api.upload_file( path_or_fileobj=readme.encode(), path_in_repo="README.md", repo_id=dataset_name, repo_type="dataset", token=token, ) print( f"Published {sum(len(ds) for ds in splits.values())} scenarios to " f"https://huggingface.co/datasets/{dataset_name}" )