Spaces:
Sleeping
Sleeping
| """HuggingFace dataset publisher for the CI-Triage scenario corpus. | |
| Imports ``huggingface_hub`` and ``datasets`` lazily so the rest of the package | |
| remains usable without those dependencies installed. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| from pathlib import Path | |
| def generate_dataset_readme(scenarios_dir: Path) -> str: | |
| """Auto-generated README for the HF dataset, accurate to the actual counts.""" | |
| def _count(split: str) -> int: | |
| d = scenarios_dir / split | |
| return len(list(d.glob("*.json"))) if d.exists() else 0 | |
| train_n = _count("train") | |
| val_n = _count("val") | |
| held_out_n = _count("held_out") | |
| return f"""--- | |
| license: cc-by-4.0 | |
| task_categories: | |
| - text-classification | |
| language: | |
| - en | |
| tags: | |
| - ci-triage | |
| - openenv | |
| - rl-environment | |
| - failure-diagnosis | |
| --- | |
| # CI-Triage Scenarios | |
| A corpus of CI-failure scenarios for the **CI-Triage-Env** OpenEnv RL environment. | |
| Generated from public OSS CI logs (anonymized) and open-license datasets | |
| (DeFlaker, iDFlakies, FlakeFlagger, LogHub). | |
| ## Splits | |
| | Split | Count | Notes | | |
| |-------|-------|-------| | |
| | train | {train_n} | Unambiguous families only | | |
| | val | {val_n} | Unambiguous families only | | |
| | held_out | {held_out_n} | Includes ALL ambiguous instances (calibration probe) | | |
| ## Schema | |
| Each row contains: | |
| - `scenario_id` (string): unique identifier (`<family>-s<seed>-<hash>`) | |
| - `family` (string): one of `real_bug`, `race_flake`, `timing_flake`, | |
| `infra_network`, `infra_resource`, `dependency_drift`, `ambiguous` | |
| - `scenario_json` (string): full `Scenario` JSON, validates against | |
| `ci_triage_env.schemas.scenario.Scenario` | |
| - `difficulty` (string): `easy` / `medium` / `hard` | |
| ## Failure Families | |
| | Family | Description | | |
| |--------|-------------| | |
| | `real_bug` | A genuine code defect introduced by a recent commit | | |
| | `race_flake` | Non-deterministic failure from a data race / goroutine conflict | | |
| | `timing_flake` | Intermittent timeout under CI scheduler load | | |
| | `infra_network` | DNS / TLS / connectivity failure on the CI node | | |
| | `infra_resource` | OOM-kill, disk full, or file-descriptor exhaustion | | |
| | `dependency_drift` | Breaking change from a dependency version bump | | |
| | `ambiguous` | Multiple plausible causes — correct response is low confidence | | |
| ## License | |
| CC-BY-4.0. Generated from public OSS CI logs (anonymized) and open-license | |
| datasets (DeFlaker, iDFlakies, FlakeFlagger, LogHub). | |
| ## Citation | |
| If you use this corpus, please cite the originating datasets and the | |
| Meta PyTorch OpenEnv hackathon submission (CI-Triage-Env). | |
| """ | |
| def publish_to_hf( | |
| scenarios_dir: Path, | |
| dataset_name: str, | |
| token: str | None = None, | |
| ) -> None: | |
| """Upload the generated corpus to the HuggingFace dataset hub. | |
| Args: | |
| scenarios_dir: Directory produced by ``CorpusBuilder.build()``, | |
| containing ``train/``, ``val/``, and ``held_out/`` subdirectories. | |
| dataset_name: HF repo id, e.g. ``"your-org/ci-triage-scenarios"``. | |
| token: HF API token. Falls back to ``HF_TOKEN`` env var if ``None``. | |
| """ | |
| from datasets import Dataset, DatasetDict | |
| from huggingface_hub import HfApi, create_repo | |
| api = HfApi(token=token) | |
| create_repo(repo_id=dataset_name, repo_type="dataset", exist_ok=True, token=token) | |
| splits: dict[str, Dataset] = {} | |
| for split_name in ("train", "val", "held_out"): | |
| split_dir = scenarios_dir / split_name | |
| if not split_dir.exists(): | |
| continue | |
| records = [] | |
| for path in sorted(split_dir.glob("*.json")): | |
| scenario_dict = json.loads(path.read_text()) | |
| records.append( | |
| { | |
| "scenario_id": scenario_dict["scenario_id"], | |
| "family": scenario_dict["family"], | |
| "scenario_json": json.dumps(scenario_dict), | |
| "difficulty": scenario_dict["metadata"]["difficulty"], | |
| } | |
| ) | |
| if records: | |
| splits[split_name] = Dataset.from_list(records) | |
| dataset_dict = DatasetDict(splits) | |
| dataset_dict.push_to_hub(dataset_name, token=token) | |
| readme = generate_dataset_readme(scenarios_dir) | |
| api.upload_file( | |
| path_or_fileobj=readme.encode(), | |
| path_in_repo="README.md", | |
| repo_id=dataset_name, | |
| repo_type="dataset", | |
| token=token, | |
| ) | |
| print( | |
| f"Published {sum(len(ds) for ds in splits.values())} scenarios to " | |
| f"https://huggingface.co/datasets/{dataset_name}" | |
| ) | |