Prasham.Jain
feat(data): Phase B5 — corpus instantiation, HF publish, annotations
18a3fbf
"""HuggingFace dataset publisher for the CI-Triage scenario corpus.
Imports ``huggingface_hub`` and ``datasets`` lazily so the rest of the package
remains usable without those dependencies installed.
"""
from __future__ import annotations
import json
from pathlib import Path
def generate_dataset_readme(scenarios_dir: Path) -> str:
"""Auto-generated README for the HF dataset, accurate to the actual counts."""
def _count(split: str) -> int:
d = scenarios_dir / split
return len(list(d.glob("*.json"))) if d.exists() else 0
train_n = _count("train")
val_n = _count("val")
held_out_n = _count("held_out")
return f"""---
license: cc-by-4.0
task_categories:
- text-classification
language:
- en
tags:
- ci-triage
- openenv
- rl-environment
- failure-diagnosis
---
# CI-Triage Scenarios
A corpus of CI-failure scenarios for the **CI-Triage-Env** OpenEnv RL environment.
Generated from public OSS CI logs (anonymized) and open-license datasets
(DeFlaker, iDFlakies, FlakeFlagger, LogHub).
## Splits
| Split | Count | Notes |
|-------|-------|-------|
| train | {train_n} | Unambiguous families only |
| val | {val_n} | Unambiguous families only |
| held_out | {held_out_n} | Includes ALL ambiguous instances (calibration probe) |
## Schema
Each row contains:
- `scenario_id` (string): unique identifier (`<family>-s<seed>-<hash>`)
- `family` (string): one of `real_bug`, `race_flake`, `timing_flake`,
`infra_network`, `infra_resource`, `dependency_drift`, `ambiguous`
- `scenario_json` (string): full `Scenario` JSON, validates against
`ci_triage_env.schemas.scenario.Scenario`
- `difficulty` (string): `easy` / `medium` / `hard`
## Failure Families
| Family | Description |
|--------|-------------|
| `real_bug` | A genuine code defect introduced by a recent commit |
| `race_flake` | Non-deterministic failure from a data race / goroutine conflict |
| `timing_flake` | Intermittent timeout under CI scheduler load |
| `infra_network` | DNS / TLS / connectivity failure on the CI node |
| `infra_resource` | OOM-kill, disk full, or file-descriptor exhaustion |
| `dependency_drift` | Breaking change from a dependency version bump |
| `ambiguous` | Multiple plausible causes — correct response is low confidence |
## License
CC-BY-4.0. Generated from public OSS CI logs (anonymized) and open-license
datasets (DeFlaker, iDFlakies, FlakeFlagger, LogHub).
## Citation
If you use this corpus, please cite the originating datasets and the
Meta PyTorch OpenEnv hackathon submission (CI-Triage-Env).
"""
def publish_to_hf(
scenarios_dir: Path,
dataset_name: str,
token: str | None = None,
) -> None:
"""Upload the generated corpus to the HuggingFace dataset hub.
Args:
scenarios_dir: Directory produced by ``CorpusBuilder.build()``,
containing ``train/``, ``val/``, and ``held_out/`` subdirectories.
dataset_name: HF repo id, e.g. ``"your-org/ci-triage-scenarios"``.
token: HF API token. Falls back to ``HF_TOKEN`` env var if ``None``.
"""
from datasets import Dataset, DatasetDict
from huggingface_hub import HfApi, create_repo
api = HfApi(token=token)
create_repo(repo_id=dataset_name, repo_type="dataset", exist_ok=True, token=token)
splits: dict[str, Dataset] = {}
for split_name in ("train", "val", "held_out"):
split_dir = scenarios_dir / split_name
if not split_dir.exists():
continue
records = []
for path in sorted(split_dir.glob("*.json")):
scenario_dict = json.loads(path.read_text())
records.append(
{
"scenario_id": scenario_dict["scenario_id"],
"family": scenario_dict["family"],
"scenario_json": json.dumps(scenario_dict),
"difficulty": scenario_dict["metadata"]["difficulty"],
}
)
if records:
splits[split_name] = Dataset.from_list(records)
dataset_dict = DatasetDict(splits)
dataset_dict.push_to_hub(dataset_name, token=token)
readme = generate_dataset_readme(scenarios_dir)
api.upload_file(
path_or_fileobj=readme.encode(),
path_in_repo="README.md",
repo_id=dataset_name,
repo_type="dataset",
token=token,
)
print(
f"Published {sum(len(ds) for ds in splits.values())} scenarios to "
f"https://huggingface.co/datasets/{dataset_name}"
)