Spaces:

Prasham1710
/

ci-triage-training

Sleeping

ci-triage-training / src /ci_triage_env /data /publish.py

Prasham.Jain

feat(data): Phase B5 — corpus instantiation, HF publish, annotations

18a3fbf 29 days ago

4.51 kB

	"""HuggingFace dataset publisher for the CI-Triage scenario corpus.

	Imports ``huggingface_hub`` and ``datasets`` lazily so the rest of the package
	remains usable without those dependencies installed.
	"""

	from __future__ import annotations

	import json
	from pathlib import Path


	def generate_dataset_readme(scenarios_dir: Path) -> str:
	"""Auto-generated README for the HF dataset, accurate to the actual counts."""

	def _count(split: str) -> int:
	d = scenarios_dir / split
	return len(list(d.glob("*.json"))) if d.exists() else 0

	train_n = _count("train")
	val_n = _count("val")
	held_out_n = _count("held_out")

	return f"""---
	license: cc-by-4.0
	task_categories:
	- text-classification
	language:
	- en
	tags:
	- ci-triage
	- openenv
	- rl-environment
	- failure-diagnosis
	---

	# CI-Triage Scenarios

	A corpus of CI-failure scenarios for the CI-Triage-Env OpenEnv RL environment.
	Generated from public OSS CI logs (anonymized) and open-license datasets
	(DeFlaker, iDFlakies, FlakeFlagger, LogHub).

	## Splits

	\| Split \| Count \| Notes \|
	\|-------\|-------\|-------\|
	\| train \| {train_n} \| Unambiguous families only \|
	\| val \| {val_n} \| Unambiguous families only \|
	\| held_out \| {held_out_n} \| Includes ALL ambiguous instances (calibration probe) \|

	## Schema

	Each row contains:

	- `scenario_id` (string): unique identifier (`<family>-s<seed>-<hash>`)
	- `family` (string): one of `real_bug`, `race_flake`, `timing_flake`,
	`infra_network`, `infra_resource`, `dependency_drift`, `ambiguous`
	- `scenario_json` (string): full `Scenario` JSON, validates against
	`ci_triage_env.schemas.scenario.Scenario`
	- `difficulty` (string): `easy` / `medium` / `hard`

	## Failure Families

	\| Family \| Description \|
	\|--------\|-------------\|
	\| `real_bug` \| A genuine code defect introduced by a recent commit \|
	\| `race_flake` \| Non-deterministic failure from a data race / goroutine conflict \|
	\| `timing_flake` \| Intermittent timeout under CI scheduler load \|
	\| `infra_network` \| DNS / TLS / connectivity failure on the CI node \|
	\| `infra_resource` \| OOM-kill, disk full, or file-descriptor exhaustion \|
	\| `dependency_drift` \| Breaking change from a dependency version bump \|
	\| `ambiguous` \| Multiple plausible causes — correct response is low confidence \|

	## License

	CC-BY-4.0. Generated from public OSS CI logs (anonymized) and open-license
	datasets (DeFlaker, iDFlakies, FlakeFlagger, LogHub).

	## Citation

	If you use this corpus, please cite the originating datasets and the
	Meta PyTorch OpenEnv hackathon submission (CI-Triage-Env).
	"""


	def publish_to_hf(
	scenarios_dir: Path,
	dataset_name: str,
	token: str \| None = None,
	) -> None:
	"""Upload the generated corpus to the HuggingFace dataset hub.

	Args:
	scenarios_dir: Directory produced by ``CorpusBuilder.build()``,
	containing ``train/``, ``val/``, and ``held_out/`` subdirectories.
	dataset_name: HF repo id, e.g. ``"your-org/ci-triage-scenarios"``.
	token: HF API token. Falls back to ``HF_TOKEN`` env var if ``None``.
	"""
	from datasets import Dataset, DatasetDict
	from huggingface_hub import HfApi, create_repo

	api = HfApi(token=token)
	create_repo(repo_id=dataset_name, repo_type="dataset", exist_ok=True, token=token)

	splits: dict[str, Dataset] = {}
	for split_name in ("train", "val", "held_out"):
	split_dir = scenarios_dir / split_name
	if not split_dir.exists():
	continue
	records = []
	for path in sorted(split_dir.glob("*.json")):
	scenario_dict = json.loads(path.read_text())
	records.append(
	{
	"scenario_id": scenario_dict["scenario_id"],
	"family": scenario_dict["family"],
	"scenario_json": json.dumps(scenario_dict),
	"difficulty": scenario_dict["metadata"]["difficulty"],
	}
	)
	if records:
	splits[split_name] = Dataset.from_list(records)

	dataset_dict = DatasetDict(splits)
	dataset_dict.push_to_hub(dataset_name, token=token)

	readme = generate_dataset_readme(scenarios_dir)
	api.upload_file(
	path_or_fileobj=readme.encode(),
	path_in_repo="README.md",
	repo_id=dataset_name,
	repo_type="dataset",
	token=token,
	)
	print(
	f"Published {sum(len(ds) for ds in splits.values())} scenarios to "
	f"https://huggingface.co/datasets/{dataset_name}"
	)