Prasham.Jain
feat(branch-b): B1 public dataset ingest — DeFlaker, iDFlakies, FlakeFlagger, LogHub loaders + CLI
54627d8
"""DeFlaker (Bell et al., FSE 2018) loader.
Source paper: `DeFlaker: Automatically Detecting Flaky Tests
<https://www.jonbell.net/icse18-deflaker.pdf>`_.
DeFlaker labels test-failure events on 26 OSS Java projects as either ``flaky``
(test fails on a commit that didn't change code reachable from the test) or
``real`` (the test fails because the code under test changed). The published
artifact ships as a CSV-ish dump per project; the canonical mirror has rotted
in the past, so we read whatever local file the user points us at.
Expected on-disk format under ``data_path/`` is **one CSV** with columns
``project, test, label, commit_sha, log`` (``label`` ∈ {``flaky``, ``real``}).
This matches the reduced shape the published artifact uses after extracting
its per-project ``failures.csv`` files; concatenate them if you start from the
raw release. Document any deviation in the loader's caller.
"""
from __future__ import annotations
import csv
from collections.abc import Iterator
from ci_triage_env.data.datasets._base import DatasetLoader, FailureRecord
_REQUIRED_COLUMNS = {"project", "test", "label", "commit_sha"}
class DeFlakerLoader(DatasetLoader):
name = "deflaker"
env_var = "DEFLAKER_DATA_PATH"
download_instructions = (
"Visit https://github.com/jonbell-/deflaker (or the FSE 2018 paper's "
"supplementary materials) and download the per-project failures.csv "
"files. Concatenate into one CSV with columns "
"(project, test, label, commit_sha, log) and point "
"$DEFLAKER_DATA_PATH at that file."
)
def fetch(self) -> Iterator[FailureRecord]:
path = self._require_data_path()
with open(path, newline="", encoding="utf-8") as fh:
reader = csv.DictReader(fh)
missing = _REQUIRED_COLUMNS - set(reader.fieldnames or [])
if missing:
raise ValueError(
f"deflaker CSV at {path} is missing required columns: {sorted(missing)}"
)
for row in reader:
project = (row.get("project") or "").strip()
test = (row.get("test") or "").strip()
commit_sha = (row.get("commit_sha") or "").strip()
label = (row.get("label") or "").strip().lower() or None
if not project or not test or not commit_sha:
continue
yield FailureRecord(
record_id=f"deflaker-{commit_sha[:12]}-{test}",
source_dataset="deflaker",
project=project,
test_name=test,
failure_type_label=label,
log_text=row.get("log", "") or "",
metadata={"commit_sha": commit_sha},
)