Spaces:
Sleeping
Sleeping
File size: 1,862 Bytes
54627d8 cd61817 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 | """Cache layout for B1 dataset loaders.
One subdir per dataset under ``CI_TRIAGE_DATA_CACHE`` (default
``data_artifacts/datasets_cache``); each ``FailureRecord`` is written as
``<record_id>.json``. Re-loading is content-addressable: same ``record_id``
overwrites the same file, so re-running ``cli load`` is idempotent.
The cache dir is gitignored (see ``.gitignore`` for ``data_artifacts/*``);
only the *generated scenarios* (Phase B5) are committed / published.
"""
from __future__ import annotations
import os
from collections.abc import Iterator
from pathlib import Path
DEFAULT_CACHE_ROOT = Path("data_artifacts/datasets_cache")
def cache_root() -> Path:
return Path(os.environ.get("CI_TRIAGE_DATA_CACHE", str(DEFAULT_CACHE_ROOT)))
def cache_dir_for(dataset_name: str) -> Path:
return cache_root() / dataset_name
def is_cached(record_id: str, dataset_name: str) -> bool:
return (cache_dir_for(dataset_name) / f"{record_id}.json").exists()
def load_cached(dataset_name: str) -> Iterator:
"""Yield cached ``FailureRecord``s for a dataset, if any."""
from ci_triage_env.data.datasets._base import FailureRecord # circular guard
target = cache_dir_for(dataset_name)
if not target.exists():
return
for path in sorted(target.glob("*.json")):
yield FailureRecord.model_validate_json(path.read_text())
def load_all_cached() -> list:
"""Return all cached ``FailureRecord``s across every dataset sub-directory."""
from ci_triage_env.data.datasets._base import FailureRecord # circular guard
root = cache_root()
records = []
if not root.exists():
return records
for json_path in sorted(root.rglob("*.json")):
try:
records.append(FailureRecord.model_validate_json(json_path.read_text()))
except Exception:
pass
return records
|