Spaces:
Sleeping
Sleeping
File size: 9,998 Bytes
7a658b7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 | """Tests for Phase B4 β ScenarioFamilyGenerators (one per failure family)."""
from __future__ import annotations
import pytest
from ci_triage_env.data.generators import GENERATOR_REGISTRY, ScenarioFamilyGenerator
from ci_triage_env.schemas.scenario import Scenario
from ci_triage_env.schemas.tools import ALL_TOOLS
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
ALL_FAMILIES = list(GENERATOR_REGISTRY.keys())
ALL_TOOL_NAMES = {t.name for t in ALL_TOOLS}
EXPECTED_FAMILIES = {
"real_bug", "race_flake", "timing_flake",
"infra_network", "infra_resource", "dependency_drift", "ambiguous",
}
# ---------------------------------------------------------------------------
# Registry sanity
# ---------------------------------------------------------------------------
def test_registry_has_all_seven_families() -> None:
assert set(GENERATOR_REGISTRY.keys()) == EXPECTED_FAMILIES
def test_registry_values_are_generator_subclasses() -> None:
for GenCls in GENERATOR_REGISTRY.values():
assert issubclass(GenCls, ScenarioFamilyGenerator)
# ---------------------------------------------------------------------------
# Per-family parametrized tests
# ---------------------------------------------------------------------------
@pytest.mark.parametrize("family", ALL_FAMILIES)
def test_generator_determinism(family: str) -> None:
"""Same seed β identical Scenario."""
GenCls = GENERATOR_REGISTRY[family]
s1 = GenCls().generate(seed=42)
s2 = GenCls().generate(seed=42)
assert s1.model_dump() == s2.model_dump()
@pytest.mark.parametrize("family", ALL_FAMILIES)
def test_generator_different_seeds_produce_different_ids(family: str) -> None:
"""Different seeds β different scenario_ids."""
GenCls = GENERATOR_REGISTRY[family]
ids = {GenCls().generate(seed=s).scenario_id for s in range(5)}
assert len(ids) == 5
@pytest.mark.parametrize("family", ALL_FAMILIES)
def test_generator_validates_schema(family: str) -> None:
"""Output passes Scenario Pydantic schema validation."""
scenario = GENERATOR_REGISTRY[family]().generate(seed=7)
Scenario.model_validate(scenario.model_dump())
@pytest.mark.parametrize("family", ALL_FAMILIES)
def test_ground_truth_label_matches_family(family: str) -> None:
scenario = GENERATOR_REGISTRY[family]().generate(seed=99)
assert scenario.ground_truth.label.value == family
@pytest.mark.parametrize("family", ALL_FAMILIES)
def test_all_tools_have_outputs(family: str) -> None:
"""For each of 11 tools, tool_outputs has at least one matching key."""
scenario = GENERATOR_REGISTRY[family]().generate(seed=3)
for tool_name in ALL_TOOL_NAMES:
covered = any(
k == tool_name or k.startswith(tool_name + ":")
for k in scenario.tool_outputs
)
assert covered, (
f"family={family!r}: no tool_outputs key for tool={tool_name!r}. "
f"Keys present: {sorted(scenario.tool_outputs)}"
)
@pytest.mark.parametrize("family", ALL_FAMILIES)
def test_informative_tools_nonempty(family: str) -> None:
"""informative_tools must have β₯ 2 entries (or be empty for ambiguous)."""
gen = GENERATOR_REGISTRY[family]()
scenario = gen.generate(seed=5)
if family == "ambiguous":
# ambiguous has more than 2 by design
assert len(scenario.informative_tools) >= 2
else:
assert len(scenario.informative_tools) >= 2
@pytest.mark.parametrize("family", ALL_FAMILIES)
def test_minimal_evidence_subset_of_informative(family: str) -> None:
"""minimal_evidence_set β informative_tools OR minimal_evidence_set is empty."""
gen = GENERATOR_REGISTRY[family]()
scenario = gen.generate(seed=11)
if scenario.minimal_evidence_set:
assert set(scenario.minimal_evidence_set) <= set(scenario.informative_tools), (
f"family={family}: minimal_evidence_set not a subset of informative_tools. "
f"minimal={scenario.minimal_evidence_set}, informative={scenario.informative_tools}"
)
@pytest.mark.parametrize("family", ALL_FAMILIES)
def test_difficulty_is_valid(family: str) -> None:
scenario = GENERATOR_REGISTRY[family]().generate(seed=21)
assert scenario.metadata.difficulty in {"easy", "medium", "hard"}
@pytest.mark.parametrize("family", ALL_FAMILIES)
def test_scenario_id_contains_family(family: str) -> None:
scenario = GENERATOR_REGISTRY[family]().generate(seed=13)
assert family in scenario.scenario_id
@pytest.mark.parametrize("family", ALL_FAMILIES)
def test_schema_version_is_1_0(family: str) -> None:
scenario = GENERATOR_REGISTRY[family]().generate(seed=77)
assert scenario.schema_version == "1.0"
@pytest.mark.parametrize("family", ALL_FAMILIES)
def test_tool_output_cost_units_nonnegative(family: str) -> None:
scenario = GENERATOR_REGISTRY[family]().generate(seed=55)
for key, output in scenario.tool_outputs.items():
assert output.cost_units >= 0.0, f"negative cost_units on key={key!r}"
@pytest.mark.parametrize("family", ALL_FAMILIES)
def test_failure_summary_populated(family: str) -> None:
scenario = GENERATOR_REGISTRY[family]().generate(seed=33)
fs = scenario.failure_summary
assert fs.test_name
assert fs.branch
assert fs.suite in {"unit", "integration", "benchmark"}
assert len(fs.last_passing_commit) == 40 # full SHA
# ---------------------------------------------------------------------------
# Ambiguous-specific tests
# ---------------------------------------------------------------------------
def test_ambiguous_confidence_target_below_one() -> None:
scenario = GENERATOR_REGISTRY["ambiguous"]().generate(seed=42)
assert scenario.ground_truth.confidence_target < 1.0
def test_ambiguous_is_flagged() -> None:
scenario = GENERATOR_REGISTRY["ambiguous"]().generate(seed=42)
assert scenario.ground_truth.is_ambiguous is True
def test_ambiguous_minimal_evidence_is_empty() -> None:
scenario = GENERATOR_REGISTRY["ambiguous"]().generate(seed=42)
assert scenario.minimal_evidence_set == []
def test_ambiguous_correct_action_has_low_confidence() -> None:
scenario = GENERATOR_REGISTRY["ambiguous"]().generate(seed=42)
confidence = scenario.correct_terminal_action.args.get("confidence", 1.0)
assert confidence < 0.6
# ---------------------------------------------------------------------------
# Non-ambiguous families: is_ambiguous == False
# ---------------------------------------------------------------------------
@pytest.mark.parametrize("family", [f for f in ALL_FAMILIES if f != "ambiguous"])
def test_non_ambiguous_scenarios_not_flagged(family: str) -> None:
scenario = GENERATOR_REGISTRY[family]().generate(seed=42)
assert scenario.ground_truth.is_ambiguous is False
@pytest.mark.parametrize("family", [f for f in ALL_FAMILIES if f != "ambiguous"])
def test_non_ambiguous_confidence_target_is_1(family: str) -> None:
scenario = GENERATOR_REGISTRY[family]().generate(seed=42)
assert scenario.ground_truth.confidence_target == 1.0
# ---------------------------------------------------------------------------
# Integration tests
# ---------------------------------------------------------------------------
def test_all_generators_produce_distinct_scenario_ids() -> None:
"""Generate 10 scenarios per family with different seeds; all IDs are unique."""
seen: set[str] = set()
for family, GenCls in GENERATOR_REGISTRY.items():
gen = GenCls()
for seed in range(10):
sid = gen.generate(seed=seed).scenario_id
assert sid not in seen, f"Duplicate scenario_id={sid!r} for family={family!r} seed={seed}"
seen.add(sid)
def test_generators_use_default_archetypes_when_no_clustering_data(tmp_path) -> None:
"""Generators fall back to built-in defaults when archetypes_dir is empty."""
for family, GenCls in GENERATOR_REGISTRY.items():
gen = GenCls(archetypes_dir=tmp_path) # empty dir β use defaults
scenario = gen.generate(seed=42)
assert scenario.family == family
def test_generators_use_archetypes_when_available(tmp_path) -> None:
"""If clustering archetypes exist, generator picks from them."""
import json
from ci_triage_env.data.clustering.archetypes import Archetype
family = "real_bug"
family_dir = tmp_path / family
family_dir.mkdir()
custom_arch = Archetype(
archetype_id="custom_001",
family=family,
pattern_summary="Custom archetype for test",
log_template="CUSTOM_LOG_LINE_{NUM}",
slot_distributions={"NUM": ["42", "99"]},
informative_tools_hint=["read_logs:full"],
minimal_evidence_hint=["read_logs:full"],
)
(family_dir / "archetypes.json").write_text(
json.dumps([custom_arch.model_dump()], indent=2)
)
gen = GENERATOR_REGISTRY[family](archetypes_dir=tmp_path)
# Force reload by creating fresh instance
scenario = gen.generate(seed=42)
# Scenario should be valid and deterministic
assert scenario.family == family
Scenario.model_validate(scenario.model_dump())
def test_generator_seed_embedded_in_scenario() -> None:
for _family, GenCls in GENERATOR_REGISTRY.items():
seed = 1337
scenario = GenCls().generate(seed=seed)
assert scenario.seed == seed
def test_read_logs_full_has_content() -> None:
"""read_logs:full always has non-empty lines."""
for family, GenCls in GENERATOR_REGISTRY.items():
scenario = GenCls().generate(seed=42)
full_output = scenario.tool_outputs.get("read_logs:full")
assert full_output is not None, f"{family}: missing read_logs:full"
payload = full_output.payload
assert isinstance(payload, dict)
assert len(payload.get("lines", [])) > 0, f"{family}: read_logs:full has empty lines"
|