Spaces:
Sleeping
Sleeping
| """RealBugGenerator β scenario family: real_bug.""" | |
| from __future__ import annotations | |
| import random | |
| from ci_triage_env.data.clustering.archetypes import Archetype | |
| from ci_triage_env.data.generators._helpers import ( | |
| ArchetypedGenerator, | |
| build_base_outputs, | |
| fake_short_sha, | |
| fake_timestamp, | |
| fill_template, | |
| make_failure_summary, | |
| pick_test_name, | |
| scenario_id_for, | |
| ) | |
| from ci_triage_env.schemas.diagnosis import DiagnosisLabel | |
| from ci_triage_env.schemas.scenario import ( | |
| GroundTruth, | |
| Scenario, | |
| ScenarioMetadata, | |
| TerminalActionSpec, | |
| ToolOutput, | |
| ) | |
| _DEFAULT_LOG_TEMPLATE = ( | |
| "FAILED {TEST_MODULE}::{TEST_FUNC} - AssertionError\n" | |
| " assert result == expected\n" | |
| " where result = {ACTUAL}\n" | |
| " and expected = {EXPECTED}\n" | |
| "E AssertionError: assertion failed at line {LINENO}\n" | |
| "short test summary info\n" | |
| "FAILED {TEST_MODULE}::{TEST_FUNC}" | |
| ) | |
| _DEFAULT_BUGGY_CODE = ( | |
| "def {TEST_FUNC}(self):\n" | |
| " result = self.service.compute({INPUT})\n" | |
| " assert result == {EXPECTED} # broke after {COMMIT_MSG}\n" | |
| ) | |
| class RealBugGenerator(ArchetypedGenerator): | |
| family_name = "real_bug" | |
| label = DiagnosisLabel.REAL_BUG | |
| def informative_tools(self) -> list[str]: | |
| return ["read_logs", "inspect_test_code", "recent_commits", "rerun_test"] | |
| def minimal_evidence_set(self) -> list[str]: | |
| return ["recent_commits", "inspect_test_code"] | |
| def _default_archetypes(self) -> list[Archetype]: | |
| return [ | |
| Archetype( | |
| archetype_id="real_bug_001", | |
| family="real_bug", | |
| pattern_summary="AssertionError after recent commit changed return value", | |
| log_template=_DEFAULT_LOG_TEMPLATE, | |
| slot_distributions={ | |
| "TEST_MODULE": ["tests/unit/test_core", "tests/unit/test_api"], | |
| "TEST_FUNC": ["test_compute", "test_process", "test_validate"], | |
| "ACTUAL": ["None", "0", "-1", "[]"], | |
| "EXPECTED": ["42", "True", "{'ok': True}"], | |
| "LINENO": ["42", "87", "115", "203"], | |
| }, | |
| informative_tools_hint=["read_logs:full", "inspect_test_code", "recent_commits"], | |
| minimal_evidence_hint=["recent_commits", "inspect_test_code"], | |
| ), | |
| Archetype( | |
| archetype_id="real_bug_002", | |
| family="real_bug", | |
| pattern_summary="AttributeError / NullPointerException in core logic", | |
| log_template=( | |
| "AttributeError: 'NoneType' object has no attribute '{ATTR}'\n" | |
| " File \"{TEST_MODULE}.py\", line {LINENO}, in {TEST_FUNC}\n" | |
| " return obj.{ATTR}\n" | |
| "FAILED {TEST_MODULE}::{TEST_FUNC}" | |
| ), | |
| slot_distributions={ | |
| "ATTR": ["name", "id", "value", "data", "result"], | |
| "TEST_MODULE": ["tests/unit/test_models", "tests/unit/test_service"], | |
| "TEST_FUNC": ["test_create", "test_update", "test_fetch"], | |
| "LINENO": ["33", "67", "91", "144"], | |
| }, | |
| informative_tools_hint=["read_logs:full", "inspect_test_code", "recent_commits"], | |
| minimal_evidence_hint=["inspect_test_code"], | |
| ), | |
| ] | |
| def generate(self, seed: int, source_log_hash: str | None = None) -> Scenario: | |
| rng = random.Random(seed) | |
| archetype = self._pick_archetype(rng) | |
| log_text = fill_template(archetype.log_template, archetype.slot_distributions, rng) | |
| test_name = pick_test_name(rng) | |
| summary = make_failure_summary( | |
| self.family_name, rng, test_name=test_name, log_excerpt=log_text | |
| ) | |
| branch = summary.branch | |
| outputs = build_base_outputs( | |
| test_name, branch, rng, | |
| log_lines=log_text.splitlines(), | |
| rerun_passes=False, | |
| ) | |
| # --- informative overrides --- | |
| breaking_author = rng.choice(["@alice", "@bob", "@carol"]) | |
| breaking_sha = fake_short_sha(rng) | |
| breaking_commit = { | |
| "sha": breaking_sha, | |
| "author": breaking_author, | |
| "msg": rng.choice([ | |
| f"fix: update {test_name.split('::')[-1].replace('test_', '')} logic", | |
| "refactor: change return contract of compute()", | |
| f"feat: extend {test_name.split('::')[-1].split('_')[1]} API", | |
| ]), | |
| "files": [ | |
| f"src/{test_name.split('/')[1].replace('test_', '')}.py", | |
| test_name.rsplit("::", 1)[0], | |
| ], | |
| } | |
| outputs[f"recent_commits:{branch}"] = ToolOutput( | |
| tool_name="recent_commits", | |
| payload={"commits": [breaking_commit, { | |
| "sha": fake_short_sha(rng), | |
| "author": rng.choice(["@dave", "@eve"]), | |
| "msg": "chore: update lockfile", | |
| "files": ["pyproject.toml"], | |
| }]}, | |
| cost_units=0.002, | |
| ) | |
| buggy_code = self._pick_buggy_code(rng) | |
| outputs[f"inspect_test_code:{test_name}"] = ToolOutput( | |
| tool_name="inspect_test_code", | |
| payload={"source": buggy_code, "fixtures": []}, | |
| cost_units=0.002, | |
| ) | |
| # Rerun also fails β it's a real bug, not a flake | |
| outputs["rerun_test"] = ToolOutput( | |
| tool_name="rerun_test", | |
| payload={"results": [{"passed": False, "duration_s": round(rng.uniform(5, 30), 2), | |
| "log_excerpt": log_text.splitlines()[:3]}]}, | |
| cost_units=0.01, | |
| ) | |
| # Flake history is clean (test was stable before the bad commit) | |
| outputs[f"query_flake_history:{test_name}"] = ToolOutput( | |
| tool_name="query_flake_history", | |
| payload={"failure_count": 0, "pass_count": 50, "recent_failures": []}, | |
| cost_units=0.002, | |
| ) | |
| difficulty = rng.choice(["easy", "medium", "hard"]) | |
| rationale = ( | |
| f"The commit {breaking_sha} by {breaking_author} changed the return contract " | |
| f"of the production code exercised by {test_name}. " | |
| f"inspect_test_code shows the assertion that now fails; " | |
| f"recent_commits:{branch} shows the introducing commit. " | |
| f"query_flake_history shows no prior failures β not a flake. " | |
| f"rerun_test fails again β confirms deterministic breakage." | |
| ) | |
| return Scenario( | |
| schema_version="1.0", | |
| scenario_id=scenario_id_for(self.family_name, seed), | |
| family=self.family_name, | |
| seed=seed, | |
| ground_truth=GroundTruth( | |
| label=self.label, | |
| rationale=rationale, | |
| is_ambiguous=False, | |
| confidence_target=1.0, | |
| ), | |
| failure_summary=summary, | |
| tool_outputs=outputs, | |
| informative_tools=self.informative_tools(), | |
| minimal_evidence_set=self.minimal_evidence_set(), | |
| correct_terminal_action=TerminalActionSpec( | |
| primary="submit_diagnosis", | |
| args={ | |
| "diagnosis": self.label.value, | |
| "confidence": 1.0, | |
| "secondary_actions": [{"name": "file_bug", "owner": breaking_author}], | |
| }, | |
| acceptable_alternatives=[], | |
| ), | |
| metadata=ScenarioMetadata( | |
| generator_version="1.0", | |
| generated_at=fake_timestamp(rng), | |
| source_log_hash=source_log_hash, | |
| difficulty=difficulty, | |
| ), | |
| ) | |