Spaces:

Prasham1710
/

ci-triage-training

Sleeping

File size: 10,962 Bytes

7a658b7

"""AmbiguousGenerator — scenario family: ambiguous.

These scenarios deliberately blend signals from multiple families so that no
single tool reading uniquely determines the label.  The correct response is
``submit_diagnosis(ambiguous, confidence ≈ 0.4)``, NOT a high-confidence single
label.  Branch C's Brier-score reward penalises overconfidence here.
"""

from __future__ import annotations

import random

from ci_triage_env.data.clustering.archetypes import Archetype
from ci_triage_env.data.generators._helpers import (
    ArchetypedGenerator,
    build_base_outputs,
    fake_short_sha,
    fake_timestamp,
    fill_template,
    make_failure_summary,
    pick_test_name,
    scenario_id_for,
)
from ci_triage_env.schemas.diagnosis import DiagnosisLabel
from ci_triage_env.schemas.scenario import (
    GroundTruth,
    Scenario,
    ScenarioMetadata,
    TerminalActionSpec,
    ToolOutput,
)

# Confidence that a calibrated agent should express for ambiguous scenarios
_AMBIGUOUS_CONFIDENCE_TARGET = 0.4


class AmbiguousGenerator(ArchetypedGenerator):
    family_name = "ambiguous"
    label = DiagnosisLabel.AMBIGUOUS

    def informative_tools(self) -> list[str]:
        # All tools are listed but none is definitive alone
        return [
            "read_logs",
            "query_flake_history",
            "recent_commits",
            "cluster_metrics",
            "inspect_test_code",
            "run_diagnostic",
        ]

    def minimal_evidence_set(self) -> list[str]:
        # No subset uniquely determines the label — empty by design
        return []

    def _default_archetypes(self) -> list[Archetype]:
        return [
            Archetype(
                archetype_id="ambiguous_001",
                family="ambiguous",
                pattern_summary="Memory pressure coincides with recent commit change",
                log_template=(
                    "FAILED {TEST_MODULE}::{TEST_FUNC} — signal: killed\n"
                    "kernel: Out of memory: Killed process {NUM} (pytest)\n"
                    "  Also noted: recent commit changed {MODULE} (author: {AUTHOR})\n"
                    "  query_flake_history shows {FAIL_COUNT}/{TOTAL} failures (borderline)"
                ),
                slot_distributions={
                    "TEST_MODULE": ["tests/unit/test_core", "tests/integration/test_batch"],
                    "TEST_FUNC": ["test_process_large", "test_batch_compute"],
                    "NUM": ["1234", "5678", "9012"],
                    "MODULE": ["src/core/processor.py", "src/batch/runner.py"],
                    "AUTHOR": ["@alice", "@bob"],
                    "FAIL_COUNT": ["3", "4"],
                    "TOTAL": ["15", "20"],
                },
                informative_tools_hint=["read_logs:kernel", "cluster_metrics:queue_depth", "recent_commits", "query_flake_history"],
                minimal_evidence_hint=[],
            ),
            Archetype(
                archetype_id="ambiguous_002",
                family="ambiguous",
                pattern_summary="Timeout overlaps with flake history AND borderline cluster load",
                log_template=(
                    "context canceled: deadline exceeded after {DURATION}s\n"
                    "FAILED {TEST_MODULE}::{TEST_FUNC}\n"
                    "  — queue_depth: {QUEUE} (borderline elevated)\n"
                    "  — flake_rate: {FAIL_COUNT}/{TOTAL} runs failed (inconclusive)\n"
                    "  — recent change to {MODULE} by {AUTHOR} ({COMMIT_MSG})"
                ),
                slot_distributions={
                    "DURATION": ["30", "60"],
                    "TEST_MODULE": ["tests/integration/test_rpc", "tests/unit/test_worker"],
                    "TEST_FUNC": ["test_rpc_call", "test_worker_drain"],
                    "QUEUE": ["0.45", "0.52", "0.48"],
                    "FAIL_COUNT": ["2", "3"],
                    "TOTAL": ["12", "18"],
                    "MODULE": ["src/rpc/client.py", "src/worker/loop.py"],
                    "AUTHOR": ["@carol", "@dave"],
                    "COMMIT_MSG": ["refactor: simplify timeout handling", "fix: adjust backoff"],
                },
                informative_tools_hint=["read_logs:full", "query_flake_history", "cluster_metrics:queue_depth", "recent_commits"],
                minimal_evidence_hint=[],
            ),
        ]

    def generate(self, seed: int, source_log_hash: str | None = None) -> Scenario:
        rng = random.Random(seed)
        archetype = self._pick_archetype(rng)
        log_text = fill_template(archetype.log_template, archetype.slot_distributions, rng)
        test_name = pick_test_name(rng)

        summary = make_failure_summary(
            self.family_name, rng, test_name=test_name, log_excerpt=log_text
        )
        branch = summary.branch

        # Rerun result is mixed — consistent with ambiguity
        rerun_passes = (seed % 3 == 0)
        outputs = build_base_outputs(
            test_name, branch, rng,
            log_lines=log_text.splitlines(),
            rerun_passes=rerun_passes,
        )

        # --- mixed signals: no single tool tells the full story ---

        # Borderline queue depth — elevated but not extreme
        borderline_queue = rng.uniform(0.42, 0.58)
        outputs["cluster_metrics:queue_depth"] = ToolOutput(
            tool_name="cluster_metrics",
            payload={
                "samples": [
                    {"t": fake_timestamp(rng), "queue_depth": round(borderline_queue + rng.uniform(-0.05, 0.05), 3),
                     "ok": True}
                    for _ in range(5)
                ]
            },
            cost_units=0.003,
        )

        # Borderline memory — not clearly OOM, but elevated
        outputs["cluster_metrics:node_health"] = ToolOutput(
            tool_name="cluster_metrics",
            payload={
                "samples": [
                    {"t": fake_timestamp(rng), "node_health": round(rng.uniform(0.35, 0.55), 3), "ok": True}
                    for _ in range(5)
                ]
            },
            cost_units=0.003,
        )

        # Flake history: small sample, recently added test — inconclusive
        total_runs = rng.randint(8, 15)
        failures = rng.randint(2, 4)
        outputs[f"query_flake_history:{test_name}"] = ToolOutput(
            tool_name="query_flake_history",
            payload={
                "failure_count": failures,
                "pass_count": total_runs - failures,
                "recent_failures": [
                    {"run_id": fake_short_sha(rng), "at": fake_timestamp(rng)} for _ in range(failures)
                ],
                "note": "Test was recently added — insufficient history for confident judgement",
            },
            cost_units=0.002,
        )

        # Recent commit touched related code but not obviously buggy
        change_author = rng.choice(["@alice", "@bob", "@carol"])
        change_sha = fake_short_sha(rng)
        change_file = rng.choice([
            f"src/core/{test_name.split('::')[-1].replace('test_', '')}.py",
            "src/middleware/timeout.py",
        ])
        outputs[f"recent_commits:{branch}"] = ToolOutput(
            tool_name="recent_commits",
            payload={
                "commits": [
                    {
                        "sha": change_sha,
                        "author": change_author,
                        "msg": rng.choice([
                            "refactor: simplify retry logic",
                            "fix: adjust timeout constants",
                            "perf: reduce allocation in hot path",
                        ]),
                        "files": [change_file],
                    }
                ]
            },
            cost_units=0.002,
        )

        # Test code looks plausibly related to both timeout and memory
        func_name = test_name.rsplit("::", 1)[-1]
        outputs[f"inspect_test_code:{test_name}"] = ToolOutput(
            tool_name="inspect_test_code",
            payload={
                "source": (
                    f"def {func_name}(self):\n"
                    f"    # This test exercises a code path that was recently modified.\n"
                    f"    with timeout(30):\n"
                    f"        result = self.service.process_batch(self.large_fixture)\n"
                    f"    self.assertIsNotNone(result)\n"
                ),
                "fixtures": [],
            },
            cost_units=0.002,
        )

        # run_diagnostic is borderline — not clearly broken
        outputs["run_diagnostic:memory"] = ToolOutput(
            tool_name="run_diagnostic",
            payload={
                "ok": True,
                "details": {
                    "available_gb": round(rng.uniform(0.8, 2.0), 2),
                    "note": "Low but not critical — borderline",
                },
            },
            cost_units=0.005,
        )

        difficulty = "hard"  # ambiguous scenarios are always hard
        rationale = (
            f"Multiple plausible causes: "
            f"(1) borderline queue_depth ({borderline_queue:.2f}) could cause timeout; "
            f"(2) commit {change_sha} by {change_author} touched related code; "
            f"(3) flake history is inconclusive ({failures}/{total_runs} — new test). "
            f"No single tool reading is decisive. Correct response: ambiguous, confidence ~0.4."
        )

        return Scenario(
            schema_version="1.0",
            scenario_id=scenario_id_for(self.family_name, seed),
            family=self.family_name,
            seed=seed,
            ground_truth=GroundTruth(
                label=self.label,
                rationale=rationale,
                is_ambiguous=True,
                confidence_target=_AMBIGUOUS_CONFIDENCE_TARGET,
            ),
            failure_summary=summary,
            tool_outputs=outputs,
            informative_tools=self.informative_tools(),
            minimal_evidence_set=self.minimal_evidence_set(),
            correct_terminal_action=TerminalActionSpec(
                primary="submit_diagnosis",
                args={
                    "diagnosis": self.label.value,
                    "confidence": _AMBIGUOUS_CONFIDENCE_TARGET,
                },
                acceptable_alternatives=[
                    {"primary": "submit_diagnosis",
                     "args": {"diagnosis": "ambiguous", "confidence": 0.35}},
                    {"primary": "submit_diagnosis",
                     "args": {"diagnosis": "ambiguous", "confidence": 0.45}},
                ],
            ),
            metadata=ScenarioMetadata(
                generator_version="1.0",
                generated_at=fake_timestamp(rng),
                source_log_hash=source_log_hash,
                difficulty=difficulty,
            ),
        )