Spaces:

Prasham1710
/

ci-triage-env

Sleeping

Prasham.Jain Claude Sonnet 4.6 commited on 15 days ago

Commit

d11066d

1 Parent(s): 18a3fbf

feat(rewards): Phase C1 — all 9 reward components implemented

- FormatGate: schema-validates all ToolCall args via jsonschema + TerminalAction bounds
- DiagnosisReward: asymmetric 7×7 confusion-matrix with operational consequence weights
- ActionQualityReward: secondary action × failure-family matrix; quarantine-on-bug is most catastrophic
- CostEfficiencyReward: linear penalty inversely proportional to budget consumed
- InvestigationReward: coverage × ordering × redundancy shaping reward
- TimePenaltyReward: per-step penalty beyond 6-step reference
- AntiGamingReward: no-info-action guard + rolling quarantine-rate guard + Brier calibration probe
- MinimalEvidenceReward: bonus for correct diagnosis using only the minimal evidence set (weight=0 in v1)
- CounterfactualPredictReward: dormant in v1 (weight=0); implementation preserved for v2
- 67 unit tests across 9 test files (329 total, all passing)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (20) hide show

src/ci_triage_env/rewards/__init__.py +21 -0
src/ci_triage_env/rewards/action_quality.py +96 -0
src/ci_triage_env/rewards/anti_gaming.py +86 -0
src/ci_triage_env/rewards/cost_efficiency.py +36 -0
src/ci_triage_env/rewards/counterfactual_predict.py +56 -0
src/ci_triage_env/rewards/diagnosis.py +88 -0
src/ci_triage_env/rewards/format_gate.py +71 -0
src/ci_triage_env/rewards/investigation.py +87 -0
src/ci_triage_env/rewards/minimal_evidence.py +65 -0
src/ci_triage_env/rewards/time_penalty.py +37 -0
tests/rewards/__init__.py +0 -0
tests/rewards/test_action_quality.py +90 -0
tests/rewards/test_anti_gaming.py +102 -0
tests/rewards/test_cost_efficiency.py +70 -0
tests/rewards/test_counterfactual_predict.py +78 -0
tests/rewards/test_diagnosis.py +75 -0
tests/rewards/test_format_gate.py +144 -0
tests/rewards/test_investigation.py +111 -0
tests/rewards/test_minimal_evidence.py +120 -0
tests/rewards/test_time_penalty.py +87 -0

src/ci_triage_env/rewards/__init__.py CHANGED Viewed

	@@ -0,0 +1,21 @@

+from ci_triage_env.rewards.action_quality import ActionQualityReward
+from ci_triage_env.rewards.anti_gaming import AntiGamingReward
+from ci_triage_env.rewards.cost_efficiency import CostEfficiencyReward
+from ci_triage_env.rewards.counterfactual_predict import CounterfactualPredictReward
+from ci_triage_env.rewards.diagnosis import DiagnosisReward
+from ci_triage_env.rewards.format_gate import FormatGate
+from ci_triage_env.rewards.investigation import InvestigationReward
+from ci_triage_env.rewards.minimal_evidence import MinimalEvidenceReward
+from ci_triage_env.rewards.time_penalty import TimePenaltyReward
+__all__ = [
+    "ActionQualityReward",
+    "AntiGamingReward",
+    "CostEfficiencyReward",
+    "CounterfactualPredictReward",
+    "DiagnosisReward",
+    "FormatGate",
+    "InvestigationReward",
+    "MinimalEvidenceReward",
+    "TimePenaltyReward",
+]

src/ci_triage_env/rewards/action_quality.py ADDED Viewed

	@@ -0,0 +1,96 @@

+"""ActionQualityReward — secondary action × failure-family matrix.
+Raw score range: [-2.0, 1.5] (capped). Default weight: 0.20.
+"""
+from __future__ import annotations
+from ci_triage_env.rewards.base import RewardComponent
+from ci_triage_env.schemas.episode import EpisodeTrace
+from ci_triage_env.schemas.reward import ComponentScore
+from ci_triage_env.schemas.scenario import Scenario
+# (action_name, ground_truth_family) → reward
+ACTION_REWARD_MATRIX: dict[tuple[str, str], float] = {
+    ("file_bug", "real_bug"): 1.0,
+    ("file_bug", "dependency_drift"): 0.7,
+    ("file_bug", "race_flake"): -0.5,
+    ("file_bug", "timing_flake"): -0.3,
+    ("file_bug", "infra_network"): -0.5,
+    ("file_bug", "infra_resource"): -0.5,
+    ("file_bug", "ambiguous"): -0.2,
+    # Quarantine: ideal for flakes, catastrophic for real bugs
+    ("quarantine_test", "race_flake"): 1.0,
+    ("quarantine_test", "timing_flake"): 0.8,
+    ("quarantine_test", "real_bug"): -1.5,
+    ("quarantine_test", "infra_network"): -0.3,
+    ("quarantine_test", "infra_resource"): -0.3,
+    ("quarantine_test", "dependency_drift"): -0.5,
+    ("quarantine_test", "ambiguous"): -0.3,
+    # Rerun: right for transient failures, bad for bugs
+    ("rerun_test", "race_flake"): 0.6,
+    ("rerun_test", "timing_flake"): 0.6,
+    ("rerun_test", "infra_network"): 0.8,
+    ("rerun_test", "infra_resource"): 0.5,
+    ("rerun_test", "real_bug"): -0.6,
+    ("rerun_test", "dependency_drift"): -0.3,
+    ("rerun_test", "ambiguous"): 0.2,
+    # Ping owner: escalates to the right team
+    ("ping_owner", "infra_resource"): 0.7,
+    ("ping_owner", "infra_network"): 0.5,
+    ("ping_owner", "real_bug"): 0.4,
+    ("ping_owner", "dependency_drift"): 0.6,
+    ("ping_owner", "race_flake"): 0.0,
+    ("ping_owner", "timing_flake"): 0.0,
+    ("ping_owner", "ambiguous"): 0.3,
+}
+_RAW_MIN = -2.0
+_RAW_MAX = 1.5
+class ActionQualityReward(RewardComponent):
+    """Reward for secondary actions taken alongside the diagnosis.
+    Multiple secondary actions are summed then capped to [-2.0, 1.5].
+    No secondary actions → neutral (0.0). No terminal action → -0.5.
+    """
+    name = "action_quality"
+    default_weight = 0.20
+    def score(self, trace: EpisodeTrace, scenario: Scenario) -> ComponentScore:
+        if trace.episode.final_action is None:
+            raw = -0.5
+            return ComponentScore(
+                raw=raw,
+                weighted=raw * self.default_weight,
+                weight=self.default_weight,
+                sub_scores={"no_action": -0.5},
+            )
+        true = scenario.ground_truth.label.value
+        secondary = trace.episode.final_action.secondary_actions
+        if not secondary:
+            return ComponentScore(
+                raw=0.0,
+                weighted=0.0,
+                weight=self.default_weight,
+                sub_scores={"no_secondary": 0.0},
+            )
+        sub_scores: dict[str, float] = {}
+        total = 0.0
+        for sa in secondary:
+            r = ACTION_REWARD_MATRIX.get((sa.name, true), 0.0)
+            sub_scores[sa.name] = r
+            total += r
+        capped = max(min(total, _RAW_MAX), _RAW_MIN)
+        return ComponentScore(
+            raw=capped,
+            weighted=capped * self.default_weight,
+            weight=self.default_weight,
+            sub_scores=sub_scores,
+        )

src/ci_triage_env/rewards/anti_gaming.py ADDED Viewed

	@@ -0,0 +1,86 @@

+"""AntiGamingReward — three guards against reward exploitation.
+Guards:
+  1. No-info-action: terminal with < 2 tool calls → -0.5
+  2. Quarantine-rate: rolling-window over-use of quarantine_test → penalty
+  3. Brier calibration: on ambiguous scenarios, penalises mis-calibrated confidence
+Raw score range: [-1.5, 1.0]. Default weight: 0.15.
+Quarantine-rate state is injected at construction; empty list → no penalty.
+"""
+from __future__ import annotations
+from ci_triage_env.rewards.base import RewardComponent
+from ci_triage_env.schemas.action import ToolCall
+from ci_triage_env.schemas.episode import EpisodeTrace
+from ci_triage_env.schemas.reward import ComponentScore
+from ci_triage_env.schemas.scenario import Scenario
+_QUARANTINE_THRESHOLD = 0.30
+_RAW_MIN = -1.5
+_RAW_MAX = 1.0
+class AntiGamingReward(RewardComponent):
+    """Guards against common reward-gaming strategies.
+    Raw score range: [-1.5, 1.0].
+    Args:
+        recent_episode_actions: Names of the primary secondary actions taken in
+            the last N episodes. Supplied by the trainer's rolling-window state.
+            Pass an empty list for unit tests (no quarantine-rate pressure).
+    """
+    name = "anti_gaming"
+    default_weight = 0.15
+    def __init__(self, recent_episode_actions: list[str] | None = None) -> None:
+        self.recent_actions: list[str] = recent_episode_actions or []
+    def score(self, trace: EpisodeTrace, scenario: Scenario) -> ComponentScore:
+        sub: dict[str, float] = {}
+        # Guard 1: must gather at least 2 tool calls before diagnosing
+        n_tool_calls = sum(1 for r in trace.episode.history if isinstance(r.action, ToolCall))
+        if trace.episode.final_action is not None and n_tool_calls < 2:
+            no_info_penalty = -0.5
+        else:
+            no_info_penalty = 0.0
+        sub["no_info_penalty"] = no_info_penalty
+        # Guard 2: quarantine over-use relative to a rolling window
+        quarantine_rate = self._compute_quarantine_rate()
+        if quarantine_rate > _QUARANTINE_THRESHOLD:
+            quarantine_penalty = -(quarantine_rate - _QUARANTINE_THRESHOLD) * 2.0
+        else:
+            quarantine_penalty = 0.0
+        sub["quarantine_rate"] = quarantine_rate
+        sub["quarantine_penalty"] = quarantine_penalty
+        # Guard 3: Brier calibration probe (ambiguous scenarios only)
+        brier_bonus = 0.0
+        if scenario.ground_truth.is_ambiguous:
+            target = scenario.ground_truth.confidence_target
+            if trace.episode.final_action is not None:
+                pred_conf = trace.episode.final_action.confidence
+                brier = (pred_conf - target) ** 2
+                brier_bonus = 0.5 * (1.0 - brier)
+            else:
+                brier_bonus = -0.5
+        sub["brier_bonus"] = brier_bonus
+        raw = no_info_penalty + quarantine_penalty + brier_bonus
+        raw = max(min(raw, _RAW_MAX), _RAW_MIN)
+        return ComponentScore(
+            raw=raw,
+            weighted=raw * self.default_weight,
+            weight=self.default_weight,
+            sub_scores=sub,
+        )
+    def _compute_quarantine_rate(self) -> float:
+        if not self.recent_actions:
+            return 0.0
+        return sum(1 for a in self.recent_actions if a == "quarantine_test") / len(self.recent_actions)

src/ci_triage_env/rewards/cost_efficiency.py ADDED Viewed

	@@ -0,0 +1,36 @@

+"""CostEfficiencyReward — penalises high tool-call cost spend.
+Raw score range: [-1.0, 1.0]. Default weight: 0.15.
+Mapping: 0 cost → 1.0; full BUDGET_REFERENCE spend → -1.0.
+Over-budget episodes are not possible (env enforces budget), so ratio is clamped at 1.0.
+"""
+from __future__ import annotations
+from ci_triage_env.rewards.base import RewardComponent
+from ci_triage_env.schemas.episode import EpisodeTrace
+from ci_triage_env.schemas.reward import ComponentScore
+from ci_triage_env.schemas.scenario import Scenario
+class CostEfficiencyReward(RewardComponent):
+    """Linear reward inversely proportional to total cost spent.
+    Raw score range: [-1.0, 1.0].
+    """
+    name = "cost_efficiency"
+    default_weight = 0.15
+    BUDGET_REFERENCE: float = 5.0
+    def score(self, trace: EpisodeTrace, scenario: Scenario) -> ComponentScore:
+        total_spent = sum(rec.cost_charged for rec in trace.episode.history)
+        ratio = total_spent / self.BUDGET_REFERENCE
+        raw = 1.0 - 2.0 * min(ratio, 1.0)
+        return ComponentScore(
+            raw=raw,
+            weighted=raw * self.default_weight,
+            weight=self.default_weight,
+            sub_scores={"total_cost": total_spent, "ratio": ratio},
+        )

src/ci_triage_env/rewards/counterfactual_predict.py ADDED Viewed

	@@ -0,0 +1,56 @@

+"""CounterfactualPredictReward — DORMANT in v1.
+Counterfactual probe is deferred to v2. In v1 the env never fires probes
+(trace.counterfactual_replay is always None), so this component always returns
+(raw=0.0, weight=0.0). The implementation is preserved so v2 re-enable is a
+purely additive change: set default_weight to 0.10 in weights.py.
+Raw score range: [-0.5, 1.0]. Default weight: 0.0 (dormant).
+"""
+from __future__ import annotations
+from ci_triage_env.rewards.base import RewardComponent
+from ci_triage_env.schemas.episode import EpisodeTrace
+from ci_triage_env.schemas.reward import ComponentScore
+from ci_triage_env.schemas.scenario import Scenario
+class CounterfactualPredictReward(RewardComponent):
+    """Rewards correct prediction of the counterfactual probe outcome.
+    DORMANT in v1: default_weight=0.0 and trace.counterfactual_replay is always
+    None, so score() always returns zero contribution.
+    Raw score range: [-0.5, 1.0].
+    """
+    name = "counterfactual"
+    default_weight = 0.0
+    def score(self, trace: EpisodeTrace, scenario: Scenario) -> ComponentScore:
+        # v1: probes never fire; replay list is None or empty
+        if not trace.counterfactual_replay:
+            return ComponentScore(
+                raw=0.0,
+                weighted=0.0,
+                weight=self.default_weight,
+                sub_scores={"fired": 0.0},
+            )
+        # v2 path (reachable only when probes are enabled):
+        # The replay records encode the probe action and its observed outcome.
+        # Compare the agent's predicted outcome (last record) vs actual terminal.
+        predicted_record = trace.counterfactual_replay[-1]
+        actual_record = trace.episode.history[-1] if trace.episode.history else None
+        if actual_record is not None and predicted_record.action == actual_record.action:
+            raw = 1.0
+        else:
+            raw = -0.5
+        return ComponentScore(
+            raw=raw,
+            weighted=raw * self.default_weight,
+            weight=self.default_weight,
+            sub_scores={"fired": 1.0},
+        )

src/ci_triage_env/rewards/diagnosis.py ADDED Viewed

	@@ -0,0 +1,88 @@

+"""DiagnosisReward — asymmetric confusion-matrix reward.
+Raw score range: [-1.0, 1.0]. Default weight: 0.25.
+"""
+from __future__ import annotations
+from ci_triage_env.rewards.base import RewardComponent
+from ci_triage_env.schemas.episode import EpisodeTrace
+from ci_triage_env.schemas.reward import ComponentScore
+from ci_triage_env.schemas.scenario import Scenario
+# (predicted, true) → reward
+# Diagonal = 1.0; off-diagonal is asymmetric based on operational consequence.
+# Worst: quarantine a real bug (ships to prod); best: correct identification.
+DIAGNOSIS_REWARD_MATRIX: dict[tuple[str, str], float] = {
+    ("real_bug", "real_bug"): 1.0,
+    ("race_flake", "race_flake"): 1.0,
+    ("timing_flake", "timing_flake"): 1.0,
+    ("infra_network", "infra_network"): 1.0,
+    ("infra_resource", "infra_resource"): 1.0,
+    ("dependency_drift", "dependency_drift"): 1.0,
+    ("ambiguous", "ambiguous"): 1.0,
+    # Worst: predicting flake when it's a real bug (ships to prod)
+    ("race_flake", "real_bug"): -1.0,
+    ("timing_flake", "real_bug"): -1.0,
+    ("ambiguous", "real_bug"): -0.7,
+    # Bad: predicting infra when it's a real bug (file with wrong team)
+    ("infra_network", "real_bug"): -0.5,
+    ("infra_resource", "real_bug"): -0.5,
+    ("dependency_drift", "real_bug"): -0.4,
+    # Bad: predicting bug when it's a flake (false-alarm noise)
+    ("real_bug", "race_flake"): -0.3,
+    ("real_bug", "timing_flake"): -0.3,
+    # Bad: predicting bug when it's infra (wastes engineering time)
+    ("real_bug", "infra_network"): -0.4,
+    ("real_bug", "infra_resource"): -0.4,
+    ("real_bug", "dependency_drift"): -0.2,
+    # Mild: confusing similar families
+    ("race_flake", "timing_flake"): 0.2,
+    ("timing_flake", "race_flake"): 0.2,
+    ("infra_network", "infra_resource"): 0.1,
+    ("infra_resource", "infra_network"): 0.1,
+    # Abstaining on clear non-real-bug causes
+    ("ambiguous", "race_flake"): 0.0,
+    ("ambiguous", "timing_flake"): 0.0,
+    ("ambiguous", "infra_network"): 0.0,
+    ("ambiguous", "infra_resource"): 0.0,
+    ("ambiguous", "dependency_drift"): 0.0,
+}
+_DEFAULT_OFF_DIAGONAL = -0.5
+def lookup_reward(predicted: str, true: str) -> float:
+    return DIAGNOSIS_REWARD_MATRIX.get((predicted, true), _DEFAULT_OFF_DIAGONAL)
+class DiagnosisReward(RewardComponent):
+    """Reward based on predicted vs. true failure family.
+    Raw score range: [-1.0, 1.0]. No-terminal penalty: -1.0.
+    """
+    name = "diagnosis"
+    default_weight = 0.25
+    def score(self, trace: EpisodeTrace, scenario: Scenario) -> ComponentScore:
+        if trace.episode.final_action is None:
+            raw = -1.0
+            return ComponentScore(
+                raw=raw,
+                weighted=raw * self.default_weight,
+                weight=self.default_weight,
+                sub_scores={"no_diagnosis": -1.0},
+            )
+        predicted = trace.episode.final_action.diagnosis.value
+        true = scenario.ground_truth.label.value
+        raw = lookup_reward(predicted, true)
+        return ComponentScore(
+            raw=raw,
+            weighted=raw * self.default_weight,
+            weight=self.default_weight,
+            sub_scores={
+                "matrix_lookup": raw,
+                "predicted": 1.0 if predicted == true else 0.0,
+            },
+        )

src/ci_triage_env/rewards/format_gate.py ADDED Viewed

	@@ -0,0 +1,71 @@

+"""FormatGate — validates trajectory schema compliance.
+Returns 1.0 (all records valid) or 0.0 (first violation found).
+Raw score range: {0.0, 1.0}. Used as a multiplicative gate in composite.
+"""
+from __future__ import annotations
+import jsonschema
+from ci_triage_env.rewards.base import RewardComponent
+from ci_triage_env.schemas.action import TerminalAction, ToolCall
+from ci_triage_env.schemas.diagnosis import DiagnosisLabel
+from ci_triage_env.schemas.episode import EpisodeTrace
+from ci_triage_env.schemas.reward import ComponentScore
+from ci_triage_env.schemas.scenario import Scenario
+from ci_triage_env.schemas.tools import ALL_TOOLS
+TOOL_DEF_BY_NAME: dict = {t.name: t for t in ALL_TOOLS}
+class FormatGate(RewardComponent):
+    """Validates every ToolCall args against the tool's args_schema and every
+    TerminalAction against the DiagnosisLabel enum + confidence bounds.
+    Returns 0.0 (gate fails) or 1.0 (passes). The composite uses this as a
+    multiplicative gate: total = format_gate * weighted_sum.
+    """
+    name = "format_gate"
+    default_weight = 1.0
+    def score(self, trace: EpisodeTrace, scenario: Scenario) -> ComponentScore:
+        for record in trace.episode.history:
+            if isinstance(record.action, ToolCall):
+                tool_def = TOOL_DEF_BY_NAME.get(record.action.tool_name)
+                if tool_def is None:
+                    return self._fail("unknown_tool")
+                try:
+                    jsonschema.validate(record.action.args, tool_def.args_schema)
+                except jsonschema.ValidationError:
+                    return self._fail("args_invalid")
+            elif isinstance(record.action, TerminalAction):
+                if record.action.diagnosis not in DiagnosisLabel:
+                    return self._fail("invalid_diagnosis")
+                if not (0.0 <= record.action.confidence <= 1.0):
+                    return self._fail("confidence_oob")
+        # v1: counterfactual_replay is a list of StepRecords or None; probes never fire
+        if trace.counterfactual_replay is not None and len(trace.counterfactual_replay) > 0:
+            # Any probe records must themselves contain valid actions
+            for record in trace.counterfactual_replay:
+                if isinstance(record.action, ToolCall):
+                    tool_def = TOOL_DEF_BY_NAME.get(record.action.tool_name)
+                    if tool_def is None:
+                        return self._fail("probe_unknown_tool")
+        return ComponentScore(
+            raw=1.0,
+            weighted=1.0,
+            weight=self.default_weight,
+            sub_scores={"valid": 1.0},
+        )
+    def _fail(self, reason: str) -> ComponentScore:
+        return ComponentScore(
+            raw=0.0,
+            weighted=0.0,
+            weight=self.default_weight,
+            sub_scores={"reason": 0.0, "reason_code": 0.0},
+        )

src/ci_triage_env/rewards/investigation.py ADDED Viewed

	@@ -0,0 +1,87 @@

+"""InvestigationReward — shaping reward for evidence-gathering quality.
+Combines:
+  - coverage: fraction of informative_tools that were called (weight 0.6)
+  - ordering: cheap-before-expensive bonus (weight 0.2)
+  - redundancy_penalty: -0.1 per duplicate (tool_name, args) call
+Raw score range: [-1.0, 1.0]. Default weight: 0.15.
+"""
+from __future__ import annotations
+import json
+from ci_triage_env.rewards.base import RewardComponent
+from ci_triage_env.schemas.action import ToolCall
+from ci_triage_env.schemas.episode import EpisodeTrace
+from ci_triage_env.schemas.reward import ComponentScore
+from ci_triage_env.schemas.scenario import Scenario
+_CHEAP_TOOLS = frozenset({
+    "read_logs", "query_flake_history", "recent_commits",
+    "check_owner", "inspect_test_code", "cluster_metrics",
+})
+_EXPENSIVE_TOOLS = frozenset({
+    "rerun_test", "run_diagnostic", "file_bug", "ping_owner", "quarantine_test",
+})
+class InvestigationReward(RewardComponent):
+    """Shaping reward for how well the agent investigates the failure.
+    Raw score range: [-1.0, 1.0].
+    """
+    name = "investigation"
+    default_weight = 0.15
+    def score(self, trace: EpisodeTrace, scenario: Scenario) -> ComponentScore:
+        called_tools = [
+            rec.action.tool_name
+            for rec in trace.episode.history
+            if isinstance(rec.action, ToolCall)
+        ]
+        # Coverage: fraction of informative_tools called
+        informative = set(scenario.informative_tools)
+        called_informative = sum(1 for t in called_tools if t in informative)
+        coverage = called_informative / max(len(informative), 1)
+        # Redundancy: duplicate (tool_name, sorted-args-json) calls
+        seen_calls: set[tuple[str, str]] = set()
+        redundancy_count = 0
+        for rec in trace.episode.history:
+            if isinstance(rec.action, ToolCall):
+                key = (rec.action.tool_name, json.dumps(rec.action.args, sort_keys=True))
+                if key in seen_calls:
+                    redundancy_count += 1
+                seen_calls.add(key)
+        redundancy_penalty = -0.1 * redundancy_count
+        # Ordering: cheap tools should precede expensive tools
+        ordering = self._compute_ordering_score(called_tools)
+        raw = 0.6 * coverage + 0.2 * ordering + redundancy_penalty
+        raw = max(min(raw, 1.0), -1.0)
+        return ComponentScore(
+            raw=raw,
+            weighted=raw * self.default_weight,
+            weight=self.default_weight,
+            sub_scores={
+                "coverage": coverage,
+                "ordering": ordering,
+                "redundancy_penalty": redundancy_penalty,
+            },
+        )
+    def _compute_ordering_score(self, tools: list[str]) -> float:
+        violations = 0
+        seen_expensive = False
+        for t in tools:
+            if t in _EXPENSIVE_TOOLS:
+                seen_expensive = True
+            elif t in _CHEAP_TOOLS and seen_expensive:
+                violations += 1
+        return max(1.0 - 0.2 * violations, 0.0)

src/ci_triage_env/rewards/minimal_evidence.py ADDED Viewed

	@@ -0,0 +1,65 @@

+"""MinimalEvidenceReward — bonus for diagnosing correctly with the minimal tool set.
+Default weight: 0.0 — this component is NOT in the additive composite directly.
+In Phase C2 its score modifies the InvestigationReward via a multiplier.
+Raw score range: [-0.5, 1.0].
+"""
+from __future__ import annotations
+from ci_triage_env.rewards.base import RewardComponent
+from ci_triage_env.schemas.action import ToolCall
+from ci_triage_env.schemas.episode import EpisodeTrace
+from ci_triage_env.schemas.reward import ComponentScore
+from ci_triage_env.schemas.scenario import Scenario
+class MinimalEvidenceReward(RewardComponent):
+    """Bonus when the agent reaches the correct diagnosis using only the minimal evidence set.
+    If minimal_evidence_set is empty (ambiguous scenarios), returns 0.0.
+    Raw score range: [-0.5, 1.0]. Default weight: 0.0 (folded into InvestigationReward in C2).
+    """
+    name = "minimal_evidence"
+    default_weight = 0.0
+    def score(self, trace: EpisodeTrace, scenario: Scenario) -> ComponentScore:
+        min_set = set(scenario.minimal_evidence_set)
+        if not min_set:
+            return ComponentScore(
+                raw=0.0, weighted=0.0, weight=self.default_weight, sub_scores={}
+            )
+        called = {
+            rec.action.tool_name
+            for rec in trace.episode.history
+            if isinstance(rec.action, ToolCall)
+        }
+        final = trace.episode.final_action
+        correct_diagnosis = (
+            final is not None
+            and final.diagnosis.value == scenario.ground_truth.label.value
+        )
+        if correct_diagnosis:
+            min_used = called & min_set
+            extra = called - min_set
+            if min_used == min_set:
+                # All minimal evidence used; small penalty for extras
+                bonus = max(min(1.0 - 0.1 * len(extra), 1.0), -0.5)
+            else:
+                bonus = 0.3  # correct answer but didn't use all key evidence
+        else:
+            bonus = 0.0
+        return ComponentScore(
+            raw=bonus,
+            weighted=bonus * self.default_weight,
+            weight=self.default_weight,
+            sub_scores={
+                "min_set_used": float(len(called & min_set)),
+                "extras": float(len(called - min_set)),
+            },
+        )

src/ci_triage_env/rewards/time_penalty.py ADDED Viewed

	@@ -0,0 +1,37 @@

+"""TimePenaltyReward — penalises episodes that take more than REFERENCE_STEPS tool calls.
+Raw score range: [-1.0, 0.0]. Default weight: 0.10.
+"""
+from __future__ import annotations
+from ci_triage_env.rewards.base import RewardComponent
+from ci_triage_env.schemas.action import ToolCall
+from ci_triage_env.schemas.episode import EpisodeTrace
+from ci_triage_env.schemas.reward import ComponentScore
+from ci_triage_env.schemas.scenario import Scenario
+class TimePenaltyReward(RewardComponent):
+    """Linear per-step penalty beyond REFERENCE_STEPS tool calls.
+    0 to REFERENCE_STEPS calls → 0.0. Each extra step → -PER_STEP_PENALTY.
+    Floor at -1.0. Raw score range: [-1.0, 0.0].
+    """
+    name = "time"
+    default_weight = 0.10
+    PER_STEP_PENALTY: float = 0.02
+    REFERENCE_STEPS: int = 6
+    def score(self, trace: EpisodeTrace, scenario: Scenario) -> ComponentScore:
+        steps = sum(1 for r in trace.episode.history if isinstance(r.action, ToolCall))
+        excess = max(0, steps - self.REFERENCE_STEPS)
+        raw = max(-self.PER_STEP_PENALTY * excess, -1.0)
+        return ComponentScore(
+            raw=raw,
+            weighted=raw * self.default_weight,
+            weight=self.default_weight,
+            sub_scores={"steps": float(steps), "excess": float(excess)},
+        )

tests/rewards/__init__.py ADDED Viewed

File without changes

tests/rewards/test_action_quality.py ADDED Viewed

	@@ -0,0 +1,90 @@

+"""Tests for ActionQualityReward component."""
+from __future__ import annotations
+from ci_triage_env.mock import make_mock_scenario, make_mock_trajectory
+from ci_triage_env.rewards.action_quality import ACTION_REWARD_MATRIX, ActionQualityReward
+from ci_triage_env.schemas.action import SecondaryAction, TerminalAction
+def _patch_secondary(trace, secondary_actions):
+    new_terminal = TerminalAction(
+        action_type="submit_diagnosis",
+        diagnosis=trace.episode.final_action.diagnosis,
+        confidence=trace.episode.final_action.confidence,
+        secondary_actions=secondary_actions,
+    )
+    return trace.model_copy(
+        update={"episode": trace.episode.model_copy(update={"final_action": new_terminal})}
+    )
+def test_action_quality_correct_case_returns_high_score() -> None:
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    patched = _patch_secondary(trace, [SecondaryAction(name="file_bug", args={})])
+    score = ActionQualityReward().score(patched, scenario)
+    assert score.raw > 0.5
+def test_action_quality_wrong_case_returns_low_score() -> None:
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    # quarantine_test on real_bug is catastrophically bad
+    patched = _patch_secondary(trace, [SecondaryAction(name="quarantine_test", args={})])
+    score = ActionQualityReward().score(patched, scenario)
+    assert score.raw < 0.0
+def test_action_quality_handles_no_terminal_action() -> None:
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    no_terminal = trace.model_copy(
+        update={"episode": trace.episode.model_copy(update={"final_action": None})}
+    )
+    score = ActionQualityReward().score(no_terminal, scenario)
+    assert score.raw == -0.5
+def test_action_quality_deterministic() -> None:
+    scenario = make_mock_scenario("race_flake")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    comp = ActionQualityReward()
+    s1 = comp.score(trace, scenario)
+    s2 = comp.score(trace, scenario)
+    assert s1.raw == s2.raw
+def test_action_quality_score_is_in_documented_range() -> None:
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    # Maximum stacking: multiple good actions
+    patched = _patch_secondary(trace, [
+        SecondaryAction(name="file_bug", args={}),
+        SecondaryAction(name="ping_owner", args={}),
+    ])
+    score = ActionQualityReward().score(patched, scenario)
+    assert -2.0 <= score.raw <= 1.5
+def test_action_quality_subscores_are_meaningful() -> None:
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    patched = _patch_secondary(trace, [SecondaryAction(name="file_bug", args={})])
+    score = ActionQualityReward().score(patched, scenario)
+    assert "file_bug" in score.sub_scores
+def test_quarantine_real_bug_is_worst() -> None:
+    worst = ACTION_REWARD_MATRIX[("quarantine_test", "real_bug")]
+    assert worst == -1.5
+    all_values = list(ACTION_REWARD_MATRIX.values())
+    assert all(v >= worst for v in all_values)
+def test_action_quality_no_secondary_neutral() -> None:
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    # Default mock trajectory has no secondary actions
+    score = ActionQualityReward().score(trace, scenario)
+    assert score.raw == 0.0

tests/rewards/test_anti_gaming.py ADDED Viewed

	@@ -0,0 +1,102 @@

+"""Tests for AntiGamingReward component."""
+from __future__ import annotations
+import pytest
+from ci_triage_env.mock import make_mock_scenario, make_mock_trajectory
+from ci_triage_env.rewards.anti_gaming import AntiGamingReward
+from ci_triage_env.schemas.action import ToolCall
+from ci_triage_env.schemas.episode import StepRecord
+from ci_triage_env.schemas.observation import BudgetState, Observation
+def _dummy_obs() -> Observation:
+    return Observation(
+        episode_id="test",
+        step=0,
+        failure_summary=None,
+        tool_response=None,
+        budget_remaining=BudgetState(tool_calls_remaining=10, cost_remaining=1.0),
+        is_terminal=False,
+        probe_question=None,
+    )
+def test_anti_gaming_correct_case_returns_high_score() -> None:
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    score = AntiGamingReward().score(trace, scenario)
+    # Good trajectory: ≥ 2 tool calls, no quarantine abuse, non-ambiguous
+    assert score.raw >= 0.0
+def test_anti_gaming_wrong_case_returns_low_score() -> None:
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    # Simulate no-info-action: only 1 tool call before terminal
+    one_tool = [
+        StepRecord(step=0, action=ToolCall(tool_name="read_logs", args={"scope": "full"}),
+                   observation=_dummy_obs(), cost_charged=0.001)
+    ]
+    patched = trace.model_copy(
+        update={"episode": trace.episode.model_copy(update={"history": one_tool})}
+    )
+    score = AntiGamingReward().score(patched, scenario)
+    assert score.raw <= -0.5
+def test_anti_gaming_handles_no_terminal_action() -> None:
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    no_terminal = trace.model_copy(
+        update={"episode": trace.episode.model_copy(update={"final_action": None})}
+    )
+    score = AntiGamingReward().score(no_terminal, scenario)
+    # No terminal → no no-info-action penalty; result depends on other guards
+    assert -1.5 <= score.raw <= 1.0
+def test_anti_gaming_deterministic() -> None:
+    scenario = make_mock_scenario("race_flake")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    comp = AntiGamingReward()
+    s1 = comp.score(trace, scenario)
+    s2 = comp.score(trace, scenario)
+    assert s1.raw == s2.raw
+def test_anti_gaming_score_is_in_documented_range() -> None:
+    for family in ["real_bug", "race_flake", "ambiguous"]:
+        scenario = make_mock_scenario(family)
+        for outcome in ["good", "bad"]:
+            trace = make_mock_trajectory(scenario, outcome=outcome)
+            score = AntiGamingReward().score(trace, scenario)
+            assert -1.5 <= score.raw <= 1.0
+def test_anti_gaming_subscores_are_meaningful() -> None:
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    score = AntiGamingReward().score(trace, scenario)
+    assert "no_info_penalty" in score.sub_scores
+    assert "quarantine_rate" in score.sub_scores
+    assert "brier_bonus" in score.sub_scores
+def test_brier_calibration_perfect_match_bonus() -> None:
+    scenario = make_mock_scenario("ambiguous")
+    # confidence_target=0.5 for ambiguous mock; abstain trajectory uses confidence=0.5
+    trace = make_mock_trajectory(scenario, outcome="abstain")
+    score = AntiGamingReward().score(trace, scenario)
+    # Perfect match: brier=(0.5-0.5)^2=0; bonus=0.5*(1-0)=0.5
+    assert score.sub_scores["brier_bonus"] == pytest.approx(0.5, abs=1e-6)
+def test_quarantine_rate_above_threshold_penalizes() -> None:
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    # 100% quarantine rate → well above 30% threshold
+    comp = AntiGamingReward(recent_episode_actions=["quarantine_test"] * 50)
+    score = comp.score(trace, scenario)
+    assert score.sub_scores["quarantine_penalty"] < 0.0

tests/rewards/test_cost_efficiency.py ADDED Viewed

	@@ -0,0 +1,70 @@

+"""Tests for CostEfficiencyReward component."""
+from __future__ import annotations
+from ci_triage_env.mock import make_mock_scenario, make_mock_trajectory
+from ci_triage_env.rewards.cost_efficiency import CostEfficiencyReward
+def test_cost_efficiency_correct_case_returns_high_score() -> None:
+    # Low-cost trajectory (mock costs ~0.016) → should be positive
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    score = CostEfficiencyReward().score(trace, scenario)
+    assert score.raw > 0.0
+def test_cost_efficiency_wrong_case_returns_low_score() -> None:
+    # Simulate a trajectory that spent the full budget
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    # Patch each step to have spent the full BUDGET_REFERENCE
+    from ci_triage_env.rewards.cost_efficiency import CostEfficiencyReward as CR
+    budget_each = CR.BUDGET_REFERENCE / max(len(trace.episode.history), 1)
+    patched_history = [
+        r.model_copy(update={"cost_charged": budget_each})
+        for r in trace.episode.history
+    ]
+    patched = trace.model_copy(
+        update={"episode": trace.episode.model_copy(update={"history": patched_history})}
+    )
+    score = CR().score(patched, scenario)
+    assert score.raw <= -0.9
+def test_cost_efficiency_handles_no_terminal_action() -> None:
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    no_terminal = trace.model_copy(
+        update={"episode": trace.episode.model_copy(update={"final_action": None})}
+    )
+    # Still scores based on cost; no terminal doesn't affect this component
+    score = CostEfficiencyReward().score(no_terminal, scenario)
+    assert -1.0 <= score.raw <= 1.0
+def test_cost_efficiency_deterministic() -> None:
+    scenario = make_mock_scenario("race_flake")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    comp = CostEfficiencyReward()
+    s1 = comp.score(trace, scenario)
+    s2 = comp.score(trace, scenario)
+    assert s1.raw == s2.raw
+def test_cost_efficiency_score_is_in_documented_range() -> None:
+    for family in ["real_bug", "race_flake", "ambiguous"]:
+        scenario = make_mock_scenario(family)
+        for outcome in ["good", "bad"]:
+            trace = make_mock_trajectory(scenario, outcome=outcome)
+            score = CostEfficiencyReward().score(trace, scenario)
+            assert -1.0 <= score.raw <= 1.0
+def test_cost_efficiency_subscores_are_meaningful() -> None:
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    score = CostEfficiencyReward().score(trace, scenario)
+    assert "total_cost" in score.sub_scores
+    assert "ratio" in score.sub_scores
+    assert score.sub_scores["total_cost"] >= 0.0

tests/rewards/test_counterfactual_predict.py ADDED Viewed

	@@ -0,0 +1,78 @@

+"""Tests for CounterfactualPredictReward component — dormant in v1."""
+from __future__ import annotations
+from ci_triage_env.mock import make_mock_scenario, make_mock_trajectory
+from ci_triage_env.rewards.counterfactual_predict import CounterfactualPredictReward
+def test_counterfactual_correct_case_returns_high_score() -> None:
+    # In v1 probes never fire → always returns 0.0 even in "good" trajectory
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    score = CounterfactualPredictReward().score(trace, scenario)
+    assert score.raw == 0.0
+def test_counterfactual_wrong_case_returns_low_score() -> None:
+    # v1: still 0.0 since counterfactual_replay is always None
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="bad")
+    score = CounterfactualPredictReward().score(trace, scenario)
+    assert score.raw == 0.0
+def test_no_probe_returns_zero() -> None:
+    scenario = make_mock_scenario("race_flake")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    assert trace.counterfactual_replay is None
+    score = CounterfactualPredictReward().score(trace, scenario)
+    assert score.raw == 0.0
+    assert score.weighted == 0.0
+def test_counterfactual_handles_no_terminal_action() -> None:
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    no_terminal = trace.model_copy(
+        update={"episode": trace.episode.model_copy(update={"final_action": None})}
+    )
+    score = CounterfactualPredictReward().score(no_terminal, scenario)
+    assert score.raw == 0.0
+def test_counterfactual_deterministic() -> None:
+    scenario = make_mock_scenario("race_flake")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    comp = CounterfactualPredictReward()
+    s1 = comp.score(trace, scenario)
+    s2 = comp.score(trace, scenario)
+    assert s1.raw == s2.raw
+def test_counterfactual_score_is_in_documented_range() -> None:
+    for family in ["real_bug", "race_flake", "ambiguous"]:
+        scenario = make_mock_scenario(family)
+        trace = make_mock_trajectory(scenario, outcome="good")
+        score = CounterfactualPredictReward().score(trace, scenario)
+        # v1: always 0.0; generally in [-0.5, 1.0]
+        assert -0.5 <= score.raw <= 1.0
+def test_counterfactual_subscores_are_meaningful() -> None:
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    score = CounterfactualPredictReward().score(trace, scenario)
+    assert "fired" in score.sub_scores
+def test_v1_default_weight_is_zero() -> None:
+    assert CounterfactualPredictReward.default_weight == 0.0
+def test_v1_weighted_always_zero() -> None:
+    for family in ["real_bug", "ambiguous"]:
+        scenario = make_mock_scenario(family)
+        trace = make_mock_trajectory(scenario, outcome="good")
+        score = CounterfactualPredictReward().score(trace, scenario)
+        assert score.weighted == 0.0

tests/rewards/test_diagnosis.py ADDED Viewed

	@@ -0,0 +1,75 @@

+"""Tests for DiagnosisReward component."""
+from __future__ import annotations
+from ci_triage_env.mock import make_mock_scenario, make_mock_trajectory
+from ci_triage_env.rewards.diagnosis import DIAGNOSIS_REWARD_MATRIX, DiagnosisReward
+ALL_FAMILIES = ["real_bug", "race_flake", "timing_flake", "infra_network", "infra_resource",
+                "dependency_drift", "ambiguous"]
+def test_diagnosis_correct_case_returns_high_score() -> None:
+    for family in ALL_FAMILIES:
+        scenario = make_mock_scenario(family)
+        trace = make_mock_trajectory(scenario, outcome="good")
+        score = DiagnosisReward().score(trace, scenario)
+        assert score.raw == 1.0, f"family={family}: expected 1.0 got {score.raw}"
+def test_diagnosis_wrong_case_returns_low_score() -> None:
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="bad")
+    score = DiagnosisReward().score(trace, scenario)
+    assert score.raw < 0.0
+def test_diagnosis_handles_no_terminal_action() -> None:
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    no_terminal = trace.model_copy(
+        update={"episode": trace.episode.model_copy(update={"final_action": None})}
+    )
+    score = DiagnosisReward().score(no_terminal, scenario)
+    assert score.raw == -1.0
+    assert score.sub_scores.get("no_diagnosis") == -1.0
+def test_diagnosis_deterministic() -> None:
+    scenario = make_mock_scenario("race_flake")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    comp = DiagnosisReward()
+    s1 = comp.score(trace, scenario)
+    s2 = comp.score(trace, scenario)
+    assert s1.raw == s2.raw
+def test_diagnosis_score_is_in_documented_range() -> None:
+    for family in ALL_FAMILIES:
+        scenario = make_mock_scenario(family)
+        for outcome in ["good", "bad"]:
+            trace = make_mock_trajectory(scenario, outcome=outcome)
+            score = DiagnosisReward().score(trace, scenario)
+            assert -1.0 <= score.raw <= 1.0, f"out of range: family={family} outcome={outcome}"
+def test_diagnosis_subscores_are_meaningful() -> None:
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    score = DiagnosisReward().score(trace, scenario)
+    assert "matrix_lookup" in score.sub_scores
+def test_diagonal_matches_return_one() -> None:
+    families = ALL_FAMILIES
+    for f in families:
+        assert DIAGNOSIS_REWARD_MATRIX.get((f, f)) == 1.0, f"diagonal {f} is not 1.0"
+def test_quarantine_real_bug_is_most_negative_action() -> None:
+    # The diagnosis matrix's worst entry for predicting flake on real_bug is -1.0
+    flake_on_real = DIAGNOSIS_REWARD_MATRIX[("race_flake", "real_bug")]
+    assert flake_on_real == -1.0
+    # Every other (predicted, "real_bug") entry should be >= flake_on_real
+    real_bug_penalties = [v for (p, t), v in DIAGNOSIS_REWARD_MATRIX.items() if t == "real_bug" and p != "real_bug"]
+    assert all(v >= flake_on_real for v in real_bug_penalties)

tests/rewards/test_format_gate.py ADDED Viewed

	@@ -0,0 +1,144 @@

+"""Tests for FormatGate reward component."""
+from __future__ import annotations
+from ci_triage_env.mock import make_mock_scenario, make_mock_trajectory
+from ci_triage_env.rewards.format_gate import FormatGate
+from ci_triage_env.schemas.action import ToolCall
+from ci_triage_env.schemas.episode import StepRecord
+from ci_triage_env.schemas.observation import BudgetState, Observation
+def _dummy_obs() -> Observation:
+    return Observation(
+        episode_id="test",
+        step=0,
+        failure_summary=None,
+        tool_response=None,
+        budget_remaining=BudgetState(tool_calls_remaining=10, cost_remaining=1.0),
+        is_terminal=False,
+        probe_question=None,
+    )
+def test_format_gate_correct_case_returns_high_score() -> None:
+    # Build a trajectory with only valid read_logs calls (scope is required)
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    # Replace history with a single valid tool call
+    valid_record = StepRecord(
+        step=0,
+        action=ToolCall(tool_name="read_logs", args={"scope": "full"}),
+        observation=_dummy_obs(),
+        cost_charged=0.001,
+    )
+    patched = trace.model_copy(
+        update={"episode": trace.episode.model_copy(update={"history": [valid_record]})}
+    )
+    score = FormatGate().score(patched, scenario)
+    assert score.raw == 1.0
+def test_format_gate_wrong_case_returns_low_score() -> None:
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    # Inject an unknown tool name
+    bad_action = ToolCall(tool_name="__nonexistent_tool__", args={})
+    bad_record = StepRecord(step=99, action=bad_action, observation=_dummy_obs(), cost_charged=0.0)
+    patched = trace.model_copy(
+        update={"episode": trace.episode.model_copy(
+            update={"history": trace.episode.history + [bad_record]}
+        )}
+    )
+    score = FormatGate().score(patched, scenario)
+    assert score.raw == 0.0
+def test_format_gate_handles_no_terminal_action() -> None:
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    # Build trajectory with only valid tool calls and no terminal action
+    valid_record = StepRecord(
+        step=0,
+        action=ToolCall(tool_name="read_logs", args={"scope": "full"}),
+        observation=_dummy_obs(),
+        cost_charged=0.001,
+    )
+    patched = trace.model_copy(
+        update={"episode": trace.episode.model_copy(
+            update={"history": [valid_record], "final_action": None}
+        )}
+    )
+    score = FormatGate().score(patched, scenario)
+    # No terminal action → still valid if all tool calls are valid
+    assert score.raw == 1.0
+def test_format_gate_deterministic() -> None:
+    scenario = make_mock_scenario("race_flake")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    gate = FormatGate()
+    s1 = gate.score(trace, scenario)
+    s2 = gate.score(trace, scenario)
+    assert s1.raw == s2.raw
+def test_format_gate_score_is_in_documented_range() -> None:
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    # Valid trajectory (single valid tool call)
+    valid_record = StepRecord(
+        step=0,
+        action=ToolCall(tool_name="read_logs", args={"scope": "full"}),
+        observation=_dummy_obs(),
+        cost_charged=0.001,
+    )
+    valid_trace = trace.model_copy(
+        update={"episode": trace.episode.model_copy(update={"history": [valid_record]})}
+    )
+    # Invalid trajectory (unknown tool)
+    bad_record = StepRecord(
+        step=0,
+        action=ToolCall(tool_name="__bad__", args={}),
+        observation=_dummy_obs(),
+        cost_charged=0.0,
+    )
+    invalid_trace = trace.model_copy(
+        update={"episode": trace.episode.model_copy(update={"history": [bad_record]})}
+    )
+    assert FormatGate().score(valid_trace, scenario).raw == 1.0
+    assert FormatGate().score(invalid_trace, scenario).raw == 0.0
+def test_format_gate_subscores_are_meaningful() -> None:
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    # Use a valid trajectory to get the "valid" key
+    valid_record = StepRecord(
+        step=0,
+        action=ToolCall(tool_name="read_logs", args={"scope": "full"}),
+        observation=_dummy_obs(),
+        cost_charged=0.001,
+    )
+    patched = trace.model_copy(
+        update={"episode": trace.episode.model_copy(update={"history": [valid_record]})}
+    )
+    score = FormatGate().score(patched, scenario)
+    assert "valid" in score.sub_scores
+def test_format_gate_invalid_args_fails() -> None:
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    # read_logs requires "scope" arg — inject one without it
+    bad_action = ToolCall(tool_name="read_logs", args={})  # missing required "scope"
+    bad_record = StepRecord(step=99, action=bad_action, observation=_dummy_obs(), cost_charged=0.0)
+    patched = trace.model_copy(
+        update={"episode": trace.episode.model_copy(
+            update={"history": [bad_record]}
+        )}
+    )
+    score = FormatGate().score(patched, scenario)
+    assert score.raw == 0.0

tests/rewards/test_investigation.py ADDED Viewed

	@@ -0,0 +1,111 @@

+"""Tests for InvestigationReward component."""
+from __future__ import annotations
+from ci_triage_env.mock import make_mock_scenario, make_mock_trajectory
+from ci_triage_env.rewards.investigation import InvestigationReward
+from ci_triage_env.schemas.action import ToolCall
+from ci_triage_env.schemas.episode import StepRecord
+from ci_triage_env.schemas.observation import BudgetState, Observation
+def _dummy_obs(step: int = 0) -> Observation:
+    return Observation(
+        episode_id="test",
+        step=step,
+        failure_summary=None,
+        tool_response=None,
+        budget_remaining=BudgetState(tool_calls_remaining=10, cost_remaining=1.0),
+        is_terminal=False,
+        probe_question=None,
+    )
+def _make_tool_record(tool_name: str, args: dict, step: int = 0) -> StepRecord:
+    return StepRecord(
+        step=step,
+        action=ToolCall(tool_name=tool_name, args=args),
+        observation=_dummy_obs(step),
+        cost_charged=0.001,
+    )
+def test_investigation_correct_case_returns_high_score() -> None:
+    # Trajectory that calls all informative tools in cheap-before-expensive order
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    score = InvestigationReward().score(trace, scenario)
+    # Mock trajectory calls read_logs, query_flake_history, recent_commits, rerun_test
+    # informative tools for real_bug mock: read_logs, query_flake_history, rerun_test
+    assert score.raw > 0.0
+def test_investigation_wrong_case_returns_low_score() -> None:
+    # Trajectory with no informative tools called (mock informative_tools = read_logs, query_flake_history, rerun_test)
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    # Use only tools NOT in informative_tools and in wrong order (expensive first, cheap second)
+    no_informative = [
+        _make_tool_record("ping_owner", {}, step=0),     # expensive, not informative
+        _make_tool_record("recent_commits", {}, step=1),  # cheap after expensive = ordering violation
+    ]
+    patched = trace.model_copy(
+        update={"episode": trace.episode.model_copy(update={"history": no_informative})}
+    )
+    score = InvestigationReward().score(patched, scenario)
+    # coverage=0.0, ordering=0.8 (1 violation), redundancy=0 → raw=0.6*0+0.2*0.8=0.16
+    assert score.raw <= 0.2
+def test_investigation_handles_no_terminal_action() -> None:
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    no_terminal = trace.model_copy(
+        update={"episode": trace.episode.model_copy(update={"final_action": None})}
+    )
+    score = InvestigationReward().score(no_terminal, scenario)
+    assert -1.0 <= score.raw <= 1.0
+def test_investigation_deterministic() -> None:
+    scenario = make_mock_scenario("race_flake")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    comp = InvestigationReward()
+    s1 = comp.score(trace, scenario)
+    s2 = comp.score(trace, scenario)
+    assert s1.raw == s2.raw
+def test_investigation_score_is_in_documented_range() -> None:
+    for family in ["real_bug", "race_flake", "timing_flake", "ambiguous"]:
+        scenario = make_mock_scenario(family)
+        for outcome in ["good", "bad"]:
+            trace = make_mock_trajectory(scenario, outcome=outcome)
+            score = InvestigationReward().score(trace, scenario)
+            assert -1.0 <= score.raw <= 1.0
+def test_investigation_subscores_are_meaningful() -> None:
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    score = InvestigationReward().score(trace, scenario)
+    assert "coverage" in score.sub_scores
+    assert "ordering" in score.sub_scores
+    assert "redundancy_penalty" in score.sub_scores
+def test_investigation_redundancy_penalised() -> None:
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    # Duplicate tool call with same args
+    dup = _make_tool_record("read_logs", {"scope": "full"}, step=0)
+    dup2 = _make_tool_record("read_logs", {"scope": "full"}, step=1)
+    patched = trace.model_copy(
+        update={"episode": trace.episode.model_copy(update={"history": [dup, dup2]})}
+    )
+    score_dup = InvestigationReward().score(patched, scenario)
+    only_one = trace.model_copy(
+        update={"episode": trace.episode.model_copy(update={"history": [dup]})}
+    )
+    score_single = InvestigationReward().score(only_one, scenario)
+    assert score_dup.sub_scores["redundancy_penalty"] < score_single.sub_scores["redundancy_penalty"]

tests/rewards/test_minimal_evidence.py ADDED Viewed

	@@ -0,0 +1,120 @@

+"""Tests for MinimalEvidenceReward component."""
+from __future__ import annotations
+import pytest
+from ci_triage_env.mock import make_mock_scenario, make_mock_trajectory
+from ci_triage_env.rewards.minimal_evidence import MinimalEvidenceReward
+from ci_triage_env.schemas.action import TerminalAction, ToolCall
+from ci_triage_env.schemas.episode import StepRecord
+from ci_triage_env.schemas.observation import BudgetState, Observation
+def _dummy_obs(step: int = 0) -> Observation:
+    return Observation(
+        episode_id="test",
+        step=step,
+        failure_summary=None,
+        tool_response=None,
+        budget_remaining=BudgetState(tool_calls_remaining=10, cost_remaining=1.0),
+        is_terminal=False,
+        probe_question=None,
+    )
+def _make_tool_records(tool_names: list[str]) -> list[StepRecord]:
+    return [
+        StepRecord(step=i, action=ToolCall(tool_name=t, args={}),
+                   observation=_dummy_obs(i), cost_charged=0.001)
+        for i, t in enumerate(tool_names)
+    ]
+def test_minimal_evidence_correct_case_returns_high_score() -> None:
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    score = MinimalEvidenceReward().score(trace, scenario)
+    # score may be 0.0 (weight=0) but raw should be non-negative for correct diagnosis
+    assert score.raw >= 0.0
+def test_minimal_evidence_wrong_case_returns_low_score() -> None:
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="bad")  # wrong diagnosis
+    score = MinimalEvidenceReward().score(trace, scenario)
+    assert score.raw == 0.0  # wrong diagnosis → no bonus
+def test_minimal_evidence_handles_no_terminal_action() -> None:
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    no_terminal = trace.model_copy(
+        update={"episode": trace.episode.model_copy(update={"final_action": None})}
+    )
+    score = MinimalEvidenceReward().score(no_terminal, scenario)
+    assert score.raw == 0.0  # no correct diagnosis → no bonus
+def test_minimal_evidence_deterministic() -> None:
+    scenario = make_mock_scenario("race_flake")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    comp = MinimalEvidenceReward()
+    s1 = comp.score(trace, scenario)
+    s2 = comp.score(trace, scenario)
+    assert s1.raw == s2.raw
+def test_minimal_evidence_score_is_in_documented_range() -> None:
+    for family in ["real_bug", "race_flake", "timing_flake"]:
+        scenario = make_mock_scenario(family)
+        for outcome in ["good", "bad"]:
+            trace = make_mock_trajectory(scenario, outcome=outcome)
+            score = MinimalEvidenceReward().score(trace, scenario)
+            assert -0.5 <= score.raw <= 1.0
+def test_minimal_evidence_subscores_are_meaningful() -> None:
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    score = MinimalEvidenceReward().score(trace, scenario)
+    # Either empty (no min_set) or has the two keys
+    if score.sub_scores:
+        assert "min_set_used" in score.sub_scores
+        assert "extras" in score.sub_scores
+def test_using_only_min_set_max_bonus() -> None:
+    scenario = make_mock_scenario("real_bug")
+    assert scenario.minimal_evidence_set  # must have a min set for this test
+    min_set = scenario.minimal_evidence_set
+    trace = make_mock_trajectory(scenario, outcome="good")
+    # Build a history using ONLY the minimal evidence tools
+    minimal_records = _make_tool_records(min_set)
+    correct_terminal = TerminalAction(
+        action_type="submit_diagnosis",
+        diagnosis=scenario.ground_truth.label,
+        confidence=1.0,
+        secondary_actions=[],
+    )
+    patched = trace.model_copy(
+        update={"episode": trace.episode.model_copy(
+            update={"history": minimal_records, "final_action": correct_terminal}
+        )}
+    )
+    score = MinimalEvidenceReward().score(patched, scenario)
+    # Only min set tools → extras=0 → bonus = 1.0 - 0.1*0 = 1.0
+    assert score.raw == pytest.approx(1.0, abs=1e-6)
+def test_empty_min_set_returns_zero() -> None:
+    # Use a real ambiguous scenario from the generator (which correctly has empty min_set)
+    from ci_triage_env.data.generators import GENERATOR_REGISTRY
+    from ci_triage_env.mock import make_mock_trajectory
+    scenario = GENERATOR_REGISTRY["ambiguous"]().generate(seed=42)
+    assert scenario.minimal_evidence_set == []
+    trace = make_mock_trajectory(scenario, outcome="good")
+    score = MinimalEvidenceReward().score(trace, scenario)
+    assert score.raw == 0.0

tests/rewards/test_time_penalty.py ADDED Viewed

	@@ -0,0 +1,87 @@

+"""Tests for TimePenaltyReward component."""
+from __future__ import annotations
+from ci_triage_env.mock import make_mock_scenario, make_mock_trajectory
+from ci_triage_env.rewards.time_penalty import TimePenaltyReward
+from ci_triage_env.schemas.action import ToolCall
+from ci_triage_env.schemas.episode import StepRecord
+from ci_triage_env.schemas.observation import BudgetState, Observation
+def _make_tool_records(n: int) -> list[StepRecord]:
+    obs = Observation(
+        episode_id="test",
+        step=0,
+        failure_summary=None,
+        tool_response=None,
+        budget_remaining=BudgetState(tool_calls_remaining=10, cost_remaining=1.0),
+        is_terminal=False,
+        probe_question=None,
+    )
+    return [
+        StepRecord(step=i, action=ToolCall(tool_name="read_logs", args={"scope": "full"}),
+                   observation=obs, cost_charged=0.001)
+        for i in range(n)
+    ]
+def test_time_penalty_correct_case_returns_high_score() -> None:
+    # <= REFERENCE_STEPS tool calls → no penalty
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    patched = trace.model_copy(
+        update={"episode": trace.episode.model_copy(update={"history": _make_tool_records(4)})}
+    )
+    score = TimePenaltyReward().score(patched, scenario)
+    assert score.raw == 0.0
+def test_time_penalty_wrong_case_returns_low_score() -> None:
+    # Many tool calls → large penalty
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    patched = trace.model_copy(
+        update={"episode": trace.episode.model_copy(update={"history": _make_tool_records(56)})}
+    )
+    score = TimePenaltyReward().score(patched, scenario)
+    assert score.raw == -1.0  # capped at floor
+def test_time_penalty_handles_no_terminal_action() -> None:
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    no_terminal = trace.model_copy(
+        update={"episode": trace.episode.model_copy(update={"final_action": None})}
+    )
+    score = TimePenaltyReward().score(no_terminal, scenario)
+    assert -1.0 <= score.raw <= 0.0
+def test_time_penalty_deterministic() -> None:
+    scenario = make_mock_scenario("race_flake")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    comp = TimePenaltyReward()
+    s1 = comp.score(trace, scenario)
+    s2 = comp.score(trace, scenario)
+    assert s1.raw == s2.raw
+def test_time_penalty_score_is_in_documented_range() -> None:
+    scenario = make_mock_scenario("real_bug")
+    for n in [0, 3, 6, 10, 60]:
+        trace = make_mock_trajectory(scenario, outcome="good")
+        patched = trace.model_copy(
+            update={"episode": trace.episode.model_copy(update={"history": _make_tool_records(n)})}
+        )
+        score = TimePenaltyReward().score(patched, scenario)
+        assert -1.0 <= score.raw <= 0.0
+def test_time_penalty_subscores_are_meaningful() -> None:
+    scenario = make_mock_scenario("real_bug")
+    trace = make_mock_trajectory(scenario, outcome="good")
+    score = TimePenaltyReward().score(trace, scenario)
+    assert "steps" in score.sub_scores
+    assert "excess" in score.sub_scores
+    assert score.sub_scores["steps"] >= 0