Spaces:

Prasham1710
/

ci-triage-training

Sleeping

Prasham.Jain Claude Sonnet 4.6 commited on 14 days ago

Commit

e46f00b

1 Parent(s): 93e68bc

feat(training): Phase C6 — ablations, training curves, readme finalization

Adds 4-entry ablation matrix (no_diagnosis, no_action_quality, no_investigation,
no_anti_gaming), each running 1000-step GRPO with one reward component zeroed.
Extends run_grpo/TrainingRollout with weights_override to thread ablation weights
into CompositeReward. Adds W&B curve plotting, ablation summary charts,
populate_readme for [FILL] marker replacement, and Colab eval notebook.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (7) hide show

notebooks/eval.ipynb +218 -0
src/ci_triage_env/training/ablations.py +98 -0
src/ci_triage_env/training/curves.py +136 -0
src/ci_triage_env/training/finalize_readme.py +65 -0
src/ci_triage_env/training/grpo.py +4 -0
src/ci_triage_env/training/rollout.py +6 -3
tests/training/test_ablations.py +210 -0

notebooks/eval.ipynb ADDED Viewed

	@@ -0,0 +1,218 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "cell-0",
+   "metadata": {},
+   "source": [
+    "# CI-Triage-Env — Evaluation & Ablation Notebook\n",
+    "\n",
+    "Colab-runnable notebook for judges to reproduce all results.\n",
+    "\n",
+    "Steps:\n",
+    "1. Install dependencies\n",
+    "2. Load trained checkpoint from HF Hub\n",
+    "3. Run full 5-baseline evaluation\n",
+    "4. Generate all metric plots\n",
+    "5. Run reward-layer ablations (optional, GPU, ~5h)\n",
+    "6. Populate README with results\n",
+    "\n",
+    "**Prerequisites**: `HF_TOKEN`, `WANDB_API_KEY` as Colab secrets."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cell-1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cell 1: Install dependencies\n",
+    "!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121\n",
+    "!pip install -q unsloth trl transformers accelerate peft\n",
+    "!pip install -q wandb datasets huggingface_hub openai httpx fastapi uvicorn pydantic jsonschema\n",
+    "!pip install -q matplotlib seaborn pandas tabulate\n",
+    "!pip install -q -e .  # install ci_triage_env package"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cell-2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cell 2: Environment setup\n",
+    "import os\n",
+    "from google.colab import userdata\n",
+    "\n",
+    "os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')\n",
+    "os.environ['WANDB_API_KEY'] = userdata.get('WANDB_API_KEY')\n",
+    "os.environ['WANDB_PROJECT'] = 'ci-triage-env'\n",
+    "\n",
+    "# Config — replace with your values\n",
+    "HF_DATASET_REPO = 'YOUR_ORG/ci-triage-scenarios'\n",
+    "HF_MODEL_REPO   = 'YOUR_ORG/ci-triage-trained-qwen3.5-4b'\n",
+    "WANDB_RUN_ID    = 'YOUR_WANDB_RUN_ID'  # e.g. 'entity/ci-triage-env/abc123'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cell-3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cell 3: Download scenario corpus and trained checkpoint\n",
+    "from huggingface_hub import snapshot_download\n",
+    "\n",
+    "scen_dir = snapshot_download(\n",
+    "    HF_DATASET_REPO, repo_type='dataset',\n",
+    "    local_dir='data_artifacts/scenarios'\n",
+    ")\n",
+    "ckpt_dir = snapshot_download(\n",
+    "    HF_MODEL_REPO, repo_type='model',\n",
+    "    local_dir='checkpoints/grpo_full'\n",
+    ")\n",
+    "print(f'Scenarios: {scen_dir}')\n",
+    "print(f'Checkpoint: {ckpt_dir}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cell-4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cell 4: Start env server in background\n",
+    "import subprocess, time\n",
+    "server_proc = subprocess.Popen(\n",
+    "    ['python', '-m', 'ci_triage_env.env.server'],\n",
+    "    stdout=subprocess.PIPE, stderr=subprocess.PIPE\n",
+    ")\n",
+    "time.sleep(4)\n",
+    "print('Env server started, PID:', server_proc.pid)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cell-5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cell 5: Run full 5-baseline evaluation\n",
+    "from pathlib import Path\n",
+    "import pandas as pd\n",
+    "from ci_triage_env.training.eval import Evaluator\n",
+    "from ci_triage_env.training.plotting import plot_all_eval_metrics\n",
+    "\n",
+    "evaluator = Evaluator(\n",
+    "    eval_set_path='data_artifacts/scenarios/held_out/',\n",
+    "    trained_checkpoint='checkpoints/grpo_full/',\n",
+    ")\n",
+    "df_eval = evaluator.run_all(seeds=[1, 2, 3])\n",
+    "\n",
+    "out = Path('data_artifacts/results/')\n",
+    "out.mkdir(parents=True, exist_ok=True)\n",
+    "df_eval.to_csv(out / 'eval.csv', index=False)\n",
+    "\n",
+    "print(df_eval.groupby('baseline').agg({\n",
+    "    'diagnosis_correct': 'mean',\n",
+    "    'total_reward': 'mean',\n",
+    "    'tool_call_count': 'mean',\n",
+    "}))\n",
+    "\n",
+    "plot_all_eval_metrics(df_eval, out / 'plots/')\n",
+    "print('Plots saved to data_artifacts/results/plots/')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cell-6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cell 6: Pull training curves from W&B\n",
+    "from ci_triage_env.training.curves import plot_training_curves_from_wandb\n",
+    "\n",
+    "plot_training_curves_from_wandb(\n",
+    "    run_id=WANDB_RUN_ID,\n",
+    "    output_dir=Path('data_artifacts/results/plots/'),\n",
+    ")\n",
+    "print('Training curves saved.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cell-7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cell 7: Run reward-layer ablations (~5h on A100; set RUN_ABLATIONS=True to enable)\n",
+    "RUN_ABLATIONS = False\n",
+    "\n",
+    "if RUN_ABLATIONS:\n",
+    "    from ci_triage_env.training.ablations import ABLATIONS, run_ablation\n",
+    "    from ci_triage_env.training.curves import plot_ablation_summary\n",
+    "\n",
+    "    abl_results = []\n",
+    "    for name, overrides in ABLATIONS.items():\n",
+    "        print(f'=== Ablation: {name} ===')\n",
+    "        df_abl = run_ablation(name, overrides, total_steps=1000)\n",
+    "        abl_results.append(df_abl)\n",
+    "        print(df_abl.groupby('baseline')['diagnosis_correct'].mean())\n",
+    "\n",
+    "    df_full_abl = pd.concat(abl_results, ignore_index=True)\n",
+    "    df_full_abl.to_csv(out / 'ablations.csv', index=False)\n",
+    "    plot_ablation_summary(df_full_abl, output_dir=out / 'plots/')\n",
+    "    print('Ablations saved.')\n",
+    "else:\n",
+    "    print('Ablations skipped (set RUN_ABLATIONS=True to run).')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cell-8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cell 8: Populate README with results\n",
+    "from ci_triage_env.training.finalize_readme import populate_readme\n",
+    "\n",
+    "n = populate_readme(\n",
+    "    eval_csv=out / 'eval.csv',\n",
+    "    ablation_csv=out / 'ablations.csv',\n",
+    "    plots_dir=out / 'plots/',\n",
+    ")\n",
+    "print(f'Replaced {n} markers in README.md')\n",
+    "\n",
+    "# Check for any remaining unfilled markers\n",
+    "import subprocess\n",
+    "result = subprocess.run(['grep', '-c', r'\\[FILL', 'README.md'], capture_output=True, text=True)\n",
+    "remaining = int(result.stdout.strip() or 0)\n",
+    "if remaining:\n",
+    "    print(f'WARNING: {remaining} unfilled [FILL] marker(s) remain in README.md')\n",
+    "else:\n",
+    "    print('README.md is clean — no unfilled markers.')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.11.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

src/ci_triage_env/training/ablations.py ADDED Viewed

	@@ -0,0 +1,98 @@

+"""Reward-layer ablation runs for CI-Triage-Env.
+Each ablation zeroes one reward component weight, runs 1000-step GRPO from the
+SFT checkpoint, then evaluates against the held-out set. Results are saved to
+data_artifacts/results/ablations.csv.
+All GPU-heavy imports are lazy; this module is importable without GPU.
+"""
+from __future__ import annotations
+from ci_triage_env.rewards.weights import REWARD_WEIGHTS
+from ci_triage_env.training.eval import Evaluator
+from ci_triage_env.training.grpo import run_grpo
+ABLATIONS: dict[str, dict[str, float]] = {
+    "no_diagnosis": {"diagnosis": 0.0},
+    "no_action_quality": {"action_quality": 0.0},
+    "no_investigation": {"investigation": 0.0},
+    "no_anti_gaming": {"anti_gaming": 0.0},
+    # counterfactual ablation deferred to v2 — its weight is already 0.0 in v1.
+}
+def run_ablation(
+    name: str,
+    weight_overrides: dict[str, float],
+    total_steps: int = 1000,
+    base_sft_checkpoint: str = "checkpoints/sft/",
+    env_client=None,
+):
+    """Run one ablation: train GRPO with modified weights then evaluate.
+    Args:
+        name: Ablation name (used as output directory suffix and CSV column).
+        weight_overrides: Keys/values to merge over REWARD_WEIGHTS (zeroed components).
+        total_steps: GRPO training steps for this ablation (default 1000, not 3000).
+        base_sft_checkpoint: SFT warmstart checkpoint to train from.
+        env_client: Optional env client override (MockEnvClient for testing).
+    Returns:
+        pandas DataFrame with eval results; has an extra "ablation" column.
+    """
+    new_weights = {**REWARD_WEIGHTS, **weight_overrides}
+    output_dir = f"checkpoints/ablation_{name}/"
+    run_grpo(
+        sft_checkpoint_dir=base_sft_checkpoint,
+        output_dir=output_dir,
+        total_steps=total_steps,
+        weights_override=new_weights,
+        env_client=env_client,
+    )
+    evaluator = Evaluator(trained_checkpoint=output_dir, env_client=env_client)
+    evaluator.BASELINES = ["random", "heuristic", "trained"]
+    df = evaluator.run_all(seeds=[1])
+    df["ablation"] = name
+    return df
+def main(argv=None) -> None:
+    import argparse
+    from pathlib import Path
+    import pandas as pd
+    from ci_triage_env.training.curves import plot_ablation_summary
+    parser = argparse.ArgumentParser(description="Run reward-layer ablation matrix")
+    parser.add_argument("--steps", type=int, default=1000)
+    parser.add_argument("--sft-checkpoint", default="checkpoints/sft/")
+    parser.add_argument("--output", default="data_artifacts/results/")
+    args = parser.parse_args(argv)
+    all_results = []
+    for name, overrides in ABLATIONS.items():
+        print(f"=== Ablation: {name} ===")
+        df = run_ablation(
+            name, overrides,
+            total_steps=args.steps,
+            base_sft_checkpoint=args.sft_checkpoint,
+        )
+        all_results.append(df)
+        print(df.groupby("baseline")["diagnosis_correct"].mean())
+    full = pd.concat(all_results, ignore_index=True)
+    out = Path(args.output)
+    out.mkdir(parents=True, exist_ok=True)
+    full.to_csv(out / "ablations.csv", index=False)
+    print(f"\nAblations saved to {out / 'ablations.csv'}")
+    plot_ablation_summary(full, output_dir=out / "plots/")
+if __name__ == "__main__":
+    main()

src/ci_triage_env/training/curves.py ADDED Viewed

	@@ -0,0 +1,136 @@

+"""Training curve plots and ablation summary charts for CI-Triage-Env.
+Module-level try/except lets the module import without matplotlib; tests patch
+`ci_triage_env.training.curves.plt` and `.sns` directly.
+W&B imports are lazy (inside functions) since wandb is Colab-only.
+"""
+from __future__ import annotations
+from pathlib import Path
+try:
+    import matplotlib.pyplot as plt  # type: ignore[import]
+    import seaborn as sns  # type: ignore[import]
+except ImportError:
+    plt = None  # type: ignore[assignment]
+    sns = None  # type: ignore[assignment]
+def plot_training_curves_from_wandb(run_id: str, output_dir: Path) -> None:
+    """Pull per-step metrics from W&B and write training curve PNGs.
+    Args:
+        run_id: W&B run path, e.g. "<entity>/ci-triage-env/<run_id>".
+        output_dir: Directory to write PNG files.
+    """
+    if plt is None:
+        raise ImportError("matplotlib required — install with: pip install matplotlib")
+    import wandb  # type: ignore[import]
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    api = wandb.Api()
+    run = api.run(run_id)
+    history = run.history(samples=10000)
+    # 1. Reward curve (raw + smoothed)
+    fig, ax = plt.subplots(figsize=(10, 5))
+    ax.plot(history["step"], history["mean_reward"], alpha=0.4, label="raw")
+    smooth = history["mean_reward"].rolling(window=20, min_periods=1).mean()
+    ax.plot(history["step"], smooth, label="smoothed (window=20)", linewidth=2)
+    ax.set_xlabel("Training step")
+    ax.set_ylabel("Mean episode reward")
+    ax.set_title("GRPO training: reward over time")
+    ax.legend()
+    ax.grid(alpha=0.3)
+    fig.tight_layout()
+    fig.savefig(output_dir / "training_reward_curve.png", dpi=120)
+    plt.close(fig)
+    # 2. Per-component reward breakdown
+    component_keys = [
+        "reward/diagnosis", "reward/action_quality", "reward/cost_efficiency",
+        "reward/investigation", "reward/anti_gaming",
+    ]
+    fig, ax = plt.subplots(figsize=(10, 5))
+    for k in component_keys:
+        if k in history.columns:
+            smooth = history[k].rolling(window=20, min_periods=1).mean()
+            ax.plot(history["step"], smooth, label=k.replace("reward/", ""))
+    ax.set_xlabel("Training step")
+    ax.set_ylabel("Mean component reward (smoothed)")
+    ax.set_title("Per-component reward breakdown over training")
+    ax.legend()
+    ax.grid(alpha=0.3)
+    fig.tight_layout()
+    fig.savefig(output_dir / "per_component_curve.png", dpi=120)
+    plt.close(fig)
+    # 3. KL divergence
+    if "kl" in history.columns:
+        fig, ax = plt.subplots(figsize=(10, 4))
+        ax.plot(history["step"], history["kl"])
+        ax.set_xlabel("Training step")
+        ax.set_ylabel("KL(policy || ref)")
+        ax.set_title("KL divergence to SFT reference")
+        ax.grid(alpha=0.3)
+        fig.tight_layout()
+        fig.savefig(output_dir / "kl_divergence.png", dpi=120)
+        plt.close(fig)
+    # 4. Format-gate pass rate
+    if "format_gate_pass_rate" in history.columns:
+        fig, ax = plt.subplots(figsize=(10, 4))
+        ax.plot(history["step"], history["format_gate_pass_rate"])
+        ax.set_ylim(0, 1)
+        ax.set_xlabel("Training step")
+        ax.set_ylabel("Format gate pass rate")
+        ax.set_title("Fraction of episodes passing format gate")
+        ax.grid(alpha=0.3)
+        fig.tight_layout()
+        fig.savefig(output_dir / "format_gate.png", dpi=120)
+        plt.close(fig)
+def plot_ablation_summary(df, output_dir=None) -> None:
+    """Bar chart comparing diagnosis accuracy and total reward across ablations.
+    Args:
+        df: DataFrame with columns [ablation, baseline, diagnosis_correct, total_reward, ...].
+        output_dir: Directory to write ablation_summary.png. Defaults to
+            data_artifacts/results/plots/.
+    """
+    if plt is None:
+        raise ImportError("matplotlib required — install with: pip install matplotlib")
+    if output_dir is None:
+        output_dir = Path("data_artifacts/results/plots/")
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    summary = df.groupby("ablation").agg(
+        diagnosis_acc=("diagnosis_correct", "mean"),
+        total_reward=("total_reward", "mean"),
+        action_qual=("action_quality", "mean"),
+    ).reset_index()
+    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
+    axes[0].bar(list(summary["ablation"]), list(summary["diagnosis_acc"]))
+    axes[0].set_title("Diagnosis Accuracy by Ablation")
+    axes[0].set_ylabel("Diagnosis Accuracy")
+    axes[0].set_xlabel("Ablation")
+    axes[0].tick_params(axis="x", rotation=30)
+    axes[1].bar(list(summary["ablation"]), list(summary["total_reward"]))
+    axes[1].set_title("Mean Total Reward by Ablation")
+    axes[1].set_ylabel("Mean Total Reward")
+    axes[1].set_xlabel("Ablation")
+    axes[1].tick_params(axis="x", rotation=30)
+    fig.tight_layout()
+    fig.savefig(output_dir / "ablation_summary.png", dpi=120)
+    plt.close(fig)

src/ci_triage_env/training/finalize_readme.py ADDED Viewed

	@@ -0,0 +1,65 @@

+"""Populate README's [FILL POST-TRAIN] markers after training is complete.
+Run after eval.csv, ablations.csv, and plots/ are all present:
+    python -c "from ci_triage_env.training.finalize_readme import populate_readme; populate_readme()"
+"""
+from __future__ import annotations
+from pathlib import Path
+def populate_readme(
+    template_path: Path = Path("README.md"),
+    eval_csv: Path = Path("data_artifacts/results/eval.csv"),
+    ablation_csv: Path = Path("data_artifacts/results/ablations.csv"),
+    plots_dir: Path = Path("data_artifacts/results/plots/"),
+) -> int:
+    """Fill [FILL …] markers in README.md in-place.
+    Args:
+        template_path: Path to README.md.
+        eval_csv: Path to the master eval CSV from Phase C5.
+        ablation_csv: Path to the ablation results CSV from Phase C6.
+        plots_dir: Directory containing PNG plots.
+    Returns:
+        Number of markers replaced.
+    """
+    import pandas as pd
+    from ci_triage_env.training.readme_table import generate_results_table
+    text = template_path.read_text()
+    replaced = 0
+    # 1. Results table
+    if eval_csv.exists():
+        df_eval = pd.read_csv(eval_csv)
+        table_md = generate_results_table(df_eval)
+        marker = "[FILL: 5-row × 6-metric table]"
+        if marker in text:
+            text = text.replace(marker, table_md)
+            replaced += 1
+    # 2. Embed plot images — replace [FILL: <stem with spaces>] with markdown img tags
+    if plots_dir.exists():
+        for png in sorted(plots_dir.glob("*.png")):
+            stem_words = png.stem.replace("_", " ")
+            marker = f"[FILL: {stem_words}]"
+            rel = png.relative_to(template_path.parent)
+            embed = f"![{png.stem}]({rel})"
+            if marker in text:
+                text = text.replace(marker, embed)
+                replaced += 1
+    # 3. Remove any remaining generic [FILL POST-TRAIN] or [FILL] markers
+    #    by replacing them with a placeholder so the README stays valid.
+    import re
+    generic = re.compile(r"\[FILL[^\]]*\]")
+    remaining = generic.findall(text)
+    if remaining:
+        print(f"WARNING: {len(remaining)} unfilled marker(s) remain: {remaining[:5]}")
+    template_path.write_text(text)
+    return replaced

src/ci_triage_env/training/grpo.py CHANGED Viewed

@@ -34,6 +34,7 @@ def run_grpo(
     env_url: str = "http://localhost:8000",
     scenarios_train_path: str = "data_artifacts/scenarios/train",
     hyperparams: dict | None = None,
 ) -> str:
     """Launch GRPO fine-tuning from an SFT checkpoint.
@@ -46,6 +47,8 @@ def run_grpo(
         env_url: URL of the running env server (used only when env_client is None).
         scenarios_train_path: Directory containing train split scenario JSON files.
         hyperparams: Override specific GRPO hyperparameters. Merged over GRPO_HYPERPARAMS.
     Returns:
         output_dir path as a string.
@@ -64,6 +67,7 @@ def run_grpo(
     rollout = TrainingRollout(
         env_client=env_client,
         scenarios_train=scenario_ids,
     )
     model, tokenizer = load_model_for_sft(model_name=sft_checkpoint_dir)

     env_url: str = "http://localhost:8000",
     scenarios_train_path: str = "data_artifacts/scenarios/train",
     hyperparams: dict | None = None,
+    weights_override: dict | None = None,
 ) -> str:
     """Launch GRPO fine-tuning from an SFT checkpoint.
         env_url: URL of the running env server (used only when env_client is None).
         scenarios_train_path: Directory containing train split scenario JSON files.
         hyperparams: Override specific GRPO hyperparameters. Merged over GRPO_HYPERPARAMS.
+        weights_override: Override reward component weights passed to CompositeReward.
+            Used by ablation runs to zero out individual reward terms.
     Returns:
         output_dir path as a string.
     rollout = TrainingRollout(
         env_client=env_client,
         scenarios_train=scenario_ids,
+        weights=weights_override,
     )
     model, tokenizer = load_model_for_sft(model_name=sft_checkpoint_dir)

src/ci_triage_env/training/rollout.py CHANGED Viewed

@@ -47,10 +47,12 @@ class TrainingRollout:
         env_client,
         scenarios_train: list[str] | None = None,
         max_turns: int = 12,
     ) -> None:
         self.env = env_client
         self.scenarios_train: list[str] = scenarios_train or []
         self.max_turns = max_turns
         self._quarantine_window: list[str] = []
         self._tools_listing: str | None = None
@@ -114,9 +116,10 @@ class TrainingRollout:
         trace = self.env.get_trace(episode_id)
         scenario = self._load_scenario(trace, episode_id)
-        reward = compute_reward(
-            trace, scenario, quarantine_window=self._quarantine_window
-        )
         # Update rolling quarantine window (last 50 secondary actions)
         if trace.episode.final_action:

         env_client,
         scenarios_train: list[str] | None = None,
         max_turns: int = 12,
+        weights: dict | None = None,
     ) -> None:
         self.env = env_client
         self.scenarios_train: list[str] = scenarios_train or []
         self.max_turns = max_turns
+        self.weights = weights  # None → CompositeReward uses frozen defaults
         self._quarantine_window: list[str] = []
         self._tools_listing: str | None = None
         trace = self.env.get_trace(episode_id)
         scenario = self._load_scenario(trace, episode_id)
+        reward_kwargs: dict = {"quarantine_window": self._quarantine_window}
+        if self.weights is not None:
+            reward_kwargs["weights"] = self.weights
+        reward = compute_reward(trace, scenario, **reward_kwargs)
         # Update rolling quarantine window (last 50 secondary actions)
         if trace.episode.final_action:

tests/training/test_ablations.py ADDED Viewed

	@@ -0,0 +1,210 @@

+"""Tests for Phase C6 — ablations, curves, and readme finalization (no GPU)."""
+from __future__ import annotations
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+import pandas as pd
+from ci_triage_env.training.ablations import ABLATIONS, run_ablation
+from ci_triage_env.training.finalize_readme import populate_readme
+# ---------------------------------------------------------------------------
+# ABLATIONS dict
+# ---------------------------------------------------------------------------
+def test_ablations_dict_has_4_entries() -> None:
+    assert len(ABLATIONS) == 4  # counterfactual deferred to v2
+def test_each_ablation_zeros_exactly_one_weight() -> None:
+    for name, overrides in ABLATIONS.items():
+        zeroed = [k for k, v in overrides.items() if v == 0.0]
+        assert len(zeroed) == 1, f"Ablation '{name}' should zero exactly 1 weight, got {zeroed}"
+def test_ablation_names_reference_valid_reward_keys() -> None:
+    from ci_triage_env.rewards.weights import REWARD_WEIGHTS
+    for name, overrides in ABLATIONS.items():
+        for key in overrides:
+            assert key in REWARD_WEIGHTS, (
+                f"Ablation '{name}' references unknown reward key '{key}'"
+            )
+# ---------------------------------------------------------------------------
+# run_ablation smoke (mock run_grpo + Evaluator)
+# ---------------------------------------------------------------------------
+def test_run_ablation_smoke() -> None:
+    """Mock run_grpo and Evaluator; verify run_ablation returns a DataFrame."""
+    fake_df = pd.DataFrame({
+        "baseline": ["random", "heuristic", "trained"],
+        "scenario_id": ["s1", "s1", "s1"],
+        "family": ["real_bug"] * 3,
+        "difficulty": ["easy"] * 3,
+        "seed": [1, 1, 1],
+        "total_reward": [0.1, 0.5, 0.7],
+        "format_gate": [True] * 3,
+        "diagnosis_correct": [False, True, True],
+        "predicted_diagnosis": ["ambiguous", "real_bug", "real_bug"],
+        "true_diagnosis": ["real_bug"] * 3,
+        "action_quality": [0.0, 0.3, 0.5],
+        "tool_call_count": [3, 4, 5],
+        "total_cost": [0.03, 0.04, 0.05],
+        "confidence": [0.5, 0.8, 0.9],
+        "is_ambiguous_scenario": [False] * 3,
+        "brier_on_ambiguous": [None] * 3,
+    })
+    mock_evaluator = MagicMock()
+    mock_evaluator.run_all.return_value = fake_df
+    with patch("ci_triage_env.training.ablations.run_grpo") as mock_grpo, \
+         patch("ci_triage_env.training.ablations.Evaluator", return_value=mock_evaluator):
+        mock_grpo.return_value = "checkpoints/ablation_test/"
+        result = run_ablation(
+            "no_diagnosis",
+            {"diagnosis": 0.0},
+            total_steps=10,
+        )
+    assert isinstance(result, pd.DataFrame)
+    assert "ablation" in result.columns
+    assert (result["ablation"] == "no_diagnosis").all()
+    mock_grpo.assert_called_once()
+    call_kwargs = mock_grpo.call_args.kwargs
+    assert call_kwargs["weights_override"]["diagnosis"] == 0.0
+def test_run_ablation_passes_weights_to_grpo() -> None:
+    """Confirm the merged weights dict reaches run_grpo."""
+    from ci_triage_env.rewards.weights import REWARD_WEIGHTS
+    fake_df = pd.DataFrame({"baseline": [], "total_reward": [], "diagnosis_correct": [],
+                             "scenario_id": [], "family": [], "difficulty": [], "seed": [],
+                             "format_gate": [], "predicted_diagnosis": [], "true_diagnosis": [],
+                             "action_quality": [], "tool_call_count": [], "total_cost": [],
+                             "confidence": [], "is_ambiguous_scenario": [], "brier_on_ambiguous": []})
+    mock_evaluator = MagicMock()
+    mock_evaluator.run_all.return_value = fake_df
+    with patch("ci_triage_env.training.ablations.run_grpo") as mock_grpo, \
+         patch("ci_triage_env.training.ablations.Evaluator", return_value=mock_evaluator):
+        mock_grpo.return_value = "checkpoints/ablation_no_anti_gaming/"
+        run_ablation("no_anti_gaming", {"anti_gaming": 0.0}, total_steps=5)
+    weights_sent = mock_grpo.call_args.kwargs["weights_override"]
+    assert weights_sent["anti_gaming"] == 0.0
+    # All other weights preserved from REWARD_WEIGHTS
+    for k, v in REWARD_WEIGHTS.items():
+        if k != "anti_gaming":
+            assert weights_sent[k] == v
+# ---------------------------------------------------------------------------
+# plot_ablation_summary (matplotlib mocked)
+# ---------------------------------------------------------------------------
+def test_plot_ablation_summary_writes_png(tmp_path: Path) -> None:
+    df = pd.DataFrame({
+        "ablation": ["no_diagnosis", "no_action_quality", "no_diagnosis", "no_action_quality"],
+        "baseline": ["random", "random", "heuristic", "heuristic"],
+        "diagnosis_correct": [0.2, 0.4, 0.5, 0.6],
+        "total_reward": [0.1, 0.3, 0.4, 0.5],
+        "action_quality": [0.0, 0.2, 0.3, 0.4],
+    })
+    saved: list[str] = []
+    mock_fig = MagicMock()
+    mock_axes = [MagicMock(), MagicMock()]
+    mock_plt = MagicMock()
+    mock_plt.subplots.return_value = (mock_fig, mock_axes)
+    mock_sns = MagicMock()
+    def _record(path, **kwargs):
+        saved.append(str(path))
+    mock_fig.savefig.side_effect = _record
+    with patch("ci_triage_env.training.curves.plt", mock_plt), \
+         patch("ci_triage_env.training.curves.sns", mock_sns):
+        from ci_triage_env.training.curves import plot_ablation_summary
+        plot_ablation_summary(df, output_dir=tmp_path / "plots")
+    assert len(saved) >= 1
+    assert any("ablation_summary" in s for s in saved)
+# ---------------------------------------------------------------------------
+# populate_readme
+# ---------------------------------------------------------------------------
+def test_finalize_readme_replaces_table_marker(tmp_path: Path) -> None:
+    readme = tmp_path / "README.md"
+    readme.write_text(
+        "# Results\n\n[FILL: 5-row × 6-metric table]\n\nMore text.\n"
+    )
+    eval_csv = tmp_path / "eval.csv"
+    pd.DataFrame({
+        "baseline": ["random", "heuristic"],
+        "diagnosis_correct": [0.3, 0.6],
+        "action_quality": [0.1, 0.4],
+        "total_cost": [0.05, 0.03],
+        "tool_call_count": [4, 4],
+        "total_reward": [0.2, 0.5],
+    }).to_csv(eval_csv, index=False)
+    # No ablation csv; no plots dir
+    n = populate_readme(
+        template_path=readme,
+        eval_csv=eval_csv,
+        ablation_csv=tmp_path / "ablations.csv",
+        plots_dir=tmp_path / "plots",
+    )
+    result = readme.read_text()
+    assert "[FILL: 5-row × 6-metric table]" not in result
+    assert "|" in result  # table was inserted
+    assert n >= 1
+def test_finalize_readme_embeds_plot_images(tmp_path: Path) -> None:
+    readme = tmp_path / "README.md"
+    readme.write_text("# Plots\n\n[FILL: diagnosis accuracy]\n")
+    plots_dir = tmp_path / "plots"
+    plots_dir.mkdir()
+    (plots_dir / "diagnosis_accuracy.png").touch()
+    n = populate_readme(
+        template_path=readme,
+        eval_csv=tmp_path / "eval.csv",
+        ablation_csv=tmp_path / "ablations.csv",
+        plots_dir=plots_dir,
+    )
+    result = readme.read_text()
+    assert "[FILL: diagnosis accuracy]" not in result
+    assert "diagnosis_accuracy.png" in result
+    assert n >= 1
+def test_finalize_readme_missing_csv_does_not_crash(tmp_path: Path) -> None:
+    readme = tmp_path / "README.md"
+    readme.write_text("# CI Triage\n\nNo markers here.\n")
+    n = populate_readme(
+        template_path=readme,
+        eval_csv=tmp_path / "nonexistent.csv",
+        ablation_csv=tmp_path / "nonexistent2.csv",
+        plots_dir=tmp_path / "no_plots",
+    )
+    assert n == 0
+    assert readme.read_text() == "# CI Triage\n\nNo markers here.\n"