Spaces:

helloAK96
/

chaosops

Running

helloAK96 Claude Opus 4.7 commited on 14 days ago

Commit

e6e88e7

1 Parent(s): 6e35cec

Add A/B comparison Job for trained-policy showdown

scripts/ab_compare.sh evaluates two LoRA repos on the same 540-episode
sweep, builds a side-by-side report and an overlay plot, and uploads
both to whichever repo wins on summed trained mean reward across tiers.
Used to pick between the Phase-1 and Phase-2 LoRAs.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

Files changed (1) hide show

scripts/ab_compare.sh +150 -0

scripts/ab_compare.sh ADDED Viewed

	@@ -0,0 +1,150 @@

+#!/usr/bin/env bash
+# ChaosOps AI — A/B comparison Job entry-point.
+#
+# Pulls two LoRA adapters, evaluates each as the `trained` policy across
+# the full curriculum, writes a single side-by-side report, and uploads
+# everything to the WINNER's model repo.
+#
+# Required env:
+#   ADAPTER_A         repo id, e.g. helloAK96/chaosops-grpo-lora-p1
+#   ADAPTER_B         repo id, e.g. helloAK96/chaosops-grpo-lora-p2
+#   EPISODES_PER_TYPE default 5
+#
+# Output (uploaded to whichever repo wins on summed mean reward):
+#   ab_report.txt              — side-by-side per-tier table
+#   ab_comparison_curve.png    — both trained lines overlaid on baselines
+set -euo pipefail
+EPISODES_PER_TYPE="${EPISODES_PER_TYPE:-5}"
+ADAPTER_A="${ADAPTER_A:?ADAPTER_A required}"
+ADAPTER_B="${ADAPTER_B:?ADAPTER_B required}"
+echo "==[chaosops]== installing deps"
+pip install --quiet --upgrade pip
+pip install --quiet --no-deps "torch==2.4.1+cu124" \
+    --index-url https://download.pytorch.org/whl/cu124 || true
+pip install --quiet \
+    "transformers>=4.44.0,<4.50.0" \
+    "peft>=0.12.0,<0.14.0" \
+    "accelerate>=0.33.0,<0.36.0" \
+    "huggingface_hub>=0.24.0" \
+    "pydantic>=2.0.0" \
+    "matplotlib>=3.7.0" \
+    "datasets>=2.20.0,<3.0.0" \
+    "bitsandbytes==0.43.3"
+ln -sfn /data /tmp/chaosops
+export PYTHONPATH="/tmp:${PYTHONPATH:-}"
+mkdir -p /workspace/{a,b}
+cd /workspace
+for tag in a b; do
+    case "$tag" in
+        a) repo="$ADAPTER_A" ;;
+        b) repo="$ADAPTER_B" ;;
+    esac
+    echo "==[chaosops]== downloading $repo → /workspace/$tag/lora_adapter"
+    hf download "$repo" --repo-type model --local-dir "/workspace/$tag/lora_adapter" >/dev/null
+    echo "==[chaosops]== evaluating $tag ($repo)"
+    python -m chaosops.train.evaluate \
+        --policies random heuristic oracle trained \
+        --adapter-path "/workspace/$tag/lora_adapter" \
+        --episodes-per-type "${EPISODES_PER_TYPE}" \
+        --out-dir "/workspace/$tag/eval"
+done
+echo "==[chaosops]== building A/B report and overlay plot"
+ADAPTER_A="$ADAPTER_A" ADAPTER_B="$ADAPTER_B" python - <<'PY'
+import json, os
+from pathlib import Path
+from huggingface_hub import HfApi
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+repo_a = os.environ["ADAPTER_A"]
+repo_b = os.environ["ADAPTER_B"]
+def load(tag):
+    return json.loads(Path(f"/workspace/{tag}/eval/evaluation.json").read_text())
+a = load("a")
+b = load("b")
+def by(agg, policy, tier):
+    return next((x for x in agg if x["policy"] == policy and x["tier"] == tier), None)
+tiers = ["easy", "medium", "hard"]
+report_lines = [
+    "ChaosOps AI — A/B comparison",
+    f"  A = {repo_a}",
+    f"  B = {repo_b}",
+    "",
+    f"{'tier':<8} {'policy':<10} {'A.reward':>10} {'B.reward':>10}  Δ(B-A)",
+    "-" * 60,
+]
+for tier in tiers:
+    for policy in ["random", "heuristic", "oracle", "trained"]:
+        ax = by(a["aggregates"], policy, tier)
+        bx = by(b["aggregates"], policy, tier)
+        if not ax or not bx:
+            continue
+        delta = bx["mean_reward"] - ax["mean_reward"]
+        report_lines.append(
+            f"{tier:<8} {policy:<10} {ax['mean_reward']:>+10.1f} {bx['mean_reward']:>+10.1f}  {delta:+10.1f}"
+        )
+report = "\n".join(report_lines)
+Path("/workspace/ab_report.txt").write_text(report + "\n")
+print(report)
+# Determine winner by sum of trained mean rewards across tiers
+sum_a = sum(by(a["aggregates"], "trained", t)["mean_reward"] for t in tiers if by(a["aggregates"], "trained", t))
+sum_b = sum(by(b["aggregates"], "trained", t)["mean_reward"] for t in tiers if by(b["aggregates"], "trained", t))
+winner_repo = repo_a if sum_a >= sum_b else repo_b
+print(f"\nWINNER (higher summed mean trained reward): {winner_repo}  ({max(sum_a, sum_b):+.1f} vs {min(sum_a, sum_b):+.1f})")
+# Build overlay plot (baselines from A; trained-A and trained-B both shown)
+fig, ax = plt.subplots(figsize=(10, 5.5), dpi=160)
+color = {"random": "#c0392b", "heuristic": "#2980b9", "oracle": "#27ae60",
+         "trained_a": "#8e44ad", "trained_b": "#d35400"}
+for policy in ["random", "heuristic", "oracle"]:
+    xs, ys = [], []
+    for t in tiers:
+        m = by(a["aggregates"], policy, t)
+        if m: xs.append(t); ys.append(m["mean_reward"])
+    ax.plot(xs, ys, marker="o", label=policy, color=color[policy], linewidth=2.4, markersize=8)
+for tag, repo, key in [("A", repo_a, "trained_a"), ("B", repo_b, "trained_b")]:
+    src = a if tag == "A" else b
+    xs, ys = [], []
+    for t in tiers:
+        m = by(src["aggregates"], "trained", t)
+        if m: xs.append(t); ys.append(m["mean_reward"])
+    ax.plot(xs, ys, marker="s", label=f"trained ({tag}: {repo.split('/')[-1]})",
+            color=color[key], linewidth=2.4, markersize=8, linestyle="--")
+ax.axhline(0, color="#888", linewidth=0.6)
+ax.set_title("ChaosOps AI — A/B trained-policy comparison vs. baselines", fontsize=13)
+ax.set_xlabel("Difficulty tier", fontsize=12)
+ax.set_ylabel("Mean cumulative episode reward (per-episode points)", fontsize=12)
+ax.grid(True, linestyle=":", alpha=0.4)
+ax.legend(loc="lower left", fontsize=10, framealpha=0.95)
+fig.tight_layout()
+fig.savefig("/workspace/ab_comparison_curve.png")
+# Upload to WINNER repo
+api = HfApi()
+api.upload_file(path_or_fileobj="/workspace/ab_report.txt",
+                path_in_repo="ab_report.txt",
+                repo_id=winner_repo, repo_type="model",
+                commit_message="A/B comparison report")
+api.upload_file(path_or_fileobj="/workspace/ab_comparison_curve.png",
+                path_in_repo="ab_comparison_curve.png",
+                repo_id=winner_repo, repo_type="model",
+                commit_message="A/B comparison curve")
+print("uploaded to", winner_repo)
+PY
+echo "==[chaosops]== done"