chaosops / scripts /ab_compare.sh
helloAK96's picture
Add A/B comparison Job for trained-policy showdown
e6e88e7
#!/usr/bin/env bash
# ChaosOps AI β€” A/B comparison Job entry-point.
#
# Pulls two LoRA adapters, evaluates each as the `trained` policy across
# the full curriculum, writes a single side-by-side report, and uploads
# everything to the WINNER's model repo.
#
# Required env:
# ADAPTER_A repo id, e.g. helloAK96/chaosops-grpo-lora-p1
# ADAPTER_B repo id, e.g. helloAK96/chaosops-grpo-lora-p2
# EPISODES_PER_TYPE default 5
#
# Output (uploaded to whichever repo wins on summed mean reward):
# ab_report.txt β€” side-by-side per-tier table
# ab_comparison_curve.png β€” both trained lines overlaid on baselines
set -euo pipefail
EPISODES_PER_TYPE="${EPISODES_PER_TYPE:-5}"
ADAPTER_A="${ADAPTER_A:?ADAPTER_A required}"
ADAPTER_B="${ADAPTER_B:?ADAPTER_B required}"
echo "==[chaosops]== installing deps"
pip install --quiet --upgrade pip
pip install --quiet --no-deps "torch==2.4.1+cu124" \
--index-url https://download.pytorch.org/whl/cu124 || true
pip install --quiet \
"transformers>=4.44.0,<4.50.0" \
"peft>=0.12.0,<0.14.0" \
"accelerate>=0.33.0,<0.36.0" \
"huggingface_hub>=0.24.0" \
"pydantic>=2.0.0" \
"matplotlib>=3.7.0" \
"datasets>=2.20.0,<3.0.0" \
"bitsandbytes==0.43.3"
ln -sfn /data /tmp/chaosops
export PYTHONPATH="/tmp:${PYTHONPATH:-}"
mkdir -p /workspace/{a,b}
cd /workspace
for tag in a b; do
case "$tag" in
a) repo="$ADAPTER_A" ;;
b) repo="$ADAPTER_B" ;;
esac
echo "==[chaosops]== downloading $repo β†’ /workspace/$tag/lora_adapter"
hf download "$repo" --repo-type model --local-dir "/workspace/$tag/lora_adapter" >/dev/null
echo "==[chaosops]== evaluating $tag ($repo)"
python -m chaosops.train.evaluate \
--policies random heuristic oracle trained \
--adapter-path "/workspace/$tag/lora_adapter" \
--episodes-per-type "${EPISODES_PER_TYPE}" \
--out-dir "/workspace/$tag/eval"
done
echo "==[chaosops]== building A/B report and overlay plot"
ADAPTER_A="$ADAPTER_A" ADAPTER_B="$ADAPTER_B" python - <<'PY'
import json, os
from pathlib import Path
from huggingface_hub import HfApi
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
repo_a = os.environ["ADAPTER_A"]
repo_b = os.environ["ADAPTER_B"]
def load(tag):
return json.loads(Path(f"/workspace/{tag}/eval/evaluation.json").read_text())
a = load("a")
b = load("b")
def by(agg, policy, tier):
return next((x for x in agg if x["policy"] == policy and x["tier"] == tier), None)
tiers = ["easy", "medium", "hard"]
report_lines = [
"ChaosOps AI β€” A/B comparison",
f" A = {repo_a}",
f" B = {repo_b}",
"",
f"{'tier':<8} {'policy':<10} {'A.reward':>10} {'B.reward':>10} Ξ”(B-A)",
"-" * 60,
]
for tier in tiers:
for policy in ["random", "heuristic", "oracle", "trained"]:
ax = by(a["aggregates"], policy, tier)
bx = by(b["aggregates"], policy, tier)
if not ax or not bx:
continue
delta = bx["mean_reward"] - ax["mean_reward"]
report_lines.append(
f"{tier:<8} {policy:<10} {ax['mean_reward']:>+10.1f} {bx['mean_reward']:>+10.1f} {delta:+10.1f}"
)
report = "\n".join(report_lines)
Path("/workspace/ab_report.txt").write_text(report + "\n")
print(report)
# Determine winner by sum of trained mean rewards across tiers
sum_a = sum(by(a["aggregates"], "trained", t)["mean_reward"] for t in tiers if by(a["aggregates"], "trained", t))
sum_b = sum(by(b["aggregates"], "trained", t)["mean_reward"] for t in tiers if by(b["aggregates"], "trained", t))
winner_repo = repo_a if sum_a >= sum_b else repo_b
print(f"\nWINNER (higher summed mean trained reward): {winner_repo} ({max(sum_a, sum_b):+.1f} vs {min(sum_a, sum_b):+.1f})")
# Build overlay plot (baselines from A; trained-A and trained-B both shown)
fig, ax = plt.subplots(figsize=(10, 5.5), dpi=160)
color = {"random": "#c0392b", "heuristic": "#2980b9", "oracle": "#27ae60",
"trained_a": "#8e44ad", "trained_b": "#d35400"}
for policy in ["random", "heuristic", "oracle"]:
xs, ys = [], []
for t in tiers:
m = by(a["aggregates"], policy, t)
if m: xs.append(t); ys.append(m["mean_reward"])
ax.plot(xs, ys, marker="o", label=policy, color=color[policy], linewidth=2.4, markersize=8)
for tag, repo, key in [("A", repo_a, "trained_a"), ("B", repo_b, "trained_b")]:
src = a if tag == "A" else b
xs, ys = [], []
for t in tiers:
m = by(src["aggregates"], "trained", t)
if m: xs.append(t); ys.append(m["mean_reward"])
ax.plot(xs, ys, marker="s", label=f"trained ({tag}: {repo.split('/')[-1]})",
color=color[key], linewidth=2.4, markersize=8, linestyle="--")
ax.axhline(0, color="#888", linewidth=0.6)
ax.set_title("ChaosOps AI β€” A/B trained-policy comparison vs. baselines", fontsize=13)
ax.set_xlabel("Difficulty tier", fontsize=12)
ax.set_ylabel("Mean cumulative episode reward (per-episode points)", fontsize=12)
ax.grid(True, linestyle=":", alpha=0.4)
ax.legend(loc="lower left", fontsize=10, framealpha=0.95)
fig.tight_layout()
fig.savefig("/workspace/ab_comparison_curve.png")
# Upload to WINNER repo
api = HfApi()
api.upload_file(path_or_fileobj="/workspace/ab_report.txt",
path_in_repo="ab_report.txt",
repo_id=winner_repo, repo_type="model",
commit_message="A/B comparison report")
api.upload_file(path_or_fileobj="/workspace/ab_comparison_curve.png",
path_in_repo="ab_comparison_curve.png",
repo_id=winner_repo, repo_type="model",
commit_message="A/B comparison curve")
print("uploaded to", winner_repo)
PY
echo "==[chaosops]== done"