File size: 5,719 Bytes
e6e88e7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 | #!/usr/bin/env bash
# ChaosOps AI β A/B comparison Job entry-point.
#
# Pulls two LoRA adapters, evaluates each as the `trained` policy across
# the full curriculum, writes a single side-by-side report, and uploads
# everything to the WINNER's model repo.
#
# Required env:
# ADAPTER_A repo id, e.g. helloAK96/chaosops-grpo-lora-p1
# ADAPTER_B repo id, e.g. helloAK96/chaosops-grpo-lora-p2
# EPISODES_PER_TYPE default 5
#
# Output (uploaded to whichever repo wins on summed mean reward):
# ab_report.txt β side-by-side per-tier table
# ab_comparison_curve.png β both trained lines overlaid on baselines
set -euo pipefail
EPISODES_PER_TYPE="${EPISODES_PER_TYPE:-5}"
ADAPTER_A="${ADAPTER_A:?ADAPTER_A required}"
ADAPTER_B="${ADAPTER_B:?ADAPTER_B required}"
echo "==[chaosops]== installing deps"
pip install --quiet --upgrade pip
pip install --quiet --no-deps "torch==2.4.1+cu124" \
--index-url https://download.pytorch.org/whl/cu124 || true
pip install --quiet \
"transformers>=4.44.0,<4.50.0" \
"peft>=0.12.0,<0.14.0" \
"accelerate>=0.33.0,<0.36.0" \
"huggingface_hub>=0.24.0" \
"pydantic>=2.0.0" \
"matplotlib>=3.7.0" \
"datasets>=2.20.0,<3.0.0" \
"bitsandbytes==0.43.3"
ln -sfn /data /tmp/chaosops
export PYTHONPATH="/tmp:${PYTHONPATH:-}"
mkdir -p /workspace/{a,b}
cd /workspace
for tag in a b; do
case "$tag" in
a) repo="$ADAPTER_A" ;;
b) repo="$ADAPTER_B" ;;
esac
echo "==[chaosops]== downloading $repo β /workspace/$tag/lora_adapter"
hf download "$repo" --repo-type model --local-dir "/workspace/$tag/lora_adapter" >/dev/null
echo "==[chaosops]== evaluating $tag ($repo)"
python -m chaosops.train.evaluate \
--policies random heuristic oracle trained \
--adapter-path "/workspace/$tag/lora_adapter" \
--episodes-per-type "${EPISODES_PER_TYPE}" \
--out-dir "/workspace/$tag/eval"
done
echo "==[chaosops]== building A/B report and overlay plot"
ADAPTER_A="$ADAPTER_A" ADAPTER_B="$ADAPTER_B" python - <<'PY'
import json, os
from pathlib import Path
from huggingface_hub import HfApi
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
repo_a = os.environ["ADAPTER_A"]
repo_b = os.environ["ADAPTER_B"]
def load(tag):
return json.loads(Path(f"/workspace/{tag}/eval/evaluation.json").read_text())
a = load("a")
b = load("b")
def by(agg, policy, tier):
return next((x for x in agg if x["policy"] == policy and x["tier"] == tier), None)
tiers = ["easy", "medium", "hard"]
report_lines = [
"ChaosOps AI β A/B comparison",
f" A = {repo_a}",
f" B = {repo_b}",
"",
f"{'tier':<8} {'policy':<10} {'A.reward':>10} {'B.reward':>10} Ξ(B-A)",
"-" * 60,
]
for tier in tiers:
for policy in ["random", "heuristic", "oracle", "trained"]:
ax = by(a["aggregates"], policy, tier)
bx = by(b["aggregates"], policy, tier)
if not ax or not bx:
continue
delta = bx["mean_reward"] - ax["mean_reward"]
report_lines.append(
f"{tier:<8} {policy:<10} {ax['mean_reward']:>+10.1f} {bx['mean_reward']:>+10.1f} {delta:+10.1f}"
)
report = "\n".join(report_lines)
Path("/workspace/ab_report.txt").write_text(report + "\n")
print(report)
# Determine winner by sum of trained mean rewards across tiers
sum_a = sum(by(a["aggregates"], "trained", t)["mean_reward"] for t in tiers if by(a["aggregates"], "trained", t))
sum_b = sum(by(b["aggregates"], "trained", t)["mean_reward"] for t in tiers if by(b["aggregates"], "trained", t))
winner_repo = repo_a if sum_a >= sum_b else repo_b
print(f"\nWINNER (higher summed mean trained reward): {winner_repo} ({max(sum_a, sum_b):+.1f} vs {min(sum_a, sum_b):+.1f})")
# Build overlay plot (baselines from A; trained-A and trained-B both shown)
fig, ax = plt.subplots(figsize=(10, 5.5), dpi=160)
color = {"random": "#c0392b", "heuristic": "#2980b9", "oracle": "#27ae60",
"trained_a": "#8e44ad", "trained_b": "#d35400"}
for policy in ["random", "heuristic", "oracle"]:
xs, ys = [], []
for t in tiers:
m = by(a["aggregates"], policy, t)
if m: xs.append(t); ys.append(m["mean_reward"])
ax.plot(xs, ys, marker="o", label=policy, color=color[policy], linewidth=2.4, markersize=8)
for tag, repo, key in [("A", repo_a, "trained_a"), ("B", repo_b, "trained_b")]:
src = a if tag == "A" else b
xs, ys = [], []
for t in tiers:
m = by(src["aggregates"], "trained", t)
if m: xs.append(t); ys.append(m["mean_reward"])
ax.plot(xs, ys, marker="s", label=f"trained ({tag}: {repo.split('/')[-1]})",
color=color[key], linewidth=2.4, markersize=8, linestyle="--")
ax.axhline(0, color="#888", linewidth=0.6)
ax.set_title("ChaosOps AI β A/B trained-policy comparison vs. baselines", fontsize=13)
ax.set_xlabel("Difficulty tier", fontsize=12)
ax.set_ylabel("Mean cumulative episode reward (per-episode points)", fontsize=12)
ax.grid(True, linestyle=":", alpha=0.4)
ax.legend(loc="lower left", fontsize=10, framealpha=0.95)
fig.tight_layout()
fig.savefig("/workspace/ab_comparison_curve.png")
# Upload to WINNER repo
api = HfApi()
api.upload_file(path_or_fileobj="/workspace/ab_report.txt",
path_in_repo="ab_report.txt",
repo_id=winner_repo, repo_type="model",
commit_message="A/B comparison report")
api.upload_file(path_or_fileobj="/workspace/ab_comparison_curve.png",
path_in_repo="ab_comparison_curve.png",
repo_id=winner_repo, repo_type="model",
commit_message="A/B comparison curve")
print("uploaded to", winner_repo)
PY
echo "==[chaosops]== done"
|