#!/usr/bin/env bash # ChaosOps AI — A/B comparison Job entry-point. # # Pulls two LoRA adapters, evaluates each as the `trained` policy across # the full curriculum, writes a single side-by-side report, and uploads # everything to the WINNER's model repo. # # Required env: # ADAPTER_A repo id, e.g. helloAK96/chaosops-grpo-lora-p1 # ADAPTER_B repo id, e.g. helloAK96/chaosops-grpo-lora-p2 # EPISODES_PER_TYPE default 5 # # Output (uploaded to whichever repo wins on summed mean reward): # ab_report.txt — side-by-side per-tier table # ab_comparison_curve.png — both trained lines overlaid on baselines set -euo pipefail EPISODES_PER_TYPE="${EPISODES_PER_TYPE:-5}" ADAPTER_A="${ADAPTER_A:?ADAPTER_A required}" ADAPTER_B="${ADAPTER_B:?ADAPTER_B required}" echo "==[chaosops]== installing deps" pip install --quiet --upgrade pip pip install --quiet --no-deps "torch==2.4.1+cu124" \ --index-url https://download.pytorch.org/whl/cu124 || true pip install --quiet \ "transformers>=4.44.0,<4.50.0" \ "peft>=0.12.0,<0.14.0" \ "accelerate>=0.33.0,<0.36.0" \ "huggingface_hub>=0.24.0" \ "pydantic>=2.0.0" \ "matplotlib>=3.7.0" \ "datasets>=2.20.0,<3.0.0" \ "bitsandbytes==0.43.3" ln -sfn /data /tmp/chaosops export PYTHONPATH="/tmp:${PYTHONPATH:-}" mkdir -p /workspace/{a,b} cd /workspace for tag in a b; do case "$tag" in a) repo="$ADAPTER_A" ;; b) repo="$ADAPTER_B" ;; esac echo "==[chaosops]== downloading $repo → /workspace/$tag/lora_adapter" hf download "$repo" --repo-type model --local-dir "/workspace/$tag/lora_adapter" >/dev/null echo "==[chaosops]== evaluating $tag ($repo)" python -m chaosops.train.evaluate \ --policies random heuristic oracle trained \ --adapter-path "/workspace/$tag/lora_adapter" \ --episodes-per-type "${EPISODES_PER_TYPE}" \ --out-dir "/workspace/$tag/eval" done echo "==[chaosops]== building A/B report and overlay plot" ADAPTER_A="$ADAPTER_A" ADAPTER_B="$ADAPTER_B" python - <<'PY' import json, os from pathlib import Path from huggingface_hub import HfApi import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt repo_a = os.environ["ADAPTER_A"] repo_b = os.environ["ADAPTER_B"] def load(tag): return json.loads(Path(f"/workspace/{tag}/eval/evaluation.json").read_text()) a = load("a") b = load("b") def by(agg, policy, tier): return next((x for x in agg if x["policy"] == policy and x["tier"] == tier), None) tiers = ["easy", "medium", "hard"] report_lines = [ "ChaosOps AI — A/B comparison", f" A = {repo_a}", f" B = {repo_b}", "", f"{'tier':<8} {'policy':<10} {'A.reward':>10} {'B.reward':>10} Δ(B-A)", "-" * 60, ] for tier in tiers: for policy in ["random", "heuristic", "oracle", "trained"]: ax = by(a["aggregates"], policy, tier) bx = by(b["aggregates"], policy, tier) if not ax or not bx: continue delta = bx["mean_reward"] - ax["mean_reward"] report_lines.append( f"{tier:<8} {policy:<10} {ax['mean_reward']:>+10.1f} {bx['mean_reward']:>+10.1f} {delta:+10.1f}" ) report = "\n".join(report_lines) Path("/workspace/ab_report.txt").write_text(report + "\n") print(report) # Determine winner by sum of trained mean rewards across tiers sum_a = sum(by(a["aggregates"], "trained", t)["mean_reward"] for t in tiers if by(a["aggregates"], "trained", t)) sum_b = sum(by(b["aggregates"], "trained", t)["mean_reward"] for t in tiers if by(b["aggregates"], "trained", t)) winner_repo = repo_a if sum_a >= sum_b else repo_b print(f"\nWINNER (higher summed mean trained reward): {winner_repo} ({max(sum_a, sum_b):+.1f} vs {min(sum_a, sum_b):+.1f})") # Build overlay plot (baselines from A; trained-A and trained-B both shown) fig, ax = plt.subplots(figsize=(10, 5.5), dpi=160) color = {"random": "#c0392b", "heuristic": "#2980b9", "oracle": "#27ae60", "trained_a": "#8e44ad", "trained_b": "#d35400"} for policy in ["random", "heuristic", "oracle"]: xs, ys = [], [] for t in tiers: m = by(a["aggregates"], policy, t) if m: xs.append(t); ys.append(m["mean_reward"]) ax.plot(xs, ys, marker="o", label=policy, color=color[policy], linewidth=2.4, markersize=8) for tag, repo, key in [("A", repo_a, "trained_a"), ("B", repo_b, "trained_b")]: src = a if tag == "A" else b xs, ys = [], [] for t in tiers: m = by(src["aggregates"], "trained", t) if m: xs.append(t); ys.append(m["mean_reward"]) ax.plot(xs, ys, marker="s", label=f"trained ({tag}: {repo.split('/')[-1]})", color=color[key], linewidth=2.4, markersize=8, linestyle="--") ax.axhline(0, color="#888", linewidth=0.6) ax.set_title("ChaosOps AI — A/B trained-policy comparison vs. baselines", fontsize=13) ax.set_xlabel("Difficulty tier", fontsize=12) ax.set_ylabel("Mean cumulative episode reward (per-episode points)", fontsize=12) ax.grid(True, linestyle=":", alpha=0.4) ax.legend(loc="lower left", fontsize=10, framealpha=0.95) fig.tight_layout() fig.savefig("/workspace/ab_comparison_curve.png") # Upload to WINNER repo api = HfApi() api.upload_file(path_or_fileobj="/workspace/ab_report.txt", path_in_repo="ab_report.txt", repo_id=winner_repo, repo_type="model", commit_message="A/B comparison report") api.upload_file(path_or_fileobj="/workspace/ab_comparison_curve.png", path_in_repo="ab_comparison_curve.png", repo_id=winner_repo, repo_type="model", commit_message="A/B comparison curve") print("uploaded to", winner_repo) PY echo "==[chaosops]== done"