Spaces:

helloAK96
/

chaosops

Running

App Files Files Community

chaosops / scripts /ab_compare.sh

helloAK96

Add A/B comparison Job for trained-policy showdown

e6e88e7 14 days ago

raw

history blame contribute delete

5.72 kB

	#!/usr/bin/env bash
	# ChaosOps AI — A/B comparison Job entry-point.
	#
	# Pulls two LoRA adapters, evaluates each as the `trained` policy across
	# the full curriculum, writes a single side-by-side report, and uploads
	# everything to the WINNER's model repo.
	#
	# Required env:
	# ADAPTER_A repo id, e.g. helloAK96/chaosops-grpo-lora-p1
	# ADAPTER_B repo id, e.g. helloAK96/chaosops-grpo-lora-p2
	# EPISODES_PER_TYPE default 5
	#
	# Output (uploaded to whichever repo wins on summed mean reward):
	# ab_report.txt — side-by-side per-tier table
	# ab_comparison_curve.png — both trained lines overlaid on baselines

	set -euo pipefail

	EPISODES_PER_TYPE="${EPISODES_PER_TYPE:-5}"
	ADAPTER_A="${ADAPTER_A:?ADAPTER_A required}"
	ADAPTER_B="${ADAPTER_B:?ADAPTER_B required}"

	echo "==[chaosops]== installing deps"
	pip install --quiet --upgrade pip
	pip install --quiet --no-deps "torch==2.4.1+cu124" \
	--index-url https://download.pytorch.org/whl/cu124 \|\| true
	pip install --quiet \
	"transformers>=4.44.0,<4.50.0" \
	"peft>=0.12.0,<0.14.0" \
	"accelerate>=0.33.0,<0.36.0" \
	"huggingface_hub>=0.24.0" \
	"pydantic>=2.0.0" \
	"matplotlib>=3.7.0" \
	"datasets>=2.20.0,<3.0.0" \
	"bitsandbytes==0.43.3"

	ln -sfn /data /tmp/chaosops
	export PYTHONPATH="/tmp:${PYTHONPATH:-}"

	mkdir -p /workspace/{a,b}
	cd /workspace

	for tag in a b; do
	case "$tag" in
	a) repo="$ADAPTER_A" ;;
	b) repo="$ADAPTER_B" ;;
	esac
	echo "==[chaosops]== downloading $repo → /workspace/$tag/lora_adapter"
	hf download "$repo" --repo-type model --local-dir "/workspace/$tag/lora_adapter" >/dev/null

	echo "==[chaosops]== evaluating $tag ($repo)"
	python -m chaosops.train.evaluate \
	--policies random heuristic oracle trained \
	--adapter-path "/workspace/$tag/lora_adapter" \
	--episodes-per-type "${EPISODES_PER_TYPE}" \
	--out-dir "/workspace/$tag/eval"
	done

	echo "==[chaosops]== building A/B report and overlay plot"
	ADAPTER_A="$ADAPTER_A" ADAPTER_B="$ADAPTER_B" python - <<'PY'
	import json, os
	from pathlib import Path
	from huggingface_hub import HfApi
	import matplotlib
	matplotlib.use("Agg")
	import matplotlib.pyplot as plt

	repo_a = os.environ["ADAPTER_A"]
	repo_b = os.environ["ADAPTER_B"]

	def load(tag):
	return json.loads(Path(f"/workspace/{tag}/eval/evaluation.json").read_text())

	a = load("a")
	b = load("b")

	def by(agg, policy, tier):
	return next((x for x in agg if x["policy"] == policy and x["tier"] == tier), None)

	tiers = ["easy", "medium", "hard"]
	report_lines = [
	"ChaosOps AI — A/B comparison",
	f" A = {repo_a}",
	f" B = {repo_b}",
	"",
	f"{'tier':<8} {'policy':<10} {'A.reward':>10} {'B.reward':>10} Δ(B-A)",
	"-" * 60,
	]
	for tier in tiers:
	for policy in ["random", "heuristic", "oracle", "trained"]:
	ax = by(a["aggregates"], policy, tier)
	bx = by(b["aggregates"], policy, tier)
	if not ax or not bx:
	continue
	delta = bx["mean_reward"] - ax["mean_reward"]
	report_lines.append(
	f"{tier:<8} {policy:<10} {ax['mean_reward']:>+10.1f} {bx['mean_reward']:>+10.1f} {delta:+10.1f}"
	)
	report = "\n".join(report_lines)
	Path("/workspace/ab_report.txt").write_text(report + "\n")
	print(report)

	# Determine winner by sum of trained mean rewards across tiers
	sum_a = sum(by(a["aggregates"], "trained", t)["mean_reward"] for t in tiers if by(a["aggregates"], "trained", t))
	sum_b = sum(by(b["aggregates"], "trained", t)["mean_reward"] for t in tiers if by(b["aggregates"], "trained", t))
	winner_repo = repo_a if sum_a >= sum_b else repo_b
	print(f"\nWINNER (higher summed mean trained reward): {winner_repo} ({max(sum_a, sum_b):+.1f} vs {min(sum_a, sum_b):+.1f})")

	# Build overlay plot (baselines from A; trained-A and trained-B both shown)
	fig, ax = plt.subplots(figsize=(10, 5.5), dpi=160)
	color = {"random": "#c0392b", "heuristic": "#2980b9", "oracle": "#27ae60",
	"trained_a": "#8e44ad", "trained_b": "#d35400"}
	for policy in ["random", "heuristic", "oracle"]:
	xs, ys = [], []
	for t in tiers:
	m = by(a["aggregates"], policy, t)
	if m: xs.append(t); ys.append(m["mean_reward"])
	ax.plot(xs, ys, marker="o", label=policy, color=color[policy], linewidth=2.4, markersize=8)
	for tag, repo, key in [("A", repo_a, "trained_a"), ("B", repo_b, "trained_b")]:
	src = a if tag == "A" else b
	xs, ys = [], []
	for t in tiers:
	m = by(src["aggregates"], "trained", t)
	if m: xs.append(t); ys.append(m["mean_reward"])
	ax.plot(xs, ys, marker="s", label=f"trained ({tag}: {repo.split('/')[-1]})",
	color=color[key], linewidth=2.4, markersize=8, linestyle="--")

	ax.axhline(0, color="#888", linewidth=0.6)
	ax.set_title("ChaosOps AI — A/B trained-policy comparison vs. baselines", fontsize=13)
	ax.set_xlabel("Difficulty tier", fontsize=12)
	ax.set_ylabel("Mean cumulative episode reward (per-episode points)", fontsize=12)
	ax.grid(True, linestyle=":", alpha=0.4)
	ax.legend(loc="lower left", fontsize=10, framealpha=0.95)
	fig.tight_layout()
	fig.savefig("/workspace/ab_comparison_curve.png")

	# Upload to WINNER repo
	api = HfApi()
	api.upload_file(path_or_fileobj="/workspace/ab_report.txt",
	path_in_repo="ab_report.txt",
	repo_id=winner_repo, repo_type="model",
	commit_message="A/B comparison report")
	api.upload_file(path_or_fileobj="/workspace/ab_comparison_curve.png",
	path_in_repo="ab_comparison_curve.png",
	repo_id=winner_repo, repo_type="model",
	commit_message="A/B comparison curve")
	print("uploaded to", winner_repo)
	PY

	echo "==[chaosops]== done"