Spaces:

InosLihka
/

rhythm_env

Sleeping

InosLihka commited on 12 days ago

Commit

73c7ea0

1 Parent(s): 7340206

feat: HF Jobs training script + plot generator

Adds:
- scripts/train_on_hf.py: PEP 723 self-contained job script.
Submit with: hf jobs uv run --flavor a10g-large --secrets HF_TOKEN scripts/train_on_hf.py
Clones repo, trains, evals, generates plots, uploads to HF Hub.
- scripts/plot_from_log.py: regenerates all 5 plots from saved log_history.json.
Usable both inside the HF Job AND locally to re-plot without re-training.
- training/train.py: now saves trainer.state.log_history to log_history.json
so plots can be generated offline.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (3) hide show

scripts/plot_from_log.py +188 -0
scripts/train_on_hf.py +168 -0
training/train.py +6 -0

scripts/plot_from_log.py ADDED Viewed

	@@ -0,0 +1,188 @@

+"""
+Generate training plots from a saved trainer.state.log_history JSON.
+Used by HF Jobs (where there's no notebook to call trainer.state directly)
+and locally to regenerate plots after training without re-running.
+Usage:
+    python scripts/plot_from_log.py --log outputs/.../log_history.json --out plots/
+"""
+import argparse
+import json
+import os
+import matplotlib.pyplot as plt
+import numpy as np
+def series(log, *keys):
+    """Find the first matching key across log entries; return (steps, values, key)."""
+    for k in keys:
+        steps, vals = [], []
+        for entry in log:
+            if k in entry:
+                steps.append(entry.get("step", len(steps)))
+                vals.append(entry[k])
+        if vals:
+            return steps, vals, k
+    return [], [], None
+def main():
+    p = argparse.ArgumentParser()
+    p.add_argument("--log", required=True)
+    p.add_argument("--out", default="plots")
+    args = p.parse_args()
+    with open(args.log) as f:
+        log = json.load(f)
+    os.makedirs(args.out, exist_ok=True)
+    # Discover keys to help debug
+    all_keys = set()
+    for entry in log:
+        all_keys.update(entry.keys())
+    print(f"Available log keys: {sorted(all_keys)}")
+    # 1. Training Loss
+    steps, losses, _ = series(log, "loss", "train/loss")
+    if losses:
+        fig, ax = plt.subplots(figsize=(10, 5))
+        ax.plot(steps, losses, color="#2563eb", linewidth=1.5, alpha=0.8)
+        ax.set_xlabel("Training Step")
+        ax.set_ylabel("Loss")
+        ax.set_title("GRPO Training Loss - RhythmEnv Meta-RL")
+        ax.grid(True, alpha=0.3)
+        plt.tight_layout()
+        plt.savefig(f"{args.out}/training_loss.png", dpi=150)
+        plt.close()
+        print(f"Saved: {args.out}/training_loss.png ({len(losses)} points)")
+    # 2. Mean Reward
+    rs, rv, rk = series(log, "reward", "rewards/mean", "rewards/total/mean")
+    ss, sv, _ = series(log, "reward_std", "rewards/std", "rewards/total/std")
+    if rv:
+        fig, ax = plt.subplots(figsize=(10, 5))
+        ax.plot(rs, rv, color="#16a34a", linewidth=1.5, label=f"Mean Reward ({rk})")
+        if sv and len(sv) == len(rv):
+            r, s = np.array(rv), np.array(sv)
+            ax.fill_between(rs, r - s, r + s, color="#16a34a", alpha=0.15, label="+/-1 std")
+        ax.set_xlabel("Training Step")
+        ax.set_ylabel("Mean Total Reward")
+        ax.set_title("GRPO Mean Reward over Training - RhythmEnv Meta-RL")
+        ax.legend()
+        ax.grid(True, alpha=0.3)
+        plt.tight_layout()
+        plt.savefig(f"{args.out}/reward_curve.png", dpi=150)
+        plt.close()
+        print(f"Saved: {args.out}/reward_curve.png ({len(rv)} points)")
+    # 3. Per-Reward-Function components
+    components = [
+        ("format_valid", ["rewards/format_valid/mean", "rewards/format_valid", "format_valid_reward"]),
+        ("action_legal", ["rewards/action_legal/mean", "rewards/action_legal", "action_legal_reward"]),
+        ("env_reward", ["rewards/env_reward/mean", "rewards/env_reward", "env_reward_reward"]),
+        ("belief_accuracy", ["rewards/belief_accuracy/mean", "rewards/belief_accuracy", "belief_accuracy_reward"]),
+    ]
+    found = []
+    for name, keys in components:
+        s, v, k = series(log, *keys)
+        if v:
+            found.append((name, s, v))
+            print(f"  {name}: matched key '{k}'")
+        else:
+            print(f"  {name}: NOT FOUND")
+    if found:
+        fig, ax = plt.subplots(figsize=(12, 6))
+        colors = {"format_valid": "#94a3b8", "action_legal": "#60a5fa", "env_reward": "#22c55e", "belief_accuracy": "#a855f7"}
+        for name, s, v in found:
+            ax.plot(s, v, color=colors.get(name, "#000"), linewidth=1.5, alpha=0.85, label=name)
+        ax.axhline(0, color="k", linewidth=0.4)
+        ax.set_xlabel("Training Step")
+        ax.set_ylabel("Mean Reward Component")
+        ax.set_title("4-Layer Reward Stack over Training (RhythmEnv Meta-RL)")
+        ax.legend(loc="best")
+        ax.grid(True, alpha=0.3)
+        plt.tight_layout()
+        plt.savefig(f"{args.out}/reward_components.png", dpi=150)
+        plt.close()
+        print(f"Saved: {args.out}/reward_components.png ({len(found)} components)")
+    # 4. Belief-Accuracy curve
+    bs, bv, _ = series(log, "rewards/belief_accuracy/mean", "rewards/belief_accuracy", "belief_accuracy_reward")
+    if bv:
+        fig, ax = plt.subplots(figsize=(10, 5))
+        ax.plot(bs, bv, color="#a855f7", linewidth=2.0, alpha=0.9, label="Belief reward")
+        if len(bv) > 20:
+            win = max(10, len(bv) // 30)
+            kernel = np.ones(win) / win
+            smooth = np.convolve(bv, kernel, mode="valid")
+            ax.plot(bs[win - 1:], smooth, color="#7e22ce", linewidth=2.5, label=f"Rolling mean ({win}-step)")
+        ax.axhline(0.0, color="k", linewidth=0.5, linestyle="--", alpha=0.5, label="neutral baseline")
+        ax.set_xlabel("Training Step")
+        ax.set_ylabel("Mean belief_accuracy reward (-0.5 to +0.5)")
+        ax.set_title("Belief-Accuracy Reward over Training (proof agent learned to model user)")
+        ax.legend(loc="best")
+        ax.grid(True, alpha=0.3)
+        plt.tight_layout()
+        plt.savefig(f"{args.out}/belief_accuracy.png", dpi=150)
+        plt.close()
+        print(f"Saved: {args.out}/belief_accuracy.png ({len(bv)} points)")
+    # 5. Comparison plot if eval_results.json is available
+    eval_path = "eval_results.json"
+    if os.path.exists(eval_path):
+        with open(eval_path) as f:
+            results = json.load(f)
+        conditions = ["discrete-3-profiles (legacy)", "continuous-in-distribution", "continuous-OOD (generalization)"]
+        def avg(cond, strat, key="final_score"):
+            rs = [r[key] for r in results if r["condition"] == cond and r["strategy"] == strat]
+            return float(np.mean(rs)) if rs else 0.0
+        x = np.arange(len(conditions))
+        width = 0.27
+        fig, axes = plt.subplots(1, 2, figsize=(14, 5))
+        rand = [avg(c, "random") for c in conditions]
+        heur = [avg(c, "heuristic") for c in conditions]
+        trnd = [avg(c, "model") for c in conditions]
+        axes[0].bar(x - width, rand, width, label="Random", color="#94a3b8")
+        axes[0].bar(x,         heur, width, label="Heuristic", color="#60a5fa")
+        axes[0].bar(x + width, trnd, width, label="Trained Qwen", color="#22c55e")
+        axes[0].set_ylabel("Final score (0-1)")
+        axes[0].set_title("Final score by condition")
+        axes[0].set_xticks(x)
+        axes[0].set_xticklabels([c.split(" ")[0] for c in conditions], fontsize=10)
+        axes[0].legend()
+        axes[0].grid(axis="y", alpha=0.3)
+        rand_a = [avg(c, "random", "adaptation") for c in conditions]
+        heur_a = [avg(c, "heuristic", "adaptation") for c in conditions]
+        trnd_a = [avg(c, "model", "adaptation") for c in conditions]
+        axes[1].bar(x - width, rand_a, width, label="Random", color="#94a3b8")
+        axes[1].bar(x,         heur_a, width, label="Heuristic", color="#60a5fa")
+        axes[1].bar(x + width, trnd_a, width, label="Trained Qwen", color="#22c55e")
+        axes[1].set_ylabel("Adaptation (late-half - early-half mean reward)")
+        axes[1].set_title("Adaptation: did agent get better mid-episode?")
+        axes[1].set_xticks(x)
+        axes[1].set_xticklabels([c.split(" ")[0] for c in conditions], fontsize=10)
+        axes[1].axhline(0, color="k", linewidth=0.5)
+        axes[1].legend()
+        axes[1].grid(axis="y", alpha=0.3)
+        plt.tight_layout()
+        plt.savefig(f"{args.out}/baseline_vs_trained.png", dpi=150)
+        plt.close()
+        print(f"Saved: {args.out}/baseline_vs_trained.png")
+        print()
+        print(f"{'Condition':<40} {'Random':>10} {'Heuristic':>10} {'Trained':>10} {'vs Heuristic':>14}")
+        print("-" * 90)
+        for c, r, h, t in zip(conditions, rand, heur, trnd):
+            print(f"{c:<40} {r:>10.3f} {h:>10.3f} {t:>10.3f} {(t - h):>+14.3f}")
+if __name__ == "__main__":
+    main()

scripts/train_on_hf.py ADDED Viewed

	@@ -0,0 +1,168 @@

+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#   "torch",
+#   "transformers==4.56.2",
+#   "trl==0.22.2",
+#   "datasets",
+#   "peft",
+#   "accelerate",
+#   "bitsandbytes",
+#   "unsloth",
+#   "openenv-core",
+#   "fastapi",
+#   "uvicorn",
+#   "pydantic",
+#   "matplotlib",
+#   "huggingface_hub",
+# ]
+# ///
+"""
+End-to-end training job for HF Jobs.
+Submit from local machine with:
+    hf jobs uv run --flavor a10g-large --secrets HF_TOKEN scripts/train_on_hf.py
+What it does (no babysitting required):
+  1. Clone rhythm_env from HF Space (gets latest meta-RL code from main)
+  2. Generate dataset (continuous profiles, hint_fraction=0.15)
+  3. Train Qwen 2.5-3B + LoRA rank 8 via GRPO (1500 steps)
+  4. Run eval on all 3 conditions (discrete, in-dist, OOD)
+  5. Generate all 5 plots from log_history
+  6. Upload trained model + plots + eval JSON to a new HF Hub model repo
+Override defaults via env vars:
+    MAX_STEPS, NUM_EPISODES, LORA_RANK, BETA, MODEL_REPO
+Estimated cost on a10g-large at $1.50/hr: ~$3 for 1500 steps (~2h).
+"""
+import json
+import os
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+# ---------------------------------------------------------------------------
+# Config (overridable via env vars)
+# ---------------------------------------------------------------------------
+REPO_URL = os.environ.get("REPO_URL", "https://huggingface.co/spaces/InosLihka/rhythm_env")
+WORK_DIR = "/tmp/rhythm_env"
+OUTPUT_DIR = "/tmp/rhythm_env/outputs/rhythmenv_meta_trained"
+PLOTS_DIR = "/tmp/rhythm_env/plots"
+MAX_STEPS = int(os.environ.get("MAX_STEPS", "1500"))
+NUM_EPISODES = int(os.environ.get("NUM_EPISODES", "300"))
+LORA_RANK = int(os.environ.get("LORA_RANK", "8"))
+BETA = float(os.environ.get("BETA", "0.1"))
+MODEL_REPO = os.environ.get("MODEL_REPO", "InosLihka/rhythm-env-meta-trained")
+def run(cmd: list[str], **kw):
+    """Run subprocess with logging."""
+    print(f"\n>>> {' '.join(cmd) if isinstance(cmd, list) else cmd}", flush=True)
+    subprocess.run(cmd, check=True, **kw)
+def main():
+    # ---------------------------------------------------------------
+    # 1. Clone the rhythm_env repo
+    # ---------------------------------------------------------------
+    if Path(WORK_DIR).exists():
+        shutil.rmtree(WORK_DIR)
+    run(["git", "clone", REPO_URL, WORK_DIR])
+    os.chdir(WORK_DIR)
+    sys.path.insert(0, WORK_DIR)
+    sys.path.insert(0, os.path.join(WORK_DIR, "training"))
+    # Verify meta-RL code is present
+    dataset_py = Path("training/dataset.py").read_text()
+    assert "profile_mode" in dataset_py, "Cloned repo doesn't have meta-RL code"
+    print("OK: meta-RL code present in cloned repo")
+    # ---------------------------------------------------------------
+    # 2. Train
+    # ---------------------------------------------------------------
+    train_args = [
+        "python", "training/train.py",
+        "--max_steps", str(MAX_STEPS),
+        "--num_episodes", str(NUM_EPISODES),
+        "--lora_rank", str(LORA_RANK),
+        "--beta", str(BETA),
+        "--output_dir", OUTPUT_DIR,
+    ]
+    run(train_args)
+    # ---------------------------------------------------------------
+    # 3. Eval (3 conditions: discrete-3 / in-dist / OOD)
+    # ---------------------------------------------------------------
+    eval_args = [
+        "python", "training/inference_eval.py",
+        "--model_path", OUTPUT_DIR,
+        "--num_episodes", "5",
+        "--output_file", "eval_results.json",
+    ]
+    run(eval_args)
+    # ---------------------------------------------------------------
+    # 4. Generate plots from saved log_history
+    # ---------------------------------------------------------------
+    Path(PLOTS_DIR).mkdir(exist_ok=True)
+    log_path = os.path.join(OUTPUT_DIR, "log_history.json")
+    if Path(log_path).exists():
+        run(["python", "scripts/plot_from_log.py", "--log", log_path, "--out", PLOTS_DIR])
+    else:
+        print(f"WARNING: log_history.json not found at {log_path}")
+    # ---------------------------------------------------------------
+    # 5. Upload everything to HF Hub
+    # ---------------------------------------------------------------
+    token = os.environ.get("HF_TOKEN")
+    if not token:
+        print("WARNING: HF_TOKEN not set, skipping upload")
+        print(f"Outputs in: {OUTPUT_DIR}")
+        return
+    from huggingface_hub import HfApi, login
+    login(token=token)
+    api = HfApi()
+    api.create_repo(MODEL_REPO, exist_ok=True, repo_type="model")
+    # Upload trained model + config + log_history
+    api.upload_folder(
+        folder_path=OUTPUT_DIR,
+        repo_id=MODEL_REPO,
+        repo_type="model",
+        commit_message=f"Trained {MAX_STEPS}-step GRPO meta-RL agent",
+    )
+    # Upload eval JSON
+    api.upload_file(
+        path_or_fileobj="eval_results.json",
+        path_in_repo="eval_results.json",
+        repo_id=MODEL_REPO,
+        repo_type="model",
+    )
+    # Upload plots if generated
+    if Path(PLOTS_DIR).exists() and any(Path(PLOTS_DIR).iterdir()):
+        api.upload_folder(
+            folder_path=PLOTS_DIR,
+            path_in_repo="plots",
+            repo_id=MODEL_REPO,
+            repo_type="model",
+        )
+    print()
+    print("=" * 60)
+    print("DONE")
+    print(f"  Trained model: https://huggingface.co/{MODEL_REPO}")
+    print(f"  Eval JSON:     https://huggingface.co/{MODEL_REPO}/blob/main/eval_results.json")
+    print(f"  Plots:         https://huggingface.co/{MODEL_REPO}/tree/main/plots")
+    print("=" * 60)
+if __name__ == "__main__":
+    main()

training/train.py CHANGED Viewed

@@ -204,8 +204,14 @@ def main():
     with open(config_path, "w") as f:
         json.dump(vars(args), f, indent=2)
     print(f"Model saved to: {args.output_dir}")
     print(f"Training config saved to: {config_path}")
     print("\nNext: run inference_eval.py to compare baseline vs trained")
     print("      python training/inference_eval.py --model_path " + args.output_dir)

     with open(config_path, "w") as f:
         json.dump(vars(args), f, indent=2)
+    # Save log_history for offline plotting (job runs don't have a notebook to inspect trainer.state)
+    log_path = os.path.join(args.output_dir, "log_history.json")
+    with open(log_path, "w") as f:
+        json.dump(trainer.state.log_history, f, indent=2)
     print(f"Model saved to: {args.output_dir}")
     print(f"Training config saved to: {config_path}")
+    print(f"Log history saved to: {log_path}")
     print("\nNext: run inference_eval.py to compare baseline vs trained")
     print("      python training/inference_eval.py --model_path " + args.output_dir)