Spaces:

InosLihka
/

rhythm_env

Sleeping

InosLihka Claude Opus 4.7 (1M context) commited on 13 days ago

Commit

b5ac530

1 Parent(s): 786249b

notebook: add belief-accuracy + reward-components plots

The original plot cell only generated training_loss.png and reward_curve.png.
For meta-RL submission, the most important plot is missing: how the agent's
belief_accuracy reward evolved over training. That curve directly proves
the meta-learning thesis (agent learns to model the user from observation).

Added two plots:
- plots/reward_components.png: all 4 reward functions overlaid over training
(format_valid, action_legal, env_reward, belief_accuracy) so you can see
which signals were gradient-providing
- plots/belief_accuracy.png: focused belief reward with rolling mean and
neutral-baseline reference line

Plot generation is defensive: it discovers all log_history keys and tries
multiple TRL key conventions (rewards/X/mean, rewards/X, X) since these
vary by TRL version. Prints "Available log keys" so the user can debug
if any series isn't found.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (1) hide show

training/RhythmEnv_GRPO_Training.ipynb +109 -54

training/RhythmEnv_GRPO_Training.ipynb CHANGED Viewed

@@ -328,77 +328,132 @@
    "metadata": {},
    "outputs": [],
    "source": [
     "import matplotlib.pyplot as plt\n",
-    "import json\n",
-    "import os\n",
     "\n",
-    "# Extract training logs from trainer\n",
     "log_history = trainer.state.log_history\n",
     "\n",
-    "steps = []\n",
-    "losses = []\n",
-    "rewards = []\n",
-    "reward_stds = []\n",
     "\n",
     "for entry in log_history:\n",
-    "    if \"loss\" in entry:\n",
-    "        steps.append(entry.get(\"step\", 0))\n",
-    "        losses.append(entry[\"loss\"])\n",
-    "    if \"reward\" in entry:\n",
-    "        rewards.append(entry[\"reward\"])\n",
-    "    if \"reward_std\" in entry:\n",
-    "        reward_stds.append(entry[\"reward_std\"])\n",
-    "\n",
-    "# Also try rewards/mean key used by some TRL versions\n",
-    "if not rewards:\n",
-    "    for entry in log_history:\n",
-    "        if \"rewards/mean\" in entry:\n",
-    "            rewards.append(entry[\"rewards/mean\"])\n",
-    "        if \"rewards/std\" in entry:\n",
-    "            reward_stds.append(entry[\"rewards/std\"])\n",
-    "\n",
-    "os.makedirs(\"plots\", exist_ok=True)\n",
     "\n",
     "# --- Plot 1: Training Loss ---\n",
-    "fig, ax = plt.subplots(figsize=(10, 5))\n",
-    "ax.plot(steps[:len(losses)], losses, color=\"#2563eb\", linewidth=1.5, alpha=0.8)\n",
-    "ax.set_xlabel(\"Training Step\", fontsize=12)\n",
-    "ax.set_ylabel(\"Loss\", fontsize=12)\n",
-    "ax.set_title(\"GRPO Training Loss \u00e2\u20ac\u201d RhythmEnv Life Simulator\", fontsize=14)\n",
-    "ax.grid(True, alpha=0.3)\n",
-    "plt.tight_layout()\n",
-    "plt.savefig(\"plots/training_loss.png\", dpi=150)\n",
-    "plt.show()\n",
-    "print(\"Saved: plots/training_loss.png\")\n",
     "\n",
-    "# --- Plot 2: Mean Reward ---\n",
-    "if rewards:\n",
     "    fig, ax = plt.subplots(figsize=(10, 5))\n",
-    "    reward_steps = steps[:len(rewards)]\n",
-    "    ax.plot(reward_steps, rewards, color=\"#16a34a\", linewidth=1.5, alpha=0.8, label=\"Mean Reward\")\n",
-    "    if reward_stds and len(reward_stds) == len(rewards):\n",
-    "        import numpy as np\n",
-    "        r = np.array(rewards)\n",
-    "        s = np.array(reward_stds)\n",
-    "        ax.fill_between(reward_steps, r - s, r + s, color=\"#16a34a\", alpha=0.15, label=\"\u00c2\u00b11 Std Dev\")\n",
-    "    ax.set_xlabel(\"Training Step\", fontsize=12)\n",
-    "    ax.set_ylabel(\"Mean Reward\", fontsize=12)\n",
-    "    ax.set_title(\"GRPO Mean Reward \u00e2\u20ac\u201d RhythmEnv Life Simulator\", fontsize=14)\n",
     "    ax.legend()\n",
     "    ax.grid(True, alpha=0.3)\n",
     "    plt.tight_layout()\n",
     "    plt.savefig(\"plots/reward_curve.png\", dpi=150)\n",
     "    plt.show()\n",
-    "    print(\"Saved: plots/reward_curve.png\")\n",
     "else:\n",
-    "    print(\"No reward data in logs. Check trainer.state.log_history keys:\")\n",
-    "    if log_history:\n",
-    "        print(list(log_history[0].keys()))\n",
     "\n",
-    "# Save raw log data for reference\n",
-    "with open(\"plots/training_log.json\", \"w\") as f:\n",
-    "    json.dump(log_history, f, indent=2)\n",
-    "print(\"Saved: plots/training_log.json\")"
    ]
   },
   {

    "metadata": {},
    "outputs": [],
    "source": [
+    "# Generate all training plots from trainer log_history\n",
+    "# Saves: training_loss.png, reward_curve.png, reward_components.png, belief_accuracy.png\n",
+    "import os, json\n",
+    "import numpy as np\n",
     "import matplotlib.pyplot as plt\n",
     "\n",
     "log_history = trainer.state.log_history\n",
+    "os.makedirs(\"plots\", exist_ok=True)\n",
     "\n",
+    "# Save raw log first (always, even if plotting fails)\n",
+    "with open(\"plots/training_log.json\", \"w\") as f:\n",
+    "    json.dump(log_history, f, indent=2)\n",
     "\n",
+    "# Helper: extract a series across all log entries that have a key\n",
+    "def series(*keys):\n",
+    "    \"\"\"Extract (steps, values) for the first matching key across log entries.\"\"\"\n",
+    "    for k in keys:\n",
+    "        steps, vals = [], []\n",
+    "        for entry in log_history:\n",
+    "            if k in entry:\n",
+    "                steps.append(entry.get(\"step\", len(steps)))\n",
+    "                vals.append(entry[k])\n",
+    "        if vals:\n",
+    "            return steps, vals, k\n",
+    "    return [], [], None\n",
+    "\n",
+    "# Discover all log keys to help debug missing plots\n",
+    "all_keys = set()\n",
     "for entry in log_history:\n",
+    "    all_keys.update(entry.keys())\n",
+    "print(f\"Available log keys: {sorted(all_keys)}\")\n",
     "\n",
     "# --- Plot 1: Training Loss ---\n",
+    "steps, losses, _ = series(\"loss\", \"train/loss\")\n",
+    "if losses:\n",
+    "    fig, ax = plt.subplots(figsize=(10, 5))\n",
+    "    ax.plot(steps, losses, color=\"#2563eb\", linewidth=1.5, alpha=0.8)\n",
+    "    ax.set_xlabel(\"Training Step\")\n",
+    "    ax.set_ylabel(\"Loss\")\n",
+    "    ax.set_title(\"GRPO Training Loss \u2014 RhythmEnv Meta-RL\")\n",
+    "    ax.grid(True, alpha=0.3)\n",
+    "    plt.tight_layout()\n",
+    "    plt.savefig(\"plots/training_loss.png\", dpi=150)\n",
+    "    plt.show()\n",
+    "    print(f\"Saved: plots/training_loss.png ({len(losses)} points)\")\n",
     "\n",
+    "# --- Plot 2: Mean Reward (overall) ---\n",
+    "rsteps, rvals, rkey = series(\"reward\", \"rewards/mean\", \"rewards/total/mean\")\n",
+    "ssteps, svals, _ = series(\"reward_std\", \"rewards/std\", \"rewards/total/std\")\n",
+    "if rvals:\n",
     "    fig, ax = plt.subplots(figsize=(10, 5))\n",
+    "    ax.plot(rsteps, rvals, color=\"#16a34a\", linewidth=1.5, label=f\"Mean Reward ({rkey})\")\n",
+    "    if svals and len(svals) == len(rvals):\n",
+    "        r, s = np.array(rvals), np.array(svals)\n",
+    "        ax.fill_between(rsteps, r - s, r + s, color=\"#16a34a\", alpha=0.15, label=\"\u00b11 std\")\n",
+    "    ax.set_xlabel(\"Training Step\")\n",
+    "    ax.set_ylabel(\"Mean Total Reward\")\n",
+    "    ax.set_title(\"GRPO Mean Reward over Training \u2014 RhythmEnv Meta-RL\")\n",
     "    ax.legend()\n",
     "    ax.grid(True, alpha=0.3)\n",
     "    plt.tight_layout()\n",
     "    plt.savefig(\"plots/reward_curve.png\", dpi=150)\n",
     "    plt.show()\n",
+    "    print(f\"Saved: plots/reward_curve.png ({len(rvals)} points)\")\n",
+    "\n",
+    "# --- Plot 3: Per-Reward-Function Components (the 4-layer reward stack) ---\n",
+    "# TRL logs these as rewards/<func_name>/mean in newer versions.\n",
+    "components = [\n",
+    "    (\"format_valid\",    [\"rewards/format_valid/mean\",    \"rewards/format_valid\",    \"format_valid_reward\"]),\n",
+    "    (\"action_legal\",    [\"rewards/action_legal/mean\",    \"rewards/action_legal\",    \"action_legal_reward\"]),\n",
+    "    (\"env_reward\",      [\"rewards/env_reward/mean\",      \"rewards/env_reward\",      \"env_reward_reward\"]),\n",
+    "    (\"belief_accuracy\", [\"rewards/belief_accuracy/mean\", \"rewards/belief_accuracy\", \"belief_accuracy_reward\"]),\n",
+    "]\n",
+    "found = []\n",
+    "for name, keys in components:\n",
+    "    s, v, k = series(*keys)\n",
+    "    if v:\n",
+    "        found.append((name, s, v))\n",
+    "        print(f\"  {name}: matched key '{k}'\")\n",
+    "    else:\n",
+    "        print(f\"  {name}: NOT FOUND (looked for {keys})\")\n",
+    "\n",
+    "if found:\n",
+    "    fig, ax = plt.subplots(figsize=(12, 6))\n",
+    "    colors = {\"format_valid\": \"#94a3b8\", \"action_legal\": \"#60a5fa\", \"env_reward\": \"#22c55e\", \"belief_accuracy\": \"#a855f7\"}\n",
+    "    for name, s, v in found:\n",
+    "        ax.plot(s, v, color=colors.get(name, \"#000\"), linewidth=1.5, alpha=0.85, label=name)\n",
+    "    ax.axhline(0, color=\"k\", linewidth=0.4)\n",
+    "    ax.set_xlabel(\"Training Step\")\n",
+    "    ax.set_ylabel(\"Mean Reward Component\")\n",
+    "    ax.set_title(\"4-Layer Reward Stack over Training (RhythmEnv Meta-RL)\")\n",
+    "    ax.legend(loc=\"best\")\n",
+    "    ax.grid(True, alpha=0.3)\n",
+    "    plt.tight_layout()\n",
+    "    plt.savefig(\"plots/reward_components.png\", dpi=150)\n",
+    "    plt.show()\n",
+    "    print(f\"Saved: plots/reward_components.png ({len(found)} components)\")\n",
+    "\n",
+    "# --- Plot 4: Belief-Accuracy Curve (THE meta-RL signal) ---\n",
+    "bsteps, bvals, bkey = series(\"rewards/belief_accuracy/mean\", \"rewards/belief_accuracy\", \"belief_accuracy_reward\")\n",
+    "if bvals:\n",
+    "    fig, ax = plt.subplots(figsize=(10, 5))\n",
+    "    ax.plot(bsteps, bvals, color=\"#a855f7\", linewidth=2.0, alpha=0.9, label=\"Belief reward\")\n",
+    "    # Smoothed line (rolling mean)\n",
+    "    if len(bvals) > 20:\n",
+    "        win = max(10, len(bvals) // 30)\n",
+    "        kernel = np.ones(win) / win\n",
+    "        smooth = np.convolve(bvals, kernel, mode=\"valid\")\n",
+    "        smooth_x = bsteps[win - 1:]\n",
+    "        ax.plot(smooth_x, smooth, color=\"#7e22ce\", linewidth=2.5, label=f\"Rolling mean ({win}-step)\")\n",
+    "    ax.axhline(0.0, color=\"k\", linewidth=0.5, linestyle=\"--\", alpha=0.5, label=\"neutral belief baseline (0.0)\")\n",
+    "    ax.set_xlabel(\"Training Step\")\n",
+    "    ax.set_ylabel(\"Mean belief_accuracy reward (\u22120.5 to +0.5)\")\n",
+    "    ax.set_title(\"Belief-Accuracy Reward over Training\nProof the agent learned to model the user\")\n",
+    "    ax.legend(loc=\"best\")\n",
+    "    ax.grid(True, alpha=0.3)\n",
+    "    plt.tight_layout()\n",
+    "    plt.savefig(\"plots/belief_accuracy.png\", dpi=150)\n",
+    "    plt.show()\n",
+    "    print(f\"Saved: plots/belief_accuracy.png ({len(bvals)} points)\")\n",
     "else:\n",
+    "    print(\"WARNING: belief_accuracy series not found in log_history.\")\n",
+    "    print(\"  Check the 'Available log keys' line above to find the correct key name.\")\n",
+    "    print(\"  TRL key conventions vary by version; you may need to update the 'series(...)' calls.\")\n",
     "\n",
+    "print(\"\nAll plots saved to plots/\")\n"
    ]
   },
   {