Spaces:

srishtichugh
/

orgOS

Sleeping

App Files Files Community

muskan singh commited on 14 days ago

Commit

9e29238

1 Parent(s): a35bcd0

training notebook

Browse files

Files changed (1) hide show

training/grpo_orgos.ipynb +452 -336

training/grpo_orgos.ipynb CHANGED Viewed

@@ -1,54 +1,39 @@
 {
- "nbformat": 4,
- "nbformat_minor": 5,
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "name": "python",
-   "version": "3.10.0"
-  },
-  "colab": {
-   "gpuType": "T4",
-   "provenance": []
-  },
-  "accelerator": "GPU"
- },
  "cells": [
   {
    "cell_type": "markdown",
    "id": "title",
    "metadata": {},
    "source": [
-    "# OrgOS GRPO Training Notebook\n",
     "\n",
     "**Environment:** OrgOS — Multi-App Enterprise RL Environment  \n",
     "**Model:** `Qwen/Qwen2.5-3B-Instruct` (4-bit LoRA via Unsloth)  \n",
     "**Algorithm:** GRPO (Group Relative Policy Optimization) via HuggingFace TRL  \n",
-    "**Hardware:** Colab T4 (free tier compatible)  \n",
-    "\n",
-    "## What this notebook does\n",
-    "1. Installs dependencies (Unsloth + TRL)\n",
-    "2. Loads Qwen2.5-3B-Instruct with 4-bit LoRA\n",
-    "3. Collects **baseline rollouts** (untrained model) on Workflows A & C\n",
-    "4. Fine-tunes with **GRPOTrainer** using OrgOS dense rewards\n",
-    "5. Collects **post-training rollouts** and computes score improvement\n",
-    "6. Plots the **before/after reward curve** for the demo\n",
-    "\n",
-    "**Key training signal:** The schema drift mechanic creates a sharp signal gap —\n",
-    "an untrained model uses stale canonical field names (−0.20 per step),\n",
-    "while a GRPO-trained model learns to read `schema_hints` first (+reward).\n",
-    "This produces a clear, visually compelling before/after improvement."
    ]
   },
   {
    "cell_type": "markdown",
    "id": "sec1",
    "metadata": {},
-   "source": ["## 1. Install Dependencies"]
   },
   {
    "cell_type": "code",
@@ -57,67 +42,46 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Install Unsloth (optimised 4-bit LLM training) + TRL (GRPO)\n",
-    "!pip install -q unsloth[colab-new] trl>=0.9.0 peft accelerate bitsandbytes\n",
-    "!pip install -q fastapi uvicorn httpx openai pydantic\n",
-    "!pip install -q matplotlib numpy\n",
-    "\n",
-    "# Clone / mount the OrgOS repo\n",
-    "import os\n",
-    "if not os.path.exists('/content/openEnv'):\n",
-    "    !git clone https://huggingface.co/spaces/YOUR_HF_USERNAME/orgos-openenv /content/openEnv\n",
-    "    # Alternatively: upload the repo zip and unzip it here\n",
-    "\n",
-    "os.chdir('/content/openEnv')\n",
-    "print('Working directory:', os.getcwd())"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "sec2",
    "metadata": {},
-   "source": ["## 2. Load Model with Unsloth 4-bit LoRA"]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "load_model",
    "metadata": {},
    "outputs": [],
    "source": [
-    "from unsloth import FastLanguageModel\n",
-    "import torch\n",
     "\n",
-    "MAX_SEQ_LEN = 2048\n",
-    "MODEL_NAME  = 'Qwen/Qwen2.5-3B-Instruct'\n",
     "\n",
-    "model, tokenizer = FastLanguageModel.from_pretrained(\n",
-    "    model_name       = MODEL_NAME,\n",
-    "    max_seq_length   = MAX_SEQ_LEN,\n",
-    "    dtype            = None,        # auto-detect\n",
-    "    load_in_4bit     = True,\n",
-    ")\n",
     "\n",
-    "# Add LoRA adapters\n",
-    "model = FastLanguageModel.get_peft_model(\n",
-    "    model,\n",
-    "    r              = 16,\n",
-    "    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj',\n",
-    "                      'gate_proj', 'up_proj', 'down_proj'],\n",
-    "    lora_alpha     = 16,\n",
-    "    lora_dropout   = 0,\n",
-    "    bias           = 'none',\n",
-    "    use_gradient_checkpointing = 'unsloth',\n",
-    "    random_state   = 42,\n",
-    ")\n",
-    "print(f'Model loaded — trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}')"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "sec3",
    "metadata": {},
-   "source": ["## 3. Start the OrgOS Environment Server (subprocess)"]
   },
   {
    "cell_type": "code",
@@ -129,203 +93,365 @@
     "import subprocess, time, httpx\n",
     "\n",
     "server_proc = subprocess.Popen(\n",
-    "    ['python', '-m', 'uvicorn', 'server.app:app', '--host', '0.0.0.0', '--port', '8000'],\n",
-    "    stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL\n",
     ")\n",
-    "time.sleep(3)\n",
     "\n",
-    "health = httpx.get('http://localhost:8000/health').json()\n",
-    "assert health['status'] == 'healthy', f'Server not healthy: {health}'\n",
-    "print('OrgOS server running — health:', health)"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "sec4",
    "metadata": {},
-   "source": ["## 4. Rollout Harness (collect trajectories)"]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "rollout_harness",
    "metadata": {},
    "outputs": [],
    "source": [
-    "import json, re, sys\n",
-    "from typing import List, Dict, Tuple\n",
     "\n",
-    "SYSTEM_PROMPT = open('inference.py').read().split('SYSTEM_PROMPT = \\\"\\\"\\\"')[1].split('\\\"\\\"\\\"')[0]\n",
     "\n",
-    "def obs_to_text(obs: dict) -> str:\n",
-    "    \"\"\"Convert observation dict to text for the model.\"\"\"\n",
-    "    hints = obs.get('schema_hints', {})\n",
-    "    pending = obs.get('pending_steps', [])\n",
-    "    return (\n",
-    "        f\"current_score: {obs['current_score']}\\n\"\n",
-    "        f\"step_count: {obs['step_count']}\\n\"\n",
-    "        f\"workflow_id: {obs['workflow_id']}\\n\\n\"\n",
-    "        f\"=== WORKFLOW GOAL ===\\n{obs['workflow_goal']}\\n\\n\"\n",
-    "        f\"=== PENDING STEPS ===\\n\" + ('\\n'.join(f'- {s}' for s in pending) or '(done!)') + \"\\n\\n\"\n",
-    "        f\"=== SCHEMA HINTS ===\\n{json.dumps(hints, indent=2)}\\n\\n\"\n",
-    "        f\"=== ACTIVE RULES ===\\n{json.dumps(obs.get('active_rules', {}), indent=2)}\\n\\n\"\n",
-    "        f\"=== LAST MESSAGE ===\\n{obs['message']}\\n\"\n",
-    "    )\n",
     "\n",
-    "def generate_action(prompt_messages: List[Dict], max_tokens=256) -> str:\n",
-    "    \"\"\"Run the model to produce an action JSON string.\"\"\"\n",
-    "    from transformers import GenerationConfig\n",
-    "    # Format as chat\n",
-    "    text = tokenizer.apply_chat_template(\n",
-    "        prompt_messages, tokenize=False, add_generation_prompt=True\n",
-    "    )\n",
-    "    inputs = tokenizer(text, return_tensors='pt').to(model.device)\n",
-    "    with torch.no_grad():\n",
-    "        out = model.generate(\n",
-    "            **inputs,\n",
-    "            max_new_tokens    = max_tokens,\n",
-    "            temperature       = 0.7,\n",
-    "            do_sample         = True,\n",
-    "            pad_token_id      = tokenizer.eos_token_id,\n",
-    "        )\n",
-    "    decoded = tokenizer.decode(out[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)\n",
-    "    return decoded.strip()\n",
-    "\n",
-    "def run_episode(workflow_id: str, max_steps: int = 15) -> Tuple[List[dict], float]:\n",
-    "    \"\"\"\n",
-    "    Run one episode. Returns (trajectory, final_score).\n",
-    "    trajectory = list of {'messages': [...], 'reward': float}\n",
-    "    \"\"\"\n",
-    "    resp   = httpx.post('http://localhost:8000/reset', json={'workflow_id': workflow_id})\n",
-    "    obs    = resp.json()['observation']\n",
-    "    history = []\n",
-    "    trajectory = []\n",
-    "    cumulative_reward = 0.0\n",
     "\n",
-    "    for step_i in range(max_steps):\n",
-    "        if obs['done']:\n",
-    "            break\n",
     "\n",
-    "        obs_text = obs_to_text(obs)\n",
-    "        history.append({'role': 'user', 'content': obs_text})\n",
     "\n",
-    "        msgs = [{'role': 'system', 'content': SYSTEM_PROMPT}] + history[-10:]\n",
-    "        action_str = generate_action(msgs)\n",
     "\n",
-    "        history.append({'role': 'assistant', 'content': action_str})\n",
     "\n",
-    "        # Parse action\n",
-    "        action = None\n",
-    "        try:\n",
-    "            action = json.loads(action_str)\n",
-    "        except:\n",
-    "            m = re.search(r'\\{.*\\}', action_str, re.DOTALL)\n",
-    "            if m:\n",
-    "                try: action = json.loads(m.group())\n",
-    "                except: pass\n",
     "\n",
-    "        if action is None:\n",
-    "            cumulative_reward -= 0.05\n",
-    "            break\n",
     "\n",
-    "        result = httpx.post('http://localhost:8000/step', json=action).json()\n",
-    "        obs    = result['observation']\n",
-    "        reward = result['reward']\n",
-    "        cumulative_reward += reward\n",
     "\n",
-    "        # Store step for GRPO\n",
-    "        trajectory.append({\n",
-    "            'messages': msgs + [{'role': 'assistant', 'content': action_str}],\n",
-    "            'reward':   reward,\n",
     "        })\n",
     "\n",
-    "        if obs['done']:\n",
-    "            break\n",
-    "\n",
-    "    return trajectory, obs.get('current_score', 0.001)\n",
-    "\n",
-    "print('Rollout harness ready.')"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "sec5",
    "metadata": {},
-   "source": ["## 5. Collect Baseline Rollouts (Pre-Training)"]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "baseline_rollouts",
    "metadata": {},
    "outputs": [],
    "source": [
-    "import numpy as np\n",
     "\n",
-    "N_BASELINE = 30   # 30 episodes pre-training (10 per workflow)\n",
     "\n",
-    "baseline_scores = {'A': [], 'B': [], 'C': []}\n",
-    "all_trajectories = []\n",
     "\n",
-    "print('Collecting baseline rollouts...')\n",
-    "for wf in ['A', 'B', 'C']:\n",
-    "    for ep in range(N_BASELINE // 3):\n",
-    "        traj, score = run_episode(wf)\n",
-    "        baseline_scores[wf].append(score)\n",
-    "        all_trajectories.extend(traj)\n",
-    "        print(f'  Workflow {wf} ep {ep+1}: score={score:.4f}', end='\\r')\n",
-    "    print(f'  Workflow {wf}: mean={np.mean(baseline_scores[wf]):.4f} ± {np.std(baseline_scores[wf]):.4f}')\n",
     "\n",
-    "print(f'\\nTotal baseline episodes: {N_BASELINE}')\n",
-    "print(f'Total trajectory steps: {len(all_trajectories)}')\n",
-    "print(f'Overall baseline mean: {np.mean([s for v in baseline_scores.values() for s in v]):.4f}')"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "sec6",
    "metadata": {},
-   "source": ["## 6. Build GRPO Dataset from Trajectories"]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "build_dataset",
    "metadata": {},
    "outputs": [],
    "source": [
-    "from datasets import Dataset\n",
     "\n",
-    "def trajectories_to_dataset(trajectories: List[dict]) -> Dataset:\n",
-    "    \"\"\"\n",
-    "    Convert trajectory steps into a GRPO-compatible dataset.\n",
-    "    Each row = one (prompt, completion, reward) triple.\n",
-    "    \"\"\"\n",
-    "    rows = []\n",
-    "    for step in trajectories:\n",
-    "        messages   = step['messages']\n",
-    "        reward     = step['reward']\n",
-    "        # Separate prompt (all but last assistant turn) from completion\n",
-    "        prompt_msgs   = messages[:-1]\n",
-    "        completion    = messages[-1]['content']\n",
-    "        prompt_text   = tokenizer.apply_chat_template(\n",
-    "            prompt_msgs, tokenize=False, add_generation_prompt=True\n",
-    "        )\n",
-    "        rows.append({'prompt': prompt_text, 'completion': completion, 'reward': reward})\n",
-    "    return Dataset.from_list(rows)\n",
-    "\n",
-    "train_dataset = trajectories_to_dataset(all_trajectories)\n",
-    "print(f'Training dataset: {len(train_dataset)} examples')\n",
-    "print(f'Reward range: [{min(train_dataset[\"reward\"]):.4f}, {max(train_dataset[\"reward\"]):.4f}]')\n",
-    "print(f'Mean reward: {np.mean(train_dataset[\"reward\"]):.4f}')\n",
-    "train_dataset[0]"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "sec7",
    "metadata": {},
-   "source": ["## 7. GRPO Training"]
   },
   {
    "cell_type": "code",
@@ -336,164 +462,174 @@
    "source": [
     "from trl import GRPOConfig, GRPOTrainer\n",
     "\n",
-    "# Reward function for GRPO: directly use the env's per-step reward\n",
-    "def reward_fn(completions: List[str], prompts: List[str], **kwargs) -> List[float]:\n",
-    "    \"\"\"GRPO reward function — called on each group of completions.\"\"\"\n",
-    "    # In GRPO the rewards come from rollouts; we pre-compute them above.\n",
-    "    # This function returns the rewards already stored in the dataset.\n",
-    "    return kwargs.get('reward', [0.0] * len(completions))\n",
     "\n",
     "grpo_config = GRPOConfig(\n",
-    "    output_dir             = './orgos_grpo_ckpt',\n",
-    "    num_train_epochs       = 3,\n",
-    "    per_device_train_batch_size = 2,\n",
-    "    gradient_accumulation_steps = 4,\n",
-    "    learning_rate          = 5e-5,\n",
-    "    warmup_steps           = 10,\n",
-    "    logging_steps          = 5,\n",
-    "    save_steps             = 50,\n",
-    "    fp16                   = not torch.cuda.is_bf16_supported(),\n",
-    "    bf16                   = torch.cuda.is_bf16_supported(),\n",
-    "    max_grad_norm          = 1.0,\n",
     "    # GRPO-specific\n",
-    "    num_generations        = 4,          # group size G\n",
-    "    max_new_tokens         = 256,\n",
-    "    temperature            = 0.7,\n",
-    "    beta                   = 0.04,        # KL penalty\n",
-    "    report_to              = 'none',\n",
-    "    seed                   = 42,\n",
     ")\n",
     "\n",
     "trainer = GRPOTrainer(\n",
     "    model         = model,\n",
     "    args          = grpo_config,\n",
-    "    reward_funcs  = reward_fn,\n",
-    "    train_dataset = train_dataset,\n",
-    "    tokenizer     = tokenizer,\n",
     ")\n",
     "\n",
-    "print('Starting GRPO training...')\n",
     "train_result = trainer.train()\n",
-    "print('Training complete!')\n",
     "print(train_result.metrics)"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "sec8",
    "metadata": {},
-   "source": ["## 8. Collect Post-Training Rollouts"]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "posttraining_rollouts",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Switch model to inference mode\n",
     "FastLanguageModel.for_inference(model)\n",
     "\n",
-    "N_EVAL = 30\n",
-    "post_scores = {'A': [], 'B': [], 'C': []}\n",
     "\n",
-    "print('Collecting post-training rollouts...')\n",
-    "for wf in ['A', 'B', 'C']:\n",
-    "    for ep in range(N_EVAL // 3):\n",
-    "        _, score = run_episode(wf)\n",
     "        post_scores[wf].append(score)\n",
-    "        print(f'  Workflow {wf} ep {ep+1}: score={score:.4f}', end='\\r')\n",
-    "    print(f'  Workflow {wf}: mean={np.mean(post_scores[wf]):.4f} ± {np.std(post_scores[wf]):.4f}')\n",
     "\n",
-    "print(f'\\nOverall post-training mean: {np.mean([s for v in post_scores.values() for s in v]):.4f}')"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "sec9",
    "metadata": {},
-   "source": ["## 9. Plot Before/After Reward Curves"]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "plot_curves",
    "metadata": {},
    "outputs": [],
    "source": [
     "import matplotlib.pyplot as plt\n",
     "import matplotlib.gridspec as gridspec\n",
     "\n",
-    "fig = plt.figure(figsize=(14, 8), facecolor='#0f172a')\n",
-    "fig.suptitle('OrgOS: Before vs After GRPO Training', fontsize=15,\n",
-    "             color='white', fontweight='bold', y=0.98)\n",
     "\n",
     "gs = gridspec.GridSpec(2, 3, figure=fig, hspace=0.45, wspace=0.35)\n",
     "\n",
-    "COLORS = {'before': '#f87171', 'after': '#34d399', 'bg': '#1e293b', 'grid': '#334155'}\n",
-    "WF_LABELS = {'A': 'Workflow A\\nCustomer Bug Fix',\n",
-    "             'B': 'Workflow B\\nEmployee Onboarding',\n",
-    "             'C': 'Workflow C\\nChurn Risk Alert'}\n",
     "\n",
-    "for col, wf in enumerate(['A', 'B', 'C']):\n",
     "    ax = fig.add_subplot(gs[0, col])\n",
-    "    ax.set_facecolor(COLORS['bg'])\n",
-    "    ax.grid(color=COLORS['grid'], linewidth=0.5, alpha=0.7)\n",
     "\n",
     "    before = baseline_scores[wf]\n",
     "    after  = post_scores[wf]\n",
     "\n",
-    "    ax.plot(before, color=COLORS['before'], linewidth=1.5, alpha=0.8, label='Before GRPO')\n",
-    "    ax.plot(after,  color=COLORS['after'],  linewidth=1.5, alpha=0.8, label='After GRPO')\n",
-    "\n",
-    "    ax.axhline(np.mean(before), color=COLORS['before'], linestyle='--', linewidth=1, alpha=0.5)\n",
-    "    ax.axhline(np.mean(after),  color=COLORS['after'],  linestyle='--', linewidth=1, alpha=0.5)\n",
     "\n",
-    "    delta = np.mean(after) - np.mean(before)\n",
-    "    ax.set_title(WF_LABELS[wf] + f'\\n(Δ = {delta:+.4f})', color='white', fontsize=9)\n",
-    "    ax.set_xlabel('Episode', color='#94a3b8', fontsize=8)\n",
-    "    ax.set_ylabel('Final Score', color='#94a3b8', fontsize=8)\n",
-    "    ax.tick_params(colors='#64748b', labelsize=7)\n",
     "    ax.set_ylim(0, 1)\n",
-    "    ax.legend(fontsize=7, facecolor='#1e293b', labelcolor='white',\n",
-    "              edgecolor='#475569', framealpha=0.8)\n",
     "    for spine in ax.spines.values():\n",
-    "        spine.set_edgecolor('#334155')\n",
     "\n",
-    "# Bottom row: combined histogram\n",
     "ax_hist = fig.add_subplot(gs[1, :])\n",
-    "ax_hist.set_facecolor(COLORS['bg'])\n",
-    "ax_hist.grid(color=COLORS['grid'], linewidth=0.5, alpha=0.5, axis='x')\n",
     "\n",
     "all_before = [s for v in baseline_scores.values() for s in v]\n",
     "all_after  = [s for v in post_scores.values() for s in v]\n",
-    "\n",
     "bins = np.linspace(0, 1, 25)\n",
-    "ax_hist.hist(all_before, bins=bins, color=COLORS['before'], alpha=0.6, label=f'Before GRPO (mean={np.mean(all_before):.4f})', edgecolor='none')\n",
-    "ax_hist.hist(all_after,  bins=bins, color=COLORS['after'],  alpha=0.6, label=f'After GRPO  (mean={np.mean(all_after):.4f})', edgecolor='none')\n",
-    "ax_hist.axvline(np.mean(all_before), color=COLORS['before'], linestyle='--', linewidth=1.5)\n",
-    "ax_hist.axvline(np.mean(all_after),  color=COLORS['after'],  linestyle='--', linewidth=1.5)\n",
-    "\n",
-    "ax_hist.set_title('Score Distribution Across All Workflows', color='white', fontsize=10)\n",
-    "ax_hist.set_xlabel('Final Score', color='#94a3b8', fontsize=9)\n",
-    "ax_hist.set_ylabel('Count', color='#94a3b8', fontsize=9)\n",
-    "ax_hist.tick_params(colors='#64748b', labelsize=8)\n",
-    "ax_hist.legend(fontsize=9, facecolor='#1e293b', labelcolor='white',\n",
-    "               edgecolor='#475569', framealpha=0.9)\n",
     "for spine in ax_hist.spines.values():\n",
-    "    spine.set_edgecolor('#334155')\n",
     "\n",
-    "plt.savefig('before_after_curves.png', dpi=150, bbox_inches='tight',\n",
-    "            facecolor='#0f172a', edgecolor='none')\n",
     "plt.show()\n",
-    "print('Saved: before_after_curves.png')"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "sec10",
    "metadata": {},
-   "source": ["## 10. Save LoRA Adapter & Upload to HuggingFace"]
   },
   {
    "cell_type": "code",
@@ -502,49 +638,29 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Save LoRA adapter locally\n",
-    "model.save_pretrained('orgos_lora_adapter')\n",
-    "tokenizer.save_pretrained('orgos_lora_adapter')\n",
-    "print('LoRA adapter saved to ./orgos_lora_adapter')\n",
     "\n",
-    "# Optionally push to HuggingFace Hub\n",
     "# from huggingface_hub import login\n",
-    "# login(token=os.environ['HF_TOKEN'])\n",
-    "# model.push_to_hub('YOUR_HF_USERNAME/orgos-qwen25-3b-grpo-lora')\n",
-    "# tokenizer.push_to_hub('YOUR_HF_USERNAME/orgos-qwen25-3b-grpo-lora')\n",
-    "# print('Pushed to HuggingFace Hub!')"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "sec11",
-   "metadata": {},
-   "source": [
-    "## 11. Summary\n",
-    "\n",
-    "```\n",
-    "OrgOS GRPO Training Summary\n",
-    "============================\n",
-    "Model:     Qwen2.5-3B-Instruct + 4-bit LoRA\n",
-    "Algorithm: GRPO (Group Relative Policy Optimization)\n",
-    "Epochs:    3\n",
-    "Episodes:  30 baseline + 30 post-training\n",
-    "\n",
-    "Key result: The GRPO-trained model learns to:\n",
-    "  1. Read schema_hints before constructing action args\n",
-    "  2. Use drifted field names (e.g. 'severity' not 'priority')\n",
-    "  3. Complete workflow steps in the correct order\n",
-    "  4. Avoid RBAC violations by checking role constraints\n",
-    "\n",
-    "This produces a clear, measurable improvement visible in\n",
-    "before_after_curves.png — the core evidence for judging.\n",
-    "```\n",
-    "\n",
-    "**Artefacts produced:**\n",
-    "- `before_after_curves.png` — the money chart for the pitch\n",
-    "- `orgos_lora_adapter/` — the trained LoRA weights\n",
-    "- `baseline_scores.json` — raw score data"
-   ]
   }
- ]
 }

 {
  "cells": [
   {
    "cell_type": "markdown",
    "id": "title",
    "metadata": {},
    "source": [
+    "# OrgOS GRPO Training\n",
     "\n",
     "**Environment:** OrgOS — Multi-App Enterprise RL Environment  \n",
     "**Model:** `Qwen/Qwen2.5-3B-Instruct` (4-bit LoRA via Unsloth)  \n",
     "**Algorithm:** GRPO (Group Relative Policy Optimization) via HuggingFace TRL  \n",
+    "**Target hardware:** HuggingFace compute (A10G / A100)  \n",
+    "\n",
+    "## How this works\n",
+    "\n",
+    "GRPO is an **online** RL algorithm:\n",
+    "1. Each training step takes a batch of **prompts** (observations from the env)\n",
+    "2. The model generates **G candidate actions** per prompt (the group)\n",
+    "3. Each action is sent to the **live OrgOS env** to get a real reward\n",
+    "4. GRPO computes relative advantages within the group (which action did better than average?)\n",
+    "5. Model is updated to favour higher-reward actions\n",
+    "\n",
+    "**Key training signal:** Schema drift creates a sharp reward gap.\n",
+    "Using a stale field name (e.g. `priority` when schema says `severity`) → **−0.20**.  \n",
+    "Using the correct drifted name → **+0.10** adaptation bonus.  \n",
+    "The model learns to read `schema_hints` before constructing action args."
    ]
   },
   {
    "cell_type": "markdown",
    "id": "sec1",
    "metadata": {},
+   "source": [
+    "## 1. Install Dependencies"
+   ]
   },
   {
    "cell_type": "code",
    "metadata": {},
    "outputs": [],
    "source": [
+    "!pip install -q \"unsloth[huggingface]\" \"trl>=0.12.0\" peft accelerate bitsandbytes\n",
+    "!pip install -q fastapi uvicorn httpx openai pydantic python-dotenv\n",
+    "!pip install -q matplotlib numpy datasets"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "sec2",
    "metadata": {},
+   "source": [
+    "## 2. Clone the OrgOS Repo"
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "clone_repo",
    "metadata": {},
    "outputs": [],
    "source": [
+    "import os\n",
     "\n",
+    "REPO_URL = \"https://huggingface.co/spaces/tanvibisht/orgos-openenv\"\n",
+    "REPO_DIR = \"/home/user/orgos\"\n",
     "\n",
+    "if not os.path.exists(REPO_DIR):\n",
+    "    !git clone {REPO_URL} {REPO_DIR}\n",
     "\n",
+    "os.chdir(REPO_DIR)\n",
+    "print(\"Working directory:\", os.getcwd())\n",
+    "!ls"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "sec3",
    "metadata": {},
+   "source": [
+    "## 3. Start the OrgOS Environment Server"
+   ]
   },
   {
    "cell_type": "code",
     "import subprocess, time, httpx\n",
     "\n",
     "server_proc = subprocess.Popen(\n",
+    "    [\"python\", \"-m\", \"uvicorn\", \"server.app:app\", \"--host\", \"0.0.0.0\", \"--port\", \"8000\"],\n",
+    "    stdout=subprocess.DEVNULL,\n",
+    "    stderr=subprocess.DEVNULL,\n",
     ")\n",
+    "time.sleep(4)\n",
     "\n",
+    "health = httpx.get(\"http://localhost:8000/health\").json()\n",
+    "assert health[\"status\"] == \"healthy\", f\"Server not healthy: {health}\"\n",
+    "print(\"OrgOS server running:\", health)"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "sec4",
    "metadata": {},
+   "source": [
+    "## 4. Load Model with Unsloth 4-bit LoRA"
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "load_model",
    "metadata": {},
    "outputs": [],
    "source": [
+    "from unsloth import FastLanguageModel\n",
+    "import torch\n",
     "\n",
+    "MAX_SEQ_LEN = 2048\n",
+    "MODEL_NAME  = \"Qwen/Qwen2.5-3B-Instruct\"\n",
     "\n",
+    "model, tokenizer = FastLanguageModel.from_pretrained(\n",
+    "    model_name     = MODEL_NAME,\n",
+    "    max_seq_length = MAX_SEQ_LEN,\n",
+    "    dtype          = None,\n",
+    "    load_in_4bit   = True,\n",
+    ")\n",
     "\n",
+    "model = FastLanguageModel.get_peft_model(\n",
+    "    model,\n",
+    "    r              = 16,\n",
+    "    target_modules = [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n",
+    "                      \"gate_proj\", \"up_proj\", \"down_proj\"],\n",
+    "    lora_alpha     = 16,\n",
+    "    lora_dropout   = 0,\n",
+    "    bias           = \"none\",\n",
+    "    use_gradient_checkpointing = \"unsloth\",\n",
+    "    random_state   = 42,\n",
+    ")\n",
+    "trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
+    "print(f\"Model loaded — trainable params: {trainable:,}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "sec5",
+   "metadata": {},
+   "source": [
+    "## 5. Prompt Dataset\n",
     "\n",
+    "We collect **first-turn observations** from fresh episode resets as our prompt dataset.\n",
+    "These are the most important turns — they contain `schema_hints`, `active_rules`, and the\n",
+    "full workflow goal. The model must learn to read schema hints and produce a correct first action.\n",
     "\n",
+    "During GRPO training, the reward function will reset the env and evaluate each generated action live."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "build_prompts",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "from datasets import Dataset\n",
     "\n",
+    "SYSTEM_PROMPT = \"\"\"\\\n",
+    "You are OrgOS Agent — an enterprise workflow automation agent.\n",
+    "You operate across four SaaS applications: Jira, Zendesk, Salesforce, and Workday.\n",
+    "\n",
+    "Each turn you receive a JSON observation with:\n",
+    "  - workflow_goal    : the task you must complete\n",
+    "  - pending_steps    : remaining steps in the workflow\n",
+    "  - app_states       : current state of each app\n",
+    "  - schema_hints     : field renames in effect this episode (e.g. {\"jira.priority\": \"severity\"})\n",
+    "  - active_rules     : current SLA / approval thresholds\n",
+    "  - message          : feedback from the last action\n",
+    "  - current_score    : your cumulative score (0.001-0.999)\n",
+    "\n",
+    "Respond ONLY with a valid JSON object — no markdown, no explanation.\n",
+    "\n",
+    "Action format:\n",
+    "  {\"app\": \"<app>\", \"operation\": \"<op>\", \"args\": {...}}\n",
+    "\n",
+    "Available apps and key operations:\n",
+    "  jira:       get_issue, create_issue, update_status, set_priority, assign_owner,\n",
+    "              add_label, link_zendesk_ticket, close_issue, list_issues\n",
+    "  zendesk:    get_ticket, acknowledge_ticket, set_urgency, assign_agent,\n",
+    "              escalate_to_jira, resolve_ticket, add_note, list_tickets,\n",
+    "              create_agent_profile\n",
+    "  salesforce: get_account, list_accounts, update_deal_stage, flag_churn_risk,\n",
+    "              assign_account_owner, log_interaction, get_opportunity\n",
+    "  workday:    get_employee, list_employees, provision_access, log_sla_event,\n",
+    "              request_budget_approval, create_onboarding_task, complete_task\n",
+    "\n",
+    "CRITICAL RULES:\n",
+    "1. Read schema_hints FIRST — if \"jira.priority\" -> \"severity\", use \"severity\" not \"priority\" in args.\n",
+    "2. Complete ALL pending_steps in order.\n",
+    "3. Do not repeat a successful action.\n",
+    "4. If an operation fails, read the message carefully and adapt.\n",
+    "5. Use list_* operations to discover record IDs when needed.\n",
+    "6. Stop when pending_steps is empty or done=true.\n",
+    "\"\"\"\n",
     "\n",
     "\n",
+    "def obs_to_text(obs: dict) -> str:\n",
+    "    hints   = obs.get(\"schema_hints\", {})\n",
+    "    pending = obs.get(\"pending_steps\", [])\n",
+    "    lines = [\n",
+    "        f\"current_score: {obs['current_score']}\",\n",
+    "        f\"step_count:    {obs['step_count']}\",\n",
+    "        f\"workflow_id:   {obs['workflow_id']}\",\n",
+    "        \"\",\n",
+    "        \"=== WORKFLOW GOAL ===\",\n",
+    "        obs[\"workflow_goal\"],\n",
+    "        \"\",\n",
+    "        \"=== PENDING STEPS ===\",\n",
+    "        \"\\n\".join(f\"  - {s}\" for s in pending) or \"  (all steps complete!)\",\n",
+    "        \"\",\n",
+    "        \"=== SCHEMA HINTS (use these field names) ===\",\n",
+    "        json.dumps(hints, indent=2) if hints else \"  (no drift — use canonical names)\",\n",
+    "        \"\",\n",
+    "        \"=== ACTIVE RULES ===\",\n",
+    "        json.dumps(obs.get(\"active_rules\", {}), indent=2),\n",
+    "        \"\",\n",
+    "        \"=== LAST MESSAGE ===\",\n",
+    "        obs[\"message\"],\n",
+    "        \"\",\n",
+    "        \"=== APP STATES ===\",\n",
+    "    ]\n",
+    "    for app_name, view in obs.get(\"app_states\", {}).items():\n",
+    "        lines.append(f\"  [{app_name.upper()}]\")\n",
+    "        lines.append(f\"  {view}\")\n",
+    "        lines.append(\"\")\n",
+    "    return \"\\n\".join(lines)\n",
+    "\n",
+    "\n",
+    "def build_prompt(obs_text: str) -> str:\n",
+    "    \"\"\"Format as a chat prompt with system injected into first user message.\"\"\"\n",
+    "    messages = [{\"role\": \"user\", \"content\": SYSTEM_PROMPT + \"\\n\\n---\\n\\n\" + obs_text}]\n",
+    "    return tokenizer.apply_chat_template(\n",
+    "        messages, tokenize=False, add_generation_prompt=True\n",
+    "    )\n",
     "\n",
     "\n",
+    "# Collect first-turn observations across all 3 workflows, multiple episodes\n",
+    "# Each episode has a different schema version (seed varies) so we get diverse prompts\n",
+    "N_PROMPTS_PER_WORKFLOW = 20\n",
+    "prompt_rows = []\n",
     "\n",
+    "print(\"Collecting prompts from env resets...\")\n",
+    "for wf in [\"A\", \"B\", \"C\"]:\n",
+    "    for _ in range(N_PROMPTS_PER_WORKFLOW):\n",
+    "        result  = httpx.post(\"http://localhost:8000/reset\", json={\"workflow_id\": wf}).json()\n",
+    "        obs     = result[\"observation\"]\n",
+    "        obs_text = obs_to_text(obs)\n",
+    "        prompt_rows.append({\n",
+    "            \"prompt\":      build_prompt(obs_text),\n",
+    "            \"workflow_id\": wf,\n",
+    "            \"obs_text\":    obs_text,\n",
     "        })\n",
     "\n",
+    "prompt_dataset = Dataset.from_list(prompt_rows)\n",
+    "print(f\"Prompt dataset: {len(prompt_dataset)} examples across 3 workflows\")\n",
+    "print(\"Sample prompt (truncated):\\n\", prompt_rows[0][\"prompt\"][:600], \"...\")"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "sec6",
    "metadata": {},
+   "source": [
+    "## 6. Reward Function\n",
+    "\n",
+    "Called by GRPOTrainer during training on each batch of generated completions.\n",
+    "For each completion:\n",
+    "1. Parse it as action JSON\n",
+    "2. Reset the env to a fresh episode for the right workflow\n",
+    "3. Send the action via `/step`\n",
+    "4. Return the reward\n",
+    "\n",
+    "This gives the model a live signal from the actual environment."
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "reward_fn",
    "metadata": {},
    "outputs": [],
    "source": [
+    "import re\n",
+    "from typing import List\n",
+    "\n",
+    "ENV_URL = \"http://localhost:8000\"\n",
+    "\n",
+    "\n",
+    "def parse_action(text: str):\n",
+    "    \"\"\"Extract JSON action from model output.\"\"\"\n",
+    "    text = text.strip()\n",
+    "    # Strip markdown code fences if present\n",
+    "    text = re.sub(r\"```(?:json)?\\s*\", \"\", text).strip()\n",
+    "    try:\n",
+    "        return json.loads(text)\n",
+    "    except json.JSONDecodeError:\n",
+    "        m = re.search(r\"\\{.*\\}\", text, re.DOTALL)\n",
+    "        if m:\n",
+    "            try:\n",
+    "                return json.loads(m.group())\n",
+    "            except Exception:\n",
+    "                pass\n",
+    "    return None\n",
+    "\n",
+    "\n",
+    "def orgos_reward_fn(completions: List[str], prompts: List[str], **kwargs) -> List[float]:\n",
+    "    \"\"\"\n",
+    "    GRPO reward function — called by GRPOTrainer each training step.\n",
     "\n",
+    "    For each generated completion:\n",
+    "      - Parse as action JSON\n",
+    "      - Reset env to a fresh episode (workflow inferred from prompt)\n",
+    "      - Step the env with the action\n",
+    "      - Return the step reward\n",
     "\n",
+    "    Invalid JSON or failed actions return a -0.1 penalty.\n",
+    "    \"\"\"\n",
+    "    workflow_ids = kwargs.get(\"workflow_id\", [\"A\"] * len(completions))\n",
+    "    rewards = []\n",
     "\n",
+    "    for completion, wf_id in zip(completions, workflow_ids):\n",
+    "        action = parse_action(completion)\n",
+    "\n",
+    "        if action is None:\n",
+    "            rewards.append(-0.1)\n",
+    "            continue\n",
     "\n",
+    "        try:\n",
+    "            # Fresh episode for this action evaluation\n",
+    "            httpx.post(f\"{ENV_URL}/reset\", json={\"workflow_id\": wf_id}, timeout=10)\n",
+    "            result = httpx.post(f\"{ENV_URL}/step\", json=action, timeout=10).json()\n",
+    "            rewards.append(float(result[\"reward\"]))\n",
+    "        except Exception:\n",
+    "            rewards.append(-0.1)\n",
+    "\n",
+    "    return rewards\n",
+    "\n",
+    "\n",
+    "print(\"Reward function defined.\")\n",
+    "print(\"Quick sanity check...\")\n",
+    "test_rewards = orgos_reward_fn(\n",
+    "    completions  = ['{\"app\": \"zendesk\", \"operation\": \"list_tickets\", \"args\": {\"state\": \"new\"}}',\n",
+    "                    'this is not valid json'],\n",
+    "    prompts      = [\"\", \"\"],\n",
+    "    workflow_id  = [\"A\", \"A\"],\n",
+    ")\n",
+    "print(f\"  Valid action reward:   {test_rewards[0]:.4f}\")\n",
+    "print(f\"  Invalid action reward: {test_rewards[1]:.4f}\")"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "sec7",
    "metadata": {},
+   "source": [
+    "## 7. Collect Baseline Scores (Pre-Training)"
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "baseline",
    "metadata": {},
    "outputs": [],
    "source": [
+    "import numpy as np\n",
     "\n",
+    "FastLanguageModel.for_inference(model)\n",
+    "\n",
+    "\n",
+    "def run_episode_with_model(workflow_id: str, max_steps: int = 15) -> float:\n",
+    "    \"\"\"Run one full episode with the current model. Returns final score.\"\"\"\n",
+    "    result  = httpx.post(f\"{ENV_URL}/reset\", json={\"workflow_id\": workflow_id}).json()\n",
+    "    obs     = result[\"observation\"]\n",
+    "    history = []\n",
+    "\n",
+    "    for _ in range(max_steps):\n",
+    "        if obs[\"done\"]:\n",
+    "            break\n",
+    "\n",
+    "        obs_text = obs_to_text(obs)\n",
+    "        history.append({\"role\": \"user\", \"content\": obs_text})\n",
+    "\n",
+    "        # Inject system prompt into first user message\n",
+    "        messages = list(history)\n",
+    "        messages[0] = {\"role\": \"user\", \"content\": SYSTEM_PROMPT + \"\\n\\n---\\n\\n\" + messages[0][\"content\"]}\n",
+    "\n",
+    "        text   = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n",
+    "        inputs = tokenizer(text, return_tensors=\"pt\").to(model.device)\n",
+    "\n",
+    "        with torch.no_grad():\n",
+    "            out = model.generate(\n",
+    "                **inputs,\n",
+    "                max_new_tokens = 256,\n",
+    "                temperature    = 0.0,\n",
+    "                do_sample      = False,\n",
+    "                pad_token_id   = tokenizer.eos_token_id,\n",
+    "            )\n",
+    "        action_str = tokenizer.decode(\n",
+    "            out[0][inputs[\"input_ids\"].shape[1]:], skip_special_tokens=True\n",
+    "        ).strip()\n",
+    "\n",
+    "        history.append({\"role\": \"assistant\", \"content\": action_str})\n",
+    "\n",
+    "        action = parse_action(action_str)\n",
+    "        if action is None:\n",
+    "            break\n",
+    "\n",
+    "        result = httpx.post(f\"{ENV_URL}/step\", json=action).json()\n",
+    "        obs    = result[\"observation\"]\n",
+    "        if obs[\"done\"]:\n",
+    "            break\n",
+    "\n",
+    "    return obs.get(\"current_score\", 0.001)\n",
+    "\n",
+    "\n",
+    "N_EVAL = 10   # episodes per workflow for evaluation\n",
+    "baseline_scores = {wf: [] for wf in [\"A\", \"B\", \"C\"]}\n",
+    "\n",
+    "print(\"Collecting pre-training baseline scores...\")\n",
+    "for wf in [\"A\", \"B\", \"C\"]:\n",
+    "    for ep in range(N_EVAL):\n",
+    "        score = run_episode_with_model(wf)\n",
+    "        baseline_scores[wf].append(score)\n",
+    "        print(f\"  Workflow {wf} ep {ep+1}/{N_EVAL}: score={score:.4f}\", end=\"\\r\")\n",
+    "    print(f\"  Workflow {wf}: mean={np.mean(baseline_scores[wf]):.4f}\")\n",
+    "\n",
+    "baseline_mean = np.mean([s for v in baseline_scores.values() for s in v])\n",
+    "print(f\"\\nOverall baseline mean: {baseline_mean:.4f}\")"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "sec8",
    "metadata": {},
+   "source": [
+    "## 8. GRPO Training"
+   ]
   },
   {
    "cell_type": "code",
    "source": [
     "from trl import GRPOConfig, GRPOTrainer\n",
     "\n",
+    "# Switch back to training mode\n",
+    "model.train()\n",
     "\n",
     "grpo_config = GRPOConfig(\n",
+    "    output_dir                  = \"./orgos_grpo_ckpt\",\n",
+    "    num_train_epochs            = 3,\n",
+    "    per_device_train_batch_size = 4,\n",
+    "    gradient_accumulation_steps = 2,\n",
+    "    learning_rate               = 5e-5,\n",
+    "    warmup_steps                = 10,\n",
+    "    logging_steps               = 5,\n",
+    "    save_steps                  = 100,\n",
+    "    bf16                        = torch.cuda.is_bf16_supported(),\n",
+    "    fp16                        = not torch.cuda.is_bf16_supported(),\n",
+    "    max_grad_norm               = 1.0,\n",
     "    # GRPO-specific\n",
+    "    num_generations             = 4,     # G: candidate actions per prompt\n",
+    "    max_new_tokens              = 256,\n",
+    "    temperature                 = 0.8,   # exploration during training\n",
+    "    beta                        = 0.04,  # KL penalty coefficient\n",
+    "    report_to                   = \"none\",\n",
+    "    seed                        = 42,\n",
     ")\n",
     "\n",
     "trainer = GRPOTrainer(\n",
     "    model         = model,\n",
     "    args          = grpo_config,\n",
+    "    reward_funcs  = orgos_reward_fn,\n",
+    "    train_dataset = prompt_dataset,\n",
+    "    processing_class = tokenizer,\n",
     ")\n",
     "\n",
+    "print(\"Starting GRPO training...\")\n",
+    "print(f\"  Prompts: {len(prompt_dataset)}\")\n",
+    "print(f\"  Generations per prompt (G): {grpo_config.num_generations}\")\n",
+    "print(f\"  Epochs: {grpo_config.num_train_epochs}\")\n",
+    "print(f\"  Total env calls per epoch: ~{len(prompt_dataset) * grpo_config.num_generations}\")\n",
+    "print()\n",
+    "\n",
     "train_result = trainer.train()\n",
+    "print(\"\\nTraining complete!\")\n",
     "print(train_result.metrics)"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "sec9",
    "metadata": {},
+   "source": [
+    "## 9. Collect Post-Training Scores"
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "post_training",
    "metadata": {},
    "outputs": [],
    "source": [
     "FastLanguageModel.for_inference(model)\n",
     "\n",
+    "post_scores = {wf: [] for wf in [\"A\", \"B\", \"C\"]}\n",
     "\n",
+    "print(\"Collecting post-training scores...\")\n",
+    "for wf in [\"A\", \"B\", \"C\"]:\n",
+    "    for ep in range(N_EVAL):\n",
+    "        score = run_episode_with_model(wf)\n",
     "        post_scores[wf].append(score)\n",
+    "        print(f\"  Workflow {wf} ep {ep+1}/{N_EVAL}: score={score:.4f}\", end=\"\\r\")\n",
+    "    print(f\"  Workflow {wf}: mean={np.mean(post_scores[wf]):.4f}\")\n",
     "\n",
+    "post_mean = np.mean([s for v in post_scores.values() for s in v])\n",
+    "print(f\"\\nOverall post-training mean: {post_mean:.4f}\")\n",
+    "print(f\"Improvement: {post_mean - baseline_mean:+.4f}\")"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "sec10",
    "metadata": {},
+   "source": [
+    "## 10. Plot Before / After"
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "plot",
    "metadata": {},
    "outputs": [],
    "source": [
     "import matplotlib.pyplot as plt\n",
     "import matplotlib.gridspec as gridspec\n",
     "\n",
+    "fig = plt.figure(figsize=(14, 8), facecolor=\"#0f172a\")\n",
+    "fig.suptitle(\"OrgOS: Before vs After GRPO Training\", fontsize=15,\n",
+    "             color=\"white\", fontweight=\"bold\", y=0.98)\n",
     "\n",
     "gs = gridspec.GridSpec(2, 3, figure=fig, hspace=0.45, wspace=0.35)\n",
     "\n",
+    "COLORS = {\"before\": \"#f87171\", \"after\": \"#34d399\", \"bg\": \"#1e293b\", \"grid\": \"#334155\"}\n",
+    "WF_LABELS = {\n",
+    "    \"A\": \"Workflow A\\nCustomer Bug Fix\",\n",
+    "    \"B\": \"Workflow B\\nEmployee Onboarding\",\n",
+    "    \"C\": \"Workflow C\\nChurn Risk Alert\",\n",
+    "}\n",
     "\n",
+    "for col, wf in enumerate([\"A\", \"B\", \"C\"]):\n",
     "    ax = fig.add_subplot(gs[0, col])\n",
+    "    ax.set_facecolor(COLORS[\"bg\"])\n",
+    "    ax.grid(color=COLORS[\"grid\"], linewidth=0.5, alpha=0.7)\n",
     "\n",
     "    before = baseline_scores[wf]\n",
     "    after  = post_scores[wf]\n",
+    "    delta  = np.mean(after) - np.mean(before)\n",
     "\n",
+    "    ax.plot(before, color=COLORS[\"before\"], linewidth=1.5, alpha=0.8, label=\"Before GRPO\")\n",
+    "    ax.plot(after,  color=COLORS[\"after\"],  linewidth=1.5, alpha=0.8, label=\"After GRPO\")\n",
+    "    ax.axhline(np.mean(before), color=COLORS[\"before\"], linestyle=\"--\", linewidth=1, alpha=0.5)\n",
+    "    ax.axhline(np.mean(after),  color=COLORS[\"after\"],  linestyle=\"--\", linewidth=1, alpha=0.5)\n",
     "\n",
+    "    ax.set_title(WF_LABELS[wf] + f\"\\n(Δ = {delta:+.4f})\", color=\"white\", fontsize=9)\n",
+    "    ax.set_xlabel(\"Episode\", color=\"#94a3b8\", fontsize=8)\n",
+    "    ax.set_ylabel(\"Final Score\", color=\"#94a3b8\", fontsize=8)\n",
+    "    ax.tick_params(colors=\"#64748b\", labelsize=7)\n",
     "    ax.set_ylim(0, 1)\n",
+    "    ax.legend(fontsize=7, facecolor=\"#1e293b\", labelcolor=\"white\",\n",
+    "              edgecolor=\"#475569\", framealpha=0.8)\n",
     "    for spine in ax.spines.values():\n",
+    "        spine.set_edgecolor(\"#334155\")\n",
     "\n",
     "ax_hist = fig.add_subplot(gs[1, :])\n",
+    "ax_hist.set_facecolor(COLORS[\"bg\"])\n",
+    "ax_hist.grid(color=COLORS[\"grid\"], linewidth=0.5, alpha=0.5, axis=\"x\")\n",
     "\n",
     "all_before = [s for v in baseline_scores.values() for s in v]\n",
     "all_after  = [s for v in post_scores.values() for s in v]\n",
     "bins = np.linspace(0, 1, 25)\n",
+    "\n",
+    "ax_hist.hist(all_before, bins=bins, color=COLORS[\"before\"], alpha=0.6,\n",
+    "             label=f\"Before GRPO  (mean={np.mean(all_before):.4f})\", edgecolor=\"none\")\n",
+    "ax_hist.hist(all_after,  bins=bins, color=COLORS[\"after\"],  alpha=0.6,\n",
+    "             label=f\"After GRPO   (mean={np.mean(all_after):.4f})\", edgecolor=\"none\")\n",
+    "ax_hist.axvline(np.mean(all_before), color=COLORS[\"before\"], linestyle=\"--\", linewidth=1.5)\n",
+    "ax_hist.axvline(np.mean(all_after),  color=COLORS[\"after\"],  linestyle=\"--\", linewidth=1.5)\n",
+    "\n",
+    "ax_hist.set_title(\"Score Distribution Across All Workflows\", color=\"white\", fontsize=10)\n",
+    "ax_hist.set_xlabel(\"Final Score\", color=\"#94a3b8\", fontsize=9)\n",
+    "ax_hist.set_ylabel(\"Count\", color=\"#94a3b8\", fontsize=9)\n",
+    "ax_hist.tick_params(colors=\"#64748b\", labelsize=8)\n",
+    "ax_hist.legend(fontsize=9, facecolor=\"#1e293b\", labelcolor=\"white\",\n",
+    "               edgecolor=\"#475569\", framealpha=0.9)\n",
     "for spine in ax_hist.spines.values():\n",
+    "    spine.set_edgecolor(\"#334155\")\n",
     "\n",
+    "plt.savefig(\"before_after_curves.png\", dpi=150, bbox_inches=\"tight\",\n",
+    "            facecolor=\"#0f172a\", edgecolor=\"none\")\n",
     "plt.show()\n",
+    "print(\"Saved: before_after_curves.png\")"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "sec11",
    "metadata": {},
+   "source": [
+    "## 11. Save LoRA Adapter"
+   ]
   },
   {
    "cell_type": "code",
    "metadata": {},
    "outputs": [],
    "source": [
+    "model.save_pretrained(\"orgos_lora_adapter\")\n",
+    "tokenizer.save_pretrained(\"orgos_lora_adapter\")\n",
+    "print(\"LoRA adapter saved to ./orgos_lora_adapter\")\n",
     "\n",
+    "# Push to HuggingFace Hub\n",
     "# from huggingface_hub import login\n",
+    "# login(token=\"YOUR_HF_TOKEN\")\n",
+    "# model.push_to_hub(\"YOUR_USERNAME/orgos-qwen25-3b-grpo\")\n",
+    "# tokenizer.push_to_hub(\"YOUR_USERNAME/orgos-qwen25-3b-grpo\")"
    ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
   },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
   }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
 }