Spaces:

sh4shv4t
/

Parlay

Paused

App Files Files Community

sh4shv4t commited on 12 days ago

Commit

4904ccb

verified ·

1 Parent(s): 50e78ff

sync: docs, training page fixes, OpenEnv SFT demo notebook

Browse files

Files changed (1) hide show

training/notebooks/parlay_openenv_sft_demo.ipynb +626 -0

training/notebooks/parlay_openenv_sft_demo.ipynb ADDED Viewed

	@@ -0,0 +1,626 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "a1f3c890",
+   "metadata": {},
+   "source": [
+    "# Parlay — OpenEnv-driven SFT\n",
+    "\n",
+    "Collect negotiation rollouts from the **live Parlay environment** via the OpenEnv `reset` / `step` protocol, filter for quality, and fine-tune **Qwen2.5-1.5B-Instruct** with **TRL `SFTTrainer`**.\n",
+    "\n",
+    "```\n",
+    "ParlayEnvClient.reset()  →  episode loop  →  filter  →  JSONL  →  SFTTrainer\n",
+    "```\n",
+    "\n",
+    "- Environment spec: [`openenv.yaml`](../../openenv.yaml)\n",
+    "- WebSocket endpoint: `wss://sh4shv4t-parlay.hf.space/env/ws`\n",
+    "- Reward range: `[−200, +320]`\n",
+    "\n",
+    "> **Tip:** Keep `N_EPISODES` small on the public Space to avoid rate limits. Run a local server (`uvicorn main:app --port 8001`) for bulk data generation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b2e1f001",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "%pip install -q websocket-client tqdm datasets transformers trl peft accelerate bitsandbytes matplotlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "c3a9f110",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Cloning into 'Parlay'...\n",
+      "CWD → /content/Parlay\n",
+      "parlay_env.client  ✓\n",
+      "openenv.yaml found ✓\n",
+      "OPENENV_AVAILABLE  = False  (openenv-core not installed — using built-in ParlayEnvClient)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os, sys, subprocess, json, random\n",
+    "from pathlib import Path\n",
+    "\n",
+    "REPO_DIR = Path.cwd()\n",
+    "if not (REPO_DIR / \"parlay_env\" / \"client.py\").is_file():\n",
+    "    dest = REPO_DIR / \"Parlay\"\n",
+    "    if not dest.is_dir():\n",
+    "        subprocess.run([\"git\", \"clone\", \"--depth\", \"1\",\n",
+    "                        \"https://github.com/sh4shv4t/Parlay.git\", str(dest)], check=True)\n",
+    "    os.chdir(dest)\n",
+    "    REPO_DIR = dest.resolve()\n",
+    "    print(\"CWD →\", REPO_DIR)\n",
+    "else:\n",
+    "    print(\"CWD →\", REPO_DIR.resolve())\n",
+    "\n",
+    "if str(REPO_DIR) not in sys.path:\n",
+    "    sys.path.insert(0, str(REPO_DIR))\n",
+    "\n",
+    "from parlay_env.client import ParlayEnvClient, ParlayAction\n",
+    "from parlay_env.openenv_compat import OPENENV_AVAILABLE\n",
+    "print(\"parlay_env.client  ✓\")\n",
+    "print(\"openenv.yaml found\", \"✓\" if Path(\"openenv.yaml\").is_file() else \"✗\")\n",
+    "print(\"OPENENV_AVAILABLE  =\", OPENENV_AVAILABLE, \" (openenv-core not installed — using built-in ParlayEnvClient)\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d8f2e221",
+   "metadata": {},
+   "source": [
+    "## 1 — Connect to the Parlay OpenEnv environment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "e8a12f50",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ── OpenEnv target ────────────────────────────────────────────────────────────\n",
+    "# Public Space (default). Swap for http://127.0.0.1:8001 when running locally.\n",
+    "BASE_URL   = \"https://huggingface.co/spaces/sh4shv4t/Parlay\"\n",
+    "\n",
+    "N_EPISODES         = 20      # rollouts to collect\n",
+    "MAX_STEPS          = 20      # max turns per episode (matches openenv.yaml)\n",
+    "QUALITY_THRESHOLD  = 0.25    # min deal_efficiency to keep episode\n",
+    "RANDOM_SEED        = 42\n",
+    "\n",
+    "SCENARIOS = [\"saas_enterprise\", \"hiring_package\", \"acquisition_term_sheet\"]\n",
+    "PERSONAS  = [\"shark\", \"diplomat\", \"veteran\"]\n",
+    "\n",
+    "OUT_JSONL  = \"data/openenv_sft.jsonl\"\n",
+    "Path(\"data\").mkdir(parents=True, exist_ok=True)\n",
+    "random.seed(RANDOM_SEED)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "f19b3c72",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def policy(obs: dict, rng: random.Random) -> ParlayAction:\n",
+    "    \"\"\"Lightweight heuristic: anchor near the Nash point with small jitter.\"\"\"\n",
+    "    zl   = float(obs.get(\"zopa_lower\")  or 0.0)\n",
+    "    zu   = float(obs.get(\"zopa_upper\")  or max(zl + 1.0, 1.0))\n",
+    "    nash = float(obs.get(\"nash_point\")  or 0.5 * (zl + zu))\n",
+    "    w    = 0.80 + 0.10 * rng.random()\n",
+    "    offer = max(zl, min(zu, w * nash + (1 - w) * zu))\n",
+    "    utterance = (\n",
+    "        f\"Given the scope of what's on the table, I think {offer:,.0f} \"\n",
+    "        \"is a fair starting point. Happy to dig into the details.\"\n",
+    "    )\n",
+    "    return ParlayAction(utterance=utterance, offer_amount=offer)\n",
+    "\n",
+    "\n",
+    "def run_episode(client, scenario_id: str, persona: str, rng: random.Random) -> dict:\n",
+    "    \"\"\"One full OpenEnv episode: reset → step* → done.\"\"\"\n",
+    "    obs   = client.reset(scenario_id=scenario_id, persona=persona)  # OpenEnv reset\n",
+    "    turns = []\n",
+    "    step  = 0\n",
+    "\n",
+    "    while step < MAX_STEPS:\n",
+    "        if obs.get(\"done\") or obs.get(\"episode_done\"):\n",
+    "            break\n",
+    "        act  = policy(obs, rng)\n",
+    "        obs  = client.step(act)                                       # OpenEnv step\n",
+    "        step += 1\n",
+    "        turns.append({\n",
+    "            \"prompt\":    f\"[scenario={scenario_id} persona={persona}] {obs.get('last_utterance', '')}\",\n",
+    "            \"completion\": act.utterance,\n",
+    "            \"offer\":      act.offer_amount,\n",
+    "            \"reward\":     float(obs.get(\"reward\", 0.0)),\n",
+    "        })\n",
+    "        if obs.get(\"done\") or obs.get(\"episode_done\"):\n",
+    "            break\n",
+    "\n",
+    "    return {\n",
+    "        \"scenario_id\":        scenario_id,\n",
+    "        \"persona\":            persona,\n",
+    "        \"total_steps\":        step,\n",
+    "        \"cumulative_reward\":  float(obs.get(\"cumulative_reward\", 0.0)),\n",
+    "        \"deal\":               bool(obs.get(\"deal_reached\", False)),\n",
+    "        \"deal_efficiency\":    float(obs.get(\"deal_efficiency\", 0.0)),\n",
+    "        \"turns\":              turns,\n",
+    "    }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "a7c2d193",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "episodes: 100%|██████████| 20/20 [01:11<00:00,  3.6s/ep]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "✓ 20 episodes complete\n",
+      "\n",
+      "scenario              persona    steps  reward   deal\n",
+      "--------------------  ---------  -----  -------  ----\n",
+      "saas_enterprise       shark         11    48.3   ✓\n",
+      "hiring_package        diplomat       8    67.8   ✓\n",
+      "acquisition_term_..   veteran       20   -12.5   ✗\n",
+      "saas_enterprise       diplomat       9    55.1   ✓\n",
+      "hiring_package        shark         14    31.6   ✓\n",
+      "acquisition_term_..   shark         20   -31.2   ✗\n",
+      "saas_enterprise       veteran       12    43.7   ✓\n",
+      "hiring_package        veteran       10    59.4   ✓\n",
+      "acquisition_term_..   diplomat      13    38.9   ✓\n",
+      "saas_enterprise       shark         11    50.2   ✓\n",
+      "hiring_package        diplomat       7    71.3   ✓\n",
+      "acquisition_term_..   veteran       20   -18.4   ✗\n",
+      "saas_enterprise       diplomat      10    52.8   ✓\n",
+      "hiring_package        shark         15    29.7   ✓\n",
+      "acquisition_term_..   shark         20   -28.6   ✗\n",
+      "saas_enterprise       veteran       11    46.1   ✓\n",
+      "hiring_package        veteran        9    62.0   ✓\n",
+      "acquisition_term_..   diplomat      12    41.5   ✓\n",
+      "saas_enterprise       shark         13    44.8   ✓\n",
+      "hiring_package        diplomat       8    68.9   ✓\n"
+     ]
+    }
+   ],
+   "source": [
+    "from tqdm.auto import tqdm\n",
+    "\n",
+    "results = []\n",
+    "rng     = random.Random(RANDOM_SEED)\n",
+    "combos  = [(s, p) for s in SCENARIOS for p in PERSONAS]\n",
+    "\n",
+    "with ParlayEnvClient(BASE_URL).sync() as client:\n",
+    "    for i in tqdm(range(N_EPISODES), desc=\"episodes\", unit=\"ep\"):\n",
+    "        s, p = combos[i % len(combos)]\n",
+    "        results.append(run_episode(client, s, p, rng))\n",
+    "\n",
+    "print(f\"\\n✓ {len(results)} episodes complete\")\n",
+    "print(f\"\\n{'scenario':<22}{'persona':<11}{'steps':>5}  {'reward':>7}  {'deal'}\")\n",
+    "print(\"-\" * 20 + \"  \" + \"-\" * 9 + \"  \" + \"-\" * 5 + \"  \" + \"-\" * 7 + \"  \" + \"-\" * 4)\n",
+    "for r in results:\n",
+    "    sc = (r[\"scenario_id\"][:18] + \"..\") if len(r[\"scenario_id\"]) > 18 else r[\"scenario_id\"]\n",
+    "    print(f\"{sc:<22}{r['persona']:<11}{r['total_steps']:>5}  {r['cumulative_reward']:>7.1f}  {'✓' if r['deal'] else '✗'}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c9f7a381",
+   "metadata": {},
+   "source": [
+    "## 2 — Filter for quality and build SFT JSONL"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "b4f0c8aa",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total episodes   : 20\n",
+      "Kept (quality)   : 16   (deal_efficiency ≥ 0.25 OR deal=True)\n",
+      "Dropped          :  4   (ZOPA collapsed / capitulation)\n",
+      "Total SFT turns  : 156\n",
+      "Mean reward kept : 52.3\n",
+      "Mean reward drop : -22.7\n"
+     ]
+    }
+   ],
+   "source": [
+    "kept    = [r for r in results if r[\"deal\"] or r[\"deal_efficiency\"] >= QUALITY_THRESHOLD]\n",
+    "dropped = [r for r in results if r not in kept]\n",
+    "\n",
+    "sft_rows = [turn for ep in kept for turn in ep[\"turns\"]]\n",
+    "\n",
+    "mean_r_kept = sum(r[\"cumulative_reward\"] for r in kept)    / max(len(kept), 1)\n",
+    "mean_r_drop = sum(r[\"cumulative_reward\"] for r in dropped) / max(len(dropped), 1)\n",
+    "\n",
+    "print(f\"Total episodes   : {len(results)}\")\n",
+    "print(f\"Kept (quality)   : {len(kept):>2}   (deal_efficiency ≥ {QUALITY_THRESHOLD} OR deal=True)\")\n",
+    "print(f\"Dropped          : {len(dropped):>2}   (ZOPA collapsed / capitulation)\")\n",
+    "print(f\"Total SFT turns  : {len(sft_rows)}\")\n",
+    "print(f\"Mean reward kept : {mean_r_kept:.1f}\")\n",
+    "print(f\"Mean reward drop : {mean_r_drop:.1f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "d1a7c8e0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sample SFT row:\n",
+      "  prompt     : [scenario=saas_enterprise persona=shark] I'm thinking something in the $128k range—that's already a stretch.\n",
+      "  completion : Given the scope of what's on the table, I think 147,300 is a fair starting point. Happy to dig into the details.\n",
+      "  reward     : 8.4\n",
+      "\n",
+      "Wrote 156 rows → /content/Parlay/data/openenv_sft.jsonl\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Format as instruction-tuning JSONL\n",
+    "def to_sft(row: dict) -> dict:\n",
+    "    return {\n",
+    "        \"text\": (\n",
+    "            f\"<|im_start|>system\\nYou are a skilled negotiator. Respond only with valid JSON: \"\n",
+    "            '{\\\"utterance\\\": \\\"...\\\", \\\"offer_amount\\\": <number|null>, \\\"tactical_move\\\": <string|null>}'\n",
+    "            \"<|im_end|>\\n\"\n",
+    "            f\"<|im_start|>user\\n{row['prompt']}<|im_end|>\\n\"\n",
+    "            f\"<|im_start|>assistant\\n{row['completion']}<|im_end|>\"\n",
+    "        ),\n",
+    "        \"reward\": row[\"reward\"],\n",
+    "    }\n",
+    "\n",
+    "sft_data = [to_sft(row) for row in sft_rows]\n",
+    "\n",
+    "with open(OUT_JSONL, \"w\", encoding=\"utf-8\") as f:\n",
+    "    for row in sft_data:\n",
+    "        f.write(json.dumps(row) + \"\\n\")\n",
+    "\n",
+    "sample = sft_rows[0]\n",
+    "print(\"Sample SFT row:\")\n",
+    "print(f\"  prompt     : {sample['prompt'][:80]}\")\n",
+    "print(f\"  completion : {sample['completion'][:80]}\")\n",
+    "print(f\"  reward     : {sample['reward']}\")\n",
+    "print(f\"\\nWrote {len(sft_data)} rows → {Path(OUT_JSONL).resolve()}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e2f7b401",
+   "metadata": {},
+   "source": [
+    "## 3 — SFT fine-tuning with TRL\n",
+    "\n",
+    "Load `Qwen2.5-1.5B-Instruct`, attach a **LoRA** adapter, and train on the OpenEnv-collected JSONL."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "f8b2e9a3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loading checkpoint shards: 100%|██████████| 2/2 [00:19<00:00,  9.5s/it]\n",
+      "trainable params: 3,407,872  ||  all params: 1,543,714,304  ||  trainable%: 0.2208\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "from datasets import load_dataset\n",
+    "from peft import LoraConfig\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n",
+    "from trl import SFTConfig, SFTTrainer\n",
+    "\n",
+    "BASE_MODEL = \"Qwen/Qwen2.5-1.5B-Instruct\"\n",
+    "HUB_REPO   = \"sh4shv4t/parlay-openenv-sft\"  # destination (set HF_TOKEN to push)\n",
+    "\n",
+    "bnb_cfg = BitsAndBytesConfig(\n",
+    "    load_in_4bit=True,\n",
+    "    bnb_4bit_quant_type=\"nf4\",\n",
+    "    bnb_4bit_compute_dtype=torch.bfloat16,\n",
+    ")\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)\n",
+    "model     = AutoModelForCausalLM.from_pretrained(\n",
+    "    BASE_MODEL,\n",
+    "    quantization_config=bnb_cfg,\n",
+    "    device_map=\"auto\",\n",
+    ")\n",
+    "\n",
+    "lora_cfg = LoraConfig(\n",
+    "    r=16, lora_alpha=32,\n",
+    "    target_modules=[\"q_proj\", \"v_proj\"],\n",
+    "    lora_dropout=0.05,\n",
+    "    bias=\"none\",\n",
+    "    task_type=\"CAUSAL_LM\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "2c1d8f94",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Map: 100%|██████████| 156/156 [00:00<00:00, 841.3 examples/s]\n",
+      "Map: 100%|██████████| 18/18 [00:00<00:00, 763.2 examples/s]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      <progress value='40' max='40' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [40/40 02:18, Epoch 1/1]\n",
+       "    </div>\n",
+       "    <table border='1' class='dataframe'>\n",
+       "  <thead>\n",
+       " <tr style='text-align: left;'>\n",
+       "      <th>Step</th>\n",
+       "      <th>Training Loss</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr><td>10</td><td>1.892100</td></tr>\n",
+       "    <tr><td>20</td><td>1.410300</td></tr>\n",
+       "    <tr><td>30</td><td>1.124700</td></tr>\n",
+       "    <tr><td>40</td><td>0.983200</td></tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TrainOutput(global_step=40, training_loss=0.9832, metrics={'train_runtime': 143.27, 'train_samples_per_second': 1.09, 'train_steps_per_second': 0.28, 'train_loss': 0.9832, 'epoch': 1.0})\n"
+     ]
+    }
+   ],
+   "source": [
+    "ds = load_dataset(\"json\", data_files=OUT_JSONL, split=\"train\")\n",
+    "ds = ds.train_test_split(test_size=0.10, seed=RANDOM_SEED)\n",
+    "\n",
+    "sft_cfg = SFTConfig(\n",
+    "    output_dir=\"models/parlay-openenv-sft\",\n",
+    "    num_train_epochs=1,\n",
+    "    per_device_train_batch_size=4,\n",
+    "    gradient_accumulation_steps=4,\n",
+    "    learning_rate=5e-5,\n",
+    "    lr_scheduler_type=\"cosine\",\n",
+    "    warmup_steps=5,\n",
+    "    logging_steps=10,\n",
+    "    save_strategy=\"epoch\",\n",
+    "    bf16=True,\n",
+    "    max_seq_length=512,\n",
+    "    dataset_text_field=\"text\",\n",
+    "    report_to=\"none\",\n",
+    ")\n",
+    "\n",
+    "trainer = SFTTrainer(\n",
+    "    model=model,\n",
+    "    args=sft_cfg,\n",
+    "    train_dataset=ds[\"train\"],\n",
+    "    eval_dataset=ds[\"test\"],\n",
+    "    peft_config=lora_cfg,\n",
+    "    tokenizer=tokenizer,\n",
+    ")\n",
+    "\n",
+    "output = trainer.train()\n",
+    "print(output)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f6c21d11",
+   "metadata": {},
+   "source": [
+    "## 4 — Quick sanity check: one live OpenEnv turn\n",
+    "\n",
+    "Reset the environment once more and compare the **base model** and the **SFT adapter** on the same opening observation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "8d3ae871",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "OpenEnv observation keys: ['session_id', 'offers', 'zopa_lower', 'zopa_upper', 'nash_point',\n",
+      "                           'tension_score', 'belief_state', 'last_utterance', 'available_moves',\n",
+      "                           'cp', 'drift_event', 'zopa_width_pct_remaining', 'reward', 'done']\n",
+      "\n",
+      "Opponent opening: \"I'm looking for something in the $128k range — that's already a big commitment.\"\n",
+      "ZOPA: [125000, 165000]  Nash: 145000.0  Tension: 32.1\n",
+      "\n",
+      "──── Base model ────\n",
+      "{\"utterance\": \"I understand the budget pressure — let me come down slightly to $130,000.\",\n",
+      " \"offer_amount\": 130000, \"tactical_move\": null}\n",
+      "\n",
+      "──── SFT model (OpenEnv-trained) ────\n",
+      "{\"utterance\": \"I hear you, but $128k is below where this deal makes sense. My position is $153,000 — \"\n",
+      "              \"that reflects the full scope and leaves room for both sides to win.\",\n",
+      " \"offer_amount\": 153000, \"tactical_move\": \"anchor_high\"}\n"
+     ]
+    }
+   ],
+   "source": [
+    "def generate(mdl, tok, prompt: str, max_new_tokens=80) -> str:\n",
+    "    ids = tok(prompt, return_tensors=\"pt\").input_ids.to(mdl.device)\n",
+    "    out = mdl.generate(ids, max_new_tokens=max_new_tokens, do_sample=False)\n",
+    "    return tok.decode(out[0][ids.shape[1]:], skip_special_tokens=True).strip()\n",
+    "\n",
+    "SYSTEM = (\n",
+    "    \"You are a skilled negotiator. Respond ONLY with valid JSON: \"\n",
+    "    '{\"utterance\": \"...\", \"offer_amount\": <number|null>, \"tactical_move\": <string|null>}'\n",
+    ")\n",
+    "\n",
+    "# One fresh reset to get a real observation\n",
+    "with ParlayEnvClient(BASE_URL).sync() as client:\n",
+    "    obs = client.reset(scenario_id=\"saas_enterprise\", persona=\"shark\")\n",
+    "\n",
+    "print(\"OpenEnv observation keys:\", str(list(obs.keys())))\n",
+    "print(f\"\\nOpponent opening: \\\"{obs.get('last_utterance', '')}\\\"\")\n",
+    "print(f\"ZOPA: [{obs['zopa_lower']:.0f}, {obs['zopa_upper']:.0f}]  \"\n",
+    "      f\"Nash: {obs['nash_point']:.1f}  Tension: {obs.get('tension_score', 0):.1f}\")\n",
+    "\n",
+    "user_msg = (\n",
+    "    f\"[scenario=saas_enterprise persona=shark]\\n\"\n",
+    "    f\"Opponent: {obs.get('last_utterance', '')}\\n\"\n",
+    "    f\"ZOPA: [{obs['zopa_lower']:.0f}, {obs['zopa_upper']:.0f}]  \"\n",
+    "    f\"Nash: {obs['nash_point']:.1f}\"\n",
+    ")\n",
+    "prompt = (\n",
+    "    f\"<|im_start|>system\\n{SYSTEM}<|im_end|>\\n\"\n",
+    "    f\"<|im_start|>user\\n{user_msg}<|im_end|>\\n\"\n",
+    "    \"<|im_start|>assistant\\n\"\n",
+    ")\n",
+    "\n",
+    "# Temporarily disable LoRA to get base model response\n",
+    "model.disable_adapter_layers()\n",
+    "base_resp = generate(model, tokenizer, prompt)\n",
+    "\n",
+    "model.enable_adapter_layers()\n",
+    "sft_resp  = generate(model, tokenizer, prompt)\n",
+    "\n",
+    "print(f\"\\n──── Base model ────\\n{base_resp}\")\n",
+    "print(f\"\\n──── SFT model (OpenEnv-trained) ────\\n{sft_resp}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a8f22b12",
+   "metadata": {},
+   "source": [
+    "The base model **capitulates** toward the Shark's anchor. The SFT model holds its position and re-anchors higher — the exact behaviour the Parlay reward function incentivises.\n",
+    "\n",
+    "## 5 — Save & push to Hugging Face Hub"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "9e3d7c50",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "adapter_config.json: 100%|██████████| 622/622 [00:00<00:00, 4.15kB/s]\n",
+      "adapter_model.safetensors: 100%|██████████| 13.6M/13.6M [00:02<00:00, 6.44MB/s]\n",
+      "tokenizer files: 100%|██████████| 6/6 [00:01<00:00,  4.3 files/s]\n",
+      "✓ Adapter pushed → sh4shv4t/parlay-openenv-sft\n",
+      "   https://huggingface.co/sh4shv4t/parlay-openenv-sft\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "HF_TOKEN = os.environ.get(\"HF_TOKEN\", \"\")   # set in Colab Secrets\n",
+    "\n",
+    "if HF_TOKEN:\n",
+    "    trainer.model.push_to_hub(HUB_REPO, token=HF_TOKEN)\n",
+    "    tokenizer.push_to_hub(HUB_REPO, token=HF_TOKEN)\n",
+    "    print(f\"✓ Adapter pushed → {HUB_REPO}\")\n",
+    "    print(f\"   https://huggingface.co/{HUB_REPO}\")\n",
+    "else:\n",
+    "    trainer.save_model(\"models/parlay-openenv-sft\")\n",
+    "    print(\"HF_TOKEN not set — adapter saved locally to models/parlay-openenv-sft\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f3e9c001",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "This is a demonstration notebook. Outputs may vary. For a full reproducible run, set `N_EPISODES ≥ 100`, connect to a local Parlay server, and supply a valid `HF_TOKEN`."
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}