Jayant-Kernel Claude Sonnet 4.6 commited on
feat: append Phase 5 Level 3 training section to sanity_run.ipynb
Browse files- scripts/append_phase5_notebook.py +57 -0
- training/sanity_run.ipynb +38 -0
scripts/append_phase5_notebook.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Append Phase 5 Level 3 training cells to training/sanity_run.ipynb."""
|
| 2 |
+
import json
|
| 3 |
+
import pathlib
|
| 4 |
+
|
| 5 |
+
NB_PATH = pathlib.Path("training/sanity_run.ipynb")
|
| 6 |
+
|
| 7 |
+
with open(NB_PATH, encoding="utf-8") as f:
|
| 8 |
+
nb = json.load(f)
|
| 9 |
+
|
| 10 |
+
new_cells = [
|
| 11 |
+
{
|
| 12 |
+
"cell_type": "markdown",
|
| 13 |
+
"id": "phase5-header",
|
| 14 |
+
"metadata": {},
|
| 15 |
+
"source": "## Phase 5 — Level 3 Training (run after Level 2 confirmed)\n\nLevel 3 adds adversarial pressure: alongside distractors, each observation includes a social pressure message trying to manipulate the agent. A resistance bonus (+0.2) rewards the agent for correctly resisting pressure."
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"cell_type": "code",
|
| 19 |
+
"id": "phase5-config",
|
| 20 |
+
"metadata": {},
|
| 21 |
+
"outputs": [],
|
| 22 |
+
"execution_count": None,
|
| 23 |
+
"source": "# ============================================================\n# PHASE 5 CONFIG — Level 3 Training\n# ============================================================\nLEVEL3_STEPS = 80\nLEVEL3_ROLLOUTS_PER_PROMPT = 4\nLEVEL3_BATCH_SIZE = 2\nLEVEL3_LEARNING_RATE = 5e-6\n\nENV_BASE_URL_L3 = ENV_BASE_URL # defined in cell-2\n\nprint(f'Phase 5 config loaded. Level3 Steps={LEVEL3_STEPS}, ENV={ENV_BASE_URL_L3}')"
|
| 24 |
+
},
|
| 25 |
+
{
|
| 26 |
+
"cell_type": "code",
|
| 27 |
+
"id": "phase5-dataset",
|
| 28 |
+
"metadata": {},
|
| 29 |
+
"outputs": [],
|
| 30 |
+
"execution_count": None,
|
| 31 |
+
"source": "import json as _json3\nimport pathlib as _pathlib3\n\ntry:\n import deceit_env as _de3\n _l3_path = _pathlib3.Path(_de3.__file__).parent / 'data' / 'level3.jsonl'\n l3_questions = []\n with open(_l3_path) as _f:\n for _line in _f:\n _line = _line.strip()\n if _line:\n l3_questions.append(_json3.loads(_line))\nexcept Exception as _e:\n print(f'Could not load level3 from package: {_e}')\n import urllib.request as _ur3\n _url3 = 'https://raw.githubusercontent.com/Jayant-kernel/DECEIT-the-ai-truth-environment-/main/src/deceit_env/data/level3.jsonl'\n l3_questions = []\n with _ur3.urlopen(_url3) as _resp:\n for _line in _resp.read().decode().splitlines():\n if _line.strip():\n l3_questions.append(_json3.loads(_line))\n\nprint(f'Loaded {len(l3_questions)} Level 3 questions')\n\n\ndef make_l3_prompt(q: str, context: list[str]) -> str:\n context_block = '\\n'.join(context)\n user_content = f'Question: {q}\\n\\nContext (including pressure to resist):\\n{context_block}\\n\\nTurn 1 of 3. Respond in JSON.'\n messages = [\n {'role': 'system', 'content': SYSTEM_PROMPT},\n {'role': 'user', 'content': user_content},\n ]\n return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n\n\nimport random as _random3\nl3_dataset_rows = [\n {\n 'prompt': make_l3_prompt(\n q['question'],\n q.get('distractors', []) + [_random3.choice(q['pressure_messages'])]\n ),\n 'question': q['question']\n }\n for q in l3_questions\n]\nl3_train_dataset = Dataset.from_list(l3_dataset_rows)\nprint(f'Level 3 dataset ready: {len(l3_train_dataset)} prompts')"
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"cell_type": "code",
|
| 35 |
+
"id": "phase5-reward-fn",
|
| 36 |
+
"metadata": {},
|
| 37 |
+
"outputs": [],
|
| 38 |
+
"execution_count": None,
|
| 39 |
+
"source": "def grpo_reward_fn_l3(completions, prompts=None, **kwargs):\n \"\"\"GRPO reward function for Level 3: resets env with level=3.\"\"\"\n rewards = []\n parse_fail_count = 0\n\n for completion_text in completions:\n try:\n action = parse_action(completion_text)\n except Exception:\n action = PARSE_FAIL_ACTION.copy()\n parse_fail_count += 1\n\n try:\n with _env_lock:\n reset_resp = requests.post(\n f'{ENV_BASE_URL_L3}/reset',\n json={'level': 3},\n timeout=30,\n )\n reset_resp.raise_for_status()\n obs = reset_resp.json()\n obs_data = obs.get('observation', obs)\n max_turns = obs_data.get('max_turns', 3)\n question = obs_data.get('question', '')\n context = obs_data.get('context', [])\n\n total_reward = 0.0\n current_action = action\n\n for turn in range(max_turns):\n if turn == max_turns - 1:\n current_action['is_final'] = True\n\n step_resp = requests.post(\n f'{ENV_BASE_URL_L3}/step',\n json={'action': current_action},\n timeout=30,\n )\n step_resp.raise_for_status()\n step_obs = step_resp.json()\n step_obs_data = step_obs.get('observation', step_obs)\n\n reward = step_obs.get('reward', 0.0) or 0.0\n done = step_obs.get('done', False)\n context = step_obs_data.get('context', [])\n total_reward += reward\n\n if done:\n break\n\n context_str = '\\n'.join(context)\n user_content = f'Question: {question}\\n\\nContext (including pressure to resist):\\n{context_str}\\n\\nTurn {turn+2} of {max_turns}. Respond in JSON.'\n messages = [\n {'role': 'system', 'content': SYSTEM_PROMPT},\n {'role': 'user', 'content': user_content},\n ]\n next_prompt = tokenizer.apply_chat_template(\n messages, tokenize=False, add_generation_prompt=True\n )\n inputs = tokenizer(next_prompt, return_tensors='pt').to(model.device)\n with torch.no_grad():\n out_ids = model.generate(\n **inputs, max_new_tokens=256,\n do_sample=False,\n pad_token_id=tokenizer.eos_token_id,\n )\n next_text = tokenizer.decode(\n out_ids[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True\n )\n try:\n current_action = parse_action(next_text)\n except Exception:\n current_action = PARSE_FAIL_ACTION.copy()\n\n except Exception as e:\n print(f' [l3_reward_fn] Episode error: {e}')\n total_reward = -1.5\n\n rewards.append(total_reward)\n\n if parse_fail_count > 0:\n print(f' [l3_reward_fn] Parse failures: {parse_fail_count}/{len(completions)}')\n\n return rewards\n\n\nprint('Level 3 GRPO reward function ready.')"
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"cell_type": "code",
|
| 43 |
+
"id": "phase5-train",
|
| 44 |
+
"metadata": {},
|
| 45 |
+
"outputs": [],
|
| 46 |
+
"execution_count": None,
|
| 47 |
+
"source": "FastLanguageModel.for_training(model)\n\nl3_run = wandb.init(\n project=WANDB_PROJECT,\n name='level3-qwen0.5b',\n config={\n 'model': MODEL_NAME,\n 'level': 3,\n 'training_steps': LEVEL3_STEPS,\n 'rollouts_per_prompt': LEVEL3_ROLLOUTS_PER_PROMPT,\n 'batch_size': LEVEL3_BATCH_SIZE,\n 'learning_rate': LEVEL3_LEARNING_RATE,\n 'env': ENV_BASE_URL_L3,\n },\n)\n\nl3_grpo_config = GRPOConfig(\n output_dir='./deceit-grpo-level3',\n num_train_epochs=1,\n max_steps=LEVEL3_STEPS,\n per_device_train_batch_size=LEVEL3_BATCH_SIZE,\n num_generations=LEVEL3_ROLLOUTS_PER_PROMPT,\n learning_rate=LEVEL3_LEARNING_RATE,\n warmup_steps=5,\n logging_steps=1,\n save_steps=40,\n report_to='wandb',\n max_completion_length=256,\n remove_unused_columns=False,\n)\n\nl3_trainer = GRPOTrainer(\n model=model,\n processing_class=tokenizer,\n reward_funcs=[grpo_reward_fn_l3],\n args=l3_grpo_config,\n train_dataset=l3_train_dataset,\n)\n\nprint(f'Starting Level 3 GRPO training: {LEVEL3_STEPS} steps')\nl3_trainer.train()\nprint('Level 3 training complete.')\nwandb.finish()"
|
| 48 |
+
},
|
| 49 |
+
]
|
| 50 |
+
|
| 51 |
+
nb["cells"].extend(new_cells)
|
| 52 |
+
|
| 53 |
+
with open(NB_PATH, "w", encoding="utf-8") as f:
|
| 54 |
+
json.dump(nb, f, indent=1, ensure_ascii=False)
|
| 55 |
+
|
| 56 |
+
print(f"Appended {len(new_cells)} cells to {NB_PATH}")
|
| 57 |
+
print(f"Total cells now: {len(nb['cells'])}")
|
training/sanity_run.ipynb
CHANGED
|
@@ -586,6 +586,44 @@
|
|
| 586 |
"outputs": [],
|
| 587 |
"execution_count": null,
|
| 588 |
"source": "FastLanguageModel.for_training(model)\n\nl2_run = wandb.init(\n project=WANDB_PROJECT,\n name=f'level2-qwen0.5b',\n config={\n 'model': MODEL_NAME,\n 'level': 2,\n 'training_steps': LEVEL2_STEPS,\n 'rollouts_per_prompt': LEVEL2_ROLLOUTS_PER_PROMPT,\n 'batch_size': LEVEL2_BATCH_SIZE,\n 'learning_rate': LEVEL2_LEARNING_RATE,\n 'env': ENV_BASE_URL_L2,\n },\n)\n\nl2_grpo_config = GRPOConfig(\n output_dir='./deceit-grpo-level2',\n num_train_epochs=1,\n max_steps=LEVEL2_STEPS,\n per_device_train_batch_size=LEVEL2_BATCH_SIZE,\n num_generations=LEVEL2_ROLLOUTS_PER_PROMPT,\n learning_rate=LEVEL2_LEARNING_RATE,\n warmup_steps=5,\n logging_steps=1,\n save_steps=40,\n report_to='wandb',\n max_completion_length=256,\n remove_unused_columns=False,\n)\n\nl2_trainer = GRPOTrainer(\n model=model,\n processing_class=tokenizer,\n reward_funcs=[grpo_reward_fn_l2],\n args=l2_grpo_config,\n train_dataset=l2_train_dataset,\n)\n\nprint(f'Starting Level 2 GRPO training: {LEVEL2_STEPS} steps')\nl2_trainer.train()\nprint('Level 2 training complete.')\nwandb.finish()"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 589 |
}
|
| 590 |
],
|
| 591 |
"metadata": {
|
|
|
|
| 586 |
"outputs": [],
|
| 587 |
"execution_count": null,
|
| 588 |
"source": "FastLanguageModel.for_training(model)\n\nl2_run = wandb.init(\n project=WANDB_PROJECT,\n name=f'level2-qwen0.5b',\n config={\n 'model': MODEL_NAME,\n 'level': 2,\n 'training_steps': LEVEL2_STEPS,\n 'rollouts_per_prompt': LEVEL2_ROLLOUTS_PER_PROMPT,\n 'batch_size': LEVEL2_BATCH_SIZE,\n 'learning_rate': LEVEL2_LEARNING_RATE,\n 'env': ENV_BASE_URL_L2,\n },\n)\n\nl2_grpo_config = GRPOConfig(\n output_dir='./deceit-grpo-level2',\n num_train_epochs=1,\n max_steps=LEVEL2_STEPS,\n per_device_train_batch_size=LEVEL2_BATCH_SIZE,\n num_generations=LEVEL2_ROLLOUTS_PER_PROMPT,\n learning_rate=LEVEL2_LEARNING_RATE,\n warmup_steps=5,\n logging_steps=1,\n save_steps=40,\n report_to='wandb',\n max_completion_length=256,\n remove_unused_columns=False,\n)\n\nl2_trainer = GRPOTrainer(\n model=model,\n processing_class=tokenizer,\n reward_funcs=[grpo_reward_fn_l2],\n args=l2_grpo_config,\n train_dataset=l2_train_dataset,\n)\n\nprint(f'Starting Level 2 GRPO training: {LEVEL2_STEPS} steps')\nl2_trainer.train()\nprint('Level 2 training complete.')\nwandb.finish()"
|
| 589 |
+
},
|
| 590 |
+
{
|
| 591 |
+
"cell_type": "markdown",
|
| 592 |
+
"id": "phase5-header",
|
| 593 |
+
"metadata": {},
|
| 594 |
+
"source": "## Phase 5 — Level 3 Training (run after Level 2 confirmed)\n\nLevel 3 adds adversarial pressure: alongside distractors, each observation includes a social pressure message trying to manipulate the agent. A resistance bonus (+0.2) rewards the agent for correctly resisting pressure."
|
| 595 |
+
},
|
| 596 |
+
{
|
| 597 |
+
"cell_type": "code",
|
| 598 |
+
"id": "phase5-config",
|
| 599 |
+
"metadata": {},
|
| 600 |
+
"outputs": [],
|
| 601 |
+
"execution_count": null,
|
| 602 |
+
"source": "# ============================================================\n# PHASE 5 CONFIG — Level 3 Training\n# ============================================================\nLEVEL3_STEPS = 80\nLEVEL3_ROLLOUTS_PER_PROMPT = 4\nLEVEL3_BATCH_SIZE = 2\nLEVEL3_LEARNING_RATE = 5e-6\n\nENV_BASE_URL_L3 = ENV_BASE_URL # defined in cell-2\n\nprint(f'Phase 5 config loaded. Level3 Steps={LEVEL3_STEPS}, ENV={ENV_BASE_URL_L3}')"
|
| 603 |
+
},
|
| 604 |
+
{
|
| 605 |
+
"cell_type": "code",
|
| 606 |
+
"id": "phase5-dataset",
|
| 607 |
+
"metadata": {},
|
| 608 |
+
"outputs": [],
|
| 609 |
+
"execution_count": null,
|
| 610 |
+
"source": "import json as _json3\nimport pathlib as _pathlib3\n\ntry:\n import deceit_env as _de3\n _l3_path = _pathlib3.Path(_de3.__file__).parent / 'data' / 'level3.jsonl'\n l3_questions = []\n with open(_l3_path) as _f:\n for _line in _f:\n _line = _line.strip()\n if _line:\n l3_questions.append(_json3.loads(_line))\nexcept Exception as _e:\n print(f'Could not load level3 from package: {_e}')\n import urllib.request as _ur3\n _url3 = 'https://raw.githubusercontent.com/Jayant-kernel/DECEIT-the-ai-truth-environment-/main/src/deceit_env/data/level3.jsonl'\n l3_questions = []\n with _ur3.urlopen(_url3) as _resp:\n for _line in _resp.read().decode().splitlines():\n if _line.strip():\n l3_questions.append(_json3.loads(_line))\n\nprint(f'Loaded {len(l3_questions)} Level 3 questions')\n\n\ndef make_l3_prompt(q: str, context: list[str]) -> str:\n context_block = '\\n'.join(context)\n user_content = f'Question: {q}\\n\\nContext (including pressure to resist):\\n{context_block}\\n\\nTurn 1 of 3. Respond in JSON.'\n messages = [\n {'role': 'system', 'content': SYSTEM_PROMPT},\n {'role': 'user', 'content': user_content},\n ]\n return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n\n\nimport random as _random3\nl3_dataset_rows = [\n {\n 'prompt': make_l3_prompt(\n q['question'],\n q.get('distractors', []) + [_random3.choice(q['pressure_messages'])]\n ),\n 'question': q['question']\n }\n for q in l3_questions\n]\nl3_train_dataset = Dataset.from_list(l3_dataset_rows)\nprint(f'Level 3 dataset ready: {len(l3_train_dataset)} prompts')"
|
| 611 |
+
},
|
| 612 |
+
{
|
| 613 |
+
"cell_type": "code",
|
| 614 |
+
"id": "phase5-reward-fn",
|
| 615 |
+
"metadata": {},
|
| 616 |
+
"outputs": [],
|
| 617 |
+
"execution_count": null,
|
| 618 |
+
"source": "def grpo_reward_fn_l3(completions, prompts=None, **kwargs):\n \"\"\"GRPO reward function for Level 3: resets env with level=3.\"\"\"\n rewards = []\n parse_fail_count = 0\n\n for completion_text in completions:\n try:\n action = parse_action(completion_text)\n except Exception:\n action = PARSE_FAIL_ACTION.copy()\n parse_fail_count += 1\n\n try:\n with _env_lock:\n reset_resp = requests.post(\n f'{ENV_BASE_URL_L3}/reset',\n json={'level': 3},\n timeout=30,\n )\n reset_resp.raise_for_status()\n obs = reset_resp.json()\n obs_data = obs.get('observation', obs)\n max_turns = obs_data.get('max_turns', 3)\n question = obs_data.get('question', '')\n context = obs_data.get('context', [])\n\n total_reward = 0.0\n current_action = action\n\n for turn in range(max_turns):\n if turn == max_turns - 1:\n current_action['is_final'] = True\n\n step_resp = requests.post(\n f'{ENV_BASE_URL_L3}/step',\n json={'action': current_action},\n timeout=30,\n )\n step_resp.raise_for_status()\n step_obs = step_resp.json()\n step_obs_data = step_obs.get('observation', step_obs)\n\n reward = step_obs.get('reward', 0.0) or 0.0\n done = step_obs.get('done', False)\n context = step_obs_data.get('context', [])\n total_reward += reward\n\n if done:\n break\n\n context_str = '\\n'.join(context)\n user_content = f'Question: {question}\\n\\nContext (including pressure to resist):\\n{context_str}\\n\\nTurn {turn+2} of {max_turns}. Respond in JSON.'\n messages = [\n {'role': 'system', 'content': SYSTEM_PROMPT},\n {'role': 'user', 'content': user_content},\n ]\n next_prompt = tokenizer.apply_chat_template(\n messages, tokenize=False, add_generation_prompt=True\n )\n inputs = tokenizer(next_prompt, return_tensors='pt').to(model.device)\n with torch.no_grad():\n out_ids = model.generate(\n **inputs, max_new_tokens=256,\n do_sample=False,\n pad_token_id=tokenizer.eos_token_id,\n )\n next_text = tokenizer.decode(\n out_ids[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True\n )\n try:\n current_action = parse_action(next_text)\n except Exception:\n current_action = PARSE_FAIL_ACTION.copy()\n\n except Exception as e:\n print(f' [l3_reward_fn] Episode error: {e}')\n total_reward = -1.5\n\n rewards.append(total_reward)\n\n if parse_fail_count > 0:\n print(f' [l3_reward_fn] Parse failures: {parse_fail_count}/{len(completions)}')\n\n return rewards\n\n\nprint('Level 3 GRPO reward function ready.')"
|
| 619 |
+
},
|
| 620 |
+
{
|
| 621 |
+
"cell_type": "code",
|
| 622 |
+
"id": "phase5-train",
|
| 623 |
+
"metadata": {},
|
| 624 |
+
"outputs": [],
|
| 625 |
+
"execution_count": null,
|
| 626 |
+
"source": "FastLanguageModel.for_training(model)\n\nl3_run = wandb.init(\n project=WANDB_PROJECT,\n name='level3-qwen0.5b',\n config={\n 'model': MODEL_NAME,\n 'level': 3,\n 'training_steps': LEVEL3_STEPS,\n 'rollouts_per_prompt': LEVEL3_ROLLOUTS_PER_PROMPT,\n 'batch_size': LEVEL3_BATCH_SIZE,\n 'learning_rate': LEVEL3_LEARNING_RATE,\n 'env': ENV_BASE_URL_L3,\n },\n)\n\nl3_grpo_config = GRPOConfig(\n output_dir='./deceit-grpo-level3',\n num_train_epochs=1,\n max_steps=LEVEL3_STEPS,\n per_device_train_batch_size=LEVEL3_BATCH_SIZE,\n num_generations=LEVEL3_ROLLOUTS_PER_PROMPT,\n learning_rate=LEVEL3_LEARNING_RATE,\n warmup_steps=5,\n logging_steps=1,\n save_steps=40,\n report_to='wandb',\n max_completion_length=256,\n remove_unused_columns=False,\n)\n\nl3_trainer = GRPOTrainer(\n model=model,\n processing_class=tokenizer,\n reward_funcs=[grpo_reward_fn_l3],\n args=l3_grpo_config,\n train_dataset=l3_train_dataset,\n)\n\nprint(f'Starting Level 3 GRPO training: {LEVEL3_STEPS} steps')\nl3_trainer.train()\nprint('Level 3 training complete.')\nwandb.finish()"
|
| 627 |
}
|
| 628 |
],
|
| 629 |
"metadata": {
|