YUS200619 commited on
Commit
b28acab
·
verified ·
1 Parent(s): 9497e48

Upload folder using huggingface_hub

Browse files
notebooks/training.ipynb CHANGED
@@ -1,293 +1,326 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "metadata": {},
6
- "source": [
7
- "# SWEbench-IN GRPO Training Notebook\n",
8
- "\n",
9
- "This notebook trains a Qwen2.5-3B-Instruct model using GRPO (Group Relative Policy Optimization)\n",
10
- "to act as an Indian SWE fixing broken Linux systems while managing stakeholder communication.\n",
11
- "\n",
12
- "**Prerequisites:**\n",
13
- "- A running SWEbench-IN HuggingFace Space\n",
14
- "- A Weights & Biases account\n",
15
- "- Google Colab with GPU runtime (T4 or better)"
16
- ]
17
- },
18
- {
19
- "cell_type": "markdown",
20
- "metadata": {},
21
- "source": [
22
- "## Cell 1 Install Dependencies"
23
- ]
24
- },
25
- {
26
- "cell_type": "code",
27
- "execution_count": null,
28
- "metadata": {},
29
- "outputs": [],
30
- "source": [
31
- "!pip install unsloth trl transformers accelerate openenv-client wandb -q"
32
- ]
33
- },
34
- {
35
- "cell_type": "markdown",
36
- "metadata": {},
37
- "source": [
38
- "## Cell 2 — Import and Configure"
39
- ]
40
- },
41
- {
42
- "cell_type": "code",
43
- "execution_count": null,
44
- "metadata": {},
45
- "outputs": [],
46
- "source": [
47
- "import wandb\n",
48
- "import random\n",
49
- "import re\n",
50
- "import json\n",
51
- "from unsloth import FastLanguageModel\n",
52
- "from trl import GRPOTrainer, GRPOConfig\n",
53
- "from openenv.client import Environment as OpenEnvClient\n",
54
- "\n",
55
- "wandb.init(project=\"swebench-in\", name=\"grpo-run-1\")\n",
56
- "\n",
57
- "HF_SPACE_URL = \"YOUR_HF_SPACE_URL_HERE\" # Replace before running\n",
58
- "env = OpenEnvClient(HF_SPACE_URL)"
59
- ]
60
- },
61
- {
62
- "cell_type": "markdown",
63
- "metadata": {},
64
- "source": [
65
- "## Cell 3 Load Model (Qwen2.5-3B-Instruct, 4-bit QLoRA via Unsloth)"
66
- ]
67
- },
68
- {
69
- "cell_type": "code",
70
- "execution_count": null,
71
- "metadata": {},
72
- "outputs": [],
73
- "source": [
74
- "model, tokenizer = FastLanguageModel.from_pretrained(\n",
75
- " model_name=\"Qwen/Qwen2.5-3B-Instruct\",\n",
76
- " max_seq_length=2048,\n",
77
- " dtype=None,\n",
78
- " load_in_4bit=True,\n",
79
- ")\n",
80
- "model = FastLanguageModel.get_peft_model(\n",
81
- " model,\n",
82
- " r=16,\n",
83
- " target_modules=[\"q_proj\", \"v_proj\"],\n",
84
- " lora_alpha=16,\n",
85
- " lora_dropout=0,\n",
86
- " bias=\"none\",\n",
87
- " use_gradient_checkpointing=True,\n",
88
- ")"
89
- ]
90
- },
91
- {
92
- "cell_type": "markdown",
93
- "metadata": {},
94
- "source": [
95
- "## Cell 4 — Define Rollout Function"
96
- ]
97
- },
98
- {
99
- "cell_type": "code",
100
- "execution_count": null,
101
- "metadata": {},
102
- "outputs": [],
103
- "source": [
104
- "def parse_action(action_text: str) -> dict:\n",
105
- " \"\"\"\n",
106
- " Parse the model's generated text into an action dict.\n",
107
- " Expected format: ACTION_TYPE: args\n",
108
- " \"\"\"\n",
109
- " action_text = action_text.strip()\n",
110
- " # Try to find action pattern\n",
111
- " match = re.search(r'(run_command|read_file|write_file|run_tests|check_server|reply_slack|reply_email|reply_hr|close_case)[:\\s]+(.*)', action_text, re.DOTALL)\n",
112
- " if match:\n",
113
- " return {\"type\": match.group(1), \"args\": match.group(2).strip()}\n",
114
- " # Default: treat as run_command\n",
115
- " return {\"type\": \"run_command\", \"args\": action_text}\n",
116
- "\n",
117
- "\n",
118
- "def rollout(prompt: str, task_id: int) -> tuple[list[str], float]:\n",
119
- " \"\"\"\n",
120
- " Run one episode. Return (action_sequence, total_reward).\n",
121
- " Uses sampling with temperature 0.7.\n",
122
- " \"\"\"\n",
123
- " obs = env.reset(task_id=task_id)\n",
124
- " actions = []\n",
125
- " total_reward = 0.0\n",
126
- " done = False\n",
127
- "\n",
128
- " while not done:\n",
129
- " inputs = tokenizer(f\"Observation: {obs}\\nAction:\", return_tensors=\"pt\")\n",
130
- " output = model.generate(**inputs, max_new_tokens=100, do_sample=True, temperature=0.7)\n",
131
- " action_text = tokenizer.decode(output[0], skip_special_tokens=True)\n",
132
- " action = parse_action(action_text)\n",
133
- " obs, reward, done, info = env.step(action)\n",
134
- " actions.append(action_text)\n",
135
- " total_reward += reward\n",
136
- "\n",
137
- " return actions, total_reward"
138
- ]
139
- },
140
- {
141
- "cell_type": "markdown",
142
- "metadata": {},
143
- "source": [
144
- "## Cell 5 — Curriculum Training Loop\n",
145
- "\n",
146
- "Curriculum escalates when average reward over last 50 episodes crosses 0.6:\n",
147
- "- **Tier 1** (Steps 0–200): Tasks 1+2 only (easy, technical reward)\n",
148
- "- **Tier 2** (Steps 200–500): Add Tasks 3+4 (communication reward added)\n",
149
- "- **Tier 3** (Steps 500+): Add Task 5 (leave protection added)"
150
- ]
151
- },
152
- {
153
- "cell_type": "code",
154
- "execution_count": null,
155
- "metadata": {},
156
- "outputs": [],
157
- "source": [
158
- "# Curriculum: tier 1 tasks first (1,2), then tier 2 (3,4), then tier 3 (5)\n",
159
- "CURRICULUM = {\n",
160
- " \"tier1\": [1, 2],\n",
161
- " \"tier2\": [3, 4],\n",
162
- " \"tier3\": [5],\n",
163
- "}\n",
164
- "\n",
165
- "current_tier = \"tier1\"\n",
166
- "tier_rewards = []\n",
167
- "\n",
168
- "for step in range(700):\n",
169
- " task_id = random.choice(CURRICULUM[current_tier])\n",
170
- " actions, reward = rollout(\"\", task_id)\n",
171
- "\n",
172
- " # Log to wandb\n",
173
- " wandb.log({\n",
174
- " \"reward/total\": reward,\n",
175
- " \"training_step\": step,\n",
176
- " \"task_id\": task_id,\n",
177
- " \"current_tier\": current_tier,\n",
178
- " \"num_actions\": len(actions),\n",
179
- " })\n",
180
- "\n",
181
- " tier_rewards.append(reward)\n",
182
- "\n",
183
- " # Escalate curriculum\n",
184
- " if len(tier_rewards) >= 50 and sum(tier_rewards[-50:]) / 50 >= 0.6:\n",
185
- " if current_tier == \"tier1\":\n",
186
- " current_tier = \"tier2\"\n",
187
- " tier_rewards = []\n",
188
- " print(f\"Step {step}: Escalating to tier 2\")\n",
189
- " elif current_tier == \"tier2\":\n",
190
- " current_tier = \"tier3\"\n",
191
- " tier_rewards = []\n",
192
- " print(f\"Step {step}: Escalating to tier 3\")\n",
193
- "\n",
194
- " if step % 50 == 0:\n",
195
- " avg = sum(tier_rewards[-50:]) / max(len(tier_rewards[-50:]), 1)\n",
196
- " print(f\"Step {step} | Tier: {current_tier} | Avg reward (last 50): {avg:.3f}\")"
197
- ]
198
- },
199
- {
200
- "cell_type": "markdown",
201
- "metadata": {},
202
- "source": [
203
- "## Cell 6 — Save Model Correctly\n",
204
- "\n",
205
- "**CRITICAL:** Do NOT merge LoRA into 4-bit base model — this damages quality.\n",
206
- "Use `save_pretrained` with `method=\"lora\"`."
207
- ]
208
- },
209
- {
210
- "cell_type": "code",
211
- "execution_count": null,
212
- "metadata": {},
213
- "outputs": [],
214
- "source": [
215
- "# CRITICAL: Do NOT merge LoRA into 4-bit base. Use save_pretrained with method=\"lora\"\n",
216
- "model.save_pretrained(\"swebench-in-lora\")\n",
217
- "tokenizer.save_pretrained(\"swebench-in-lora\")\n",
218
- "# Push to hub\n",
219
- "model.push_to_hub(\"YOUR_HF_USERNAME/swebench-in-lora\")"
220
- ]
221
- },
222
- {
223
- "cell_type": "markdown",
224
- "metadata": {},
225
- "source": [
226
- "## Cell 7 — Generate and Commit Training Plots\n",
227
- "\n",
228
- "Both plots must be committed as `.png` files to the repo.\n",
229
- "Wandb-only links do not count for the automated validation check."
230
- ]
231
- },
232
- {
233
- "cell_type": "code",
234
- "execution_count": null,
235
- "metadata": {},
236
- "outputs": [],
237
- "source": [
238
- "import matplotlib.pyplot as plt\n",
239
- "import os\n",
240
- "\n",
241
- "# Pull run history from wandb\n",
242
- "run = wandb.run\n",
243
- "history = run.history()\n",
244
- "\n",
245
- "os.makedirs(\"plots\", exist_ok=True)\n",
246
- "\n",
247
- "# --- Reward Curve ---\n",
248
- "fig, ax = plt.subplots(figsize=(10, 5))\n",
249
- "ax.plot(history[\"training_step\"], history[\"reward/total\"],\n",
250
- " label=\"Trained Agent\", color=\"steelblue\")\n",
251
- "ax.axhline(y=-0.4, color=\"orange\", linestyle=\"--\",\n",
252
- " label=\"Untrained Baseline (-0.4)\")\n",
253
- "ax.set_xlabel(\"Training Step\")\n",
254
- "ax.set_ylabel(\"Episode Reward\")\n",
255
- "ax.set_title(\"SWEbench-IN: Training Reward Curve\")\n",
256
- "ax.legend()\n",
257
- "ax.grid(True, alpha=0.3)\n",
258
- "plt.tight_layout()\n",
259
- "plt.savefig(\"plots/reward_curve.png\", dpi=150)\n",
260
- "plt.show()\n",
261
- "print(\"plots/reward_curve.png saved. Commit it to your repo now.\")\n",
262
- "\n",
263
- "# --- Loss Curve ---\n",
264
- "fig, ax = plt.subplots(figsize=(10, 5))\n",
265
- "if \"loss\" in history.columns:\n",
266
- " ax.plot(history[\"training_step\"], history[\"loss\"],\n",
267
- " label=\"Policy Loss\", color=\"crimson\")\n",
268
- "ax.set_xlabel(\"Training Step\")\n",
269
- "ax.set_ylabel(\"Loss\")\n",
270
- "ax.set_title(\"SWEbench-IN: Policy Loss Curve\")\n",
271
- "ax.legend()\n",
272
- "ax.grid(True, alpha=0.3)\n",
273
- "plt.tight_layout()\n",
274
- "plt.savefig(\"plots/loss_curve.png\", dpi=150)\n",
275
- "plt.show()\n",
276
- "print(\"plots/loss_curve.png saved. Commit it to your repo now.\")"
277
- ]
278
- }
279
- ],
280
- "metadata": {
281
- "kernelspec": {
282
- "display_name": "Python 3",
283
- "language": "python",
284
- "name": "python3"
285
- },
286
- "language_info": {
287
- "name": "python",
288
- "version": "3.11.0"
289
- }
290
- },
291
- "nbformat": 4,
292
- "nbformat_minor": 4
293
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# SWEbench-IN \u2014 GRPO Training Notebook\n",
8
+ "\n",
9
+ "This notebook trains a Qwen2.5-3B-Instruct model using GRPO (Group Relative Policy Optimization)\n",
10
+ "to act as an Indian SWE \u2014 fixing broken Linux systems while managing stakeholder communication.\n",
11
+ "\n",
12
+ "**Prerequisites:**\n",
13
+ "- A running SWEbench-IN HuggingFace Space\n",
14
+ "- A Weights & Biases account\n",
15
+ "- Google Colab with GPU runtime (T4 or better)"
16
+ ]
17
+ },
18
+ {
19
+ "cell_type": "markdown",
20
+ "metadata": {},
21
+ "source": [
22
+ "## Cell 1 \u2014 Install Dependencies"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": 3,
28
+ "metadata": {},
29
+ "outputs": [
30
+ {
31
+ "name": "stdout",
32
+ "output_type": "stream",
33
+ "text": [
34
+ "\u001b[31mERROR: Ignored the following versions that require a different python version: 2025.3.4 Requires-Python <=3.12,>=3.9\u001b[0m\u001b[31m\n",
35
+ "\u001b[0m\u001b[31mERROR: Could not find a version that satisfies the requirement openenv-client (from versions: none)\u001b[0m\u001b[31m\n",
36
+ "\u001b[0m\u001b[31mERROR: No matching distribution found for openenv-client\u001b[0m\u001b[31m\n",
37
+ "\u001b[0m"
38
+ ]
39
+ }
40
+ ],
41
+ "source": [
42
+ "!pip install unsloth trl transformers accelerate openenv-core[core]>=0.2.2 wandb -q"
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "markdown",
47
+ "metadata": {},
48
+ "source": [
49
+ "## Cell 2 \u2014 Import and Configure"
50
+ ]
51
+ },
52
+ {
53
+ "cell_type": "code",
54
+ "execution_count": 2,
55
+ "metadata": {},
56
+ "outputs": [
57
+ {
58
+ "ename": "ModuleNotFoundError",
59
+ "evalue": "No module named 'unsloth'",
60
+ "output_type": "error",
61
+ "traceback": [
62
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
63
+ "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
64
+ "\u001b[0;32m/tmp/ipykernel_1776/4025435489.py\u001b[0m in \u001b[0;36m<cell line: 0>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0munsloth\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mFastLanguageModel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mtrl\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mGRPOTrainer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mGRPOConfig\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mopenenv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclient\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mEnvironment\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mOpenEnvClient\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
65
+ "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'unsloth'",
66
+ "",
67
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0;32m\nNOTE: If your import is failing due to a missing package, you can\nmanually install dependencies using either !pip or !apt.\n\nTo view examples of installing some common dependencies, click the\n\"Open Examples\" button below.\n\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n"
68
+ ]
69
+ }
70
+ ],
71
+ "source": [
72
+ "import wandb\n",
73
+ "import random\n",
74
+ "import re\n",
75
+ "import json\n",
76
+ "from unsloth import FastLanguageModel\n",
77
+ "from trl import GRPOTrainer, GRPOConfig\n",
78
+ "from openenv.client import Environment as OpenEnvClient\n",
79
+ "\n",
80
+ "wandb.init(project=\"swebench-in\", name=\"grpo-run-1\")\n",
81
+ "\n",
82
+ "HF_SPACE_URL = \"YOUR_HF_SPACE_URL_HERE\" # Replace before running\n",
83
+ "env = OpenEnvClient(HF_SPACE_URL)"
84
+ ]
85
+ },
86
+ {
87
+ "cell_type": "markdown",
88
+ "metadata": {},
89
+ "source": [
90
+ "## Cell 3 \u2014 Load Model (Qwen2.5-3B-Instruct, 4-bit QLoRA via Unsloth)"
91
+ ]
92
+ },
93
+ {
94
+ "cell_type": "code",
95
+ "execution_count": null,
96
+ "metadata": {},
97
+ "outputs": [],
98
+ "source": [
99
+ "model, tokenizer = FastLanguageModel.from_pretrained(\n",
100
+ " model_name=\"Qwen/Qwen2.5-3B-Instruct\",\n",
101
+ " max_seq_length=2048,\n",
102
+ " dtype=None,\n",
103
+ " load_in_4bit=True,\n",
104
+ ")\n",
105
+ "model = FastLanguageModel.get_peft_model(\n",
106
+ " model,\n",
107
+ " r=16,\n",
108
+ " target_modules=[\"q_proj\", \"v_proj\"],\n",
109
+ " lora_alpha=16,\n",
110
+ " lora_dropout=0,\n",
111
+ " bias=\"none\",\n",
112
+ " use_gradient_checkpointing=True,\n",
113
+ ")"
114
+ ]
115
+ },
116
+ {
117
+ "cell_type": "markdown",
118
+ "metadata": {},
119
+ "source": [
120
+ "## Cell 4 \u2014 Define Rollout Function"
121
+ ]
122
+ },
123
+ {
124
+ "cell_type": "code",
125
+ "execution_count": null,
126
+ "metadata": {},
127
+ "outputs": [],
128
+ "source": [
129
+ "def parse_action(action_text: str) -> dict:\n",
130
+ " \"\"\"\n",
131
+ " Parse the model's generated text into an action dict.\n",
132
+ " Expected format: ACTION_TYPE: args\n",
133
+ " \"\"\"\n",
134
+ " action_text = action_text.strip()\n",
135
+ " # Try to find action pattern\n",
136
+ " match = re.search(r'(run_command|read_file|write_file|run_tests|check_server|reply_slack|reply_email|reply_hr|close_case)[:\\s]+(.*)', action_text, re.DOTALL)\n",
137
+ " if match:\n",
138
+ " return {\"type\": match.group(1), \"args\": match.group(2).strip()}\n",
139
+ " # Default: treat as run_command\n",
140
+ " return {\"type\": \"run_command\", \"args\": action_text}\n",
141
+ "\n",
142
+ "\n",
143
+ "def rollout(prompt: str, task_id: int) -> tuple[list[str], float]:\n",
144
+ " \"\"\"\n",
145
+ " Run one episode. Return (action_sequence, total_reward).\n",
146
+ " Uses sampling with temperature 0.7.\n",
147
+ " \"\"\"\n",
148
+ " obs = env.reset(task_id=task_id)\n",
149
+ " actions = []\n",
150
+ " total_reward = 0.0\n",
151
+ " done = False\n",
152
+ "\n",
153
+ " while not done:\n",
154
+ " inputs = tokenizer(f\"Observation: {obs}\\nAction:\", return_tensors=\"pt\")\n",
155
+ " output = model.generate(**inputs, max_new_tokens=100, do_sample=True, temperature=0.7)\n",
156
+ " action_text = tokenizer.decode(output[0], skip_special_tokens=True)\n",
157
+ " action = parse_action(action_text)\n",
158
+ " obs, reward, done, info = env.step(action)\n",
159
+ " actions.append(action_text)\n",
160
+ " total_reward += reward\n",
161
+ "\n",
162
+ " return actions, total_reward"
163
+ ]
164
+ },
165
+ {
166
+ "cell_type": "markdown",
167
+ "metadata": {},
168
+ "source": [
169
+ "## Cell 5 \u2014 Curriculum Training Loop\n",
170
+ "\n",
171
+ "Curriculum escalates when average reward over last 50 episodes crosses 0.6:\n",
172
+ "- **Tier 1** (Steps 0\u2013200): Tasks 1+2 only (easy, technical reward)\n",
173
+ "- **Tier 2** (Steps 200\u2013500): Add Tasks 3+4 (communication reward added)\n",
174
+ "- **Tier 3** (Steps 500+): Add Task 5 (leave protection added)"
175
+ ]
176
+ },
177
+ {
178
+ "cell_type": "code",
179
+ "execution_count": null,
180
+ "metadata": {},
181
+ "outputs": [],
182
+ "source": [
183
+ "# Curriculum: tier 1 tasks first (1,2), then tier 2 (3,4), then tier 3 (5)\n",
184
+ "CURRICULUM = {\n",
185
+ " \"tier1\": [1, 2],\n",
186
+ " \"tier2\": [3, 4],\n",
187
+ " \"tier3\": [5],\n",
188
+ "}\n",
189
+ "\n",
190
+ "current_tier = \"tier1\"\n",
191
+ "tier_rewards = []\n",
192
+ "\n",
193
+ "for step in range(700):\n",
194
+ " task_id = random.choice(CURRICULUM[current_tier])\n",
195
+ " actions, reward = rollout(\"\", task_id)\n",
196
+ "\n",
197
+ " # Log to wandb\n",
198
+ " wandb.log({\n",
199
+ " \"reward/total\": reward,\n",
200
+ " \"training_step\": step,\n",
201
+ " \"task_id\": task_id,\n",
202
+ " \"current_tier\": current_tier,\n",
203
+ " \"num_actions\": len(actions),\n",
204
+ " })\n",
205
+ "\n",
206
+ " tier_rewards.append(reward)\n",
207
+ "\n",
208
+ " # Escalate curriculum\n",
209
+ " if len(tier_rewards) >= 50 and sum(tier_rewards[-50:]) / 50 >= 0.6:\n",
210
+ " if current_tier == \"tier1\":\n",
211
+ " current_tier = \"tier2\"\n",
212
+ " tier_rewards = []\n",
213
+ " print(f\"Step {step}: Escalating to tier 2\")\n",
214
+ " elif current_tier == \"tier2\":\n",
215
+ " current_tier = \"tier3\"\n",
216
+ " tier_rewards = []\n",
217
+ " print(f\"Step {step}: Escalating to tier 3\")\n",
218
+ "\n",
219
+ " if step % 50 == 0:\n",
220
+ " avg = sum(tier_rewards[-50:]) / max(len(tier_rewards[-50:]), 1)\n",
221
+ " print(f\"Step {step} | Tier: {current_tier} | Avg reward (last 50): {avg:.3f}\")"
222
+ ]
223
+ },
224
+ {
225
+ "cell_type": "markdown",
226
+ "metadata": {},
227
+ "source": [
228
+ "## Cell 6 \u2014 Save Model Correctly\n",
229
+ "\n",
230
+ "**CRITICAL:** Do NOT merge LoRA into 4-bit base model \u2014 this damages quality.\n",
231
+ "Use `save_pretrained` with `method=\"lora\"`."
232
+ ]
233
+ },
234
+ {
235
+ "cell_type": "code",
236
+ "execution_count": null,
237
+ "metadata": {},
238
+ "outputs": [],
239
+ "source": [
240
+ "# CRITICAL: Do NOT merge LoRA into 4-bit base. Use save_pretrained with method=\"lora\"\n",
241
+ "model.save_pretrained(\"swebench-in-lora\")\n",
242
+ "tokenizer.save_pretrained(\"swebench-in-lora\")\n",
243
+ "# Push to hub\n",
244
+ "model.push_to_hub(\"YOUR_HF_USERNAME/swebench-in-lora\")"
245
+ ]
246
+ },
247
+ {
248
+ "cell_type": "markdown",
249
+ "metadata": {},
250
+ "source": [
251
+ "## Cell 7 \u2014 Generate and Commit Training Plots\n",
252
+ "\n",
253
+ "Both plots must be committed as `.png` files to the repo.\n",
254
+ "Wandb-only links do not count for the automated validation check."
255
+ ]
256
+ },
257
+ {
258
+ "cell_type": "code",
259
+ "execution_count": null,
260
+ "metadata": {},
261
+ "outputs": [],
262
+ "source": [
263
+ "import matplotlib.pyplot as plt\n",
264
+ "import os\n",
265
+ "\n",
266
+ "# Pull run history from wandb\n",
267
+ "run = wandb.run\n",
268
+ "history = run.history()\n",
269
+ "\n",
270
+ "os.makedirs(\"plots\", exist_ok=True)\n",
271
+ "\n",
272
+ "# --- Reward Curve ---\n",
273
+ "fig, ax = plt.subplots(figsize=(10, 5))\n",
274
+ "ax.plot(history[\"training_step\"], history[\"reward/total\"],\n",
275
+ " label=\"Trained Agent\", color=\"steelblue\")\n",
276
+ "ax.axhline(y=-0.4, color=\"orange\", linestyle=\"--\",\n",
277
+ " label=\"Untrained Baseline (-0.4)\")\n",
278
+ "ax.set_xlabel(\"Training Step\")\n",
279
+ "ax.set_ylabel(\"Episode Reward\")\n",
280
+ "ax.set_title(\"SWEbench-IN: Training Reward Curve\")\n",
281
+ "ax.legend()\n",
282
+ "ax.grid(True, alpha=0.3)\n",
283
+ "plt.tight_layout()\n",
284
+ "plt.savefig(\"plots/reward_curve.png\", dpi=150)\n",
285
+ "plt.show()\n",
286
+ "print(\"plots/reward_curve.png saved. Commit it to your repo now.\")\n",
287
+ "\n",
288
+ "# --- Loss Curve ---\n",
289
+ "fig, ax = plt.subplots(figsize=(10, 5))\n",
290
+ "if \"loss\" in history.columns:\n",
291
+ " ax.plot(history[\"training_step\"], history[\"loss\"],\n",
292
+ " label=\"Policy Loss\", color=\"crimson\")\n",
293
+ "ax.set_xlabel(\"Training Step\")\n",
294
+ "ax.set_ylabel(\"Loss\")\n",
295
+ "ax.set_title(\"SWEbench-IN: Policy Loss Curve\")\n",
296
+ "ax.legend()\n",
297
+ "ax.grid(True, alpha=0.3)\n",
298
+ "plt.tight_layout()\n",
299
+ "plt.savefig(\"plots/loss_curve.png\", dpi=150)\n",
300
+ "plt.show()\n",
301
+ "print(\"plots/loss_curve.png saved. Commit it to your repo now.\")"
302
+ ]
303
+ }
304
+ ],
305
+ "metadata": {
306
+ "kernelspec": {
307
+ "display_name": "Python 3 (ipykernel)",
308
+ "language": "python",
309
+ "name": "python3"
310
+ },
311
+ "language_info": {
312
+ "codemirror_mode": {
313
+ "name": "ipython",
314
+ "version": 3
315
+ },
316
+ "file_extension": ".py",
317
+ "mimetype": "text/x-python",
318
+ "name": "python",
319
+ "nbconvert_exporter": "python",
320
+ "pygments_lexer": "ipython3",
321
+ "version": "3.12.13"
322
+ }
323
+ },
324
+ "nbformat": 4,
325
+ "nbformat_minor": 4
326
+ }
server/app.py CHANGED
@@ -26,12 +26,12 @@ except Exception as e: # pragma: no cover
26
  "openenv is required for the web interface. Install dependencies with '\n uv sync\n'"
27
  ) from e
28
 
29
- try:
30
- from ..models import SWEbenchINAction, SWEbenchINObservation
31
- from .swebench_in_environment import SWEbenchINEnvironment
32
- except ModuleNotFoundError:
33
- from models import SWEbenchINAction, SWEbenchINObservation
34
- from server.swebench_in_environment import SWEbenchINEnvironment
35
 
36
 
37
  # Create the app with web interface and README integration
 
26
  "openenv is required for the web interface. Install dependencies with '\n uv sync\n'"
27
  ) from e
28
 
29
+ import sys
30
+ import os
31
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
32
+
33
+ from models import SWEbenchINAction, SWEbenchINObservation
34
+ from server.swebench_in_environment import SWEbenchINEnvironment
35
 
36
 
37
  # Create the app with web interface and README integration
server/swebench_in_environment.py CHANGED
@@ -10,10 +10,7 @@ from uuid import uuid4
10
  from openenv.core.env_server.interfaces import Environment
11
  from openenv.core.env_server.types import State
12
 
13
- try:
14
- from ..models import SWEbenchINAction, SWEbenchINObservation
15
- except ImportError:
16
- from models import SWEbenchINAction, SWEbenchINObservation
17
 
18
  import sys
19
  import os
 
10
  from openenv.core.env_server.interfaces import Environment
11
  from openenv.core.env_server.types import State
12
 
13
+ from models import SWEbenchINAction, SWEbenchINObservation
 
 
 
14
 
15
  import sys
16
  import os