YUS200619 commited on
Commit
ad89aed
Β·
1 Parent(s): 054a3c3

changes made

Browse files
Files changed (2) hide show
  1. environment.py +13 -9
  2. notebooks/training.ipynb +325 -516
environment.py CHANGED
@@ -12,7 +12,7 @@ from dataclasses import dataclass, field
12
 
13
  from tasks import TASKS, Task
14
  from simulator import Simulator
15
- from rewards import compute_reward, RewardBreakdown
16
 
17
 
18
  @dataclass
@@ -54,27 +54,31 @@ class SWEbenchINEnvironment:
54
  def _start_container(self) -> str:
55
  """Start a new Docker container from the swebench-in image."""
56
  try:
57
- result = subprocess.run(
58
  ["docker", "run", "-d", "--name", "swebench-in-env",
59
  "-p", "8080:8080", "swebench-in"],
60
  capture_output=True,
61
  text=True,
62
  timeout=30,
63
  )
64
- container_id = result.stdout.strip()
65
- if not container_id:
66
- # If container already exists, use it
67
- result = subprocess.run(
 
 
68
  ["docker", "start", "swebench-in-env"],
69
  capture_output=True,
70
  text=True,
71
  timeout=10,
72
  )
 
73
  return "swebench-in-env"
74
- return container_id
75
  except (subprocess.TimeoutExpired, FileNotFoundError):
76
- # Fallback: return a placeholder for demo/testing without Docker
77
- return "swebench-in-env"
 
 
78
 
79
  def reset(self, task_id: int = None) -> str:
80
  """
 
12
 
13
  from tasks import TASKS, Task
14
  from simulator import Simulator
15
+ from rewards import compute_reward
16
 
17
 
18
  @dataclass
 
54
  def _start_container(self) -> str:
55
  """Start a new Docker container from the swebench-in image."""
56
  try:
57
+ run_result = subprocess.run(
58
  ["docker", "run", "-d", "--name", "swebench-in-env",
59
  "-p", "8080:8080", "swebench-in"],
60
  capture_output=True,
61
  text=True,
62
  timeout=30,
63
  )
64
+ container_id = run_result.stdout.strip()
65
+ if run_result.returncode == 0 and container_id:
66
+ return container_id
67
+
68
+ # If container already exists (or run failed), try starting it.
69
+ start_result = subprocess.run(
70
  ["docker", "start", "swebench-in-env"],
71
  capture_output=True,
72
  text=True,
73
  timeout=10,
74
  )
75
+ if start_result.returncode == 0:
76
  return "swebench-in-env"
 
77
  except (subprocess.TimeoutExpired, FileNotFoundError):
78
+ pass
79
+
80
+ # Fallback: return a placeholder for demo/testing without Docker.
81
+ return "swebench-in-env"
82
 
83
  def reset(self, task_id: int = None) -> str:
84
  """
notebooks/training.ipynb CHANGED
@@ -1,158 +1,84 @@
1
  {
2
  "cells": [
3
  {
4
- "cell_type": "markdown",
5
- "metadata": {},
 
 
 
 
 
 
 
 
 
 
6
  "source": [
7
- "# SWEbench-IN β€” GRPO Training Notebook\n",
8
- "\n",
9
- "This notebook trains a Qwen2.5-3B-Instruct model using GRPO (Group Relative Policy Optimization)\n",
10
- "to act as an Indian SWE β€” fixing broken Linux systems while managing stakeholder communication.\n",
11
- "\n",
12
- "**Prerequisites:**\n",
13
- "- A running SWEbench-IN HuggingFace Space\n",
14
- "- A Weights & Biases account\n",
15
- "- Google Colab with GPU runtime (T4 or better)"
16
  ]
17
  },
18
  {
19
- "cell_type": "markdown",
20
- "metadata": {},
 
 
 
 
 
 
 
 
 
 
21
  "source": [
22
- "## Cell 1 β€” Install Dependencies"
 
 
23
  ]
24
  },
25
  {
26
  "cell_type": "code",
27
  "execution_count": null,
28
- "metadata": {},
 
 
 
 
 
 
 
 
29
  "outputs": [],
30
  "source": [
 
 
 
 
31
  "\n",
32
- "!pip install \"unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git\" -q\n",
33
- "!pip install trl transformers accelerate datasets wandb openenv-core -q\n",
34
- "!pip install matplotlib pandas -q"
35
- ]
36
- },
37
- {
38
- "cell_type": "markdown",
39
- "metadata": {},
40
- "source": [
41
- "## Cell 2 β€” Import and Configure"
42
  ]
43
  },
44
  {
45
  "cell_type": "code",
46
- "execution_count": 1,
47
- "metadata": {},
48
- "outputs": [
49
- {
50
- "name": "stderr",
51
- "output_type": "stream",
52
- "text": [
53
- "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m If you're specifying your api key in code, ensure this code is not shared publicly.\n",
54
- "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m Consider setting the WANDB_API_KEY environment variable, or running `wandb login` from the command line.\n",
55
- "\u001b[34m\u001b[1mwandb\u001b[0m: [wandb.login()] Using explicit session credentials for https://api.wandb.ai.\n",
56
- "\u001b[34m\u001b[1mwandb\u001b[0m: No netrc file found, creating one.\n",
57
- "\u001b[34m\u001b[1mwandb\u001b[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc\n",
58
- "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33myusufindian09\u001b[0m (\u001b[33myusufindian09-aaa\u001b[0m) to \u001b[32mhttps://api.wandb.ai\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
59
- ]
60
- },
61
- {
62
- "name": "stdout",
63
- "output_type": "stream",
64
- "text": [
65
- "πŸ¦₯ Unsloth: Will patch your computer to enable 2x faster free finetuning.\n",
66
- "πŸ¦₯ Unsloth Zoo will now patch everything to make training faster!\n"
67
- ]
68
- },
69
- {
70
- "data": {
71
- "text/html": [],
72
- "text/plain": [
73
- "<IPython.core.display.HTML object>"
74
- ]
75
- },
76
- "metadata": {},
77
- "output_type": "display_data"
78
- },
79
- {
80
- "data": {
81
- "text/html": [
82
- "Tracking run with wandb version 0.25.1"
83
- ],
84
- "text/plain": [
85
- "<IPython.core.display.HTML object>"
86
- ]
87
- },
88
- "metadata": {},
89
- "output_type": "display_data"
90
- },
91
- {
92
- "data": {
93
- "text/html": [
94
- "Run data is saved locally in <code>/content/wandb/run-20260425_092404-bjgtcv44</code>"
95
- ],
96
- "text/plain": [
97
- "<IPython.core.display.HTML object>"
98
- ]
99
- },
100
- "metadata": {},
101
- "output_type": "display_data"
102
- },
103
- {
104
- "data": {
105
- "text/html": [
106
- "Syncing run <strong><a href='https://wandb.ai/yusufindian09-aaa/swebench-in/runs/bjgtcv44' target=\"_blank\">grpo-run-1</a></strong> to <a href='https://wandb.ai/yusufindian09-aaa/swebench-in' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/developer-guide' target=\"_blank\">docs</a>)<br>"
107
- ],
108
- "text/plain": [
109
- "<IPython.core.display.HTML object>"
110
- ]
111
- },
112
- "metadata": {},
113
- "output_type": "display_data"
114
- },
115
- {
116
- "data": {
117
- "text/html": [
118
- " View project at <a href='https://wandb.ai/yusufindian09-aaa/swebench-in' target=\"_blank\">https://wandb.ai/yusufindian09-aaa/swebench-in</a>"
119
- ],
120
- "text/plain": [
121
- "<IPython.core.display.HTML object>"
122
- ]
123
- },
124
- "metadata": {},
125
- "output_type": "display_data"
126
- },
127
- {
128
- "data": {
129
- "text/html": [
130
- " View run at <a href='https://wandb.ai/yusufindian09-aaa/swebench-in/runs/bjgtcv44' target=\"_blank\">https://wandb.ai/yusufindian09-aaa/swebench-in/runs/bjgtcv44</a>"
131
- ],
132
- "text/plain": [
133
- "<IPython.core.display.HTML object>"
134
- ]
135
- },
136
- "metadata": {},
137
- "output_type": "display_data"
138
- },
139
- {
140
- "name": "stderr",
141
- "output_type": "stream",
142
- "text": [
143
- "wandb: Detected [huggingface_hub.inference, mcp, openai] in use.\n",
144
- "wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.\n",
145
- "wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/\n"
146
- ]
147
  },
148
- {
149
- "name": "stdout",
150
- "output_type": "stream",
151
- "text": [
152
- "Setup complete\n"
153
- ]
154
- }
155
- ],
156
  "source": [
157
  "import os\n",
158
  "import json\n",
@@ -162,40 +88,69 @@
162
  "import matplotlib.pyplot as plt\n",
163
  "import pandas as pd\n",
164
  "import wandb\n",
165
- "wandb.login(key=\"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\")\n",
 
166
  "from datasets import Dataset\n",
167
  "from unsloth import FastLanguageModel\n",
168
  "from trl import GRPOTrainer, GRPOConfig\n",
169
  "\n",
170
- "HF_SPACE_URL = \"https://huggingface.co/spaces/YUS200619/swebench-ind\"\n",
 
 
171
  "WANDB_PROJECT = \"swebench-in\"\n",
172
  "MODEL_NAME = \"unsloth/Qwen2.5-3B-Instruct\"\n",
173
  "MAX_SEQ_LEN = 2048\n",
174
  "MAX_STEPS = 15\n",
175
  "BASELINE_EPISODES = 20\n",
 
176
  "\n",
 
177
  "wandb.init(project=WANDB_PROJECT, name=\"grpo-run-1\")\n",
178
- "print(\"Setup complete\")"
179
  ]
180
  },
181
  {
182
- "cell_type": "markdown",
183
- "metadata": {},
 
 
 
 
 
 
 
 
 
 
184
  "source": [
185
- "## Cell 3 β€” Load Model (Qwen2.5-3B-Instruct, 4-bit QLoRA via Unsloth)"
 
 
 
 
 
 
186
  ]
187
  },
188
  {
189
  "cell_type": "code",
190
  "execution_count": null,
191
- "metadata": {},
 
 
 
 
 
 
 
 
192
  "outputs": [],
193
  "source": [
194
  "model, tokenizer = FastLanguageModel.from_pretrained(\n",
195
  " model_name=MODEL_NAME,\n",
196
  " max_seq_length=MAX_SEQ_LEN,\n",
197
  " dtype=None,\n",
198
- " load_in_4bit=True, # critical for Colab GPU memory\n",
199
  ")\n",
200
  "\n",
201
  "model = FastLanguageModel.get_peft_model(\n",
@@ -205,40 +160,32 @@
205
  " lora_alpha=16,\n",
206
  " lora_dropout=0,\n",
207
  " bias=\"none\",\n",
208
- " use_gradient_checkpointing=\"unsloth\", # saves even more memory\n",
209
  " random_state=42,\n",
210
  ")\n",
211
  "\n",
212
- "print(f\"Model loaded: {MODEL_NAME}\")\n",
213
- "print(f\"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}\")"
214
- ]
215
- },
216
- {
217
- "cell_type": "markdown",
218
- "metadata": {},
219
- "source": [
220
- "## Cell 4 β€” Define Rollout Function"
221
  ]
222
  },
223
  {
224
  "cell_type": "code",
225
  "execution_count": null,
226
- "metadata": {},
 
 
 
 
 
 
 
 
227
  "outputs": [],
228
  "source": [
229
- "import requests\n",
230
- "\n",
231
  "class SWEBenchINClient:\n",
232
- " \"\"\"\n",
233
- " Simple HTTP client for your HF Space environment.\n",
234
- " Calls the OpenEnv-compliant API endpoints.\n",
235
- " \"\"\"\n",
236
  " def __init__(self, base_url: str):\n",
237
  " self.base_url = base_url.rstrip(\"/\")\n",
238
  " self.session = requests.Session()\n",
239
- " # Test connection\n",
240
- " resp = self.session.get(f\"{self.base_url}/health\", timeout=10)\n",
241
- " print(f\"Environment connected: {resp.json()}\")\n",
242
  "\n",
243
  " def reset(self, task_id: int = None) -> dict:\n",
244
  " payload = {\"task_id\": task_id} if task_id else {}\n",
@@ -266,33 +213,39 @@
266
  " )\n",
267
  "\n",
268
  " def state(self) -> dict:\n",
269
- " resp = self.session.get(f\"{self.base_url}/state\", timeout=10)\n",
 
 
 
270
  " return resp.json()\n",
271
  "\n",
272
  " def grade(self) -> dict:\n",
273
- " resp = self.session.post(f\"{self.base_url}/grade\", timeout=30)\n",
 
 
 
274
  " return resp.json()\n",
275
  "\n",
 
276
  "\n",
277
- "env = SWEBenchINClient(https://huggingface.co/spaces/YUS200619/swebench-ind)"
278
- ]
279
- },
280
- {
281
- "cell_type": "markdown",
282
- "metadata": {},
283
- "source": [
284
- "## Cell 5 β€” Curriculum Training Loop\n",
285
- "\n",
286
- "Curriculum escalates when average reward over last 50 episodes crosses 0.6:\n",
287
- "- **Tier 1** (Steps 0–200): Tasks 1+2 only (easy, technical reward)\n",
288
- "- **Tier 2** (Steps 200–500): Add Tasks 3+4 (communication reward added)\n",
289
- "- **Tier 3** (Steps 500+): Add Task 5 (leave protection added)"
290
  ]
291
  },
292
  {
293
  "cell_type": "code",
294
  "execution_count": null,
295
- "metadata": {},
 
 
 
 
 
 
 
 
296
  "outputs": [],
297
  "source": [
298
  "SYSTEM_PROMPT = \"\"\"You are an Indian software engineer working on a Linux server.\n",
@@ -301,7 +254,7 @@
301
  "\n",
302
  "RULES:\n",
303
  "- Never use sudo\n",
304
- "- Never use rm -rf \n",
305
  "- Fix the server/code issue first\n",
306
  "- Then reply to manager/client if messages exist\n",
307
  "- Be efficient β€” fewer actions is better\n",
@@ -313,25 +266,18 @@
313
  "{\"type\": \"run_tests\", \"args\": \"\"}\n",
314
  "{\"type\": \"check_server\", \"args\": \"\"}\n",
315
  "{\"type\": \"reply_slack\", \"args\": \"Server is back up. Fixed the missing dependency.\"}\n",
316
- "{\"type\": \"reply_email\", \"args\": \"Apologies for the downtime. Issue resolved at 2:15 PM.\"}\n",
317
  "{\"type\": \"close_case\", \"args\": \"\"}\n",
318
  "\n",
319
  "Output ONLY valid JSON. Nothing else.\"\"\"\n",
320
  "\n",
321
  "\n",
322
  "def parse_action(text: str) -> dict:\n",
323
- " \"\"\"\n",
324
- " Parse model output into action dict.\n",
325
- " Tries JSON first, then regex, then safe default.\n",
326
- " \"\"\"\n",
327
  " text = text.strip()\n",
328
- "\n",
329
- " # Strip markdown code blocks if present\n",
330
  " text = re.sub(r\"```json\\s*\", \"\", text)\n",
331
  " text = re.sub(r\"```\\s*\", \"\", text)\n",
332
  " text = text.strip()\n",
333
  "\n",
334
- " # Try JSON parse\n",
335
  " try:\n",
336
  " action = json.loads(text)\n",
337
  " if \"type\" in action:\n",
@@ -339,21 +285,20 @@
339
  " except json.JSONDecodeError:\n",
340
  " pass\n",
341
  "\n",
342
- " # Try regex for known action types\n",
343
  " pattern = r'\"?type\"?\\s*[:=]\\s*\"?(\\w+)\"?'\n",
344
  " match = re.search(pattern, text)\n",
345
  " if match:\n",
346
  " action_type = match.group(1)\n",
347
- " args_match = re.search(r'\"?args\"?\\s*[:=]\\s*\"?([^\"}\\n]+)\"?', text)\n",
 
 
348
  " args = args_match.group(1).strip() if args_match else \"\"\n",
349
  " return {\"type\": action_type, \"args\": args}\n",
350
  "\n",
351
- " # Safe fallback\n",
352
  " return {\"type\": \"close_case\", \"args\": \"\"}\n",
353
  "\n",
354
  "\n",
355
  "def format_prompt(observation: dict) -> str:\n",
356
- " \"\"\"Format the environment observation into a prompt.\"\"\"\n",
357
  " return f\"\"\"CURRENT SITUATION:\n",
358
  "{json.dumps(observation, indent=2)}\n",
359
  "\n",
@@ -361,10 +306,6 @@
361
  "\n",
362
  "\n",
363
  "def run_episode(task_id: int = None, temperature: float = 0.7) -> tuple:\n",
364
- " \"\"\"\n",
365
- " Run one full episode.\n",
366
- " Returns (list_of_actions, total_reward, final_grade)\n",
367
- " \"\"\"\n",
368
  " observation = env.reset(task_id=task_id)\n",
369
  " actions_taken = []\n",
370
  " total_reward = 0.0\n",
@@ -376,7 +317,6 @@
376
  " ]\n",
377
  "\n",
378
  " for step_num in range(MAX_STEPS):\n",
379
- " # Format input for model\n",
380
  " input_text = tokenizer.apply_chat_template(\n",
381
  " messages,\n",
382
  " tokenize=False,\n",
@@ -390,128 +330,82 @@
390
  " max_length=MAX_SEQ_LEN\n",
391
  " ).to(model.device)\n",
392
  "\n",
393
- " # Generate action\n",
394
- " with model.disable_adapter() if temperature == 0 else __import__('contextlib').nullcontext():\n",
395
- " outputs = model.generate(\n",
396
- " **inputs,\n",
397
- " max_new_tokens=150,\n",
398
- " temperature=temperature,\n",
399
- " do_sample=(temperature > 0),\n",
400
- " pad_token_id=tokenizer.eos_token_id,\n",
401
- " )\n",
402
  "\n",
403
- " # Decode only new tokens\n",
404
  " new_tokens = outputs[0][inputs[\"input_ids\"].shape[1]:]\n",
405
- " action_text = tokenizer.decode(new_tokens, skip_special_tokens=True)\n",
 
 
 
406
  "\n",
407
- " # Parse action\n",
408
  " action = parse_action(action_text)\n",
409
  " actions_taken.append({\n",
410
  " \"step\": step_num,\n",
411
- " \"raw_output\": action_text[:200],\n",
412
  " \"parsed\": action\n",
413
  " })\n",
414
  "\n",
415
- " # Step environment\n",
416
  " try:\n",
417
  " observation, reward, done, info = env.step(action)\n",
418
  " total_reward += reward\n",
419
  " except Exception as e:\n",
420
- " print(f\" Step {step_num} error: {e}\")\n",
421
  " total_reward -= 0.1\n",
422
  " done = True\n",
423
  " break\n",
424
  "\n",
425
- " # Add to conversation history\n",
426
  " messages.append({\"role\": \"assistant\", \"content\": action_text})\n",
427
  " messages.append({\n",
428
  " \"role\": \"user\",\n",
429
- " \"content\": f\"Result: {json.dumps(observation, indent=2)}\\n\\nNext action?\"\n",
430
  " })\n",
431
  "\n",
432
  " if done:\n",
433
  " break\n",
434
  "\n",
435
- " # Get final grade\n",
436
  " try:\n",
437
  " final_grade = env.grade()\n",
438
  " except:\n",
439
  " final_grade = {\"total\": total_reward}\n",
440
  "\n",
441
- " return actions_taken, total_reward, final_grade"
442
- ]
443
- },
444
- {
445
- "cell_type": "markdown",
446
- "metadata": {},
447
- "source": [
448
- "## Cell 6 β€” Save Model Correctly\n",
449
  "\n",
450
- "**CRITICAL:** Do NOT merge LoRA into 4-bit base model β€” this damages quality.\n",
451
- "Use `save_pretrained` with `method=\"lora\"`."
452
- ]
453
- },
454
- {
455
- "cell_type": "code",
456
- "execution_count": null,
457
- "metadata": {},
458
- "outputs": [],
459
- "source": [
460
- "print(\"=\" * 50)\n",
461
- "print(\"MEASURING UNTRAINED BASELINE\")\n",
462
- "print(\"=\" * 50)\n",
463
- "\n",
464
- "baseline_rewards = []\n",
465
- "baseline_task_rewards = {1: [], 2: [], 3: [], 4: [], 5: []}\n",
466
  "\n",
467
- "for i in range(BASELINE_EPISODES):\n",
468
- " task_id = random.choice([1, 2, 3, 4, 5])\n",
469
- " _, reward, grade = run_episode(task_id=task_id, temperature=0.0)\n",
470
- " baseline_rewards.append(reward)\n",
471
- " baseline_task_rewards[task_id].append(reward)\n",
472
- " print(f\" Episode {i+1:02d}/20 | task={task_id} | reward={reward:.3f}\")\n",
473
- "\n",
474
- "REAL_BASELINE_AVG = np.mean(baseline_rewards)\n",
475
- "print(f\"\\nReal baseline average: {REAL_BASELINE_AVG:.3f}\")\n",
476
- "print(f\"Baseline std: {np.std(baseline_rewards):.3f}\")\n",
477
- "print(\"Save this number. You need it for the final plot.\")"
478
- ]
479
- },
480
- {
481
- "cell_type": "markdown",
482
- "metadata": {},
483
- "source": [
484
- "## Cell 7 β€” Generate and Commit Training Plots\n",
485
  "\n",
486
- "Both plots must be committed as `.png` files to the repo.\n",
487
- "Wandb-only links do not count for the automated validation check."
 
 
 
 
488
  ]
489
  },
490
  {
491
  "cell_type": "code",
492
  "execution_count": null,
493
- "metadata": {},
 
 
 
 
 
 
 
 
494
  "outputs": [],
495
  "source": [
496
- "\"\"\"\n",
497
- "GRPO needs a dataset of prompts to generate completions from.\n",
498
- "Each prompt = one episode starting state.\n",
499
- "We generate 200 prompts across all tasks with curriculum weighting.\n",
500
- "\"\"\"\n",
501
- "\n",
502
  "def generate_prompt_dataset(n_prompts: int = 200) -> Dataset:\n",
503
  " prompts = []\n",
504
- "\n",
505
- " # Curriculum weighting β€” more easy tasks early\n",
506
- " task_weights = {\n",
507
- " 1: 0.30, # easy\n",
508
- " 2: 0.30, # easy\n",
509
- " 3: 0.20, # medium\n",
510
- " 4: 0.15, # medium\n",
511
- " 5: 0.05, # hard\n",
512
- " }\n",
513
- "\n",
514
  " task_pool = []\n",
 
515
  " for task_id, weight in task_weights.items():\n",
516
  " count = int(n_prompts * weight)\n",
517
  " task_pool.extend([task_id] * count)\n",
@@ -519,153 +413,113 @@
519
  " random.shuffle(task_pool)\n",
520
  "\n",
521
  " for task_id in task_pool:\n",
522
- " obs = env.reset(task_id=task_id)\n",
523
- " prompt = tokenizer.apply_chat_template(\n",
524
- " [\n",
525
- " {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n",
526
- " {\"role\": \"user\", \"content\": format_prompt(obs)}\n",
527
- " ],\n",
528
- " tokenize=False,\n",
529
- " add_generation_prompt=True\n",
530
- " )\n",
531
- " prompts.append({\n",
532
- " \"prompt\": prompt,\n",
533
- " \"task_id\": task_id\n",
534
- " })\n",
 
535
  "\n",
 
536
  " return Dataset.from_list(prompts)\n",
537
  "\n",
538
  "\n",
539
- "print(\"Generating training prompts...\")\n",
540
  "train_dataset = generate_prompt_dataset(n_prompts=200)\n",
541
- "print(f\"Dataset ready: {len(train_dataset)} prompts\")\n"
542
- ]
543
- },
544
- {
545
- "cell_type": "markdown",
546
- "metadata": {},
547
- "source": [
548
- "code 8"
549
  ]
550
  },
551
  {
552
  "cell_type": "code",
553
  "execution_count": null,
554
- "metadata": {},
 
 
 
 
 
 
 
 
555
  "outputs": [],
556
  "source": [
557
- "\"\"\"\n",
558
- "This is what GRPO calls to score each completion.\n",
559
- "GRPO generates multiple completions per prompt, scores them,\n",
560
- "and updates weights toward higher-scoring completions.\n",
561
- "\"\"\"\n",
562
- "\n",
563
- "episode_log = [] # track all episodes during training\n",
564
- "\n",
565
- "def grpo_reward_function(completions: list, prompts: list, **kwargs) -> list[float]:\n",
566
- " \"\"\"\n",
567
- " Called by GRPOTrainer after generating completions.\n",
568
- " Returns a reward score for each completion.\n",
569
- " \"\"\"\n",
570
  " rewards = []\n",
571
  "\n",
572
- " for i, completion in enumerate(completions):\n",
573
  " try:\n",
574
- " # Parse the action\n",
575
  " action = parse_action(completion)\n",
576
- "\n",
577
- " # Reset env and take the action\n",
578
- " task_id = random.choice([1, 2, 3, 4]) # avoid task 5 early\n",
579
- " obs = env.reset(task_id=task_id)\n",
580
  " _, reward, done, _ = env.step(action)\n",
581
  "\n",
582
- " # Continue episode for up to 3 more steps\n",
583
- " # (gives richer signal than single-step reward)\n",
584
- " for _ in range(3):\n",
585
- " if done:\n",
586
- " break\n",
587
- " # Simple follow-up: try to close case\n",
588
- " followup = {\"type\": \"check_server\", \"args\": \"\"}\n",
589
- " obs, step_reward, done, _ = env.step(followup)\n",
590
- " reward += step_reward * 0.5 # discount follow-up rewards\n",
591
- "\n",
592
- " # Penalty for bad actions\n",
593
  " if action[\"type\"] == \"close_case\" and reward < 0.1:\n",
594
- " reward -= 0.3 # penalize giving up immediately\n",
595
  "\n",
596
  " rewards.append(float(reward))\n",
597
  "\n",
598
- " except Exception as e:\n",
599
- " # Environment error β€” penalize\n",
600
  " rewards.append(-0.5)\n",
601
  "\n",
602
- " # Log to wandb\n",
603
  " wandb.log({\n",
604
  " \"reward/batch_mean\": np.mean(rewards),\n",
605
- " \"reward/batch_std\": np.std(rewards),\n",
606
- " \"reward/batch_max\": np.max(rewards),\n",
607
  " })\n",
608
  "\n",
609
  " episode_log.extend(rewards)\n",
610
- " return rewards\n"
611
- ]
612
- },
613
- {
614
- "cell_type": "markdown",
615
- "metadata": {},
616
- "source": [
617
- "code 9\n"
618
- ]
619
- },
620
- {
621
- "cell_type": "code",
622
- "execution_count": null,
623
- "metadata": {},
624
- "outputs": [],
625
- "source": [
626
- "training_args = GRPOConfig(\n",
627
- " # Output\n",
628
- " output_dir=\"./swebench-in-checkpoints\",\n",
629
  "\n",
630
- " # Training duration\n",
631
- " num_train_epochs=3,\n",
632
- " max_steps=500, # hard cap β€” enough for hackathon\n",
633
  "\n",
634
- " # Batch sizes β€” small to fit T4 GPU\n",
 
 
 
635
  " per_device_train_batch_size=2,\n",
636
- " gradient_accumulation_steps=8, # effective batch = 16\n",
637
- "\n",
638
- " # GRPO specific\n",
639
- " num_generations=4, # completions per prompt GRPO compares\n",
640
- " max_completion_length=200, # max tokens per action\n",
641
- "\n",
642
- " # Optimizer\n",
643
- " learning_rate=5e-6, # low LR for RL stability\n",
644
  " warmup_steps=20,\n",
645
  " weight_decay=0.01,\n",
646
- "\n",
647
- " # Logging\n",
648
  " logging_steps=10,\n",
649
  " save_steps=100,\n",
650
  " report_to=\"wandb\",\n",
651
- "\n",
652
- " # Memory\n",
653
  " gradient_checkpointing=True,\n",
654
- " fp16=True, # use fp16 on T4\n",
655
- ")"
656
- ]
657
- },
658
- {
659
- "cell_type": "markdown",
660
- "metadata": {},
661
- "source": [
662
- "code 10\n"
663
  ]
664
  },
665
  {
666
  "cell_type": "code",
667
  "execution_count": null,
668
- "metadata": {},
 
 
 
 
 
 
 
 
669
  "outputs": [],
670
  "source": [
671
  "trainer = GRPOTrainer(\n",
@@ -677,27 +531,28 @@
677
  ")\n",
678
  "\n",
679
  "print(\"Starting GRPO training...\")\n",
680
- "print(f\"Dataset: {len(train_dataset)} prompts\")\n",
681
- "print(f\"Max steps: {training_args.max_steps}\")\n",
682
  "print(f\"Generations per prompt: {training_args.num_generations}\")\n",
 
683
  "print(\"=\" * 50)\n",
684
  "\n",
685
  "trainer.train()\n",
686
  "\n",
687
- "print(\"Training complete!\")\n"
688
- ]
689
- },
690
- {
691
- "cell_type": "markdown",
692
- "metadata": {},
693
- "source": [
694
- "code 11"
695
  ]
696
  },
697
  {
698
  "cell_type": "code",
699
  "execution_count": null,
700
- "metadata": {},
 
 
 
 
 
 
 
 
701
  "outputs": [],
702
  "source": [
703
  "print(\"=\" * 50)\n",
@@ -708,212 +563,166 @@
708
  "\n",
709
  "for i in range(BASELINE_EPISODES):\n",
710
  " task_id = random.choice([1, 2, 3, 4, 5])\n",
711
- " _, reward, grade = run_episode(task_id=task_id, temperature=0.0)\n",
712
  " trained_rewards.append(reward)\n",
713
- " print(f\" Episode {i+1:02d}/20 | task={task_id} | reward={reward:.3f}\")\n",
 
714
  "\n",
715
  "TRAINED_AVG = np.mean(trained_rewards)\n",
716
- "print(f\"\\nTrained average: {TRAINED_AVG:.3f}\")\n",
717
  "print(f\"Baseline average: {REAL_BASELINE_AVG:.3f}\")\n",
718
- "print(f\"Improvement: {TRAINED_AVG - REAL_BASELINE_AVG:.3f}\")"
719
- ]
720
- },
721
- {
722
- "cell_type": "markdown",
723
- "metadata": {},
724
- "source": [
725
- "code 12"
726
  ]
727
  },
728
  {
729
  "cell_type": "code",
730
  "execution_count": null,
731
- "metadata": {},
 
 
 
 
 
 
 
 
732
  "outputs": [],
733
  "source": [
734
- "import os\n",
735
  "os.makedirs(\"plots\", exist_ok=True)\n",
736
  "\n",
737
- "# Pull training history from wandb\n",
738
  "history_df = wandb.run.history(\n",
739
- " keys=[\"reward/batch_mean\", \"_step\", \"loss\"]\n",
740
  ")\n",
741
  "history_df = history_df.dropna(subset=[\"reward/batch_mean\"])\n",
742
  "\n",
743
- "# ── Plot 1: Reward Curve ──────────────────────────────────\n",
744
  "fig, ax = plt.subplots(figsize=(12, 5))\n",
745
- "\n",
746
- "# Training reward\n",
747
  "ax.plot(\n",
748
  " history_df[\"_step\"],\n",
749
  " history_df[\"reward/batch_mean\"],\n",
750
- " color=\"steelblue\",\n",
751
- " alpha=0.6,\n",
752
- " linewidth=1,\n",
753
- " label=\"Training reward (per batch)\"\n",
754
  ")\n",
755
- "\n",
756
- "# Smoothed training reward\n",
757
  "if len(history_df) > 10:\n",
758
  " smoothed = pd.Series(\n",
759
  " history_df[\"reward/batch_mean\"].values\n",
760
  " ).rolling(window=20, min_periods=1).mean()\n",
761
  " ax.plot(\n",
762
- " history_df[\"_step\"],\n",
763
- " smoothed,\n",
764
- " color=\"steelblue\",\n",
765
- " linewidth=2.5,\n",
766
- " label=\"Training reward (smoothed)\"\n",
767
  " )\n",
768
- "\n",
769
- "# Baseline and trained horizontal lines\n",
770
  "ax.axhline(\n",
771
- " y=REAL_BASELINE_AVG,\n",
772
- " color=\"red\",\n",
773
- " linestyle=\"--\",\n",
774
- " linewidth=2,\n",
775
  " label=f\"Untrained baseline ({REAL_BASELINE_AVG:.2f})\"\n",
776
  ")\n",
777
  "ax.axhline(\n",
778
- " y=TRAINED_AVG,\n",
779
- " color=\"green\",\n",
780
- " linestyle=\"--\",\n",
781
- " linewidth=2,\n",
782
  " label=f\"Trained model ({TRAINED_AVG:.2f})\"\n",
783
  ")\n",
784
- "\n",
785
  "ax.set_xlabel(\"Training Step\", fontsize=12)\n",
786
  "ax.set_ylabel(\"Episode Reward\", fontsize=12)\n",
787
- "ax.set_title(\"SWEbench-IN: GRPO Training Reward Curve\", fontsize=14)\n",
 
788
  "ax.legend(fontsize=10)\n",
789
  "ax.grid(True, alpha=0.3)\n",
790
  "plt.tight_layout()\n",
791
- "plt.savefig(\"plots/reward_curve.png\", dpi=150, bbox_inches=\"tight\")\n",
 
792
  "plt.show()\n",
793
  "print(\"Saved: plots/reward_curve.png\")\n",
794
  "\n",
795
- "# ── Plot 2: Loss Curve ────────────────────────────────────\n",
796
  "fig, ax = plt.subplots(figsize=(12, 5))\n",
797
- "\n",
798
- "# Try trainer log history first (more reliable than wandb)\n",
799
  "log_history = trainer.state.log_history\n",
800
  "losses = [x[\"loss\"] for x in log_history if \"loss\" in x]\n",
801
  "steps = [x[\"step\"] for x in log_history if \"loss\" in x]\n",
802
  "\n",
803
  "if losses:\n",
804
- " ax.plot(steps, losses, color=\"crimson\", linewidth=1.5, label=\"Policy Loss\")\n",
805
- "elif \"loss\" in history_df.columns:\n",
806
- " loss_df = history_df.dropna(subset=[\"loss\"])\n",
807
- " ax.plot(loss_df[\"_step\"], loss_df[\"loss\"],\n",
808
- " color=\"crimson\", linewidth=1.5, label=\"Policy Loss\")\n",
809
- "else:\n",
810
- " ax.text(0.5, 0.5, \"Loss not logged\", transform=ax.transAxes,\n",
811
- " ha=\"center\", fontsize=14)\n",
812
  "\n",
813
- "ax.set_xlabel(\"Training Step\", fontsize=12)\n",
814
- "ax.set_ylabel(\"Loss\", fontsize=12)\n",
815
- "ax.set_title(\"SWEbench-IN: Policy Loss Curve\", fontsize=14)\n",
816
- "ax.legend(fontsize=10)\n",
817
- "ax.grid(True, alpha=0.3)\n",
818
  "plt.tight_layout()\n",
819
- "plt.savefig(\"plots/loss_curve.png\", dpi=150, bbox_inches=\"tight\")\n",
 
820
  "plt.show()\n",
821
  "print(\"Saved: plots/loss_curve.png\")\n",
822
  "\n",
823
- "# ── Plot 3: Before vs After Comparison ───────────────────\n",
824
- "fig, ax = plt.subplots(figsize=(10, 5))\n",
825
- "\n",
826
  "episodes = list(range(1, BASELINE_EPISODES + 1))\n",
827
  "ax.plot(episodes, baseline_rewards,\n",
828
- " color=\"red\", marker=\"o\", linewidth=1.5,\n",
 
829
  " label=f\"Untrained (avg={REAL_BASELINE_AVG:.2f})\")\n",
830
  "ax.plot(episodes, trained_rewards,\n",
831
- " color=\"green\", marker=\"s\", linewidth=1.5,\n",
 
832
  " label=f\"Trained (avg={TRAINED_AVG:.2f})\")\n",
833
- "ax.axhline(y=REAL_BASELINE_AVG, color=\"red\",\n",
834
- " linestyle=\"--\", alpha=0.4)\n",
835
- "ax.axhline(y=TRAINED_AVG, color=\"green\",\n",
836
- " linestyle=\"--\", alpha=0.4)\n",
837
- "ax.fill_between(episodes, baseline_rewards, trained_rewards,\n",
838
- " alpha=0.1, color=\"green\",\n",
839
- " label=f\"Improvement: +{TRAINED_AVG - REAL_BASELINE_AVG:.2f}\")\n",
840
- "\n",
841
  "ax.set_xlabel(\"Episode\", fontsize=12)\n",
842
  "ax.set_ylabel(\"Reward\", fontsize=12)\n",
843
- "ax.set_title(\"SWEbench-IN: Before vs After GRPO Training\", fontsize=14)\n",
 
844
  "ax.legend(fontsize=10)\n",
845
  "ax.grid(True, alpha=0.3)\n",
846
  "plt.tight_layout()\n",
847
- "plt.savefig(\"plots/before_after.png\", dpi=150, bbox_inches=\"tight\")\n",
 
848
  "plt.show()\n",
849
- "print(\"Saved: plots/before_after.png\")\n"
850
- ]
851
- },
852
- {
853
- "cell_type": "markdown",
854
- "metadata": {},
855
- "source": [
856
- "cell 13"
857
  ]
858
  },
859
  {
860
  "cell_type": "code",
861
  "execution_count": null,
862
- "metadata": {},
 
 
 
 
 
 
 
 
863
  "outputs": [],
864
  "source": [
865
- "\"\"\"\n",
866
- "CRITICAL: Do NOT merge LoRA into 4-bit base.\n",
867
- "Save adapters only using method=\"lora\"\n",
868
- "\"\"\"\n",
869
  "model.save_pretrained_merged(\n",
870
  " \"swebench-in-lora\",\n",
871
  " tokenizer=tokenizer,\n",
872
  " save_method=\"lora\"\n",
873
  ")\n",
874
- "print(\"Model saved to swebench-in-lora/\")\n",
875
- "\n",
876
- "\n"
877
- ]
878
- },
879
- {
880
- "cell_type": "markdown",
881
- "metadata": {},
882
- "source": [
883
- "cell 14"
884
- ]
885
- },
886
- {
887
- "cell_type": "code",
888
- "execution_count": null,
889
- "metadata": {},
890
- "outputs": [],
891
- "source": [
892
- "# ============================================================\n",
893
- "\"\"\"\n",
894
- "AFTER downloading plots from Colab Files panel:\n",
895
- "\n",
896
- "cd your-local-repo\n",
897
- "cp ~/Downloads/reward_curve.png plots/\n",
898
- "cp ~/Downloads/loss_curve.png plots/\n",
899
- "cp ~/Downloads/before_after.png plots/\n",
900
- "\n",
901
- "git add plots/\n",
902
- "git commit -m \"Add training evidence: reward curve, loss curve, before/after\"\n",
903
- "git push origin main\n",
904
- "\n",
905
- "Then verify from logged-out browser that plots appear in README.\n",
906
- "\"\"\"\n",
907
- "print(\"Download plots from Colab Files panel β†’ commit to repo\")\n",
908
- "print(\"plots/reward_curve.png\")\n",
909
- "print(\"plots/loss_curve.png\")\n",
910
- "print(\"plots/before_after.png\")"
911
  ]
912
  }
913
  ],
914
  "metadata": {
 
 
 
 
 
 
 
 
 
915
  "kernelspec": {
916
- "display_name": "Python 3 (ipykernel)",
917
  "language": "python",
918
  "name": "python3"
919
  },
@@ -927,7 +736,7 @@
927
  "name": "python",
928
  "nbconvert_exporter": "python",
929
  "pygments_lexer": "ipython3",
930
- "version": "3.12.13"
931
  }
932
  },
933
  "nbformat": 4,
 
1
  {
2
  "cells": [
3
  {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {
7
+ "_cell_guid": "04149943-a415-4b34-9912-b0f8c528bb41",
8
+ "_uuid": "8062a470-1b78-44c2-8d72-c8d7b56a6646",
9
+ "collapsed": false,
10
+ "jupyter": {
11
+ "outputs_hidden": false
12
+ },
13
+ "trusted": true
14
+ },
15
+ "outputs": [],
16
  "source": [
17
+ "!pip install -q \\\n",
18
+ " \"huggingface_hub>=0.24.0\" \\\n",
19
+ " \"unsloth\" \\\n",
20
+ " \"unsloth_zoo\"\n",
21
+ "!pip install \"unsloth[kaggle-new] @ git+https://github.com/unslothai/unsloth.git\" -q\n",
22
+ "!pip install trl transformers accelerate datasets wandb requests matplotlib pandas -q"
 
 
 
23
  ]
24
  },
25
  {
26
+ "cell_type": "code",
27
+ "execution_count": null,
28
+ "metadata": {
29
+ "_cell_guid": "a3cb3b13-01ca-4099-89e0-a4bf48071b56",
30
+ "_uuid": "96b2917f-1eff-408f-a4c6-f24b2b2dca11",
31
+ "collapsed": false,
32
+ "jupyter": {
33
+ "outputs_hidden": false
34
+ },
35
+ "trusted": true
36
+ },
37
+ "outputs": [],
38
  "source": [
39
+ "# Run this cell immediately after install finishes\n",
40
+ "#import os\n",
41
+ "#os.kill(os.getpid(), 9)"
42
  ]
43
  },
44
  {
45
  "cell_type": "code",
46
  "execution_count": null,
47
+ "metadata": {
48
+ "_cell_guid": "439072a1-3197-4823-9132-995b64208462",
49
+ "_uuid": "07eb3e2d-de23-4107-867e-651bf08a6915",
50
+ "collapsed": false,
51
+ "jupyter": {
52
+ "outputs_hidden": false
53
+ },
54
+ "trusted": true
55
+ },
56
  "outputs": [],
57
  "source": [
58
+ "import torch\n",
59
+ "print(\"torch:\", torch.__version__)\n",
60
+ "print(\"CUDA:\", torch.cuda.is_available())\n",
61
+ "print(\"GPUs:\", torch.cuda.device_count())\n",
62
  "\n",
63
+ "import unsloth\n",
64
+ "print(\"unsloth:\", unsloth.__version__)\n",
65
+ "\n",
66
+ "print(\"All good - ready to train!\")"
 
 
 
 
 
 
67
  ]
68
  },
69
  {
70
  "cell_type": "code",
71
+ "execution_count": null,
72
+ "metadata": {
73
+ "_cell_guid": "a967579b-40a3-4938-aa62-0cafb31ae8d6",
74
+ "_uuid": "6879acb6-3850-4ae8-ad82-fcac574fa422",
75
+ "collapsed": false,
76
+ "jupyter": {
77
+ "outputs_hidden": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  },
79
+ "trusted": true
80
+ },
81
+ "outputs": [],
 
 
 
 
 
82
  "source": [
83
  "import os\n",
84
  "import json\n",
 
88
  "import matplotlib.pyplot as plt\n",
89
  "import pandas as pd\n",
90
  "import wandb\n",
91
+ "import requests\n",
92
+ "\n",
93
  "from datasets import Dataset\n",
94
  "from unsloth import FastLanguageModel\n",
95
  "from trl import GRPOTrainer, GRPOConfig\n",
96
  "\n",
97
+ "# ── YOUR CONFIG ──────────────────────────\n",
98
+ "HF_SPACE_URL = \"https://YUS200619-swebench-ind.hf.space\"\n",
99
+ "WANDB_API_KEY = \"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\"\n",
100
  "WANDB_PROJECT = \"swebench-in\"\n",
101
  "MODEL_NAME = \"unsloth/Qwen2.5-3B-Instruct\"\n",
102
  "MAX_SEQ_LEN = 2048\n",
103
  "MAX_STEPS = 15\n",
104
  "BASELINE_EPISODES = 20\n",
105
+ "# ─────────────────────────────────────────\n",
106
  "\n",
107
+ "os.environ[\"WANDB_API_KEY\"] = WANDB_API_KEY\n",
108
  "wandb.init(project=WANDB_PROJECT, name=\"grpo-run-1\")\n",
109
+ "print(\"Wandb initialized\")"
110
  ]
111
  },
112
  {
113
+ "cell_type": "code",
114
+ "execution_count": null,
115
+ "metadata": {
116
+ "_cell_guid": "2a5bf235-85f4-4b82-84b8-25793d6d109d",
117
+ "_uuid": "96cc953e-eba7-4272-bb8d-f712f69fa805",
118
+ "collapsed": false,
119
+ "jupyter": {
120
+ "outputs_hidden": false
121
+ },
122
+ "trusted": true
123
+ },
124
+ "outputs": [],
125
  "source": [
126
+ "try:\n",
127
+ " resp = requests.get(f\"{HF_SPACE_URL}/health\", timeout=15)\n",
128
+ " print(\"Environment status:\", resp.json())\n",
129
+ " print(\"Environment is READY\")\n",
130
+ "except Exception as e:\n",
131
+ " print(\"ENVIRONMENT NOT REACHABLE:\", e)\n",
132
+ " print(\"STOP HERE. Fix your HF Space before continuing.\")"
133
  ]
134
  },
135
  {
136
  "cell_type": "code",
137
  "execution_count": null,
138
+ "metadata": {
139
+ "_cell_guid": "e9410e62-1f4c-4ac7-9155-15db5dabcffd",
140
+ "_uuid": "feb7ae57-5df5-43f0-a057-bb8bd2e9cd8a",
141
+ "collapsed": false,
142
+ "jupyter": {
143
+ "outputs_hidden": false
144
+ },
145
+ "trusted": true
146
+ },
147
  "outputs": [],
148
  "source": [
149
  "model, tokenizer = FastLanguageModel.from_pretrained(\n",
150
  " model_name=MODEL_NAME,\n",
151
  " max_seq_length=MAX_SEQ_LEN,\n",
152
  " dtype=None,\n",
153
+ " load_in_4bit=True,\n",
154
  ")\n",
155
  "\n",
156
  "model = FastLanguageModel.get_peft_model(\n",
 
160
  " lora_alpha=16,\n",
161
  " lora_dropout=0,\n",
162
  " bias=\"none\",\n",
163
+ " use_gradient_checkpointing=\"unsloth\",\n",
164
  " random_state=42,\n",
165
  ")\n",
166
  "\n",
167
+ "print(\"Model loaded successfully\")\n",
168
+ "print(f\"Trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}\")"
 
 
 
 
 
 
 
169
  ]
170
  },
171
  {
172
  "cell_type": "code",
173
  "execution_count": null,
174
+ "metadata": {
175
+ "_cell_guid": "2a55e605-0401-4f0f-87af-9c56bc85c562",
176
+ "_uuid": "10a8c794-e27a-4351-adf6-b25cb61c0b84",
177
+ "collapsed": false,
178
+ "jupyter": {
179
+ "outputs_hidden": false
180
+ },
181
+ "trusted": true
182
+ },
183
  "outputs": [],
184
  "source": [
 
 
185
  "class SWEBenchINClient:\n",
 
 
 
 
186
  " def __init__(self, base_url: str):\n",
187
  " self.base_url = base_url.rstrip(\"/\")\n",
188
  " self.session = requests.Session()\n",
 
 
 
189
  "\n",
190
  " def reset(self, task_id: int = None) -> dict:\n",
191
  " payload = {\"task_id\": task_id} if task_id else {}\n",
 
213
  " )\n",
214
  "\n",
215
  " def state(self) -> dict:\n",
216
+ " resp = self.session.get(\n",
217
+ " f\"{self.base_url}/state\",\n",
218
+ " timeout=10\n",
219
+ " )\n",
220
  " return resp.json()\n",
221
  "\n",
222
  " def grade(self) -> dict:\n",
223
+ " resp = self.session.post(\n",
224
+ " f\"{self.base_url}/grade\",\n",
225
+ " timeout=30\n",
226
+ " )\n",
227
  " return resp.json()\n",
228
  "\n",
229
+ "env = SWEBenchINClient(HF_SPACE_URL)\n",
230
  "\n",
231
+ "# Quick test\n",
232
+ "obs = env.reset(task_id=1)\n",
233
+ "print(\"Reset works:\", type(obs))\n",
234
+ "print(\"Observation keys:\", list(obs.keys()))"
 
 
 
 
 
 
 
 
 
235
  ]
236
  },
237
  {
238
  "cell_type": "code",
239
  "execution_count": null,
240
+ "metadata": {
241
+ "_cell_guid": "2725c6ef-a52f-4f6f-9c02-42f92cab4b58",
242
+ "_uuid": "be8a2067-3dc6-41cb-94c5-99d7854b7a5c",
243
+ "collapsed": false,
244
+ "jupyter": {
245
+ "outputs_hidden": false
246
+ },
247
+ "trusted": true
248
+ },
249
  "outputs": [],
250
  "source": [
251
  "SYSTEM_PROMPT = \"\"\"You are an Indian software engineer working on a Linux server.\n",
 
254
  "\n",
255
  "RULES:\n",
256
  "- Never use sudo\n",
257
+ "- Never use rm -rf\n",
258
  "- Fix the server/code issue first\n",
259
  "- Then reply to manager/client if messages exist\n",
260
  "- Be efficient β€” fewer actions is better\n",
 
266
  "{\"type\": \"run_tests\", \"args\": \"\"}\n",
267
  "{\"type\": \"check_server\", \"args\": \"\"}\n",
268
  "{\"type\": \"reply_slack\", \"args\": \"Server is back up. Fixed the missing dependency.\"}\n",
269
+ "{\"type\": \"reply_email\", \"args\": \"Apologies for the downtime. Issue resolved.\"}\n",
270
  "{\"type\": \"close_case\", \"args\": \"\"}\n",
271
  "\n",
272
  "Output ONLY valid JSON. Nothing else.\"\"\"\n",
273
  "\n",
274
  "\n",
275
  "def parse_action(text: str) -> dict:\n",
 
 
 
 
276
  " text = text.strip()\n",
 
 
277
  " text = re.sub(r\"```json\\s*\", \"\", text)\n",
278
  " text = re.sub(r\"```\\s*\", \"\", text)\n",
279
  " text = text.strip()\n",
280
  "\n",
 
281
  " try:\n",
282
  " action = json.loads(text)\n",
283
  " if \"type\" in action:\n",
 
285
  " except json.JSONDecodeError:\n",
286
  " pass\n",
287
  "\n",
 
288
  " pattern = r'\"?type\"?\\s*[:=]\\s*\"?(\\w+)\"?'\n",
289
  " match = re.search(pattern, text)\n",
290
  " if match:\n",
291
  " action_type = match.group(1)\n",
292
+ " args_match = re.search(\n",
293
+ " r'\"?args\"?\\s*[:=]\\s*\"?([^\"}\\n]+)\"?', text\n",
294
+ " )\n",
295
  " args = args_match.group(1).strip() if args_match else \"\"\n",
296
  " return {\"type\": action_type, \"args\": args}\n",
297
  "\n",
 
298
  " return {\"type\": \"close_case\", \"args\": \"\"}\n",
299
  "\n",
300
  "\n",
301
  "def format_prompt(observation: dict) -> str:\n",
 
302
  " return f\"\"\"CURRENT SITUATION:\n",
303
  "{json.dumps(observation, indent=2)}\n",
304
  "\n",
 
306
  "\n",
307
  "\n",
308
  "def run_episode(task_id: int = None, temperature: float = 0.7) -> tuple:\n",
 
 
 
 
309
  " observation = env.reset(task_id=task_id)\n",
310
  " actions_taken = []\n",
311
  " total_reward = 0.0\n",
 
317
  " ]\n",
318
  "\n",
319
  " for step_num in range(MAX_STEPS):\n",
 
320
  " input_text = tokenizer.apply_chat_template(\n",
321
  " messages,\n",
322
  " tokenize=False,\n",
 
330
  " max_length=MAX_SEQ_LEN\n",
331
  " ).to(model.device)\n",
332
  "\n",
333
+ " outputs = model.generate(\n",
334
+ " **inputs,\n",
335
+ " max_new_tokens=150,\n",
336
+ " temperature=max(temperature, 0.01),\n",
337
+ " do_sample=True,\n",
338
+ " pad_token_id=tokenizer.eos_token_id,\n",
339
+ " )\n",
 
 
340
  "\n",
 
341
  " new_tokens = outputs[0][inputs[\"input_ids\"].shape[1]:]\n",
342
+ " action_text = tokenizer.decode(\n",
343
+ " new_tokens,\n",
344
+ " skip_special_tokens=True\n",
345
+ " )\n",
346
  "\n",
 
347
  " action = parse_action(action_text)\n",
348
  " actions_taken.append({\n",
349
  " \"step\": step_num,\n",
350
+ " \"raw\": action_text[:200],\n",
351
  " \"parsed\": action\n",
352
  " })\n",
353
  "\n",
 
354
  " try:\n",
355
  " observation, reward, done, info = env.step(action)\n",
356
  " total_reward += reward\n",
357
  " except Exception as e:\n",
358
+ " print(f\" Step error: {e}\")\n",
359
  " total_reward -= 0.1\n",
360
  " done = True\n",
361
  " break\n",
362
  "\n",
 
363
  " messages.append({\"role\": \"assistant\", \"content\": action_text})\n",
364
  " messages.append({\n",
365
  " \"role\": \"user\",\n",
366
+ " \"content\": f\"Result:\\n{json.dumps(observation, indent=2)}\\n\\nNext action?\"\n",
367
  " })\n",
368
  "\n",
369
  " if done:\n",
370
  " break\n",
371
  "\n",
 
372
  " try:\n",
373
  " final_grade = env.grade()\n",
374
  " except:\n",
375
  " final_grade = {\"total\": total_reward}\n",
376
  "\n",
377
+ " return actions_taken, total_reward, final_grade\n",
 
 
 
 
 
 
 
378
  "\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
379
  "\n",
380
+ "print(\"System prompt and rollout function ready\")\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
  "\n",
382
+ "# Quick sanity check β€” run one episode\n",
383
+ "print(\"\\nRunning one test episode...\")\n",
384
+ "actions, reward, grade = run_episode(task_id=1, temperature=0.0)\n",
385
+ "print(f\"Test episode reward: {reward:.3f}\")\n",
386
+ "print(f\"Actions taken: {len(actions)}\")\n",
387
+ "print(f\"Grade: {grade}\")"
388
  ]
389
  },
390
  {
391
  "cell_type": "code",
392
  "execution_count": null,
393
+ "metadata": {
394
+ "_cell_guid": "2eb94693-a98f-49d9-a81d-75c8cf645b35",
395
+ "_uuid": "b111970f-7c82-4137-9223-f4da7bd12916",
396
+ "collapsed": false,
397
+ "jupyter": {
398
+ "outputs_hidden": false
399
+ },
400
+ "trusted": true
401
+ },
402
  "outputs": [],
403
  "source": [
 
 
 
 
 
 
404
  "def generate_prompt_dataset(n_prompts: int = 200) -> Dataset:\n",
405
  " prompts = []\n",
406
+ " task_weights = {1: 0.30, 2: 0.30, 3: 0.20, 4: 0.15, 5: 0.05}\n",
 
 
 
 
 
 
 
 
 
407
  " task_pool = []\n",
408
+ "\n",
409
  " for task_id, weight in task_weights.items():\n",
410
  " count = int(n_prompts * weight)\n",
411
  " task_pool.extend([task_id] * count)\n",
 
413
  " random.shuffle(task_pool)\n",
414
  "\n",
415
  " for task_id in task_pool:\n",
416
+ " try:\n",
417
+ " obs = env.reset(task_id=task_id)\n",
418
+ " prompt = tokenizer.apply_chat_template(\n",
419
+ " [\n",
420
+ " {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n",
421
+ " {\"role\": \"user\", \"content\": format_prompt(obs)}\n",
422
+ " ],\n",
423
+ " tokenize=False,\n",
424
+ " add_generation_prompt=True\n",
425
+ " )\n",
426
+ " prompts.append({\"prompt\": prompt, \"task_id\": task_id})\n",
427
+ " except Exception as e:\n",
428
+ " print(f\"Skipping task {task_id}: {e}\")\n",
429
+ " continue\n",
430
  "\n",
431
+ " print(f\"Generated {len(prompts)} prompts\")\n",
432
  " return Dataset.from_list(prompts)\n",
433
  "\n",
434
  "\n",
 
435
  "train_dataset = generate_prompt_dataset(n_prompts=200)\n",
436
+ "print(f\"Dataset ready: {len(train_dataset)} prompts\")\n",
437
+ "print(f\"Sample prompt length: {len(train_dataset[0]['prompt'])} chars\")"
 
 
 
 
 
 
438
  ]
439
  },
440
  {
441
  "cell_type": "code",
442
  "execution_count": null,
443
+ "metadata": {
444
+ "_cell_guid": "7bd8e434-930b-4391-93f4-41a8ee3c9971",
445
+ "_uuid": "689da08e-d727-4e7b-85fb-009b366e3f50",
446
+ "collapsed": false,
447
+ "jupyter": {
448
+ "outputs_hidden": false
449
+ },
450
+ "trusted": true
451
+ },
452
  "outputs": [],
453
  "source": [
454
+ "episode_log = []\n",
455
+ "\n",
456
+ "def grpo_reward_function(\n",
457
+ " completions: list,\n",
458
+ " prompts: list,\n",
459
+ " **kwargs\n",
460
+ ") -> list:\n",
461
+ "\n",
 
 
 
 
 
462
  " rewards = []\n",
463
  "\n",
464
+ " for completion in completions:\n",
465
  " try:\n",
 
466
  " action = parse_action(completion)\n",
467
+ " task_id = random.choice([1, 2, 3, 4])\n",
468
+ " env.reset(task_id=task_id)\n",
 
 
469
  " _, reward, done, _ = env.step(action)\n",
470
  "\n",
 
 
 
 
 
 
 
 
 
 
 
471
  " if action[\"type\"] == \"close_case\" and reward < 0.1:\n",
472
+ " reward -= 0.3\n",
473
  "\n",
474
  " rewards.append(float(reward))\n",
475
  "\n",
476
+ " except Exception:\n",
 
477
  " rewards.append(-0.5)\n",
478
  "\n",
 
479
  " wandb.log({\n",
480
  " \"reward/batch_mean\": np.mean(rewards),\n",
481
+ " \"reward/batch_max\": np.max(rewards),\n",
482
+ " \"reward/batch_std\": np.std(rewards),\n",
483
  " })\n",
484
  "\n",
485
  " episode_log.extend(rewards)\n",
486
+ " return rewards\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
487
  "\n",
 
 
 
488
  "\n",
489
+ "training_args = GRPOConfig(\n",
490
+ " output_dir=\"./swebench-checkpoints\",\n",
491
+ " num_train_epochs=3,\n",
492
+ " max_steps=500,\n",
493
  " per_device_train_batch_size=2,\n",
494
+ " gradient_accumulation_steps=8,\n",
495
+ " num_generations=4,\n",
496
+ " max_completion_length=200,\n",
497
+ " learning_rate=5e-6,\n",
 
 
 
 
498
  " warmup_steps=20,\n",
499
  " weight_decay=0.01,\n",
 
 
500
  " logging_steps=10,\n",
501
  " save_steps=100,\n",
502
  " report_to=\"wandb\",\n",
 
 
503
  " gradient_checkpointing=True,\n",
504
+ " bf16=False,\n",
505
+ " fp16=True,\n",
506
+ ")\n",
507
+ "\n",
508
+ "print(\"Reward function and training config ready\")"
 
 
 
 
509
  ]
510
  },
511
  {
512
  "cell_type": "code",
513
  "execution_count": null,
514
+ "metadata": {
515
+ "_cell_guid": "dd675d60-2347-40ce-b612-aac21ac7314f",
516
+ "_uuid": "21373bdf-191a-4493-b04c-2eade2dd5954",
517
+ "collapsed": false,
518
+ "jupyter": {
519
+ "outputs_hidden": false
520
+ },
521
+ "trusted": true
522
+ },
523
  "outputs": [],
524
  "source": [
525
  "trainer = GRPOTrainer(\n",
 
531
  ")\n",
532
  "\n",
533
  "print(\"Starting GRPO training...\")\n",
534
+ "print(f\"Steps: {training_args.max_steps}\")\n",
 
535
  "print(f\"Generations per prompt: {training_args.num_generations}\")\n",
536
+ "print(\"This takes 60-90 minutes. Watch wandb dashboard.\")\n",
537
  "print(\"=\" * 50)\n",
538
  "\n",
539
  "trainer.train()\n",
540
  "\n",
541
+ "print(\"Training complete!\")"
 
 
 
 
 
 
 
542
  ]
543
  },
544
  {
545
  "cell_type": "code",
546
  "execution_count": null,
547
+ "metadata": {
548
+ "_cell_guid": "7b5a47d9-7abc-4824-982b-adb2ab684961",
549
+ "_uuid": "9baa6267-f3ea-49d5-a6b8-de129657fcf9",
550
+ "collapsed": false,
551
+ "jupyter": {
552
+ "outputs_hidden": false
553
+ },
554
+ "trusted": true
555
+ },
556
  "outputs": [],
557
  "source": [
558
  "print(\"=\" * 50)\n",
 
563
  "\n",
564
  "for i in range(BASELINE_EPISODES):\n",
565
  " task_id = random.choice([1, 2, 3, 4, 5])\n",
566
+ " _, reward, _ = run_episode(task_id=task_id, temperature=0.0)\n",
567
  " trained_rewards.append(reward)\n",
568
+ " print(f\"Episode {i+1:02d}/{BASELINE_EPISODES} \"\n",
569
+ " f\"| task={task_id} | reward={reward:.3f}\")\n",
570
  "\n",
571
  "TRAINED_AVG = np.mean(trained_rewards)\n",
572
+ "print(f\"\\nTrained average: {TRAINED_AVG:.3f}\")\n",
573
  "print(f\"Baseline average: {REAL_BASELINE_AVG:.3f}\")\n",
574
+ "print(f\"Improvement: +{TRAINED_AVG - REAL_BASELINE_AVG:.3f}\")"
 
 
 
 
 
 
 
575
  ]
576
  },
577
  {
578
  "cell_type": "code",
579
  "execution_count": null,
580
+ "metadata": {
581
+ "_cell_guid": "089714bd-53e6-40e7-a041-3b18dbff9f16",
582
+ "_uuid": "452ccb4d-6b90-4049-a62d-841a054e9bcd",
583
+ "collapsed": false,
584
+ "jupyter": {
585
+ "outputs_hidden": false
586
+ },
587
+ "trusted": true
588
+ },
589
  "outputs": [],
590
  "source": [
 
591
  "os.makedirs(\"plots\", exist_ok=True)\n",
592
  "\n",
593
+ "# ── Plot 1: Reward Curve ──────────────────\n",
594
  "history_df = wandb.run.history(\n",
595
+ " keys=[\"reward/batch_mean\", \"_step\"]\n",
596
  ")\n",
597
  "history_df = history_df.dropna(subset=[\"reward/batch_mean\"])\n",
598
  "\n",
 
599
  "fig, ax = plt.subplots(figsize=(12, 5))\n",
 
 
600
  "ax.plot(\n",
601
  " history_df[\"_step\"],\n",
602
  " history_df[\"reward/batch_mean\"],\n",
603
+ " color=\"steelblue\", alpha=0.4,\n",
604
+ " linewidth=1, label=\"Per batch reward\"\n",
 
 
605
  ")\n",
 
 
606
  "if len(history_df) > 10:\n",
607
  " smoothed = pd.Series(\n",
608
  " history_df[\"reward/batch_mean\"].values\n",
609
  " ).rolling(window=20, min_periods=1).mean()\n",
610
  " ax.plot(\n",
611
+ " history_df[\"_step\"], smoothed,\n",
612
+ " color=\"steelblue\", linewidth=2.5,\n",
613
+ " label=\"Smoothed reward\"\n",
 
 
614
  " )\n",
 
 
615
  "ax.axhline(\n",
616
+ " y=REAL_BASELINE_AVG, color=\"red\",\n",
617
+ " linestyle=\"--\", linewidth=2,\n",
 
 
618
  " label=f\"Untrained baseline ({REAL_BASELINE_AVG:.2f})\"\n",
619
  ")\n",
620
  "ax.axhline(\n",
621
+ " y=TRAINED_AVG, color=\"green\",\n",
622
+ " linestyle=\"--\", linewidth=2,\n",
 
 
623
  " label=f\"Trained model ({TRAINED_AVG:.2f})\"\n",
624
  ")\n",
 
625
  "ax.set_xlabel(\"Training Step\", fontsize=12)\n",
626
  "ax.set_ylabel(\"Episode Reward\", fontsize=12)\n",
627
+ "ax.set_title(\"SWEbench-IN: GRPO Training Reward Curve\",\n",
628
+ " fontsize=14)\n",
629
  "ax.legend(fontsize=10)\n",
630
  "ax.grid(True, alpha=0.3)\n",
631
  "plt.tight_layout()\n",
632
+ "plt.savefig(\"plots/reward_curve.png\", dpi=150,\n",
633
+ " bbox_inches=\"tight\")\n",
634
  "plt.show()\n",
635
  "print(\"Saved: plots/reward_curve.png\")\n",
636
  "\n",
637
+ "# ── Plot 2: Loss Curve ────────────────────\n",
638
  "fig, ax = plt.subplots(figsize=(12, 5))\n",
 
 
639
  "log_history = trainer.state.log_history\n",
640
  "losses = [x[\"loss\"] for x in log_history if \"loss\" in x]\n",
641
  "steps = [x[\"step\"] for x in log_history if \"loss\" in x]\n",
642
  "\n",
643
  "if losses:\n",
644
+ " ax.plot(steps, losses, color=\"crimson\",\n",
645
+ " linewidth=1.5, label=\"Policy Loss\")\n",
646
+ " ax.set_xlabel(\"Training Step\", fontsize=12)\n",
647
+ " ax.set_ylabel(\"Loss\", fontsize=12)\n",
648
+ " ax.set_title(\"SWEbench-IN: Policy Loss Curve\",\n",
649
+ " fontsize=14)\n",
650
+ " ax.legend(fontsize=10)\n",
651
+ " ax.grid(True, alpha=0.3)\n",
652
  "\n",
 
 
 
 
 
653
  "plt.tight_layout()\n",
654
+ "plt.savefig(\"plots/loss_curve.png\", dpi=150,\n",
655
+ " bbox_inches=\"tight\")\n",
656
  "plt.show()\n",
657
  "print(\"Saved: plots/loss_curve.png\")\n",
658
  "\n",
659
+ "# ── Plot 3: Before vs After ───────────────\n",
660
+ "fig, ax = plt.subplots(figsize=(12, 5))\n",
 
661
  "episodes = list(range(1, BASELINE_EPISODES + 1))\n",
662
  "ax.plot(episodes, baseline_rewards,\n",
663
+ " color=\"red\", marker=\"o\",\n",
664
+ " linewidth=1.5,\n",
665
  " label=f\"Untrained (avg={REAL_BASELINE_AVG:.2f})\")\n",
666
  "ax.plot(episodes, trained_rewards,\n",
667
+ " color=\"green\", marker=\"s\",\n",
668
+ " linewidth=1.5,\n",
669
  " label=f\"Trained (avg={TRAINED_AVG:.2f})\")\n",
670
+ "ax.fill_between(\n",
671
+ " episodes, baseline_rewards, trained_rewards,\n",
672
+ " alpha=0.1, color=\"green\",\n",
673
+ " label=f\"Improvement: +{TRAINED_AVG - REAL_BASELINE_AVG:.2f}\"\n",
674
+ ")\n",
 
 
 
675
  "ax.set_xlabel(\"Episode\", fontsize=12)\n",
676
  "ax.set_ylabel(\"Reward\", fontsize=12)\n",
677
+ "ax.set_title(\"SWEbench-IN: Before vs After GRPO Training\",\n",
678
+ " fontsize=14)\n",
679
  "ax.legend(fontsize=10)\n",
680
  "ax.grid(True, alpha=0.3)\n",
681
  "plt.tight_layout()\n",
682
+ "plt.savefig(\"plots/before_after.png\", dpi=150,\n",
683
+ " bbox_inches=\"tight\")\n",
684
  "plt.show()\n",
685
+ "print(\"Saved: plots/before_after.png\")\n",
686
+ "\n",
687
+ "print(\"\\nAll plots saved.\")\n",
688
+ "print(\"Download from Kaggle Files panel β†’ commit to repo\")"
 
 
 
 
689
  ]
690
  },
691
  {
692
  "cell_type": "code",
693
  "execution_count": null,
694
+ "metadata": {
695
+ "_cell_guid": "16534aa6-897c-453e-9a01-23da3fc20acc",
696
+ "_uuid": "24c0f45c-3e8b-4205-8ae4-3a503f3624e1",
697
+ "collapsed": false,
698
+ "jupyter": {
699
+ "outputs_hidden": false
700
+ },
701
+ "trusted": true
702
+ },
703
  "outputs": [],
704
  "source": [
 
 
 
 
705
  "model.save_pretrained_merged(\n",
706
  " \"swebench-in-lora\",\n",
707
  " tokenizer=tokenizer,\n",
708
  " save_method=\"lora\"\n",
709
  ")\n",
710
+ "print(\"Model saved\")"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
711
  ]
712
  }
713
  ],
714
  "metadata": {
715
+ "kaggle": {
716
+ "accelerator": "nvidiaTeslaT4",
717
+ "dataSources": [],
718
+ "dockerImageVersionId": 31329,
719
+ "isGpuEnabled": true,
720
+ "isInternetEnabled": true,
721
+ "language": "python",
722
+ "sourceType": "notebook"
723
+ },
724
  "kernelspec": {
725
+ "display_name": "Python 3",
726
  "language": "python",
727
  "name": "python3"
728
  },
 
736
  "name": "python",
737
  "nbconvert_exporter": "python",
738
  "pygments_lexer": "ipython3",
739
+ "version": "3.12.12"
740
  }
741
  },
742
  "nbformat": 4,