{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "04149943-a415-4b34-9912-b0f8c528bb41",
    "_uuid": "8062a470-1b78-44c2-8d72-c8d7b56a6646",
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "!pip install -q \\\n",
    "  \"huggingface_hub>=0.24.0\" \\\n",
    "  \"unsloth\" \\\n",
    "  \"unsloth_zoo\"\n",
    "!pip install \"unsloth[kaggle-new] @ git+https://github.com/unslothai/unsloth.git\" -q\n",
    "!pip install trl transformers accelerate datasets wandb requests matplotlib pandas -q"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "a3cb3b13-01ca-4099-89e0-a4bf48071b56",
    "_uuid": "96b2917f-1eff-408f-a4c6-f24b2b2dca11",
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "# Run this cell immediately after install finishes\n",
    "#import os\n",
    "#os.kill(os.getpid(), 9)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "439072a1-3197-4823-9132-995b64208462",
    "_uuid": "07eb3e2d-de23-4107-867e-651bf08a6915",
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "import torch\n",
    "print(\"torch:\", torch.__version__)\n",
    "print(\"CUDA:\", torch.cuda.is_available())\n",
    "print(\"GPUs:\", torch.cuda.device_count())\n",
    "\n",
    "import unsloth\n",
    "print(\"unsloth:\", unsloth.__version__)\n",
    "\n",
    "print(\"All good - ready to train!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "a967579b-40a3-4938-aa62-0cafb31ae8d6",
    "_uuid": "6879acb6-3850-4ae8-ad82-fcac574fa422",
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "import re\n",
    "import random\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "import wandb\n",
    "import requests\n",
    "\n",
    "from datasets import Dataset\n",
    "from unsloth import FastLanguageModel\n",
    "from trl import GRPOTrainer, GRPOConfig\n",
    "\n",
    "# ── YOUR CONFIG ──────────────────────────\n",
    "HF_SPACE_URL      = \"https://YUS200619-swebench-ind.hf.space\"\n",
    "WANDB_API_KEY     = \"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\"\n",
    "WANDB_PROJECT     = \"swebench-in\"\n",
    "MODEL_NAME        = \"unsloth/Qwen2.5-3B-Instruct\"\n",
    "MAX_SEQ_LEN       = 2048\n",
    "MAX_STEPS         = 15\n",
    "BASELINE_EPISODES = 20\n",
    "# ─────────────────────────────────────────\n",
    "\n",
    "os.environ[\"WANDB_API_KEY\"] = WANDB_API_KEY\n",
    "wandb.init(project=WANDB_PROJECT, name=\"grpo-run-1\")\n",
    "print(\"Wandb initialized\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "2a5bf235-85f4-4b82-84b8-25793d6d109d",
    "_uuid": "96cc953e-eba7-4272-bb8d-f712f69fa805",
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "try:\n",
    "    resp = requests.get(f\"{HF_SPACE_URL}/health\", timeout=15)\n",
    "    print(\"Environment status:\", resp.json())\n",
    "    print(\"Environment is READY\")\n",
    "except Exception as e:\n",
    "    print(\"ENVIRONMENT NOT REACHABLE:\", e)\n",
    "    print(\"STOP HERE. Fix your HF Space before continuing.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "e9410e62-1f4c-4ac7-9155-15db5dabcffd",
    "_uuid": "feb7ae57-5df5-43f0-a057-bb8bd2e9cd8a",
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "model, tokenizer = FastLanguageModel.from_pretrained(\n",
    "    model_name=MODEL_NAME,\n",
    "    max_seq_length=MAX_SEQ_LEN,\n",
    "    dtype=None,\n",
    "    load_in_4bit=True,\n",
    ")\n",
    "\n",
    "model = FastLanguageModel.get_peft_model(\n",
    "    model,\n",
    "    r=16,\n",
    "    target_modules=[\"q_proj\", \"v_proj\"],\n",
    "    lora_alpha=16,\n",
    "    lora_dropout=0,\n",
    "    bias=\"none\",\n",
    "    use_gradient_checkpointing=\"unsloth\",\n",
    "    random_state=42,\n",
    ")\n",
    "\n",
    "print(\"Model loaded successfully\")\n",
    "print(f\"Trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "2a55e605-0401-4f0f-87af-9c56bc85c562",
    "_uuid": "10a8c794-e27a-4351-adf6-b25cb61c0b84",
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "class SWEBenchINClient:\n",
    "    def __init__(self, base_url: str):\n",
    "        self.base_url = base_url.rstrip(\"/\")\n",
    "        self.session = requests.Session()\n",
    "\n",
    "    def reset(self, task_id: int = None) -> dict:\n",
    "        payload = {\"task_id\": task_id} if task_id else {}\n",
    "        resp = self.session.post(\n",
    "            f\"{self.base_url}/reset\",\n",
    "            json=payload,\n",
    "            timeout=30\n",
    "        )\n",
    "        resp.raise_for_status()\n",
    "        return resp.json()\n",
    "\n",
    "    def step(self, action: dict) -> tuple:\n",
    "        resp = self.session.post(\n",
    "            f\"{self.base_url}/step\",\n",
    "            json={\"action\": action},\n",
    "            timeout=30\n",
    "        )\n",
    "        resp.raise_for_status()\n",
    "        data = resp.json()\n",
    "        return (\n",
    "            data[\"observation\"],\n",
    "            float(data[\"reward\"]),\n",
    "            bool(data[\"done\"]),\n",
    "            data.get(\"info\", {})\n",
    "        )\n",
    "\n",
    "    def state(self) -> dict:\n",
    "        resp = self.session.get(\n",
    "            f\"{self.base_url}/state\",\n",
    "            timeout=10\n",
    "        )\n",
    "        return resp.json()\n",
    "\n",
    "    def grade(self) -> dict:\n",
    "        resp = self.session.post(\n",
    "            f\"{self.base_url}/grade\",\n",
    "            timeout=30\n",
    "        )\n",
    "        return resp.json()\n",
    "\n",
    "env = SWEBenchINClient(HF_SPACE_URL)\n",
    "\n",
    "# Quick test\n",
    "obs = env.reset(task_id=1)\n",
    "print(\"Reset works:\", type(obs))\n",
    "print(\"Observation keys:\", list(obs.keys()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "2725c6ef-a52f-4f6f-9c02-42f92cab4b58",
    "_uuid": "be8a2067-3dc6-41cb-94c5-99d7854b7a5c",
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "SYSTEM_PROMPT = \"\"\"You are an Indian software engineer working on a Linux server.\n",
    "You will receive a broken environment description.\n",
    "Your job: fix the technical issue AND handle any human communication.\n",
    "\n",
    "RULES:\n",
    "- Never use sudo\n",
    "- Never use rm -rf\n",
    "- Fix the server/code issue first\n",
    "- Then reply to manager/client if messages exist\n",
    "- Be efficient — fewer actions is better\n",
    "\n",
    "OUTPUT FORMAT (pick one per turn):\n",
    "{\"type\": \"run_command\", \"args\": \"pip install flask\"}\n",
    "{\"type\": \"read_file\", \"args\": \"/home/user2/app.py\"}\n",
    "{\"type\": \"write_file\", \"args\": \"/home/user2/app.py\", \"content\": \"fixed code here\"}\n",
    "{\"type\": \"run_tests\", \"args\": \"\"}\n",
    "{\"type\": \"check_server\", \"args\": \"\"}\n",
    "{\"type\": \"reply_slack\", \"args\": \"Server is back up. Fixed the missing dependency.\"}\n",
    "{\"type\": \"reply_email\", \"args\": \"Apologies for the downtime. Issue resolved.\"}\n",
    "{\"type\": \"close_case\", \"args\": \"\"}\n",
    "\n",
    "Output ONLY valid JSON. Nothing else.\"\"\"\n",
    "\n",
    "\n",
    "def parse_action(text: str) -> dict:\n",
    "    text = text.strip()\n",
    "    text = re.sub(r\"```json\\s*\", \"\", text)\n",
    "    text = re.sub(r\"```\\s*\", \"\", text)\n",
    "    text = text.strip()\n",
    "\n",
    "    try:\n",
    "        action = json.loads(text)\n",
    "        if \"type\" in action:\n",
    "            return action\n",
    "    except json.JSONDecodeError:\n",
    "        pass\n",
    "\n",
    "    pattern = r'\"?type\"?\\s*[:=]\\s*\"?(\\w+)\"?'\n",
    "    match = re.search(pattern, text)\n",
    "    if match:\n",
    "        action_type = match.group(1)\n",
    "        args_match = re.search(\n",
    "            r'\"?args\"?\\s*[:=]\\s*\"?([^\"}\\n]+)\"?', text\n",
    "        )\n",
    "        args = args_match.group(1).strip() if args_match else \"\"\n",
    "        return {\"type\": action_type, \"args\": args}\n",
    "\n",
    "    return {\"type\": \"close_case\", \"args\": \"\"}\n",
    "\n",
    "\n",
    "def format_prompt(observation: dict) -> str:\n",
    "    return f\"\"\"CURRENT SITUATION:\n",
    "{json.dumps(observation, indent=2)}\n",
    "\n",
    "What is your next action? Output valid JSON only.\"\"\"\n",
    "\n",
    "\n",
    "def run_episode(task_id: int = None, temperature: float = 0.7) -> tuple:\n",
    "    observation = env.reset(task_id=task_id)\n",
    "    actions_taken = []\n",
    "    total_reward = 0.0\n",
    "    done = False\n",
    "\n",
    "    messages = [\n",
    "        {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n",
    "        {\"role\": \"user\", \"content\": format_prompt(observation)}\n",
    "    ]\n",
    "\n",
    "    for step_num in range(MAX_STEPS):\n",
    "        input_text = tokenizer.apply_chat_template(\n",
    "            messages,\n",
    "            tokenize=False,\n",
    "            add_generation_prompt=True\n",
    "        )\n",
    "\n",
    "        inputs = tokenizer(\n",
    "            input_text,\n",
    "            return_tensors=\"pt\",\n",
    "            truncation=True,\n",
    "            max_length=MAX_SEQ_LEN\n",
    "        ).to(model.device)\n",
    "\n",
    "        outputs = model.generate(\n",
    "            **inputs,\n",
    "            max_new_tokens=150,\n",
    "            temperature=max(temperature, 0.01),\n",
    "            do_sample=True,\n",
    "            pad_token_id=tokenizer.eos_token_id,\n",
    "        )\n",
    "\n",
    "        new_tokens = outputs[0][inputs[\"input_ids\"].shape[1]:]\n",
    "        action_text = tokenizer.decode(\n",
    "            new_tokens,\n",
    "            skip_special_tokens=True\n",
    "        )\n",
    "\n",
    "        action = parse_action(action_text)\n",
    "        actions_taken.append({\n",
    "            \"step\": step_num,\n",
    "            \"raw\": action_text[:200],\n",
    "            \"parsed\": action\n",
    "        })\n",
    "\n",
    "        try:\n",
    "            observation, reward, done, info = env.step(action)\n",
    "            total_reward += reward\n",
    "        except Exception as e:\n",
    "            print(f\"  Step error: {e}\")\n",
    "            total_reward -= 0.1\n",
    "            done = True\n",
    "            break\n",
    "\n",
    "        messages.append({\"role\": \"assistant\", \"content\": action_text})\n",
    "        messages.append({\n",
    "            \"role\": \"user\",\n",
    "            \"content\": f\"Result:\\n{json.dumps(observation, indent=2)}\\n\\nNext action?\"\n",
    "        })\n",
    "\n",
    "        if done:\n",
    "            break\n",
    "\n",
    "    try:\n",
    "        final_grade = env.grade()\n",
    "    except:\n",
    "        final_grade = {\"total\": total_reward}\n",
    "\n",
    "    return actions_taken, total_reward, final_grade\n",
    "\n",
    "\n",
    "print(\"System prompt and rollout function ready\")\n",
    "\n",
    "# Quick sanity check — run one episode\n",
    "print(\"\\nRunning one test episode...\")\n",
    "actions, reward, grade = run_episode(task_id=1, temperature=0.0)\n",
    "print(f\"Test episode reward: {reward:.3f}\")\n",
    "print(f\"Actions taken: {len(actions)}\")\n",
    "print(f\"Grade: {grade}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "2eb94693-a98f-49d9-a81d-75c8cf645b35",
    "_uuid": "b111970f-7c82-4137-9223-f4da7bd12916",
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "def generate_prompt_dataset(n_prompts: int = 200) -> Dataset:\n",
    "    prompts = []\n",
    "    task_weights = {1: 0.30, 2: 0.30, 3: 0.20, 4: 0.15, 5: 0.05}\n",
    "    task_pool = []\n",
    "\n",
    "    for task_id, weight in task_weights.items():\n",
    "        count = int(n_prompts * weight)\n",
    "        task_pool.extend([task_id] * count)\n",
    "\n",
    "    random.shuffle(task_pool)\n",
    "\n",
    "    for task_id in task_pool:\n",
    "        try:\n",
    "            obs = env.reset(task_id=task_id)\n",
    "            prompt = tokenizer.apply_chat_template(\n",
    "                [\n",
    "                    {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n",
    "                    {\"role\": \"user\", \"content\": format_prompt(obs)}\n",
    "                ],\n",
    "                tokenize=False,\n",
    "                add_generation_prompt=True\n",
    "            )\n",
    "            prompts.append({\"prompt\": prompt, \"task_id\": task_id})\n",
    "        except Exception as e:\n",
    "            print(f\"Skipping task {task_id}: {e}\")\n",
    "            continue\n",
    "\n",
    "    print(f\"Generated {len(prompts)} prompts\")\n",
    "    return Dataset.from_list(prompts)\n",
    "\n",
    "\n",
    "train_dataset = generate_prompt_dataset(n_prompts=200)\n",
    "print(f\"Dataset ready: {len(train_dataset)} prompts\")\n",
    "print(f\"Sample prompt length: {len(train_dataset[0]['prompt'])} chars\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "7bd8e434-930b-4391-93f4-41a8ee3c9971",
    "_uuid": "689da08e-d727-4e7b-85fb-009b366e3f50",
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "episode_log = []\n",
    "\n",
    "def grpo_reward_function(\n",
    "    completions: list,\n",
    "    prompts: list,\n",
    "    **kwargs\n",
    ") -> list:\n",
    "\n",
    "    rewards = []\n",
    "\n",
    "    for completion in completions:\n",
    "        try:\n",
    "            action = parse_action(completion)\n",
    "            task_id = random.choice([1, 2, 3, 4])\n",
    "            env.reset(task_id=task_id)\n",
    "            _, reward, done, _ = env.step(action)\n",
    "\n",
    "            if action[\"type\"] == \"close_case\" and reward < 0.1:\n",
    "                reward -= 0.3\n",
    "\n",
    "            rewards.append(float(reward))\n",
    "\n",
    "        except Exception:\n",
    "            rewards.append(-0.5)\n",
    "\n",
    "    wandb.log({\n",
    "        \"reward/batch_mean\": np.mean(rewards),\n",
    "        \"reward/batch_max\":  np.max(rewards),\n",
    "        \"reward/batch_std\":  np.std(rewards),\n",
    "    })\n",
    "\n",
    "    episode_log.extend(rewards)\n",
    "    return rewards\n",
    "\n",
    "\n",
    "training_args = GRPOConfig(\n",
    "    output_dir=\"./swebench-checkpoints\",\n",
    "    num_train_epochs=3,\n",
    "    max_steps=500,\n",
    "    per_device_train_batch_size=2,\n",
    "    gradient_accumulation_steps=8,\n",
    "    num_generations=4,\n",
    "    max_completion_length=200,\n",
    "    learning_rate=5e-6,\n",
    "    warmup_steps=20,\n",
    "    weight_decay=0.01,\n",
    "    logging_steps=10,\n",
    "    save_steps=100,\n",
    "    report_to=\"wandb\",\n",
    "    gradient_checkpointing=True,\n",
    "    bf16=False,\n",
    "    fp16=True,\n",
    ")\n",
    "\n",
    "print(\"Reward function and training config ready\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "dd675d60-2347-40ce-b612-aac21ac7314f",
    "_uuid": "21373bdf-191a-4493-b04c-2eade2dd5954",
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "trainer = GRPOTrainer(\n",
    "    model=model,\n",
    "    reward_funcs=grpo_reward_function,\n",
    "    args=training_args,\n",
    "    train_dataset=train_dataset,\n",
    "    tokenizer=tokenizer,\n",
    ")\n",
    "\n",
    "print(\"Starting GRPO training...\")\n",
    "print(f\"Steps: {training_args.max_steps}\")\n",
    "print(f\"Generations per prompt: {training_args.num_generations}\")\n",
    "print(\"This takes 60-90 minutes. Watch wandb dashboard.\")\n",
    "print(\"=\" * 50)\n",
    "\n",
    "trainer.train()\n",
    "\n",
    "print(\"Training complete!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "7b5a47d9-7abc-4824-982b-adb2ab684961",
    "_uuid": "9baa6267-f3ea-49d5-a6b8-de129657fcf9",
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "print(\"=\" * 50)\n",
    "print(\"EVALUATING TRAINED MODEL\")\n",
    "print(\"=\" * 50)\n",
    "\n",
    "trained_rewards = []\n",
    "\n",
    "for i in range(BASELINE_EPISODES):\n",
    "    task_id = random.choice([1, 2, 3, 4, 5])\n",
    "    _, reward, _ = run_episode(task_id=task_id, temperature=0.0)\n",
    "    trained_rewards.append(reward)\n",
    "    print(f\"Episode {i+1:02d}/{BASELINE_EPISODES} \"\n",
    "          f\"| task={task_id} | reward={reward:.3f}\")\n",
    "\n",
    "TRAINED_AVG = np.mean(trained_rewards)\n",
    "print(f\"\\nTrained average:  {TRAINED_AVG:.3f}\")\n",
    "print(f\"Baseline average: {REAL_BASELINE_AVG:.3f}\")\n",
    "print(f\"Improvement:      +{TRAINED_AVG - REAL_BASELINE_AVG:.3f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "089714bd-53e6-40e7-a041-3b18dbff9f16",
    "_uuid": "452ccb4d-6b90-4049-a62d-841a054e9bcd",
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "os.makedirs(\"plots\", exist_ok=True)\n",
    "\n",
    "# ── Plot 1: Reward Curve ──────────────────\n",
    "history_df = wandb.run.history(\n",
    "    keys=[\"reward/batch_mean\", \"_step\"]\n",
    ")\n",
    "history_df = history_df.dropna(subset=[\"reward/batch_mean\"])\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(12, 5))\n",
    "ax.plot(\n",
    "    history_df[\"_step\"],\n",
    "    history_df[\"reward/batch_mean\"],\n",
    "    color=\"steelblue\", alpha=0.4,\n",
    "    linewidth=1, label=\"Per batch reward\"\n",
    ")\n",
    "if len(history_df) > 10:\n",
    "    smoothed = pd.Series(\n",
    "        history_df[\"reward/batch_mean\"].values\n",
    "    ).rolling(window=20, min_periods=1).mean()\n",
    "    ax.plot(\n",
    "        history_df[\"_step\"], smoothed,\n",
    "        color=\"steelblue\", linewidth=2.5,\n",
    "        label=\"Smoothed reward\"\n",
    "    )\n",
    "ax.axhline(\n",
    "    y=REAL_BASELINE_AVG, color=\"red\",\n",
    "    linestyle=\"--\", linewidth=2,\n",
    "    label=f\"Untrained baseline ({REAL_BASELINE_AVG:.2f})\"\n",
    ")\n",
    "ax.axhline(\n",
    "    y=TRAINED_AVG, color=\"green\",\n",
    "    linestyle=\"--\", linewidth=2,\n",
    "    label=f\"Trained model ({TRAINED_AVG:.2f})\"\n",
    ")\n",
    "ax.set_xlabel(\"Training Step\", fontsize=12)\n",
    "ax.set_ylabel(\"Episode Reward\", fontsize=12)\n",
    "ax.set_title(\"SWEbench-IN: GRPO Training Reward Curve\",\n",
    "             fontsize=14)\n",
    "ax.legend(fontsize=10)\n",
    "ax.grid(True, alpha=0.3)\n",
    "plt.tight_layout()\n",
    "plt.savefig(\"plots/reward_curve.png\", dpi=150,\n",
    "            bbox_inches=\"tight\")\n",
    "plt.show()\n",
    "print(\"Saved: plots/reward_curve.png\")\n",
    "\n",
    "# ── Plot 2: Loss Curve ────────────────────\n",
    "fig, ax = plt.subplots(figsize=(12, 5))\n",
    "log_history = trainer.state.log_history\n",
    "losses = [x[\"loss\"] for x in log_history if \"loss\" in x]\n",
    "steps  = [x[\"step\"] for x in log_history if \"loss\" in x]\n",
    "\n",
    "if losses:\n",
    "    ax.plot(steps, losses, color=\"crimson\",\n",
    "            linewidth=1.5, label=\"Policy Loss\")\n",
    "    ax.set_xlabel(\"Training Step\", fontsize=12)\n",
    "    ax.set_ylabel(\"Loss\", fontsize=12)\n",
    "    ax.set_title(\"SWEbench-IN: Policy Loss Curve\",\n",
    "                 fontsize=14)\n",
    "    ax.legend(fontsize=10)\n",
    "    ax.grid(True, alpha=0.3)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig(\"plots/loss_curve.png\", dpi=150,\n",
    "            bbox_inches=\"tight\")\n",
    "plt.show()\n",
    "print(\"Saved: plots/loss_curve.png\")\n",
    "\n",
    "# ── Plot 3: Before vs After ───────────────\n",
    "fig, ax = plt.subplots(figsize=(12, 5))\n",
    "episodes = list(range(1, BASELINE_EPISODES + 1))\n",
    "ax.plot(episodes, baseline_rewards,\n",
    "        color=\"red\", marker=\"o\",\n",
    "        linewidth=1.5,\n",
    "        label=f\"Untrained (avg={REAL_BASELINE_AVG:.2f})\")\n",
    "ax.plot(episodes, trained_rewards,\n",
    "        color=\"green\", marker=\"s\",\n",
    "        linewidth=1.5,\n",
    "        label=f\"Trained (avg={TRAINED_AVG:.2f})\")\n",
    "ax.fill_between(\n",
    "    episodes, baseline_rewards, trained_rewards,\n",
    "    alpha=0.1, color=\"green\",\n",
    "    label=f\"Improvement: +{TRAINED_AVG - REAL_BASELINE_AVG:.2f}\"\n",
    ")\n",
    "ax.set_xlabel(\"Episode\", fontsize=12)\n",
    "ax.set_ylabel(\"Reward\", fontsize=12)\n",
    "ax.set_title(\"SWEbench-IN: Before vs After GRPO Training\",\n",
    "             fontsize=14)\n",
    "ax.legend(fontsize=10)\n",
    "ax.grid(True, alpha=0.3)\n",
    "plt.tight_layout()\n",
    "plt.savefig(\"plots/before_after.png\", dpi=150,\n",
    "            bbox_inches=\"tight\")\n",
    "plt.show()\n",
    "print(\"Saved: plots/before_after.png\")\n",
    "\n",
    "print(\"\\nAll plots saved.\")\n",
    "print(\"Download from Kaggle Files panel → commit to repo\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "16534aa6-897c-453e-9a01-23da3fc20acc",
    "_uuid": "24c0f45c-3e8b-4205-8ae4-3a503f3624e1",
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "model.save_pretrained_merged(\n",
    "    \"swebench-in-lora\",\n",
    "    tokenizer=tokenizer,\n",
    "    save_method=\"lora\"\n",
    ")\n",
    "print(\"Model saved\")"
   ]
  }
 ],
 "metadata": {
  "kaggle": {
   "accelerator": "nvidiaTeslaT4",
   "dataSources": [],
   "dockerImageVersionId": 31329,
   "isGpuEnabled": true,
   "isInternetEnabled": true,
   "language": "python",
   "sourceType": "notebook"
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}