{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# ECHO Training Notebook\n",
        "Trains Qwen2.5-7B to predict its own correctness using GRPO + OpenEnv"
      ],
      "id": "e67d4af1"
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "# Install dependencies\n",
        "!pip install -q \"trl>=0.8.0\" \"peft\" \"transformers\" \"datasets\" \"huggingface_hub\"\n",
        "!pip install -q \"openenv-core[core]>=0.2.0\" || pip install -q git+https://github.com/meta-pytorch/OpenEnv.git\n",
        "!pip install -q \"unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git\""
      ],
      "execution_count": null,
      "outputs": [],
      "id": "04648bc5"
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "import os\n",
        "import requests\n",
        "import json\n",
        "import numpy as np\n",
        "from huggingface_hub import login\n",
        "\n",
        "# Authenticate\n",
        "HF_TOKEN = os.environ.get(\"HF_TOKEN\", \"\")  # Set in Colab secrets\n",
        "if HF_TOKEN:\n",
        "    login(HF_TOKEN)\n",
        "\n",
        "# Connect to live ECHO environment on HuggingFace Spaces\n",
        "ECHO_SPACE_URL = \"https://vikaspandey582003-echo-ultimate.hf.space\"\n",
        "\n",
        "# Test connection\n",
        "resp = requests.get(f\"{ECHO_SPACE_URL}/health\", timeout=10)\n",
        "print(f\"Space status: {resp.json()}\")"
      ],
      "execution_count": null,
      "outputs": [],
      "id": "b1aee9a5"
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "# Simple HTTP client for the ECHO environment\n",
        "class EchoEnvClient:\n",
        "    def __init__(self, base_url):\n",
        "        self.base_url = base_url.rstrip(\"/\")\n",
        "    \n",
        "    def reset(self):\n",
        "        r = requests.post(f\"{self.base_url}/reset\", timeout=30)\n",
        "        r.raise_for_status()\n",
        "        return r.json()\n",
        "    \n",
        "    def step(self, response_text: str):\n",
        "        # OpenEnv servers may accept either {\"response\": ...} or {\"action\": {\"response\": ...}}\n",
        "        payloads = [\n",
        "            {\"response\": response_text},\n",
        "            {\"action\": {\"response\": response_text}},\n",
        "        ]\n",
        "        last_error = None\n",
        "        for payload in payloads:\n",
        "            try:\n",
        "                r = requests.post(f\"{self.base_url}/step\", json=payload, timeout=30)\n",
        "                r.raise_for_status()\n",
        "                return r.json()\n",
        "            except Exception as e:\n",
        "                last_error = e\n",
        "        raise RuntimeError(f\"Step request failed for all payload formats: {last_error}\")\n",
        "    \n",
        "    def get_metrics(self):\n",
        "        r = requests.get(f\"{self.base_url}/metrics\", timeout=10)\n",
        "        r.raise_for_status()\n",
        "        return r.json()\n",
        "\n",
        "env = EchoEnvClient(ECHO_SPACE_URL)\n",
        "\n",
        "# Test: reset and take a step\n",
        "obs = env.reset()\n",
        "print(\"Question:\", obs.get(\"question\", \"\"))\n",
        "result = env.step(\"<confidence>70</confidence><answer>test answer</answer>\")\n",
        "print(\"Step response keys:\", list(result.keys()))"
      ],
      "execution_count": null,
      "outputs": [],
      "id": "dbf22129"
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "# Load model with Unsloth\n",
        "from unsloth import FastLanguageModel\n",
        "import torch\n",
        "\n",
        "model, tokenizer = FastLanguageModel.from_pretrained(\n",
        "    model_name=\"unsloth/Qwen2.5-7B-Instruct\",\n",
        "    max_seq_length=2048,\n",
        "    dtype=None,\n",
        "    load_in_4bit=True,\n",
        ")\n",
        "\n",
        "model = FastLanguageModel.get_peft_model(\n",
        "    model,\n",
        "    r=16,\n",
        "    target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n",
        "                    \"gate_proj\", \"up_proj\", \"down_proj\"],\n",
        "    lora_alpha=16,\n",
        "    lora_dropout=0,\n",
        "    bias=\"none\",\n",
        "    use_gradient_checkpointing=\"unsloth\",\n",
        "    random_state=42,\n",
        ")"
      ],
      "execution_count": null,
      "outputs": [],
      "id": "e58fc972"
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "from trl import GRPOConfig, GRPOTrainer\n",
        "from datasets import Dataset\n",
        "\n",
        "SYSTEM_PROMPT = \"\"\"You are a calibrated AI assistant. For every question:\n",
        "1. Think step-by-step (optional: use <think>...</think> tags)\n",
        "2. Output your confidence as an integer 0-100: <confidence>INTEGER</confidence>\n",
        "3. Output your answer: <answer>YOUR ANSWER</answer>\n",
        "\n",
        "Be honest about uncertainty. Overconfidence is penalized heavily.\n",
        "\n",
        "CRITICAL: You MUST use <confidence> and <answer> tags. Responses without these tags score -0.5 reward automatically. Example of correct format:\n",
        "<think>The capital of France is Paris, I am very sure.</think>\n",
        "<confidence>95</confidence>\n",
        "<answer>Paris</answer>\"\"\"\n",
        "\n",
        "# Build dataset from ECHO environment\n",
        "def build_training_dataset(n_samples=500):\n",
        "    samples = []\n",
        "    for _ in range(n_samples):\n",
        "        obs = env.reset()\n",
        "        question = obs.get(\"question\", \"\")\n",
        "        samples.append({\n",
        "            \"prompt\": f\"{SYSTEM_PROMPT}\\n\\nQuestion: {question}\",\n",
        "            \"question\": question,\n",
        "        })\n",
        "    return Dataset.from_list(samples)\n",
        "\n",
        "print(\"Building training dataset from live environment...\")\n",
        "dataset = build_training_dataset(500)\n",
        "print(f\"Dataset size: {len(dataset)}\")"
      ],
      "execution_count": null,
      "outputs": [],
      "id": "bf6efbc1"
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "# GRPO reward function — calls live OpenEnv environment\n",
        "ece_history = []\n",
        "reward_history = []\n",
        "confidence_eval_history = []\n",
        "outcome_history = []\n",
        "\n",
        "def _extract_step_values(result: dict):\n",
        "    # Supports both flat and OpenEnv-shaped responses.\n",
        "    obs = result.get(\"observation\") or result.get(\"obs\") or result.get(\"state\") or {}\n",
        "    info = result.get(\"info\") or {}\n",
        "\n",
        "    reward = result.get(\"reward\", info.get(\"reward\", obs.get(\"reward\", 0.0)))\n",
        "    ece = result.get(\"ece\", info.get(\"ece\", obs.get(\"ece\", 0.5)))\n",
        "    conf = result.get(\"confidence\", obs.get(\"confidence\", None))\n",
        "    is_correct = result.get(\"is_correct\", obs.get(\"is_correct\", info.get(\"was_correct\", None)))\n",
        "\n",
        "    return float(reward), float(ece), conf, is_correct\n",
        "\n",
        "def echo_reward_function(completions, prompts=None, **kwargs):\n",
        "    \"\"\"\n",
        "    Reward function that evaluates each completion against the live ECHO environment.\n",
        "    This is the core of GRPO training — the environment provides the reward signal.\n",
        "    \"\"\"\n",
        "    rewards = []\n",
        "    for i, completion in enumerate(completions):\n",
        "        try:\n",
        "            # Reset for each completion so reward is grounded to a fresh environment question.\n",
        "            env.reset()\n",
        "\n",
        "            # Each completion is evaluated by the running OpenEnv Space.\n",
        "            result = env.step(completion)\n",
        "            reward, ece, conf, is_correct = _extract_step_values(result)\n",
        "\n",
        "            ece_history.append(ece)\n",
        "            reward_history.append(reward)\n",
        "            if conf is not None:\n",
        "                confidence_eval_history.append(float(conf) / 100.0)\n",
        "            if is_correct is not None:\n",
        "                outcome_history.append(1.0 if bool(is_correct) else 0.0)\n",
        "            rewards.append(reward)\n",
        "\n",
        "        except Exception as e:\n",
        "            print(f\"Env step failed: {e}\")\n",
        "            rewards.append(-0.5)  # penalty for failed step\n",
        "\n",
        "    return rewards\n",
        "\n",
        "# Alias used by the sanity check cell below\n",
        "echo_reward = echo_reward_function"
      ],
      "execution_count": null,
      "outputs": [],
      "id": "bbd4c2d9"
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "# ============================================================\n",
        "# PRE-TRAINING SANITY CHECK — run this before starting training\n",
        "# If all 3 checks pass, training will work. If any fail, fix first.\n",
        "# ============================================================\n",
        "print(\"=== PRE-TRAINING SANITY CHECK ===\\n\")\n",
        "\n",
        "# 1. Test environment connection\n",
        "obs = env.reset()\n",
        "assert \"question\" in obs, \"❌ /reset broken — check Space is running\"\n",
        "print(f\"✅ Environment connected: {obs['question'][:70]}...\")\n",
        "\n",
        "# 2. Test reward function with a known good response\n",
        "good_response = \"<think>Let me think carefully about this.</think><confidence>75</confidence><answer>Paris</answer>\"\n",
        "result = env.step(good_response)\n",
        "assert \"reward\" in result or \"state\" in result, \"❌ /step broken — check Space is running\"\n",
        "reward_val = result.get(\"reward\", result.get(\"state\", {}).get(\"reward\", \"?\"))\n",
        "ece_val    = result.get(\"ece\",    result.get(\"state\", {}).get(\"ece\",    \"?\"))\n",
        "print(f\"✅ /step working: reward={reward_val}, ece={ece_val}\")\n",
        "\n",
        "# 3. Test reward function returns sensible values\n",
        "test_responses = [\n",
        "    \"<confidence>80</confidence><answer>42</answer>\",                      # good format\n",
        "    \"<think>hmm</think><confidence>60</confidence><answer>Paris</answer>\", # good format with think\n",
        "    \"I think the answer is Paris, I am sure about this.\",                   # BAD format — no tags\n",
        "]\n",
        "rewards = echo_reward(test_responses)\n",
        "print(f\"✅ Reward function outputs: {[round(r, 3) for r in rewards]}\")\n",
        "print(f\"   good_format_1={rewards[0]:.3f}  good_format_2={rewards[1]:.3f}  bad_format={rewards[2]:.3f}\")\n",
        "\n",
        "assert rewards[2] < max(rewards[0], rewards[1]), (\n",
        "    f\"❌ Bad format not being penalized! rewards={rewards}. \"\n",
        "    \"Check echo_reward_function — parser may not be filtering correctly.\"\n",
        ")\n",
        "\n",
        "print()\n",
        "print(\"=\" * 50)\n",
        "print(\"✅ ALL CHECKS PASSED — safe to start training!\")\n",
        "print(f\"   Good format reward: {rewards[0]:.3f}\")\n",
        "print(f\"   Bad  format reward: {rewards[2]:.3f}\")\n",
        "print(f\"   Penalty gap: {rewards[0] - rewards[2]:.3f}\")\n",
        "print()\n",
        "print(\"⚠️  WATCH for these in first 30 training steps:\")\n",
        "print(\"   GOOD: rewards between -0.5 and +0.8 (mixed)\")\n",
        "print(\"   BAD : all rewards exactly -0.5 → stop & report\")\n",
        "print(\"=\" * 50)"
      ],
      "execution_count": null,
      "outputs": [],
      "id": "081d73fd"
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "# Configure GRPO training — OPTIMIZED for A10G small (~2.5 hrs, ~$3-4 cost)\n",
        "# Hardware: A10G small ($1.05/hr) — 3x faster than T4 for 7B models\n",
        "# max_completion_length=256: enough for reasoning, 2x faster than 512\n",
        "\n",
        "# Rebuild dataset for A10G run\n",
        "dataset_a10g = build_training_dataset(300)\n",
        "print(f\"Dataset: {len(dataset_a10g)} samples\")\n",
        "\n",
        "training_args = GRPOConfig(\n",
        "    output_dir=\"echo_grpo_output\",\n",
        "    num_train_epochs=1,\n",
        "    per_device_train_batch_size=1,\n",
        "    gradient_accumulation_steps=8,    # effective batch = 8, keep for GRPO stability\n",
        "    learning_rate=2e-5,\n",
        "    warmup_steps=20,\n",
        "    logging_steps=5,\n",
        "    save_steps=50,\n",
        "    bf16=True,                        # A10G supports bfloat16 — better than fp16\n",
        "    fp16=False,\n",
        "    report_to=\"none\",\n",
        "    max_completion_length=256,        # 256 = enough reasoning space, 2x faster than 512\n",
        "    num_generations=4,                # GRPO group size — do NOT reduce\n",
        "    temperature=0.8,\n",
        ")\n",
        "\n",
        "trainer = GRPOTrainer(\n",
        "    model=model,\n",
        "    args=training_args,\n",
        "    reward_funcs=[echo_reward_function],\n",
        "    train_dataset=dataset_a10g,\n",
        "    tokenizer=tokenizer,\n",
        ")\n",
        "\n",
        "print(\"=\" * 55)\n",
        "print(\"🚀  ECHO GRPO Training — A10G small + 256 tokens\")\n",
        "print(\"    300 samples | 1 epoch | grad_accum=8\")\n",
        "print(\"    Estimated: ~2.5 hrs | Cost: ~$3-4\")\n",
        "print(\"=\" * 55)\n",
        "print()\n",
        "print(\"Watch step output — after step 5 you should see:\")\n",
        "print(\"  GOOD: rewards mixed between -0.5 and +0.8\")\n",
        "print(\"  BAD : all rewards exactly -0.5 → stop & report\")\n",
        "print()\n",
        "trainer.train()\n",
        "print(\"\\n✅ Training complete!\")"
      ],
      "execution_count": null,
      "outputs": [],
      "id": "7258d2c1"
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "# Plot ECE curve, reward curve, and reliability diagram\n",
        "import matplotlib.pyplot as plt\n",
        "\n",
        "fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 5))\n",
        "\n",
        "# ECE curve\n",
        "if ece_history:\n",
        "    window = 50\n",
        "    smoothed = [np.mean(ece_history[max(0, i - window):i + 1]) for i in range(len(ece_history))]\n",
        "    ax1.plot(ece_history, alpha=0.3, color='blue', label='Raw ECE')\n",
        "    ax1.plot(smoothed, color='blue', linewidth=2, label='Smoothed ECE')\n",
        "    ax1.axhline(y=0.15, color='green', linestyle='--', label='Good threshold (0.15)')\n",
        "    ax1.axhline(y=0.20, color='orange', linestyle='--', label='Acceptable (0.20)')\n",
        "    ax1.set_xlabel('Training Steps')\n",
        "    ax1.set_ylabel('ECE (lower = better)')\n",
        "    ax1.set_title('ECHO: ECE During GRPO Training')\n",
        "    ax1.legend()\n",
        "    ax1.grid(True, alpha=0.3)\n",
        "\n",
        "# Reward curve\n",
        "if reward_history:\n",
        "    window = 50\n",
        "    smoothed_r = [np.mean(reward_history[max(0, i - window):i + 1]) for i in range(len(reward_history))]\n",
        "    ax2.plot(reward_history, alpha=0.3, color='green', label='Raw Reward')\n",
        "    ax2.plot(smoothed_r, color='green', linewidth=2, label='Smoothed Reward')\n",
        "    ax2.set_xlabel('Training Steps')\n",
        "    ax2.set_ylabel('Reward')\n",
        "    ax2.set_title('ECHO: Reward During GRPO Training')\n",
        "    ax2.legend()\n",
        "    ax2.grid(True, alpha=0.3)\n",
        "\n",
        "# Reliability diagram\n",
        "if confidence_eval_history and outcome_history and len(confidence_eval_history) == len(outcome_history):\n",
        "    n_bins = 10\n",
        "    bins = np.linspace(0.0, 1.0, n_bins + 1)\n",
        "    bin_centers = (bins[:-1] + bins[1:]) / 2\n",
        "    accs = []\n",
        "    confs = []\n",
        "\n",
        "    conf_arr = np.array(confidence_eval_history)\n",
        "    out_arr = np.array(outcome_history)\n",
        "\n",
        "    for i in range(n_bins):\n",
        "        mask = (conf_arr >= bins[i]) & (conf_arr < bins[i + 1])\n",
        "        if i == n_bins - 1:\n",
        "            mask = (conf_arr >= bins[i]) & (conf_arr <= bins[i + 1])\n",
        "        if np.any(mask):\n",
        "            accs.append(float(np.mean(out_arr[mask])))\n",
        "            confs.append(float(np.mean(conf_arr[mask])))\n",
        "        else:\n",
        "            accs.append(np.nan)\n",
        "            confs.append(np.nan)\n",
        "\n",
        "    ax3.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Perfect calibration')\n",
        "    ax3.plot(bin_centers, accs, marker='o', linewidth=2, color='purple', label='Model')\n",
        "    ax3.set_xlabel('Predicted confidence')\n",
        "    ax3.set_ylabel('Empirical accuracy')\n",
        "    ax3.set_title('Reliability Diagram')\n",
        "    ax3.set_xlim(0, 1)\n",
        "    ax3.set_ylim(0, 1)\n",
        "    ax3.grid(True, alpha=0.3)\n",
        "    ax3.legend()\n",
        "\n",
        "plt.tight_layout()\n",
        "plt.savefig(\"echo_training_curves.png\", dpi=150, bbox_inches='tight')\n",
        "plt.show()\n",
        "print(f\"Final ECE: {ece_history[-1]:.4f}\" if ece_history else \"No ECE data\")"
      ],
      "execution_count": null,
      "outputs": [],
      "id": "e548b198"
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "# Save and push adapter to HF Hub\n",
        "model.save_pretrained(\"echo_lora_adapter\")\n",
        "tokenizer.save_pretrained(\"echo_lora_adapter\")\n",
        "\n",
        "from huggingface_hub import HfApi\n",
        "api = HfApi()\n",
        "api.upload_folder(\n",
        "    folder_path=\"echo_lora_adapter\",\n",
        "    repo_id=\"Vikaspandey582003/echo-calibration-adapter\",\n",
        "    repo_type=\"model\",\n",
        "    commit_message=\"ECHO GRPO-trained calibration adapter - Hackathon submission\",\n",
        ")\n",
        "print(\"Adapter pushed to HF Hub!\")\n",
        "print(\"Model: https://huggingface.co/Vikaspandey582003/echo-calibration-adapter\")"
      ],
      "execution_count": null,
      "outputs": [],
      "id": "091afb04"
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "name": "python",
      "version": "3.10"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}