{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# ECHO Training Notebook\n", "Trains Qwen2.5-7B to predict its own correctness using GRPO + OpenEnv" ], "id": "e67d4af1" }, { "cell_type": "code", "metadata": {}, "source": [ "# Install dependencies\n", "!pip install -q \"trl>=0.8.0\" \"peft\" \"transformers\" \"datasets\" \"huggingface_hub\"\n", "!pip install -q \"openenv-core[core]>=0.2.0\" || pip install -q git+https://github.com/meta-pytorch/OpenEnv.git\n", "!pip install -q \"unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git\"" ], "execution_count": null, "outputs": [], "id": "04648bc5" }, { "cell_type": "code", "metadata": {}, "source": [ "import os\n", "import requests\n", "import json\n", "import numpy as np\n", "from huggingface_hub import login\n", "\n", "# Authenticate\n", "HF_TOKEN = os.environ.get(\"HF_TOKEN\", \"\") # Set in Colab secrets\n", "if HF_TOKEN:\n", " login(HF_TOKEN)\n", "\n", "# Connect to live ECHO environment on HuggingFace Spaces\n", "ECHO_SPACE_URL = \"https://vikaspandey582003-echo-ultimate.hf.space\"\n", "\n", "# Test connection\n", "resp = requests.get(f\"{ECHO_SPACE_URL}/health\", timeout=10)\n", "print(f\"Space status: {resp.json()}\")" ], "execution_count": null, "outputs": [], "id": "b1aee9a5" }, { "cell_type": "code", "metadata": {}, "source": [ "# Simple HTTP client for the ECHO environment\n", "class EchoEnvClient:\n", " def __init__(self, base_url):\n", " self.base_url = base_url.rstrip(\"/\")\n", " \n", " def reset(self):\n", " r = requests.post(f\"{self.base_url}/reset\", timeout=30)\n", " r.raise_for_status()\n", " return r.json()\n", " \n", " def step(self, response_text: str):\n", " # OpenEnv servers may accept either {\"response\": ...} or {\"action\": {\"response\": ...}}\n", " payloads = [\n", " {\"response\": response_text},\n", " {\"action\": {\"response\": response_text}},\n", " ]\n", " last_error = None\n", " for payload in payloads:\n", " try:\n", " r = requests.post(f\"{self.base_url}/step\", json=payload, timeout=30)\n", " r.raise_for_status()\n", " return r.json()\n", " except Exception as e:\n", " last_error = e\n", " raise RuntimeError(f\"Step request failed for all payload formats: {last_error}\")\n", " \n", " def get_metrics(self):\n", " r = requests.get(f\"{self.base_url}/metrics\", timeout=10)\n", " r.raise_for_status()\n", " return r.json()\n", "\n", "env = EchoEnvClient(ECHO_SPACE_URL)\n", "\n", "# Test: reset and take a step\n", "obs = env.reset()\n", "print(\"Question:\", obs.get(\"question\", \"\"))\n", "result = env.step(\"70test answer\")\n", "print(\"Step response keys:\", list(result.keys()))" ], "execution_count": null, "outputs": [], "id": "dbf22129" }, { "cell_type": "code", "metadata": {}, "source": [ "# Load model with Unsloth\n", "from unsloth import FastLanguageModel\n", "import torch\n", "\n", "model, tokenizer = FastLanguageModel.from_pretrained(\n", " model_name=\"unsloth/Qwen2.5-7B-Instruct\",\n", " max_seq_length=2048,\n", " dtype=None,\n", " load_in_4bit=True,\n", ")\n", "\n", "model = FastLanguageModel.get_peft_model(\n", " model,\n", " r=16,\n", " target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n", " \"gate_proj\", \"up_proj\", \"down_proj\"],\n", " lora_alpha=16,\n", " lora_dropout=0,\n", " bias=\"none\",\n", " use_gradient_checkpointing=\"unsloth\",\n", " random_state=42,\n", ")" ], "execution_count": null, "outputs": [], "id": "e58fc972" }, { "cell_type": "code", "metadata": {}, "source": [ "from trl import GRPOConfig, GRPOTrainer\n", "from datasets import Dataset\n", "\n", "SYSTEM_PROMPT = \"\"\"You are a calibrated AI assistant. For every question:\n", "1. Think step-by-step (optional: use ... tags)\n", "2. Output your confidence as an integer 0-100: INTEGER\n", "3. Output your answer: YOUR ANSWER\n", "\n", "Be honest about uncertainty. Overconfidence is penalized heavily.\n", "\n", "CRITICAL: You MUST use and tags. Responses without these tags score -0.5 reward automatically. Example of correct format:\n", "The capital of France is Paris, I am very sure.\n", "95\n", "Paris\"\"\"\n", "\n", "# Build dataset from ECHO environment\n", "def build_training_dataset(n_samples=500):\n", " samples = []\n", " for _ in range(n_samples):\n", " obs = env.reset()\n", " question = obs.get(\"question\", \"\")\n", " samples.append({\n", " \"prompt\": f\"{SYSTEM_PROMPT}\\n\\nQuestion: {question}\",\n", " \"question\": question,\n", " })\n", " return Dataset.from_list(samples)\n", "\n", "print(\"Building training dataset from live environment...\")\n", "dataset = build_training_dataset(500)\n", "print(f\"Dataset size: {len(dataset)}\")" ], "execution_count": null, "outputs": [], "id": "bf6efbc1" }, { "cell_type": "code", "metadata": {}, "source": [ "# GRPO reward function — calls live OpenEnv environment\n", "ece_history = []\n", "reward_history = []\n", "confidence_eval_history = []\n", "outcome_history = []\n", "\n", "def _extract_step_values(result: dict):\n", " # Supports both flat and OpenEnv-shaped responses.\n", " obs = result.get(\"observation\") or result.get(\"obs\") or result.get(\"state\") or {}\n", " info = result.get(\"info\") or {}\n", "\n", " reward = result.get(\"reward\", info.get(\"reward\", obs.get(\"reward\", 0.0)))\n", " ece = result.get(\"ece\", info.get(\"ece\", obs.get(\"ece\", 0.5)))\n", " conf = result.get(\"confidence\", obs.get(\"confidence\", None))\n", " is_correct = result.get(\"is_correct\", obs.get(\"is_correct\", info.get(\"was_correct\", None)))\n", "\n", " return float(reward), float(ece), conf, is_correct\n", "\n", "def echo_reward_function(completions, prompts=None, **kwargs):\n", " \"\"\"\n", " Reward function that evaluates each completion against the live ECHO environment.\n", " This is the core of GRPO training — the environment provides the reward signal.\n", " \"\"\"\n", " rewards = []\n", " for i, completion in enumerate(completions):\n", " try:\n", " # Reset for each completion so reward is grounded to a fresh environment question.\n", " env.reset()\n", "\n", " # Each completion is evaluated by the running OpenEnv Space.\n", " result = env.step(completion)\n", " reward, ece, conf, is_correct = _extract_step_values(result)\n", "\n", " ece_history.append(ece)\n", " reward_history.append(reward)\n", " if conf is not None:\n", " confidence_eval_history.append(float(conf) / 100.0)\n", " if is_correct is not None:\n", " outcome_history.append(1.0 if bool(is_correct) else 0.0)\n", " rewards.append(reward)\n", "\n", " except Exception as e:\n", " print(f\"Env step failed: {e}\")\n", " rewards.append(-0.5) # penalty for failed step\n", "\n", " return rewards\n", "\n", "# Alias used by the sanity check cell below\n", "echo_reward = echo_reward_function" ], "execution_count": null, "outputs": [], "id": "bbd4c2d9" }, { "cell_type": "code", "metadata": {}, "source": [ "# ============================================================\n", "# PRE-TRAINING SANITY CHECK — run this before starting training\n", "# If all 3 checks pass, training will work. If any fail, fix first.\n", "# ============================================================\n", "print(\"=== PRE-TRAINING SANITY CHECK ===\\n\")\n", "\n", "# 1. Test environment connection\n", "obs = env.reset()\n", "assert \"question\" in obs, \"❌ /reset broken — check Space is running\"\n", "print(f\"✅ Environment connected: {obs['question'][:70]}...\")\n", "\n", "# 2. Test reward function with a known good response\n", "good_response = \"Let me think carefully about this.75Paris\"\n", "result = env.step(good_response)\n", "assert \"reward\" in result or \"state\" in result, \"❌ /step broken — check Space is running\"\n", "reward_val = result.get(\"reward\", result.get(\"state\", {}).get(\"reward\", \"?\"))\n", "ece_val = result.get(\"ece\", result.get(\"state\", {}).get(\"ece\", \"?\"))\n", "print(f\"✅ /step working: reward={reward_val}, ece={ece_val}\")\n", "\n", "# 3. Test reward function returns sensible values\n", "test_responses = [\n", " \"8042\", # good format\n", " \"hmm60Paris\", # good format with think\n", " \"I think the answer is Paris, I am sure about this.\", # BAD format — no tags\n", "]\n", "rewards = echo_reward(test_responses)\n", "print(f\"✅ Reward function outputs: {[round(r, 3) for r in rewards]}\")\n", "print(f\" good_format_1={rewards[0]:.3f} good_format_2={rewards[1]:.3f} bad_format={rewards[2]:.3f}\")\n", "\n", "assert rewards[2] < max(rewards[0], rewards[1]), (\n", " f\"❌ Bad format not being penalized! rewards={rewards}. \"\n", " \"Check echo_reward_function — parser may not be filtering correctly.\"\n", ")\n", "\n", "print()\n", "print(\"=\" * 50)\n", "print(\"✅ ALL CHECKS PASSED — safe to start training!\")\n", "print(f\" Good format reward: {rewards[0]:.3f}\")\n", "print(f\" Bad format reward: {rewards[2]:.3f}\")\n", "print(f\" Penalty gap: {rewards[0] - rewards[2]:.3f}\")\n", "print()\n", "print(\"⚠️ WATCH for these in first 30 training steps:\")\n", "print(\" GOOD: rewards between -0.5 and +0.8 (mixed)\")\n", "print(\" BAD : all rewards exactly -0.5 → stop & report\")\n", "print(\"=\" * 50)" ], "execution_count": null, "outputs": [], "id": "081d73fd" }, { "cell_type": "code", "metadata": {}, "source": [ "# Configure GRPO training — OPTIMIZED for A10G small (~2.5 hrs, ~$3-4 cost)\n", "# Hardware: A10G small ($1.05/hr) — 3x faster than T4 for 7B models\n", "# max_completion_length=256: enough for reasoning, 2x faster than 512\n", "\n", "# Rebuild dataset for A10G run\n", "dataset_a10g = build_training_dataset(300)\n", "print(f\"Dataset: {len(dataset_a10g)} samples\")\n", "\n", "training_args = GRPOConfig(\n", " output_dir=\"echo_grpo_output\",\n", " num_train_epochs=1,\n", " per_device_train_batch_size=1,\n", " gradient_accumulation_steps=8, # effective batch = 8, keep for GRPO stability\n", " learning_rate=2e-5,\n", " warmup_steps=20,\n", " logging_steps=5,\n", " save_steps=50,\n", " bf16=True, # A10G supports bfloat16 — better than fp16\n", " fp16=False,\n", " report_to=\"none\",\n", " max_completion_length=256, # 256 = enough reasoning space, 2x faster than 512\n", " num_generations=4, # GRPO group size — do NOT reduce\n", " temperature=0.8,\n", ")\n", "\n", "trainer = GRPOTrainer(\n", " model=model,\n", " args=training_args,\n", " reward_funcs=[echo_reward_function],\n", " train_dataset=dataset_a10g,\n", " tokenizer=tokenizer,\n", ")\n", "\n", "print(\"=\" * 55)\n", "print(\"🚀 ECHO GRPO Training — A10G small + 256 tokens\")\n", "print(\" 300 samples | 1 epoch | grad_accum=8\")\n", "print(\" Estimated: ~2.5 hrs | Cost: ~$3-4\")\n", "print(\"=\" * 55)\n", "print()\n", "print(\"Watch step output — after step 5 you should see:\")\n", "print(\" GOOD: rewards mixed between -0.5 and +0.8\")\n", "print(\" BAD : all rewards exactly -0.5 → stop & report\")\n", "print()\n", "trainer.train()\n", "print(\"\\n✅ Training complete!\")" ], "execution_count": null, "outputs": [], "id": "7258d2c1" }, { "cell_type": "code", "metadata": {}, "source": [ "# Plot ECE curve, reward curve, and reliability diagram\n", "import matplotlib.pyplot as plt\n", "\n", "fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 5))\n", "\n", "# ECE curve\n", "if ece_history:\n", " window = 50\n", " smoothed = [np.mean(ece_history[max(0, i - window):i + 1]) for i in range(len(ece_history))]\n", " ax1.plot(ece_history, alpha=0.3, color='blue', label='Raw ECE')\n", " ax1.plot(smoothed, color='blue', linewidth=2, label='Smoothed ECE')\n", " ax1.axhline(y=0.15, color='green', linestyle='--', label='Good threshold (0.15)')\n", " ax1.axhline(y=0.20, color='orange', linestyle='--', label='Acceptable (0.20)')\n", " ax1.set_xlabel('Training Steps')\n", " ax1.set_ylabel('ECE (lower = better)')\n", " ax1.set_title('ECHO: ECE During GRPO Training')\n", " ax1.legend()\n", " ax1.grid(True, alpha=0.3)\n", "\n", "# Reward curve\n", "if reward_history:\n", " window = 50\n", " smoothed_r = [np.mean(reward_history[max(0, i - window):i + 1]) for i in range(len(reward_history))]\n", " ax2.plot(reward_history, alpha=0.3, color='green', label='Raw Reward')\n", " ax2.plot(smoothed_r, color='green', linewidth=2, label='Smoothed Reward')\n", " ax2.set_xlabel('Training Steps')\n", " ax2.set_ylabel('Reward')\n", " ax2.set_title('ECHO: Reward During GRPO Training')\n", " ax2.legend()\n", " ax2.grid(True, alpha=0.3)\n", "\n", "# Reliability diagram\n", "if confidence_eval_history and outcome_history and len(confidence_eval_history) == len(outcome_history):\n", " n_bins = 10\n", " bins = np.linspace(0.0, 1.0, n_bins + 1)\n", " bin_centers = (bins[:-1] + bins[1:]) / 2\n", " accs = []\n", " confs = []\n", "\n", " conf_arr = np.array(confidence_eval_history)\n", " out_arr = np.array(outcome_history)\n", "\n", " for i in range(n_bins):\n", " mask = (conf_arr >= bins[i]) & (conf_arr < bins[i + 1])\n", " if i == n_bins - 1:\n", " mask = (conf_arr >= bins[i]) & (conf_arr <= bins[i + 1])\n", " if np.any(mask):\n", " accs.append(float(np.mean(out_arr[mask])))\n", " confs.append(float(np.mean(conf_arr[mask])))\n", " else:\n", " accs.append(np.nan)\n", " confs.append(np.nan)\n", "\n", " ax3.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Perfect calibration')\n", " ax3.plot(bin_centers, accs, marker='o', linewidth=2, color='purple', label='Model')\n", " ax3.set_xlabel('Predicted confidence')\n", " ax3.set_ylabel('Empirical accuracy')\n", " ax3.set_title('Reliability Diagram')\n", " ax3.set_xlim(0, 1)\n", " ax3.set_ylim(0, 1)\n", " ax3.grid(True, alpha=0.3)\n", " ax3.legend()\n", "\n", "plt.tight_layout()\n", "plt.savefig(\"echo_training_curves.png\", dpi=150, bbox_inches='tight')\n", "plt.show()\n", "print(f\"Final ECE: {ece_history[-1]:.4f}\" if ece_history else \"No ECE data\")" ], "execution_count": null, "outputs": [], "id": "e548b198" }, { "cell_type": "code", "metadata": {}, "source": [ "# Save and push adapter to HF Hub\n", "model.save_pretrained(\"echo_lora_adapter\")\n", "tokenizer.save_pretrained(\"echo_lora_adapter\")\n", "\n", "from huggingface_hub import HfApi\n", "api = HfApi()\n", "api.upload_folder(\n", " folder_path=\"echo_lora_adapter\",\n", " repo_id=\"Vikaspandey582003/echo-calibration-adapter\",\n", " repo_type=\"model\",\n", " commit_message=\"ECHO GRPO-trained calibration adapter - Hackathon submission\",\n", ")\n", "print(\"Adapter pushed to HF Hub!\")\n", "print(\"Model: https://huggingface.co/Vikaspandey582003/echo-calibration-adapter\")" ], "execution_count": null, "outputs": [], "id": "091afb04" } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.10" } }, "nbformat": 4, "nbformat_minor": 5 }