{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ECHO Training Notebook\n",
"Trains Qwen2.5-7B to predict its own correctness using GRPO + OpenEnv"
],
"id": "e67d4af1"
},
{
"cell_type": "code",
"metadata": {},
"source": [
"# Install dependencies\n",
"!pip install -q \"trl>=0.8.0\" \"peft\" \"transformers\" \"datasets\" \"huggingface_hub\"\n",
"!pip install -q \"openenv-core[core]>=0.2.0\" || pip install -q git+https://github.com/meta-pytorch/OpenEnv.git\n",
"!pip install -q \"unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git\""
],
"execution_count": null,
"outputs": [],
"id": "04648bc5"
},
{
"cell_type": "code",
"metadata": {},
"source": [
"import os\n",
"import requests\n",
"import json\n",
"import numpy as np\n",
"from huggingface_hub import login\n",
"\n",
"# Authenticate\n",
"HF_TOKEN = os.environ.get(\"HF_TOKEN\", \"\") # Set in Colab secrets\n",
"if HF_TOKEN:\n",
" login(HF_TOKEN)\n",
"\n",
"# Connect to live ECHO environment on HuggingFace Spaces\n",
"ECHO_SPACE_URL = \"https://vikaspandey582003-echo-ultimate.hf.space\"\n",
"\n",
"# Test connection\n",
"resp = requests.get(f\"{ECHO_SPACE_URL}/health\", timeout=10)\n",
"print(f\"Space status: {resp.json()}\")"
],
"execution_count": null,
"outputs": [],
"id": "b1aee9a5"
},
{
"cell_type": "code",
"metadata": {},
"source": [
"# Simple HTTP client for the ECHO environment\n",
"class EchoEnvClient:\n",
" def __init__(self, base_url):\n",
" self.base_url = base_url.rstrip(\"/\")\n",
" \n",
" def reset(self):\n",
" r = requests.post(f\"{self.base_url}/reset\", timeout=30)\n",
" r.raise_for_status()\n",
" return r.json()\n",
" \n",
" def step(self, response_text: str):\n",
" # OpenEnv servers may accept either {\"response\": ...} or {\"action\": {\"response\": ...}}\n",
" payloads = [\n",
" {\"response\": response_text},\n",
" {\"action\": {\"response\": response_text}},\n",
" ]\n",
" last_error = None\n",
" for payload in payloads:\n",
" try:\n",
" r = requests.post(f\"{self.base_url}/step\", json=payload, timeout=30)\n",
" r.raise_for_status()\n",
" return r.json()\n",
" except Exception as e:\n",
" last_error = e\n",
" raise RuntimeError(f\"Step request failed for all payload formats: {last_error}\")\n",
" \n",
" def get_metrics(self):\n",
" r = requests.get(f\"{self.base_url}/metrics\", timeout=10)\n",
" r.raise_for_status()\n",
" return r.json()\n",
"\n",
"env = EchoEnvClient(ECHO_SPACE_URL)\n",
"\n",
"# Test: reset and take a step\n",
"obs = env.reset()\n",
"print(\"Question:\", obs.get(\"question\", \"\"))\n",
"result = env.step(\"70test answer\")\n",
"print(\"Step response keys:\", list(result.keys()))"
],
"execution_count": null,
"outputs": [],
"id": "dbf22129"
},
{
"cell_type": "code",
"metadata": {},
"source": [
"# Load model with Unsloth\n",
"from unsloth import FastLanguageModel\n",
"import torch\n",
"\n",
"model, tokenizer = FastLanguageModel.from_pretrained(\n",
" model_name=\"unsloth/Qwen2.5-7B-Instruct\",\n",
" max_seq_length=2048,\n",
" dtype=None,\n",
" load_in_4bit=True,\n",
")\n",
"\n",
"model = FastLanguageModel.get_peft_model(\n",
" model,\n",
" r=16,\n",
" target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n",
" \"gate_proj\", \"up_proj\", \"down_proj\"],\n",
" lora_alpha=16,\n",
" lora_dropout=0,\n",
" bias=\"none\",\n",
" use_gradient_checkpointing=\"unsloth\",\n",
" random_state=42,\n",
")"
],
"execution_count": null,
"outputs": [],
"id": "e58fc972"
},
{
"cell_type": "code",
"metadata": {},
"source": [
"from trl import GRPOConfig, GRPOTrainer\n",
"from datasets import Dataset\n",
"\n",
"SYSTEM_PROMPT = \"\"\"You are a calibrated AI assistant. For every question:\n",
"1. Think step-by-step (optional: use ... tags)\n",
"2. Output your confidence as an integer 0-100: INTEGER\n",
"3. Output your answer: YOUR ANSWER\n",
"\n",
"Be honest about uncertainty. Overconfidence is penalized heavily.\n",
"\n",
"CRITICAL: You MUST use and tags. Responses without these tags score -0.5 reward automatically. Example of correct format:\n",
"The capital of France is Paris, I am very sure.\n",
"95\n",
"Paris\"\"\"\n",
"\n",
"# Build dataset from ECHO environment\n",
"def build_training_dataset(n_samples=500):\n",
" samples = []\n",
" for _ in range(n_samples):\n",
" obs = env.reset()\n",
" question = obs.get(\"question\", \"\")\n",
" samples.append({\n",
" \"prompt\": f\"{SYSTEM_PROMPT}\\n\\nQuestion: {question}\",\n",
" \"question\": question,\n",
" })\n",
" return Dataset.from_list(samples)\n",
"\n",
"print(\"Building training dataset from live environment...\")\n",
"dataset = build_training_dataset(500)\n",
"print(f\"Dataset size: {len(dataset)}\")"
],
"execution_count": null,
"outputs": [],
"id": "bf6efbc1"
},
{
"cell_type": "code",
"metadata": {},
"source": [
"# GRPO reward function — calls live OpenEnv environment\n",
"ece_history = []\n",
"reward_history = []\n",
"confidence_eval_history = []\n",
"outcome_history = []\n",
"\n",
"def _extract_step_values(result: dict):\n",
" # Supports both flat and OpenEnv-shaped responses.\n",
" obs = result.get(\"observation\") or result.get(\"obs\") or result.get(\"state\") or {}\n",
" info = result.get(\"info\") or {}\n",
"\n",
" reward = result.get(\"reward\", info.get(\"reward\", obs.get(\"reward\", 0.0)))\n",
" ece = result.get(\"ece\", info.get(\"ece\", obs.get(\"ece\", 0.5)))\n",
" conf = result.get(\"confidence\", obs.get(\"confidence\", None))\n",
" is_correct = result.get(\"is_correct\", obs.get(\"is_correct\", info.get(\"was_correct\", None)))\n",
"\n",
" return float(reward), float(ece), conf, is_correct\n",
"\n",
"def echo_reward_function(completions, prompts=None, **kwargs):\n",
" \"\"\"\n",
" Reward function that evaluates each completion against the live ECHO environment.\n",
" This is the core of GRPO training — the environment provides the reward signal.\n",
" \"\"\"\n",
" rewards = []\n",
" for i, completion in enumerate(completions):\n",
" try:\n",
" # Reset for each completion so reward is grounded to a fresh environment question.\n",
" env.reset()\n",
"\n",
" # Each completion is evaluated by the running OpenEnv Space.\n",
" result = env.step(completion)\n",
" reward, ece, conf, is_correct = _extract_step_values(result)\n",
"\n",
" ece_history.append(ece)\n",
" reward_history.append(reward)\n",
" if conf is not None:\n",
" confidence_eval_history.append(float(conf) / 100.0)\n",
" if is_correct is not None:\n",
" outcome_history.append(1.0 if bool(is_correct) else 0.0)\n",
" rewards.append(reward)\n",
"\n",
" except Exception as e:\n",
" print(f\"Env step failed: {e}\")\n",
" rewards.append(-0.5) # penalty for failed step\n",
"\n",
" return rewards\n",
"\n",
"# Alias used by the sanity check cell below\n",
"echo_reward = echo_reward_function"
],
"execution_count": null,
"outputs": [],
"id": "bbd4c2d9"
},
{
"cell_type": "code",
"metadata": {},
"source": [
"# ============================================================\n",
"# PRE-TRAINING SANITY CHECK — run this before starting training\n",
"# If all 3 checks pass, training will work. If any fail, fix first.\n",
"# ============================================================\n",
"print(\"=== PRE-TRAINING SANITY CHECK ===\\n\")\n",
"\n",
"# 1. Test environment connection\n",
"obs = env.reset()\n",
"assert \"question\" in obs, \"❌ /reset broken — check Space is running\"\n",
"print(f\"✅ Environment connected: {obs['question'][:70]}...\")\n",
"\n",
"# 2. Test reward function with a known good response\n",
"good_response = \"Let me think carefully about this.75Paris\"\n",
"result = env.step(good_response)\n",
"assert \"reward\" in result or \"state\" in result, \"❌ /step broken — check Space is running\"\n",
"reward_val = result.get(\"reward\", result.get(\"state\", {}).get(\"reward\", \"?\"))\n",
"ece_val = result.get(\"ece\", result.get(\"state\", {}).get(\"ece\", \"?\"))\n",
"print(f\"✅ /step working: reward={reward_val}, ece={ece_val}\")\n",
"\n",
"# 3. Test reward function returns sensible values\n",
"test_responses = [\n",
" \"8042\", # good format\n",
" \"hmm60Paris\", # good format with think\n",
" \"I think the answer is Paris, I am sure about this.\", # BAD format — no tags\n",
"]\n",
"rewards = echo_reward(test_responses)\n",
"print(f\"✅ Reward function outputs: {[round(r, 3) for r in rewards]}\")\n",
"print(f\" good_format_1={rewards[0]:.3f} good_format_2={rewards[1]:.3f} bad_format={rewards[2]:.3f}\")\n",
"\n",
"assert rewards[2] < max(rewards[0], rewards[1]), (\n",
" f\"❌ Bad format not being penalized! rewards={rewards}. \"\n",
" \"Check echo_reward_function — parser may not be filtering correctly.\"\n",
")\n",
"\n",
"print()\n",
"print(\"=\" * 50)\n",
"print(\"✅ ALL CHECKS PASSED — safe to start training!\")\n",
"print(f\" Good format reward: {rewards[0]:.3f}\")\n",
"print(f\" Bad format reward: {rewards[2]:.3f}\")\n",
"print(f\" Penalty gap: {rewards[0] - rewards[2]:.3f}\")\n",
"print()\n",
"print(\"⚠️ WATCH for these in first 30 training steps:\")\n",
"print(\" GOOD: rewards between -0.5 and +0.8 (mixed)\")\n",
"print(\" BAD : all rewards exactly -0.5 → stop & report\")\n",
"print(\"=\" * 50)"
],
"execution_count": null,
"outputs": [],
"id": "081d73fd"
},
{
"cell_type": "code",
"metadata": {},
"source": [
"# Configure GRPO training — OPTIMIZED for A10G small (~2.5 hrs, ~$3-4 cost)\n",
"# Hardware: A10G small ($1.05/hr) — 3x faster than T4 for 7B models\n",
"# max_completion_length=256: enough for reasoning, 2x faster than 512\n",
"\n",
"# Rebuild dataset for A10G run\n",
"dataset_a10g = build_training_dataset(300)\n",
"print(f\"Dataset: {len(dataset_a10g)} samples\")\n",
"\n",
"training_args = GRPOConfig(\n",
" output_dir=\"echo_grpo_output\",\n",
" num_train_epochs=1,\n",
" per_device_train_batch_size=1,\n",
" gradient_accumulation_steps=8, # effective batch = 8, keep for GRPO stability\n",
" learning_rate=2e-5,\n",
" warmup_steps=20,\n",
" logging_steps=5,\n",
" save_steps=50,\n",
" bf16=True, # A10G supports bfloat16 — better than fp16\n",
" fp16=False,\n",
" report_to=\"none\",\n",
" max_completion_length=256, # 256 = enough reasoning space, 2x faster than 512\n",
" num_generations=4, # GRPO group size — do NOT reduce\n",
" temperature=0.8,\n",
")\n",
"\n",
"trainer = GRPOTrainer(\n",
" model=model,\n",
" args=training_args,\n",
" reward_funcs=[echo_reward_function],\n",
" train_dataset=dataset_a10g,\n",
" tokenizer=tokenizer,\n",
")\n",
"\n",
"print(\"=\" * 55)\n",
"print(\"🚀 ECHO GRPO Training — A10G small + 256 tokens\")\n",
"print(\" 300 samples | 1 epoch | grad_accum=8\")\n",
"print(\" Estimated: ~2.5 hrs | Cost: ~$3-4\")\n",
"print(\"=\" * 55)\n",
"print()\n",
"print(\"Watch step output — after step 5 you should see:\")\n",
"print(\" GOOD: rewards mixed between -0.5 and +0.8\")\n",
"print(\" BAD : all rewards exactly -0.5 → stop & report\")\n",
"print()\n",
"trainer.train()\n",
"print(\"\\n✅ Training complete!\")"
],
"execution_count": null,
"outputs": [],
"id": "7258d2c1"
},
{
"cell_type": "code",
"metadata": {},
"source": [
"# Plot ECE curve, reward curve, and reliability diagram\n",
"import matplotlib.pyplot as plt\n",
"\n",
"fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 5))\n",
"\n",
"# ECE curve\n",
"if ece_history:\n",
" window = 50\n",
" smoothed = [np.mean(ece_history[max(0, i - window):i + 1]) for i in range(len(ece_history))]\n",
" ax1.plot(ece_history, alpha=0.3, color='blue', label='Raw ECE')\n",
" ax1.plot(smoothed, color='blue', linewidth=2, label='Smoothed ECE')\n",
" ax1.axhline(y=0.15, color='green', linestyle='--', label='Good threshold (0.15)')\n",
" ax1.axhline(y=0.20, color='orange', linestyle='--', label='Acceptable (0.20)')\n",
" ax1.set_xlabel('Training Steps')\n",
" ax1.set_ylabel('ECE (lower = better)')\n",
" ax1.set_title('ECHO: ECE During GRPO Training')\n",
" ax1.legend()\n",
" ax1.grid(True, alpha=0.3)\n",
"\n",
"# Reward curve\n",
"if reward_history:\n",
" window = 50\n",
" smoothed_r = [np.mean(reward_history[max(0, i - window):i + 1]) for i in range(len(reward_history))]\n",
" ax2.plot(reward_history, alpha=0.3, color='green', label='Raw Reward')\n",
" ax2.plot(smoothed_r, color='green', linewidth=2, label='Smoothed Reward')\n",
" ax2.set_xlabel('Training Steps')\n",
" ax2.set_ylabel('Reward')\n",
" ax2.set_title('ECHO: Reward During GRPO Training')\n",
" ax2.legend()\n",
" ax2.grid(True, alpha=0.3)\n",
"\n",
"# Reliability diagram\n",
"if confidence_eval_history and outcome_history and len(confidence_eval_history) == len(outcome_history):\n",
" n_bins = 10\n",
" bins = np.linspace(0.0, 1.0, n_bins + 1)\n",
" bin_centers = (bins[:-1] + bins[1:]) / 2\n",
" accs = []\n",
" confs = []\n",
"\n",
" conf_arr = np.array(confidence_eval_history)\n",
" out_arr = np.array(outcome_history)\n",
"\n",
" for i in range(n_bins):\n",
" mask = (conf_arr >= bins[i]) & (conf_arr < bins[i + 1])\n",
" if i == n_bins - 1:\n",
" mask = (conf_arr >= bins[i]) & (conf_arr <= bins[i + 1])\n",
" if np.any(mask):\n",
" accs.append(float(np.mean(out_arr[mask])))\n",
" confs.append(float(np.mean(conf_arr[mask])))\n",
" else:\n",
" accs.append(np.nan)\n",
" confs.append(np.nan)\n",
"\n",
" ax3.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Perfect calibration')\n",
" ax3.plot(bin_centers, accs, marker='o', linewidth=2, color='purple', label='Model')\n",
" ax3.set_xlabel('Predicted confidence')\n",
" ax3.set_ylabel('Empirical accuracy')\n",
" ax3.set_title('Reliability Diagram')\n",
" ax3.set_xlim(0, 1)\n",
" ax3.set_ylim(0, 1)\n",
" ax3.grid(True, alpha=0.3)\n",
" ax3.legend()\n",
"\n",
"plt.tight_layout()\n",
"plt.savefig(\"echo_training_curves.png\", dpi=150, bbox_inches='tight')\n",
"plt.show()\n",
"print(f\"Final ECE: {ece_history[-1]:.4f}\" if ece_history else \"No ECE data\")"
],
"execution_count": null,
"outputs": [],
"id": "e548b198"
},
{
"cell_type": "code",
"metadata": {},
"source": [
"# Save and push adapter to HF Hub\n",
"model.save_pretrained(\"echo_lora_adapter\")\n",
"tokenizer.save_pretrained(\"echo_lora_adapter\")\n",
"\n",
"from huggingface_hub import HfApi\n",
"api = HfApi()\n",
"api.upload_folder(\n",
" folder_path=\"echo_lora_adapter\",\n",
" repo_id=\"Vikaspandey582003/echo-calibration-adapter\",\n",
" repo_type=\"model\",\n",
" commit_message=\"ECHO GRPO-trained calibration adapter - Hackathon submission\",\n",
")\n",
"print(\"Adapter pushed to HF Hub!\")\n",
"print(\"Model: https://huggingface.co/Vikaspandey582003/echo-calibration-adapter\")"
],
"execution_count": null,
"outputs": [],
"id": "091afb04"
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}