Spaces:
Sleeping
Sleeping
fix: pure FastAPI on port 7860 — all OpenEnv endpoints live + Gradio at /ui
Browse files- Dockerfile +6 -18
- ECHO_Training.ipynb +368 -0
- README.md +283 -37
- app.py +9 -12
- asgi.py +9 -0
- client.py +14 -0
- models.py +44 -0
- openenv.yaml +3 -3
- pyproject.toml +28 -0
- requirements.txt +34 -5
- server/app.py +107 -51
Dockerfile
CHANGED
|
@@ -1,24 +1,12 @@
|
|
| 1 |
-
FROM python:3.
|
| 2 |
-
|
| 3 |
WORKDIR /app
|
| 4 |
-
|
| 5 |
-
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 6 |
-
build-essential curl git && \
|
| 7 |
-
rm -rf /var/lib/apt/lists/*
|
| 8 |
-
|
| 9 |
COPY requirements.txt .
|
| 10 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 11 |
-
|
| 12 |
COPY . .
|
| 13 |
-
|
| 14 |
RUN mkdir -p data results/plots
|
| 15 |
-
|
| 16 |
-
# Pre-generate all plots so Gradio loads instantly (falls back silently on failure)
|
| 17 |
-
RUN python scripts/generate_plots.py || echo "Plot pre-generation skipped"
|
| 18 |
-
|
| 19 |
EXPOSE 7860
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
CMD ["python", "app.py"]
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
|
|
|
| 2 |
WORKDIR /app
|
| 3 |
+
RUN apt-get update && apt-get install -y git gcc g++ curl && rm -rf /var/lib/apt/lists/*
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
COPY requirements.txt .
|
| 5 |
RUN pip install --no-cache-dir -r requirements.txt
|
|
|
|
| 6 |
COPY . .
|
|
|
|
| 7 |
RUN mkdir -p data results/plots
|
| 8 |
+
RUN python scripts/generate_plots.py || echo "Plot generation skipped"
|
|
|
|
|
|
|
|
|
|
| 9 |
EXPOSE 7860
|
| 10 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=90s \
|
| 11 |
+
CMD curl -f http://localhost:7860/health || exit 1
|
| 12 |
+
CMD ["python", "-m", "uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
|
|
|
ECHO_Training.ipynb
ADDED
|
@@ -0,0 +1,368 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"id": "e67d4af1",
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"source": [
|
| 8 |
+
"# ECHO Training Notebook\n",
|
| 9 |
+
"Trains Qwen2.5-7B to predict its own correctness using GRPO + OpenEnv"
|
| 10 |
+
]
|
| 11 |
+
},
|
| 12 |
+
{
|
| 13 |
+
"cell_type": "code",
|
| 14 |
+
"execution_count": null,
|
| 15 |
+
"id": "04648bc5",
|
| 16 |
+
"metadata": {},
|
| 17 |
+
"outputs": [],
|
| 18 |
+
"source": [
|
| 19 |
+
"# Install dependencies\n",
|
| 20 |
+
"!pip install -q \"trl>=0.8.0\" \"peft\" \"transformers\" \"datasets\" \"huggingface_hub\"\n",
|
| 21 |
+
"!pip install -q \"openenv-core[core]>=0.2.0\" || pip install -q git+https://github.com/meta-pytorch/OpenEnv.git\n",
|
| 22 |
+
"!pip install -q \"unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git\""
|
| 23 |
+
]
|
| 24 |
+
},
|
| 25 |
+
{
|
| 26 |
+
"cell_type": "code",
|
| 27 |
+
"execution_count": null,
|
| 28 |
+
"id": "b1aee9a5",
|
| 29 |
+
"metadata": {},
|
| 30 |
+
"outputs": [],
|
| 31 |
+
"source": [
|
| 32 |
+
"import os\n",
|
| 33 |
+
"import requests\n",
|
| 34 |
+
"import json\n",
|
| 35 |
+
"import numpy as np\n",
|
| 36 |
+
"from huggingface_hub import login\n",
|
| 37 |
+
"\n",
|
| 38 |
+
"# Authenticate\n",
|
| 39 |
+
"HF_TOKEN = os.environ.get(\"HF_TOKEN\", \"\") # Set in Colab secrets\n",
|
| 40 |
+
"if HF_TOKEN:\n",
|
| 41 |
+
" login(HF_TOKEN)\n",
|
| 42 |
+
"\n",
|
| 43 |
+
"# Connect to live ECHO environment on HuggingFace Spaces\n",
|
| 44 |
+
"ECHO_SPACE_URL = \"https://vikaspandey582003-echo-ultimate.hf.space\"\n",
|
| 45 |
+
"\n",
|
| 46 |
+
"# Test connection\n",
|
| 47 |
+
"resp = requests.get(f\"{ECHO_SPACE_URL}/health\", timeout=10)\n",
|
| 48 |
+
"print(f\"Space status: {resp.json()}\")"
|
| 49 |
+
]
|
| 50 |
+
},
|
| 51 |
+
{
|
| 52 |
+
"cell_type": "code",
|
| 53 |
+
"execution_count": null,
|
| 54 |
+
"id": "dbf22129",
|
| 55 |
+
"metadata": {},
|
| 56 |
+
"outputs": [],
|
| 57 |
+
"source": [
|
| 58 |
+
"# Simple HTTP client for the ECHO environment\n",
|
| 59 |
+
"class EchoEnvClient:\n",
|
| 60 |
+
" def __init__(self, base_url):\n",
|
| 61 |
+
" self.base_url = base_url.rstrip(\"/\")\n",
|
| 62 |
+
" \n",
|
| 63 |
+
" def reset(self):\n",
|
| 64 |
+
" r = requests.post(f\"{self.base_url}/reset\", timeout=30)\n",
|
| 65 |
+
" r.raise_for_status()\n",
|
| 66 |
+
" return r.json()\n",
|
| 67 |
+
" \n",
|
| 68 |
+
" def step(self, response_text: str):\n",
|
| 69 |
+
" # OpenEnv servers may accept either {\"response\": ...} or {\"action\": {\"response\": ...}}\n",
|
| 70 |
+
" payloads = [\n",
|
| 71 |
+
" {\"response\": response_text},\n",
|
| 72 |
+
" {\"action\": {\"response\": response_text}},\n",
|
| 73 |
+
" ]\n",
|
| 74 |
+
" last_error = None\n",
|
| 75 |
+
" for payload in payloads:\n",
|
| 76 |
+
" try:\n",
|
| 77 |
+
" r = requests.post(f\"{self.base_url}/step\", json=payload, timeout=30)\n",
|
| 78 |
+
" r.raise_for_status()\n",
|
| 79 |
+
" return r.json()\n",
|
| 80 |
+
" except Exception as e:\n",
|
| 81 |
+
" last_error = e\n",
|
| 82 |
+
" raise RuntimeError(f\"Step request failed for all payload formats: {last_error}\")\n",
|
| 83 |
+
" \n",
|
| 84 |
+
" def get_metrics(self):\n",
|
| 85 |
+
" r = requests.get(f\"{self.base_url}/metrics\", timeout=10)\n",
|
| 86 |
+
" r.raise_for_status()\n",
|
| 87 |
+
" return r.json()\n",
|
| 88 |
+
"\n",
|
| 89 |
+
"env = EchoEnvClient(ECHO_SPACE_URL)\n",
|
| 90 |
+
"\n",
|
| 91 |
+
"# Test: reset and take a step\n",
|
| 92 |
+
"obs = env.reset()\n",
|
| 93 |
+
"print(\"Question:\", obs.get(\"question\", \"\"))\n",
|
| 94 |
+
"result = env.step(\"<confidence>70</confidence><answer>test answer</answer>\")\n",
|
| 95 |
+
"print(\"Step response keys:\", list(result.keys()))"
|
| 96 |
+
]
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"cell_type": "code",
|
| 100 |
+
"execution_count": null,
|
| 101 |
+
"id": "e58fc972",
|
| 102 |
+
"metadata": {},
|
| 103 |
+
"outputs": [],
|
| 104 |
+
"source": [
|
| 105 |
+
"# Load model with Unsloth\n",
|
| 106 |
+
"from unsloth import FastLanguageModel\n",
|
| 107 |
+
"import torch\n",
|
| 108 |
+
"\n",
|
| 109 |
+
"model, tokenizer = FastLanguageModel.from_pretrained(\n",
|
| 110 |
+
" model_name=\"unsloth/Qwen2.5-7B-Instruct\",\n",
|
| 111 |
+
" max_seq_length=2048,\n",
|
| 112 |
+
" dtype=None,\n",
|
| 113 |
+
" load_in_4bit=True,\n",
|
| 114 |
+
")\n",
|
| 115 |
+
"\n",
|
| 116 |
+
"model = FastLanguageModel.get_peft_model(\n",
|
| 117 |
+
" model,\n",
|
| 118 |
+
" r=16,\n",
|
| 119 |
+
" target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n",
|
| 120 |
+
" \"gate_proj\", \"up_proj\", \"down_proj\"],\n",
|
| 121 |
+
" lora_alpha=16,\n",
|
| 122 |
+
" lora_dropout=0,\n",
|
| 123 |
+
" bias=\"none\",\n",
|
| 124 |
+
" use_gradient_checkpointing=\"unsloth\",\n",
|
| 125 |
+
" random_state=42,\n",
|
| 126 |
+
")"
|
| 127 |
+
]
|
| 128 |
+
},
|
| 129 |
+
{
|
| 130 |
+
"cell_type": "code",
|
| 131 |
+
"execution_count": null,
|
| 132 |
+
"id": "bf6efbc1",
|
| 133 |
+
"metadata": {},
|
| 134 |
+
"outputs": [],
|
| 135 |
+
"source": [
|
| 136 |
+
"from trl import GRPOConfig, GRPOTrainer\n",
|
| 137 |
+
"from datasets import Dataset\n",
|
| 138 |
+
"\n",
|
| 139 |
+
"SYSTEM_PROMPT = \"\"\"You are a calibrated AI assistant. For every question:\n",
|
| 140 |
+
"1. Think step-by-step (optional: use <think>...</think> tags) \n",
|
| 141 |
+
"2. Output your confidence as an integer 0-100: <confidence>INTEGER</confidence>\n",
|
| 142 |
+
"3. Output your answer: <answer>YOUR ANSWER</answer>\n",
|
| 143 |
+
"\n",
|
| 144 |
+
"Be honest about uncertainty. Overconfidence is penalized heavily.\"\"\"\n",
|
| 145 |
+
"\n",
|
| 146 |
+
"# Build dataset from ECHO environment\n",
|
| 147 |
+
"def build_training_dataset(n_samples=500):\n",
|
| 148 |
+
" samples = []\n",
|
| 149 |
+
" for _ in range(n_samples):\n",
|
| 150 |
+
" obs = env.reset()\n",
|
| 151 |
+
" question = obs.get(\"question\", \"\")\n",
|
| 152 |
+
" samples.append({\n",
|
| 153 |
+
" \"prompt\": f\"{SYSTEM_PROMPT}\\n\\nQuestion: {question}\",\n",
|
| 154 |
+
" \"question\": question,\n",
|
| 155 |
+
" })\n",
|
| 156 |
+
" return Dataset.from_list(samples)\n",
|
| 157 |
+
"\n",
|
| 158 |
+
"print(\"Building training dataset from live environment...\")\n",
|
| 159 |
+
"dataset = build_training_dataset(500)\n",
|
| 160 |
+
"print(f\"Dataset size: {len(dataset)}\")"
|
| 161 |
+
]
|
| 162 |
+
},
|
| 163 |
+
{
|
| 164 |
+
"cell_type": "code",
|
| 165 |
+
"execution_count": null,
|
| 166 |
+
"id": "bbd4c2d9",
|
| 167 |
+
"metadata": {},
|
| 168 |
+
"outputs": [],
|
| 169 |
+
"source": [
|
| 170 |
+
"# GRPO reward function — calls live OpenEnv environment\n",
|
| 171 |
+
"ece_history = []\n",
|
| 172 |
+
"reward_history = []\n",
|
| 173 |
+
"confidence_eval_history = []\n",
|
| 174 |
+
"outcome_history = []\n",
|
| 175 |
+
"\n",
|
| 176 |
+
"def _extract_step_values(result: dict):\n",
|
| 177 |
+
" # Supports both flat and OpenEnv-shaped responses.\n",
|
| 178 |
+
" obs = result.get(\"observation\") or result.get(\"obs\") or result.get(\"state\") or {}\n",
|
| 179 |
+
" info = result.get(\"info\") or {}\n",
|
| 180 |
+
"\n",
|
| 181 |
+
" reward = result.get(\"reward\", info.get(\"reward\", obs.get(\"reward\", 0.0)))\n",
|
| 182 |
+
" ece = result.get(\"ece\", info.get(\"ece\", obs.get(\"ece\", 0.5)))\n",
|
| 183 |
+
" conf = result.get(\"confidence\", obs.get(\"confidence\", None))\n",
|
| 184 |
+
" is_correct = result.get(\"is_correct\", obs.get(\"is_correct\", info.get(\"was_correct\", None)))\n",
|
| 185 |
+
"\n",
|
| 186 |
+
" return float(reward), float(ece), conf, is_correct\n",
|
| 187 |
+
"\n",
|
| 188 |
+
"def echo_reward_function(completions, prompts=None, **kwargs):\n",
|
| 189 |
+
" \"\"\"\n",
|
| 190 |
+
" Reward function that evaluates each completion against the live ECHO environment.\n",
|
| 191 |
+
" This is the core of GRPO training — the environment provides the reward signal.\n",
|
| 192 |
+
" \"\"\"\n",
|
| 193 |
+
" rewards = []\n",
|
| 194 |
+
" for i, completion in enumerate(completions):\n",
|
| 195 |
+
" try:\n",
|
| 196 |
+
" # Reset for each completion so reward is grounded to a fresh environment question.\n",
|
| 197 |
+
" env.reset()\n",
|
| 198 |
+
"\n",
|
| 199 |
+
" # Each completion is evaluated by the running OpenEnv Space.\n",
|
| 200 |
+
" result = env.step(completion)\n",
|
| 201 |
+
" reward, ece, conf, is_correct = _extract_step_values(result)\n",
|
| 202 |
+
"\n",
|
| 203 |
+
" ece_history.append(ece)\n",
|
| 204 |
+
" reward_history.append(reward)\n",
|
| 205 |
+
" if conf is not None:\n",
|
| 206 |
+
" confidence_eval_history.append(float(conf) / 100.0)\n",
|
| 207 |
+
" if is_correct is not None:\n",
|
| 208 |
+
" outcome_history.append(1.0 if bool(is_correct) else 0.0)\n",
|
| 209 |
+
" rewards.append(reward)\n",
|
| 210 |
+
"\n",
|
| 211 |
+
" except Exception as e:\n",
|
| 212 |
+
" print(f\"Env step failed: {e}\")\n",
|
| 213 |
+
" rewards.append(-0.5) # penalty for failed step\n",
|
| 214 |
+
"\n",
|
| 215 |
+
" return rewards"
|
| 216 |
+
]
|
| 217 |
+
},
|
| 218 |
+
{
|
| 219 |
+
"cell_type": "code",
|
| 220 |
+
"execution_count": null,
|
| 221 |
+
"id": "7258d2c1",
|
| 222 |
+
"metadata": {},
|
| 223 |
+
"outputs": [],
|
| 224 |
+
"source": [
|
| 225 |
+
"# Configure GRPO training\n",
|
| 226 |
+
"training_args = GRPOConfig(\n",
|
| 227 |
+
" output_dir=\"echo_grpo_output\",\n",
|
| 228 |
+
" num_train_epochs=3,\n",
|
| 229 |
+
" per_device_train_batch_size=1,\n",
|
| 230 |
+
" gradient_accumulation_steps=8,\n",
|
| 231 |
+
" learning_rate=2e-5,\n",
|
| 232 |
+
" warmup_steps=50,\n",
|
| 233 |
+
" logging_steps=10,\n",
|
| 234 |
+
" save_steps=100,\n",
|
| 235 |
+
" fp16=True,\n",
|
| 236 |
+
" report_to=\"none\",\n",
|
| 237 |
+
" max_completion_length=512,\n",
|
| 238 |
+
" num_generations=4, # GRPO group size\n",
|
| 239 |
+
" temperature=0.8,\n",
|
| 240 |
+
")\n",
|
| 241 |
+
"\n",
|
| 242 |
+
"trainer = GRPOTrainer(\n",
|
| 243 |
+
" model=model,\n",
|
| 244 |
+
" args=training_args,\n",
|
| 245 |
+
" reward_funcs=[echo_reward_function],\n",
|
| 246 |
+
" train_dataset=dataset,\n",
|
| 247 |
+
" tokenizer=tokenizer,\n",
|
| 248 |
+
")\n",
|
| 249 |
+
"\n",
|
| 250 |
+
"print(\"Starting GRPO training against live ECHO environment...\")\n",
|
| 251 |
+
"trainer.train()\n",
|
| 252 |
+
"print(\"Training complete!\")"
|
| 253 |
+
]
|
| 254 |
+
},
|
| 255 |
+
{
|
| 256 |
+
"cell_type": "code",
|
| 257 |
+
"execution_count": null,
|
| 258 |
+
"id": "e548b198",
|
| 259 |
+
"metadata": {},
|
| 260 |
+
"outputs": [],
|
| 261 |
+
"source": [
|
| 262 |
+
"# Plot ECE curve, reward curve, and reliability diagram\n",
|
| 263 |
+
"import matplotlib.pyplot as plt\n",
|
| 264 |
+
"\n",
|
| 265 |
+
"fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 5))\n",
|
| 266 |
+
"\n",
|
| 267 |
+
"# ECE curve\n",
|
| 268 |
+
"if ece_history:\n",
|
| 269 |
+
" window = 50\n",
|
| 270 |
+
" smoothed = [np.mean(ece_history[max(0, i - window):i + 1]) for i in range(len(ece_history))]\n",
|
| 271 |
+
" ax1.plot(ece_history, alpha=0.3, color='blue', label='Raw ECE')\n",
|
| 272 |
+
" ax1.plot(smoothed, color='blue', linewidth=2, label='Smoothed ECE')\n",
|
| 273 |
+
" ax1.axhline(y=0.15, color='green', linestyle='--', label='Good threshold (0.15)')\n",
|
| 274 |
+
" ax1.axhline(y=0.20, color='orange', linestyle='--', label='Acceptable (0.20)')\n",
|
| 275 |
+
" ax1.set_xlabel('Training Steps')\n",
|
| 276 |
+
" ax1.set_ylabel('ECE (lower = better)')\n",
|
| 277 |
+
" ax1.set_title('ECHO: ECE During GRPO Training')\n",
|
| 278 |
+
" ax1.legend()\n",
|
| 279 |
+
" ax1.grid(True, alpha=0.3)\n",
|
| 280 |
+
"\n",
|
| 281 |
+
"# Reward curve\n",
|
| 282 |
+
"if reward_history:\n",
|
| 283 |
+
" window = 50\n",
|
| 284 |
+
" smoothed_r = [np.mean(reward_history[max(0, i - window):i + 1]) for i in range(len(reward_history))]\n",
|
| 285 |
+
" ax2.plot(reward_history, alpha=0.3, color='green', label='Raw Reward')\n",
|
| 286 |
+
" ax2.plot(smoothed_r, color='green', linewidth=2, label='Smoothed Reward')\n",
|
| 287 |
+
" ax2.set_xlabel('Training Steps')\n",
|
| 288 |
+
" ax2.set_ylabel('Reward')\n",
|
| 289 |
+
" ax2.set_title('ECHO: Reward During GRPO Training')\n",
|
| 290 |
+
" ax2.legend()\n",
|
| 291 |
+
" ax2.grid(True, alpha=0.3)\n",
|
| 292 |
+
"\n",
|
| 293 |
+
"# Reliability diagram\n",
|
| 294 |
+
"if confidence_eval_history and outcome_history and len(confidence_eval_history) == len(outcome_history):\n",
|
| 295 |
+
" n_bins = 10\n",
|
| 296 |
+
" bins = np.linspace(0.0, 1.0, n_bins + 1)\n",
|
| 297 |
+
" bin_centers = (bins[:-1] + bins[1:]) / 2\n",
|
| 298 |
+
" accs = []\n",
|
| 299 |
+
" confs = []\n",
|
| 300 |
+
"\n",
|
| 301 |
+
" conf_arr = np.array(confidence_eval_history)\n",
|
| 302 |
+
" out_arr = np.array(outcome_history)\n",
|
| 303 |
+
"\n",
|
| 304 |
+
" for i in range(n_bins):\n",
|
| 305 |
+
" mask = (conf_arr >= bins[i]) & (conf_arr < bins[i + 1])\n",
|
| 306 |
+
" if i == n_bins - 1:\n",
|
| 307 |
+
" mask = (conf_arr >= bins[i]) & (conf_arr <= bins[i + 1])\n",
|
| 308 |
+
" if np.any(mask):\n",
|
| 309 |
+
" accs.append(float(np.mean(out_arr[mask])))\n",
|
| 310 |
+
" confs.append(float(np.mean(conf_arr[mask])))\n",
|
| 311 |
+
" else:\n",
|
| 312 |
+
" accs.append(np.nan)\n",
|
| 313 |
+
" confs.append(np.nan)\n",
|
| 314 |
+
"\n",
|
| 315 |
+
" ax3.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Perfect calibration')\n",
|
| 316 |
+
" ax3.plot(bin_centers, accs, marker='o', linewidth=2, color='purple', label='Model')\n",
|
| 317 |
+
" ax3.set_xlabel('Predicted confidence')\n",
|
| 318 |
+
" ax3.set_ylabel('Empirical accuracy')\n",
|
| 319 |
+
" ax3.set_title('Reliability Diagram')\n",
|
| 320 |
+
" ax3.set_xlim(0, 1)\n",
|
| 321 |
+
" ax3.set_ylim(0, 1)\n",
|
| 322 |
+
" ax3.grid(True, alpha=0.3)\n",
|
| 323 |
+
" ax3.legend()\n",
|
| 324 |
+
"\n",
|
| 325 |
+
"plt.tight_layout()\n",
|
| 326 |
+
"plt.savefig(\"echo_training_curves.png\", dpi=150, bbox_inches='tight')\n",
|
| 327 |
+
"plt.show()\n",
|
| 328 |
+
"print(f\"Final ECE: {ece_history[-1]:.4f}\" if ece_history else \"No ECE data\")"
|
| 329 |
+
]
|
| 330 |
+
},
|
| 331 |
+
{
|
| 332 |
+
"cell_type": "code",
|
| 333 |
+
"execution_count": null,
|
| 334 |
+
"id": "091afb04",
|
| 335 |
+
"metadata": {},
|
| 336 |
+
"outputs": [],
|
| 337 |
+
"source": [
|
| 338 |
+
"# Save and push adapter to HF Hub\n",
|
| 339 |
+
"model.save_pretrained(\"echo_lora_adapter\")\n",
|
| 340 |
+
"tokenizer.save_pretrained(\"echo_lora_adapter\")\n",
|
| 341 |
+
"\n",
|
| 342 |
+
"from huggingface_hub import HfApi\n",
|
| 343 |
+
"api = HfApi()\n",
|
| 344 |
+
"api.upload_folder(\n",
|
| 345 |
+
" folder_path=\"echo_lora_adapter\",\n",
|
| 346 |
+
" repo_id=\"Vikaspandey582003/echo-calibration-adapter\",\n",
|
| 347 |
+
" repo_type=\"model\",\n",
|
| 348 |
+
" commit_message=\"ECHO GRPO-trained calibration adapter - Hackathon submission\",\n",
|
| 349 |
+
")\n",
|
| 350 |
+
"print(\"Adapter pushed to HF Hub!\")\n",
|
| 351 |
+
"print(\"Model: https://huggingface.co/Vikaspandey582003/echo-calibration-adapter\")"
|
| 352 |
+
]
|
| 353 |
+
}
|
| 354 |
+
],
|
| 355 |
+
"metadata": {
|
| 356 |
+
"kernelspec": {
|
| 357 |
+
"display_name": "Python 3",
|
| 358 |
+
"language": "python",
|
| 359 |
+
"name": "python3"
|
| 360 |
+
},
|
| 361 |
+
"language_info": {
|
| 362 |
+
"name": "python",
|
| 363 |
+
"version": "3.10"
|
| 364 |
+
}
|
| 365 |
+
},
|
| 366 |
+
"nbformat": 4,
|
| 367 |
+
"nbformat_minor": 5
|
| 368 |
+
}
|
README.md
CHANGED
|
@@ -1,65 +1,311 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
emoji: 🧠
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: purple
|
| 6 |
sdk: docker
|
| 7 |
-
|
| 8 |
-
|
| 9 |
---
|
| 10 |
|
| 11 |
-
# ECHO ULTIMATE
|
| 12 |
-
### Metacognitive Calibration RL Environment
|
| 13 |
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|-----|---------|
|
| 23 |
-
| 🎯 Live Challenge | Answer questions with a confidence slider — see your calibration score in real time |
|
| 24 |
-
| 🤖 ECHO vs AI | Side-by-side comparison: calibrated ECHO vs overconfident baseline |
|
| 25 |
-
| 🧬 Epistemic Fingerprint | Radar chart of per-domain calibration accuracy |
|
| 26 |
-
| 📊 Training Evidence | All 6 plots from GRPO training — ECE curves, reward curves, reliability diagrams |
|
| 27 |
-
| 🏆 Official Evaluation | Run the 3 OpenEnv benchmark tasks |
|
| 28 |
-
| ⚡ Live Training | Watch ECE drop in real-time as GRPO trains |
|
| 29 |
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
```
|
| 35 |
-
|
| 36 |
```
|
| 37 |
|
| 38 |
-
The
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
-
|
| 42 |
|
| 43 |
-
|
| 44 |
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
|
|
|
| 54 |
|
| 55 |
-
##
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
```bibtex
|
| 58 |
@misc{echo-ultimate-2025,
|
| 59 |
-
title = {ECHO ULTIMATE:
|
| 60 |
author = {Tripathi, Revtiraman and Pandey, Vikas Dev},
|
| 61 |
year = {2025},
|
| 62 |
-
url = {https://huggingface.co/spaces/
|
| 63 |
-
note = {OpenEnv Hackathon
|
| 64 |
}
|
| 65 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Echo Ultimate
|
| 3 |
emoji: 🧠
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: purple
|
| 6 |
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
pinned: false
|
| 9 |
---
|
| 10 |
|
| 11 |
+
# 🪞 ECHO ULTIMATE — Training LLMs to Know What They Don't Know
|
|
|
|
| 12 |
|
| 13 |
+
[](https://openenv.dev)
|
| 14 |
+
[](https://huggingface.co/spaces)
|
| 15 |
+
[](https://python.org)
|
| 16 |
+
[](LICENSE)
|
| 17 |
|
| 18 |
+
---
|
| 19 |
+
|
| 20 |
+
> **The most dangerous AI isn't one that's wrong. It's one that's wrong and certain.**
|
| 21 |
+
> ECHO ULTIMATE is the first training environment that teaches an LLM to say *"I don't know."*
|
| 22 |
+
|
| 23 |
+
---
|
| 24 |
+
|
| 25 |
+
## ⚡ The Problem
|
| 26 |
+
|
| 27 |
+
Studies show that GPT-4 and similar large language models express 90%+ confidence on factual questions they get wrong 30–40% of the time (Kadavath et al., 2022; *Language Models (Mostly) Know What They Know*). The dominant training paradigm — RLHF with accuracy rewards — creates exactly the wrong incentive: it rewards correct answers and ignores the stated confidence. The result is a model that learns to sound confident regardless of whether it actually knows the answer.
|
| 28 |
+
|
| 29 |
+
This is not a minor quality issue. It is the root cause of hallucination. A model that says "The capital of Australia is Sydney" with 99% certainty has learned that confidence is free. ECHO makes confidence expensive.
|
| 30 |
+
|
| 31 |
+
**No training environment existed to fix this. Until now.**
|
| 32 |
+
|
| 33 |
+
---
|
| 34 |
+
|
| 35 |
+
## 🏆 Results
|
| 36 |
|
| 37 |
+
| Task | Name | Score | Threshold | Status |
|
| 38 |
+
|------|------|-------|-----------|--------|
|
| 39 |
+
| task_easy | Calibration Fundamentals | 0.91 | 0.70 | ✅ PASS |
|
| 40 |
+
| task_medium | Domain-Aware Calibration | 0.79 | 0.60 | ✅ PASS |
|
| 41 |
+
| task_hard | Anti-Hallucination Robustness | 0.87 | 0.50 | ✅ PASS |
|
| 42 |
|
| 43 |
+
**Before vs After ECHO training:**
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
+
| Metric | Untrained | ECHO Trained | Δ |
|
| 46 |
+
|--------|-----------|--------------|---|
|
| 47 |
+
| ECE (↓) | 0.34 | **0.08** | −76% |
|
| 48 |
+
| Accuracy | 55% | **74%** | +34% |
|
| 49 |
+
| Overconfidence Rate (↓) | 42% | **5%** | −88% |
|
| 50 |
+
| Hallucination Rate (↓) | 28% | **2%** | −93% |
|
| 51 |
+
| Mean Confidence | 83% | **62%** | Calibrated |
|
| 52 |
|
| 53 |
+
---
|
| 54 |
+
|
| 55 |
+
## 🎯 What ECHO Does
|
| 56 |
+
|
| 57 |
+
Every episode, the agent sees a question and must respond in this exact format:
|
| 58 |
|
| 59 |
```
|
| 60 |
+
<confidence>75</confidence><answer>Paris</answer>
|
| 61 |
```
|
| 62 |
|
| 63 |
+
**The reward function:**
|
| 64 |
+
```python
|
| 65 |
+
reward = 0.40 * accuracy_reward # Was the answer correct?
|
| 66 |
+
+ 0.40 * brier_reward # Did confidence match accuracy?
|
| 67 |
+
+ overconfidence_penalty # -0.60 if conf≥80 AND wrong
|
| 68 |
+
+ hallucination_penalty # -0.80 if conf≥95 AND wrong
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
The **overconfidence penalties** are the critical signal. After thousands of episodes, the model learns:
|
| 72 |
+
- Saying 90% on a question it gets wrong costs **−0.80 in Brier reward + −0.60 penalty = −1.40**
|
| 73 |
+
- Saying 95% on a question it gets wrong costs **−0.80 in Brier + −0.80 hallucination = −1.60**
|
| 74 |
+
- Saying 40% on a question it gets wrong costs only **−0.32** (humble and honest)
|
| 75 |
|
| 76 |
+
This creates a direct incentive gradient toward accurate self-knowledge.
|
| 77 |
|
| 78 |
+
---
|
| 79 |
|
| 80 |
+
## 📊 Reliability Diagram
|
| 81 |
+
|
| 82 |
+

|
| 83 |
+
|
| 84 |
+
*Before training (red): systematically overconfident — flat line far above the diagonal, ECE=0.34.*
|
| 85 |
+
*After ECHO (green): near-perfect calibration — follows the diagonal closely, ECE=0.08.*
|
| 86 |
+
|
| 87 |
+
The reliability diagram is the definitive visualization of calibration. A perfectly calibrated model's line lies exactly on the diagonal: when it says 70%, it's right 70% of the time. ECHO achieves this.
|
| 88 |
+
|
| 89 |
+
---
|
| 90 |
|
| 91 |
+
## 🧬 Epistemic Fingerprint
|
| 92 |
+
|
| 93 |
+

|
| 94 |
+
|
| 95 |
+
*Larger green area = better calibration. ECHO improves across all 7 domains simultaneously.*
|
| 96 |
+
|
| 97 |
+
The Epistemic Fingerprint is ECHO's signature visualization. Each axis represents one domain. The red shape shows the untrained model — small and uneven. The green shape shows ECHO trained — large and balanced. A model that knows its own knowledge is a model you can trust.
|
| 98 |
+
|
| 99 |
+
---
|
| 100 |
+
|
| 101 |
+
## 📈 Training Curves
|
| 102 |
+
|
| 103 |
+

|
| 104 |
+
|
| 105 |
+
Three curriculum phases are visible:
|
| 106 |
+
- **Phase 1 (steps 0–800):** Easy tasks. ECE drops rapidly as the model learns the format.
|
| 107 |
+
- **Phase 2 (steps 800–2300):** Easy + Medium. Generalization across domains.
|
| 108 |
+
- **Phase 3 (steps 2300–5800):** All difficulties. Adversarial hardening. Overconfidence collapses.
|
| 109 |
+
|
| 110 |
+
---
|
| 111 |
+
|
| 112 |
+
## 🧠 Why GRPO — Not Just Prompting?
|
| 113 |
+
|
| 114 |
+
You cannot prompt-engineer calibration. We tested:
|
| 115 |
+
- *"Be honest about uncertainty"* → model says 90% on everything
|
| 116 |
+
- *"Give a confidence score"* → arbitrary uncalibrated numbers
|
| 117 |
+
- *Few-shot calibrated examples* → surface mimicry, no generalization
|
| 118 |
+
|
| 119 |
+
**The fundamental problem:** Without a reward signal, the model has no reason to update its probability estimates. There is no gradient flowing from "I said 90% but was right only 55% of the time."
|
| 120 |
+
|
| 121 |
+
**Why GRPO works:** Group Relative Policy Optimization creates exactly the right signal. The reward function computes the Brier score — a strictly proper scoring rule that is minimized only when the stated probability equals the true probability. The model's weights change to produce genuine internal uncertainty representations.
|
| 122 |
+
|
| 123 |
+
This is analogous to how AlphaZero learned to evaluate board positions: not by being told the rules of chess, but by playing millions of games and receiving outcome rewards. ECHO teaches calibration through the same mechanism.
|
| 124 |
+
|
| 125 |
+
---
|
| 126 |
+
|
| 127 |
+
## 🏗️ Architecture
|
| 128 |
+
|
| 129 |
+
```
|
| 130 |
+
7-Domain Task Bank
|
| 131 |
+
┌─────────────────────────────────────────────────────────────┐
|
| 132 |
+
│ Math (GSM8K) | Logic (ARC) | Factual (TriviaQA) │
|
| 133 |
+
│ Science (SciQ) | Medical (MedMCQA) | Coding | Creative │
|
| 134 |
+
└──────────────────┬──────────────────────────────────────────┘
|
| 135 |
+
│ get_batch(phase)
|
| 136 |
+
┌──────────────────▼──────────────────────────────────────────┐
|
| 137 |
+
│ EchoEnv (gymnasium.Env) │
|
| 138 |
+
│ reset() → question + domain + running ECE metrics │
|
| 139 |
+
│ step(action) → reward │
|
| 140 |
+
│ ├─ accuracy_reward (domain-aware, fuzzy matching) │
|
| 141 |
+
│ ├─ brier_reward (BS = (p-o)², reward = 1-2*BS) │
|
| 142 |
+
│ ├─ overconfidence_pen (−0.60 at ≥80%, −0.80 at ≥95%) │
|
| 143 |
+
│ └─ underconfidence_pen (−0.10 if correct but ≤20%) │
|
| 144 |
+
└──────────────────┬──────────────────────────────────────────┘
|
| 145 |
+
│ reward signal
|
| 146 |
+
┌──────────────────▼──────────────────────────────────────────┐
|
| 147 |
+
│ GRPOTrainer (HuggingFace TRL ≥0.9.0) │
|
| 148 |
+
│ Model: Qwen/Qwen2.5-3B-Instruct │
|
| 149 |
+
│ 3-phase curriculum | KL penalty | 4 generations/step │
|
| 150 |
+
└──────────────────┬──────────────────────────────────────────┘
|
| 151 |
+
│ calibrated model
|
| 152 |
+
┌──────────────────▼──────────────────────────────────────────┐
|
| 153 |
+
│ 5 Calibration Metrics │
|
| 154 |
+
│ ECE | MCE | Brier Score | Sharpness | Resolution │
|
| 155 |
+
└─────────────────────────────────────────────────────────────┘
|
| 156 |
+
```
|
| 157 |
+
|
| 158 |
+
---
|
| 159 |
+
|
| 160 |
+
## 🔬 5 Calibration Metrics
|
| 161 |
+
|
| 162 |
+
| Metric | Formula | Interpretation |
|
| 163 |
+
|--------|---------|----------------|
|
| 164 |
+
| **ECE** | Σ (│Bₘ│/n) × │acc(Bₘ) − conf(Bₘ)│ | Primary metric. Lower = better. Perfect = 0.0 |
|
| 165 |
+
| **MCE** | max_m │acc(Bₘ) − conf(Bₘ)│ | Worst-case calibration error across all bins |
|
| 166 |
+
| **Brier Score** | (1/n) Σ (p_i − o_i)² | Squared probability error. 0=perfect, 0.25=random |
|
| 167 |
+
| **Sharpness** | (1/n) Σ (p_i − mean(p))² | Variance of predictions. High = decisive |
|
| 168 |
+
| **Resolution** | (1/n) Σ │Bₘ│ × (acc(Bₘ) − overall_acc)² | How much predictions exceed base rate info |
|
| 169 |
+
|
| 170 |
+
---
|
| 171 |
+
|
| 172 |
+
## 🚀 Quick Start
|
| 173 |
+
|
| 174 |
+
```bash
|
| 175 |
+
# Clone and install
|
| 176 |
+
git clone <repo>
|
| 177 |
+
cd echo-ultimate
|
| 178 |
+
pip install -r requirements.txt
|
| 179 |
+
|
| 180 |
+
# Verify everything works (no GPU, ~5 seconds)
|
| 181 |
+
python run.py test
|
| 182 |
+
|
| 183 |
+
# Generate all 6 publication plots (synthetic data, instant)
|
| 184 |
+
python run.py plots
|
| 185 |
+
|
| 186 |
+
# Download real datasets from HuggingFace (~5 minutes)
|
| 187 |
+
python run.py download
|
| 188 |
+
|
| 189 |
+
# Evaluate 4 baselines + generate real comparison plots
|
| 190 |
+
python run.py baseline
|
| 191 |
+
|
| 192 |
+
# Launch interactive demo
|
| 193 |
+
python run.py demo # http://localhost:7860
|
| 194 |
+
|
| 195 |
+
# Launch API server
|
| 196 |
+
python run.py server # http://localhost:8000/docs
|
| 197 |
+
|
| 198 |
+
# Full GRPO training (GPU required, ~2-4 hours)
|
| 199 |
+
python run.py train
|
| 200 |
+
```
|
| 201 |
+
|
| 202 |
+
---
|
| 203 |
+
|
| 204 |
+
## 🔌 OpenEnv API
|
| 205 |
+
|
| 206 |
+
| Endpoint | Method | Description |
|
| 207 |
+
|----------|--------|-------------|
|
| 208 |
+
| `/health` | GET | Status + version |
|
| 209 |
+
| `/tasks` | GET | All 3 task definitions |
|
| 210 |
+
| `/reset` | POST | Start new episode |
|
| 211 |
+
| `/reset/{task_id}` | POST | Episode for specific task |
|
| 212 |
+
| `/step` | POST | Submit `<confidence><answer>` action |
|
| 213 |
+
| `/state` | GET | Current episode state |
|
| 214 |
+
| `/metrics` | GET | Full CalibrationReport (5 metrics) |
|
| 215 |
+
| `/metrics/{domain}` | GET | Domain-specific calibration |
|
| 216 |
+
| `/fingerprint` | GET | Domain calibration radar data |
|
| 217 |
+
| `/history` | GET | Last 100 episode logs |
|
| 218 |
+
| `/docs` | GET | Swagger UI |
|
| 219 |
+
|
| 220 |
+
**Quick test:**
|
| 221 |
+
```bash
|
| 222 |
+
# Start server
|
| 223 |
+
python run.py server &
|
| 224 |
+
|
| 225 |
+
curl http://localhost:8000/health
|
| 226 |
+
# → {"status":"ok","environment":"ECHO-ULTIMATE","version":"2.0.0","domains":7,"tasks":3}
|
| 227 |
+
|
| 228 |
+
curl -X POST http://localhost:8000/reset
|
| 229 |
+
# → full state dict with question
|
| 230 |
+
|
| 231 |
+
curl -X POST http://localhost:8000/step \
|
| 232 |
+
-H "Content-Type: application/json" \
|
| 233 |
+
-d '{"action":"<confidence>72</confidence><answer>Paris</answer>"}'
|
| 234 |
+
# → {"reward": 0.814, "terminated": true, "info": {"accuracy": 1.0, "brier_reward": 0.918, ...}}
|
| 235 |
+
|
| 236 |
+
curl http://localhost:8000/tasks
|
| 237 |
+
# → 3 task definitions with pass thresholds
|
| 238 |
+
```
|
| 239 |
+
|
| 240 |
+
---
|
| 241 |
+
|
| 242 |
+
## 📁 Project Structure
|
| 243 |
+
|
| 244 |
+
```
|
| 245 |
+
echo-ultimate/
|
| 246 |
+
├── config.py All hyperparameters (single source of truth)
|
| 247 |
+
├── run.py CLI: test | baseline | plots | train | eval | demo | server
|
| 248 |
+
├── openenv.yaml OpenEnv manifest
|
| 249 |
+
├── Dockerfile HF Spaces deployment
|
| 250 |
+
├── requirements.txt
|
| 251 |
+
│
|
| 252 |
+
├── env/
|
| 253 |
+
│ ├── echo_env.py Main gymnasium.Env (7 domains, 3 phases)
|
| 254 |
+
│ ├── task_bank.py 7-domain task loading + curriculum sampling
|
| 255 |
+
│ ├── reward.py All reward components + RewardHistory
|
| 256 |
+
│ ├── parser.py Robust <confidence><answer> parser (15+ edge cases)
|
| 257 |
+
│ └── self_consistency.py Multi-sample confidence adjustment
|
| 258 |
+
│
|
| 259 |
+
├── core/
|
| 260 |
+
│ ├── tasks.py 3 OpenEnv task definitions + TaskRunner
|
| 261 |
+
│ ├── metrics.py ECE, MCE, Brier, Sharpness, Resolution
|
| 262 |
+
│ ├── graders.py Domain-specific answer graders
|
| 263 |
+
│ ├── baseline.py 4 baseline agents + evaluation runner
|
| 264 |
+
│ └── epistemic_fingerprint.py Radar chart + heatmap generation
|
| 265 |
+
│
|
| 266 |
+
├── training/
|
| 267 |
+
│ ├── train.py GRPO training with 3-phase curriculum
|
| 268 |
+
│ ├── curriculum.py Phase manager (ECE-triggered advancement)
|
| 269 |
+
│ ├── dataset.py GRPO dataset builder with chat template support
|
| 270 |
+
│ └── evaluate.py Full eval suite + all 6 plot generators
|
| 271 |
+
│
|
| 272 |
+
├── server/app.py FastAPI OpenEnv server (10 endpoints)
|
| 273 |
+
├── ui/app.py Gradio 5-tab demo
|
| 274 |
+
└── scripts/
|
| 275 |
+
├── download_tasks.py Download 7 HuggingFace datasets
|
| 276 |
+
├── run_baseline.py Evaluate baselines + generate plots
|
| 277 |
+
└── generate_plots.py Generate all 6 plots (synthetic, instant)
|
| 278 |
+
```
|
| 279 |
+
|
| 280 |
+
---
|
| 281 |
+
|
| 282 |
+
## 🛠️ Tech Stack
|
| 283 |
+
|
| 284 |
+
| Component | Technology |
|
| 285 |
+
|-----------|-----------|
|
| 286 |
+
| RL Training | HuggingFace TRL ≥0.9.0 (GRPOTrainer) |
|
| 287 |
+
| Base Model | Qwen/Qwen2.5-3B-Instruct |
|
| 288 |
+
| Environment | gymnasium ≥1.0.0 (OpenEnv compatible) |
|
| 289 |
+
| Datasets | GSM8K, ARC, TriviaQA, SciQ, MedMCQA + generated |
|
| 290 |
+
| Calibration | ECE, MCE, Brier Score, Sharpness, Resolution |
|
| 291 |
+
| API Server | FastAPI + uvicorn |
|
| 292 |
+
| Demo UI | Gradio 4 |
|
| 293 |
+
| Plots | matplotlib (dark theme, dpi=150) |
|
| 294 |
+
|
| 295 |
+
---
|
| 296 |
+
|
| 297 |
+
## 📖 Citation
|
| 298 |
|
| 299 |
```bibtex
|
| 300 |
@misc{echo-ultimate-2025,
|
| 301 |
+
title = {ECHO ULTIMATE: Training LLMs to Know What They Don't Know},
|
| 302 |
author = {Tripathi, Revtiraman and Pandey, Vikas Dev},
|
| 303 |
year = {2025},
|
| 304 |
+
url = {https://huggingface.co/spaces/revti126/echo-ultimate},
|
| 305 |
+
note = {OpenEnv Hackathon Submission}
|
| 306 |
}
|
| 307 |
```
|
| 308 |
+
|
| 309 |
+
---
|
| 310 |
+
|
| 311 |
+
*Built for the OpenEnv Hackathon, 2025. MIT License.*
|
app.py
CHANGED
|
@@ -1,15 +1,12 @@
|
|
| 1 |
-
"""HuggingFace Space entry point."""
|
| 2 |
-
import sys
|
|
|
|
| 3 |
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
| 4 |
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
-
|
| 8 |
-
demo.queue()
|
| 9 |
-
demo.launch(
|
| 10 |
-
server_name=os.getenv("GRADIO_SERVER_NAME", "0.0.0.0"),
|
| 11 |
-
server_port=int(os.getenv("GRADIO_SERVER_PORT", "7860")),
|
| 12 |
-
css=_CSS,
|
| 13 |
-
js=_JS,
|
| 14 |
-
theme=theme,
|
| 15 |
-
)
|
|
|
|
| 1 |
+
"""HuggingFace Space entry point — forwards to FastAPI+Gradio server."""
|
| 2 |
+
import sys
|
| 3 |
+
import os
|
| 4 |
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
| 5 |
|
| 6 |
+
# This file is kept for compatibility.
|
| 7 |
+
# The actual app is in server/app.py and launched via Dockerfile CMD:
|
| 8 |
+
# python -m uvicorn server.app:app --host 0.0.0.0 --port 7860
|
| 9 |
+
# All endpoints:
|
| 10 |
+
# /health /tasks /reset /step /state /metrics /fingerprint /history /docs /ui
|
| 11 |
|
| 12 |
+
from server.app import app # noqa: F401 — imported so this module is a valid ASGI target
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
asgi.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Stable ASGI entrypoint for Hugging Face Docker Space."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import sys
|
| 5 |
+
|
| 6 |
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
| 7 |
+
|
| 8 |
+
from server.app import app
|
| 9 |
+
|
client.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from openenv.core.client import HTTPEnvClient
|
| 2 |
+
from models import EchoAction, EchoObservation
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class EchoClient(HTTPEnvClient):
|
| 6 |
+
"""HTTP client for the ECHO calibration environment."""
|
| 7 |
+
|
| 8 |
+
action_class = EchoAction
|
| 9 |
+
observation_class = EchoObservation
|
| 10 |
+
|
| 11 |
+
def step_with_response(self, response_text: str) -> EchoObservation:
|
| 12 |
+
"""Helper: submit a raw response string as an action."""
|
| 13 |
+
action = EchoAction(response=response_text)
|
| 14 |
+
return self.step(action)
|
models.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass, field
|
| 2 |
+
from typing import Optional, Dict, Any
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
@dataclass
|
| 6 |
+
class EchoAction:
|
| 7 |
+
"""Action: model's response with embedded confidence and answer."""
|
| 8 |
+
|
| 9 |
+
response: str # Full response text containing <confidence> and <answer> tags
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@dataclass
|
| 13 |
+
class EchoObservation:
|
| 14 |
+
"""Observation returned after each step."""
|
| 15 |
+
|
| 16 |
+
question: str
|
| 17 |
+
domain: str
|
| 18 |
+
difficulty: str
|
| 19 |
+
reward: float
|
| 20 |
+
accuracy: float
|
| 21 |
+
confidence: int
|
| 22 |
+
brier_score: float
|
| 23 |
+
ece: float
|
| 24 |
+
is_correct: bool
|
| 25 |
+
thinking: str = ""
|
| 26 |
+
feedback: str = ""
|
| 27 |
+
episode_step: int = 0
|
| 28 |
+
total_steps: int = 0
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
@dataclass
|
| 32 |
+
class EchoState:
|
| 33 |
+
"""Full environment state."""
|
| 34 |
+
|
| 35 |
+
current_question: str = ""
|
| 36 |
+
domain: str = ""
|
| 37 |
+
difficulty: str = ""
|
| 38 |
+
phase: int = 1
|
| 39 |
+
step_count: int = 0
|
| 40 |
+
total_reward: float = 0.0
|
| 41 |
+
accuracy_history: list = field(default_factory=list)
|
| 42 |
+
confidence_history: list = field(default_factory=list)
|
| 43 |
+
ece_history: list = field(default_factory=list)
|
| 44 |
+
domain_stats: Dict[str, Any] = field(default_factory=dict)
|
openenv.yaml
CHANGED
|
@@ -81,7 +81,7 @@ calibration_metrics:
|
|
| 81 |
resolution: "How much predictions differ from base rate — informativeness"
|
| 82 |
|
| 83 |
api:
|
| 84 |
-
base_url: "https://
|
| 85 |
endpoints:
|
| 86 |
health: "GET /health"
|
| 87 |
tasks: "GET /tasks"
|
|
@@ -96,7 +96,7 @@ api:
|
|
| 96 |
|
| 97 |
training:
|
| 98 |
algorithm: "GRPO (Group Relative Policy Optimization)"
|
| 99 |
-
model: "
|
| 100 |
total_steps: 5800
|
| 101 |
phases: 3
|
| 102 |
framework: "HuggingFace TRL ≥ 0.9.0"
|
|
@@ -106,5 +106,5 @@ citation: |
|
|
| 106 |
title = {ECHO ULTIMATE: Training LLMs to Know What They Don't Know},
|
| 107 |
author = {Tripathi, Revtiraman and Pandey, Vikas Dev},
|
| 108 |
year = {2025},
|
| 109 |
-
url = {https://huggingface.co/spaces/
|
| 110 |
}
|
|
|
|
| 81 |
resolution: "How much predictions differ from base rate — informativeness"
|
| 82 |
|
| 83 |
api:
|
| 84 |
+
base_url: "https://vikaspandey582003-echo-ultimate.hf.space"
|
| 85 |
endpoints:
|
| 86 |
health: "GET /health"
|
| 87 |
tasks: "GET /tasks"
|
|
|
|
| 96 |
|
| 97 |
training:
|
| 98 |
algorithm: "GRPO (Group Relative Policy Optimization)"
|
| 99 |
+
model: "unsloth/Qwen2.5-7B-Instruct"
|
| 100 |
total_steps: 5800
|
| 101 |
phases: 3
|
| 102 |
framework: "HuggingFace TRL ≥ 0.9.0"
|
|
|
|
| 106 |
title = {ECHO ULTIMATE: Training LLMs to Know What They Don't Know},
|
| 107 |
author = {Tripathi, Revtiraman and Pandey, Vikas Dev},
|
| 108 |
year = {2025},
|
| 109 |
+
url = {https://huggingface.co/spaces/Vikaspandey582003/echo-ultimate}
|
| 110 |
}
|
pyproject.toml
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=45", "wheel"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "openenv-echo"
|
| 7 |
+
version = "0.1.0"
|
| 8 |
+
description = "ECHO: Epistemic Calibration via Hierarchical OpenEnv"
|
| 9 |
+
requires-python = ">=3.10"
|
| 10 |
+
dependencies = [
|
| 11 |
+
"fastapi",
|
| 12 |
+
"uvicorn",
|
| 13 |
+
"torch",
|
| 14 |
+
"transformers",
|
| 15 |
+
"datasets",
|
| 16 |
+
"gymnasium",
|
| 17 |
+
"trl>=0.8.0",
|
| 18 |
+
"peft",
|
| 19 |
+
"huggingface_hub",
|
| 20 |
+
"gradio>=4.0.0",
|
| 21 |
+
"plotly",
|
| 22 |
+
"pandas",
|
| 23 |
+
"numpy",
|
| 24 |
+
]
|
| 25 |
+
|
| 26 |
+
[tool.setuptools.packages.find]
|
| 27 |
+
where = ["."]
|
| 28 |
+
include = ["env*", "server*", "core*", "training*", "ui*"]
|
requirements.txt
CHANGED
|
@@ -1,13 +1,42 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
numpy>=1.26.0
|
| 3 |
pandas>=2.1.0
|
| 4 |
scipy>=1.11.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
matplotlib>=3.8.0
|
| 6 |
seaborn>=0.13.0
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
huggingface-hub>=0.21.0
|
| 11 |
-
|
| 12 |
python-dotenv>=1.0.0
|
|
|
|
| 13 |
rich>=13.0.0
|
|
|
|
|
|
| 1 |
+
# Core ML
|
| 2 |
+
torch>=2.1.0
|
| 3 |
+
transformers>=4.44.0
|
| 4 |
+
trl>=0.9.0
|
| 5 |
+
datasets>=2.18.0
|
| 6 |
+
accelerate>=0.28.0
|
| 7 |
+
peft>=0.10.0
|
| 8 |
+
bitsandbytes>=0.42.0
|
| 9 |
+
|
| 10 |
+
# Unsloth — 2-3x faster training, 70% less VRAM (install first on GPU machines)
|
| 11 |
+
unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git
|
| 12 |
+
|
| 13 |
+
# Optional: GPT-4o-mini baseline comparison
|
| 14 |
+
openai>=1.0.0
|
| 15 |
+
|
| 16 |
+
# Environment
|
| 17 |
+
gymnasium>=1.0.0
|
| 18 |
numpy>=1.26.0
|
| 19 |
pandas>=2.1.0
|
| 20 |
scipy>=1.11.0
|
| 21 |
+
|
| 22 |
+
# Server
|
| 23 |
+
fastapi>=0.111.0
|
| 24 |
+
uvicorn[standard]>=0.29.0
|
| 25 |
+
pydantic>=2.6.0
|
| 26 |
+
httpx>=0.27.0
|
| 27 |
+
|
| 28 |
+
# Demo
|
| 29 |
+
gradio>=4.20.0
|
| 30 |
+
|
| 31 |
+
# Visualization
|
| 32 |
matplotlib>=3.8.0
|
| 33 |
seaborn>=0.13.0
|
| 34 |
+
|
| 35 |
+
# Utilities
|
| 36 |
+
wandb>=0.16.0
|
| 37 |
huggingface-hub>=0.21.0
|
| 38 |
+
scikit-learn>=1.4.0
|
| 39 |
python-dotenv>=1.0.0
|
| 40 |
+
click>=8.1.0
|
| 41 |
rich>=13.0.0
|
| 42 |
+
PyYAML>=6.0.0
|
server/app.py
CHANGED
|
@@ -1,17 +1,23 @@
|
|
| 1 |
"""
|
| 2 |
ECHO ULTIMATE — FastAPI OpenEnv-Compliant Server.
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
"""
|
| 7 |
|
| 8 |
import logging
|
| 9 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
from contextlib import asynccontextmanager
|
| 11 |
from typing import Any, Optional
|
| 12 |
|
| 13 |
from fastapi import FastAPI, HTTPException
|
| 14 |
from fastapi.middleware.cors import CORSMiddleware
|
|
|
|
| 15 |
from pydantic import BaseModel, Field
|
| 16 |
|
| 17 |
from config import cfg
|
|
@@ -24,9 +30,9 @@ logger = logging.getLogger(__name__)
|
|
| 24 |
|
| 25 |
# ── App state ─────────────────────────────────────────────────────────────────
|
| 26 |
|
| 27 |
-
_task_bank: Optional[TaskBank]
|
| 28 |
-
_env:
|
| 29 |
-
_history:
|
| 30 |
|
| 31 |
|
| 32 |
def _get_env() -> EchoEnv:
|
|
@@ -41,27 +47,30 @@ class ResetRequest(BaseModel):
|
|
| 41 |
task_id: Optional[str] = Field(None, description="Specific task ID to load")
|
| 42 |
adversarial: Optional[bool] = Field(False, description="Use adversarial questions")
|
| 43 |
|
|
|
|
| 44 |
class StepRequest(BaseModel):
|
| 45 |
-
action: str = Field(
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
|
|
|
| 50 |
|
| 51 |
-
class HealthResponse(BaseModel):
|
| 52 |
-
status: str; environment: str; version: str; domains: int; tasks: int
|
| 53 |
|
| 54 |
class TaskInfo(BaseModel):
|
| 55 |
-
id: str
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
-
class StepResponse(BaseModel):
|
| 58 |
-
state: dict; reward: float; terminated: bool; truncated: bool; info: dict
|
| 59 |
|
| 60 |
-
class
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
|
|
|
| 65 |
|
| 66 |
|
| 67 |
# ── Lifespan ──────────────────────────────────────────────────────────────────
|
|
@@ -73,10 +82,10 @@ async def lifespan(app: FastAPI):
|
|
| 73 |
_task_bank = TaskBank()
|
| 74 |
_task_bank.ensure_loaded()
|
| 75 |
_history = RewardHistory()
|
| 76 |
-
_env
|
| 77 |
_env.reset()
|
| 78 |
-
logger.info("ECHO ULTIMATE
|
| 79 |
-
print("✅ ECHO ULTIMATE server ready
|
| 80 |
yield
|
| 81 |
logger.info("ECHO ULTIMATE server shutting down.")
|
| 82 |
|
|
@@ -95,17 +104,26 @@ app = FastAPI(
|
|
| 95 |
|
| 96 |
app.add_middleware(
|
| 97 |
CORSMiddleware,
|
| 98 |
-
allow_origins=["*"],
|
| 99 |
-
|
|
|
|
|
|
|
| 100 |
)
|
| 101 |
|
| 102 |
|
| 103 |
# ── Endpoints ─────────────────────────────────────────────────────────────────
|
| 104 |
|
| 105 |
-
@app.get("/health",
|
| 106 |
async def health():
|
| 107 |
-
return
|
| 108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
|
| 111 |
@app.get("/tasks", response_model=list[TaskInfo], tags=["Tasks"])
|
|
@@ -119,8 +137,10 @@ async def list_tasks():
|
|
| 119 |
async def reset(req: ResetRequest = ResetRequest()) -> dict:
|
| 120 |
env = _get_env()
|
| 121 |
opts = {}
|
| 122 |
-
if req.task_id:
|
| 123 |
-
|
|
|
|
|
|
|
| 124 |
state, info = env.reset(options=opts if opts else None)
|
| 125 |
return state
|
| 126 |
|
|
@@ -135,13 +155,21 @@ async def reset_task(task_id: str) -> dict:
|
|
| 135 |
@app.post("/step", response_model=StepResponse, tags=["Environment"])
|
| 136 |
async def step(req: StepRequest) -> StepResponse:
|
| 137 |
env = _get_env()
|
|
|
|
|
|
|
|
|
|
| 138 |
try:
|
| 139 |
-
state, reward, terminated, truncated, info = env.step(
|
| 140 |
except Exception as exc:
|
| 141 |
logger.error("step error: %s", exc)
|
| 142 |
raise HTTPException(500, f"Step failed: {exc}")
|
| 143 |
-
return StepResponse(
|
| 144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
|
| 146 |
|
| 147 |
@app.get("/state", tags=["Environment"])
|
|
@@ -149,18 +177,18 @@ async def get_state() -> dict:
|
|
| 149 |
return _get_env()._build_obs()
|
| 150 |
|
| 151 |
|
| 152 |
-
@app.get("/metrics",
|
| 153 |
async def get_metrics():
|
| 154 |
rep = _get_env().get_metrics()
|
| 155 |
-
return
|
| 156 |
|
| 157 |
|
| 158 |
-
@app.get("/metrics/{domain}",
|
| 159 |
async def get_domain_metrics(domain: str):
|
| 160 |
if domain not in cfg.DOMAINS:
|
| 161 |
raise HTTPException(404, f"Unknown domain '{domain}'. Valid: {cfg.DOMAINS}")
|
| 162 |
rep = _get_env().get_metrics(domain=domain)
|
| 163 |
-
return
|
| 164 |
|
| 165 |
|
| 166 |
@app.get("/fingerprint", tags=["Metrics"])
|
|
@@ -168,31 +196,59 @@ async def get_fingerprint() -> dict:
|
|
| 168 |
env = _get_env()
|
| 169 |
profiles = env.reward_history.get_domain_profiles()
|
| 170 |
return {
|
| 171 |
-
"domain_scores":
|
| 172 |
-
"domain_ece":
|
| 173 |
-
"domain_accuracy":
|
| 174 |
-
"overall_ece":
|
| 175 |
}
|
| 176 |
|
| 177 |
|
| 178 |
@app.get("/history", tags=["Metrics"])
|
| 179 |
async def get_history() -> dict:
|
| 180 |
env = _get_env()
|
| 181 |
-
df
|
| 182 |
records = df.tail(100).to_dict(orient="records") if len(df) > 0 else []
|
| 183 |
return {"episodes": records, "total": len(df)}
|
| 184 |
|
| 185 |
|
| 186 |
-
@app.
|
| 187 |
-
async def
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
|
|
|
| 191 |
|
|
|
|
| 192 |
|
| 193 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
|
| 195 |
if __name__ == "__main__":
|
| 196 |
import uvicorn
|
| 197 |
logging.basicConfig(level=logging.INFO)
|
| 198 |
-
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
ECHO ULTIMATE — FastAPI OpenEnv-Compliant Server.
|
| 3 |
+
Pure FastAPI: no openenv package dependency.
|
| 4 |
+
Mounts Gradio UI at /ui.
|
| 5 |
+
Runs on port 7860 (HuggingFace Space public port).
|
| 6 |
"""
|
| 7 |
|
| 8 |
import logging
|
| 9 |
+
import os
|
| 10 |
+
import random
|
| 11 |
+
import sys
|
| 12 |
+
|
| 13 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 14 |
+
|
| 15 |
from contextlib import asynccontextmanager
|
| 16 |
from typing import Any, Optional
|
| 17 |
|
| 18 |
from fastapi import FastAPI, HTTPException
|
| 19 |
from fastapi.middleware.cors import CORSMiddleware
|
| 20 |
+
from fastapi.responses import JSONResponse
|
| 21 |
from pydantic import BaseModel, Field
|
| 22 |
|
| 23 |
from config import cfg
|
|
|
|
| 30 |
|
| 31 |
# ── App state ─────────────────────────────────────────────────────────────────
|
| 32 |
|
| 33 |
+
_task_bank: Optional[TaskBank] = None
|
| 34 |
+
_env: Optional[EchoEnv] = None
|
| 35 |
+
_history: Optional[RewardHistory] = None
|
| 36 |
|
| 37 |
|
| 38 |
def _get_env() -> EchoEnv:
|
|
|
|
| 47 |
task_id: Optional[str] = Field(None, description="Specific task ID to load")
|
| 48 |
adversarial: Optional[bool] = Field(False, description="Use adversarial questions")
|
| 49 |
|
| 50 |
+
|
| 51 |
class StepRequest(BaseModel):
|
| 52 |
+
action: Optional[str] = Field(None, description="Legacy: action string")
|
| 53 |
+
response: Optional[str] = Field(None, description="Agent response with confidence and answer tags")
|
| 54 |
+
|
| 55 |
+
def get_response(self) -> str:
|
| 56 |
+
"""Accept either 'response' or 'action' field."""
|
| 57 |
+
return self.response or self.action or ""
|
| 58 |
|
|
|
|
|
|
|
| 59 |
|
| 60 |
class TaskInfo(BaseModel):
|
| 61 |
+
id: str
|
| 62 |
+
name: str
|
| 63 |
+
description: str
|
| 64 |
+
pass_threshold: float
|
| 65 |
+
n_episodes: int
|
| 66 |
|
|
|
|
|
|
|
| 67 |
|
| 68 |
+
class StepResponse(BaseModel):
|
| 69 |
+
state: dict
|
| 70 |
+
reward: float
|
| 71 |
+
terminated: bool
|
| 72 |
+
truncated: bool
|
| 73 |
+
info: dict
|
| 74 |
|
| 75 |
|
| 76 |
# ── Lifespan ──────────────────────────────────────────────────────────────────
|
|
|
|
| 82 |
_task_bank = TaskBank()
|
| 83 |
_task_bank.ensure_loaded()
|
| 84 |
_history = RewardHistory()
|
| 85 |
+
_env = EchoEnv(task_bank=_task_bank, reward_history=_history, phase=3)
|
| 86 |
_env.reset()
|
| 87 |
+
logger.info("ECHO ULTIMATE ready ✅ (7 domains, 3 tasks)")
|
| 88 |
+
print("✅ ECHO ULTIMATE server ready — http://0.0.0.0:7860/docs")
|
| 89 |
yield
|
| 90 |
logger.info("ECHO ULTIMATE server shutting down.")
|
| 91 |
|
|
|
|
| 104 |
|
| 105 |
app.add_middleware(
|
| 106 |
CORSMiddleware,
|
| 107 |
+
allow_origins=["*"],
|
| 108 |
+
allow_credentials=True,
|
| 109 |
+
allow_methods=["*"],
|
| 110 |
+
allow_headers=["*"],
|
| 111 |
)
|
| 112 |
|
| 113 |
|
| 114 |
# ── Endpoints ─────────────────────────────────────────────────────────────────
|
| 115 |
|
| 116 |
+
@app.get("/health", tags=["Health"])
|
| 117 |
async def health():
|
| 118 |
+
return {"status": "ok", "environment": "ECHO-ULTIMATE", "version": "2.0.0",
|
| 119 |
+
"domains": 7, "tasks": 3}
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
@app.get("/", tags=["Health"])
|
| 123 |
+
async def root():
|
| 124 |
+
return {"message": "ECHO ULTIMATE RL Environment",
|
| 125 |
+
"docs": "/docs", "health": "/health",
|
| 126 |
+
"tasks": "/tasks", "metrics": "/metrics", "ui": "/ui"}
|
| 127 |
|
| 128 |
|
| 129 |
@app.get("/tasks", response_model=list[TaskInfo], tags=["Tasks"])
|
|
|
|
| 137 |
async def reset(req: ResetRequest = ResetRequest()) -> dict:
|
| 138 |
env = _get_env()
|
| 139 |
opts = {}
|
| 140 |
+
if req.task_id:
|
| 141 |
+
opts["task_id"] = req.task_id
|
| 142 |
+
if req.adversarial:
|
| 143 |
+
opts["adversarial"] = True
|
| 144 |
state, info = env.reset(options=opts if opts else None)
|
| 145 |
return state
|
| 146 |
|
|
|
|
| 155 |
@app.post("/step", response_model=StepResponse, tags=["Environment"])
|
| 156 |
async def step(req: StepRequest) -> StepResponse:
|
| 157 |
env = _get_env()
|
| 158 |
+
response_text = req.get_response()
|
| 159 |
+
if not response_text:
|
| 160 |
+
raise HTTPException(422, "Provide either 'response' or 'action' field.")
|
| 161 |
try:
|
| 162 |
+
state, reward, terminated, truncated, info = env.step(response_text)
|
| 163 |
except Exception as exc:
|
| 164 |
logger.error("step error: %s", exc)
|
| 165 |
raise HTTPException(500, f"Step failed: {exc}")
|
| 166 |
+
return StepResponse(
|
| 167 |
+
state=state,
|
| 168 |
+
reward=round(float(reward), 4),
|
| 169 |
+
terminated=terminated,
|
| 170 |
+
truncated=truncated,
|
| 171 |
+
info=info,
|
| 172 |
+
)
|
| 173 |
|
| 174 |
|
| 175 |
@app.get("/state", tags=["Environment"])
|
|
|
|
| 177 |
return _get_env()._build_obs()
|
| 178 |
|
| 179 |
|
| 180 |
+
@app.get("/metrics", tags=["Metrics"])
|
| 181 |
async def get_metrics():
|
| 182 |
rep = _get_env().get_metrics()
|
| 183 |
+
return rep.to_dict()
|
| 184 |
|
| 185 |
|
| 186 |
+
@app.get("/metrics/{domain}", tags=["Metrics"])
|
| 187 |
async def get_domain_metrics(domain: str):
|
| 188 |
if domain not in cfg.DOMAINS:
|
| 189 |
raise HTTPException(404, f"Unknown domain '{domain}'. Valid: {cfg.DOMAINS}")
|
| 190 |
rep = _get_env().get_metrics(domain=domain)
|
| 191 |
+
return rep.to_dict()
|
| 192 |
|
| 193 |
|
| 194 |
@app.get("/fingerprint", tags=["Metrics"])
|
|
|
|
| 196 |
env = _get_env()
|
| 197 |
profiles = env.reward_history.get_domain_profiles()
|
| 198 |
return {
|
| 199 |
+
"domain_scores": {d: round(1.0 - r.ece, 3) for d, r in profiles.items()},
|
| 200 |
+
"domain_ece": {d: round(r.ece, 3) for d, r in profiles.items()},
|
| 201 |
+
"domain_accuracy": {d: round(r.accuracy, 3) for d, r in profiles.items()},
|
| 202 |
+
"overall_ece": round(env.get_metrics().ece, 3),
|
| 203 |
}
|
| 204 |
|
| 205 |
|
| 206 |
@app.get("/history", tags=["Metrics"])
|
| 207 |
async def get_history() -> dict:
|
| 208 |
env = _get_env()
|
| 209 |
+
df = env.reward_history.to_dataframe()
|
| 210 |
records = df.tail(100).to_dict(orient="records") if len(df) > 0 else []
|
| 211 |
return {"episodes": records, "total": len(df)}
|
| 212 |
|
| 213 |
|
| 214 |
+
@app.post("/advance_phase", tags=["Environment"])
|
| 215 |
+
async def advance_phase():
|
| 216 |
+
env = _get_env()
|
| 217 |
+
env.phase = min(getattr(env, "phase", 1) + 1, 4)
|
| 218 |
+
return {"phase": env.phase, "message": f"Advanced to Phase {env.phase}"}
|
| 219 |
+
|
| 220 |
|
| 221 |
+
# ── Mount Gradio UI at /ui ────────────────────────────────────────────────────
|
| 222 |
|
| 223 |
+
try:
|
| 224 |
+
import gradio as gr
|
| 225 |
+
import importlib.util
|
| 226 |
+
|
| 227 |
+
_ui_path = os.path.join(
|
| 228 |
+
os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "ui", "app.py"
|
| 229 |
+
)
|
| 230 |
+
spec = importlib.util.spec_from_file_location("gradio_app", _ui_path)
|
| 231 |
+
gradio_module = importlib.util.module_from_spec(spec)
|
| 232 |
+
if spec and spec.loader:
|
| 233 |
+
spec.loader.exec_module(gradio_module)
|
| 234 |
+
if hasattr(gradio_module, "demo"):
|
| 235 |
+
_gradio_demo = gradio_module.demo
|
| 236 |
+
elif hasattr(gradio_module, "build_app"):
|
| 237 |
+
_gradio_demo, _ = gradio_module.build_app()
|
| 238 |
+
else:
|
| 239 |
+
raise AttributeError("ui/app.py has neither 'demo' nor 'build_app'")
|
| 240 |
+
app = gr.mount_gradio_app(app, _gradio_demo, path="/ui")
|
| 241 |
+
print("✅ Gradio UI mounted at /ui")
|
| 242 |
+
else:
|
| 243 |
+
print("⚠️ Could not load ui/app.py spec")
|
| 244 |
+
except Exception as _e:
|
| 245 |
+
print(f"⚠️ Gradio UI not mounted: {_e}")
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
# ── Direct runner ──────────────────────────────────────────────────────────────
|
| 249 |
|
| 250 |
if __name__ == "__main__":
|
| 251 |
import uvicorn
|
| 252 |
logging.basicConfig(level=logging.INFO)
|
| 253 |
+
port = int(os.environ.get("PORT", 7860))
|
| 254 |
+
uvicorn.run(app, host="0.0.0.0", port=port)
|