Vikaspandey582003 commited on
Commit
fc58aef
·
verified ·
1 Parent(s): 4e366bc

fix: pure FastAPI on port 7860 — all OpenEnv endpoints live + Gradio at /ui

Browse files
Files changed (11) hide show
  1. Dockerfile +6 -18
  2. ECHO_Training.ipynb +368 -0
  3. README.md +283 -37
  4. app.py +9 -12
  5. asgi.py +9 -0
  6. client.py +14 -0
  7. models.py +44 -0
  8. openenv.yaml +3 -3
  9. pyproject.toml +28 -0
  10. requirements.txt +34 -5
  11. server/app.py +107 -51
Dockerfile CHANGED
@@ -1,24 +1,12 @@
1
- FROM python:3.11-slim
2
-
3
  WORKDIR /app
4
-
5
- RUN apt-get update && apt-get install -y --no-install-recommends \
6
- build-essential curl git && \
7
- rm -rf /var/lib/apt/lists/*
8
-
9
  COPY requirements.txt .
10
  RUN pip install --no-cache-dir -r requirements.txt
11
-
12
  COPY . .
13
-
14
  RUN mkdir -p data results/plots
15
-
16
- # Pre-generate all plots so Gradio loads instantly (falls back silently on failure)
17
- RUN python scripts/generate_plots.py || echo "Plot pre-generation skipped"
18
-
19
  EXPOSE 7860
20
-
21
- ENV GRADIO_SERVER_NAME=0.0.0.0
22
- ENV GRADIO_SERVER_PORT=7860
23
-
24
- CMD ["python", "app.py"]
 
1
+ FROM python:3.10-slim
 
2
  WORKDIR /app
3
+ RUN apt-get update && apt-get install -y git gcc g++ curl && rm -rf /var/lib/apt/lists/*
 
 
 
 
4
  COPY requirements.txt .
5
  RUN pip install --no-cache-dir -r requirements.txt
 
6
  COPY . .
 
7
  RUN mkdir -p data results/plots
8
+ RUN python scripts/generate_plots.py || echo "Plot generation skipped"
 
 
 
9
  EXPOSE 7860
10
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=90s \
11
+ CMD curl -f http://localhost:7860/health || exit 1
12
+ CMD ["python", "-m", "uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]
 
 
ECHO_Training.ipynb ADDED
@@ -0,0 +1,368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "e67d4af1",
6
+ "metadata": {},
7
+ "source": [
8
+ "# ECHO Training Notebook\n",
9
+ "Trains Qwen2.5-7B to predict its own correctness using GRPO + OpenEnv"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": null,
15
+ "id": "04648bc5",
16
+ "metadata": {},
17
+ "outputs": [],
18
+ "source": [
19
+ "# Install dependencies\n",
20
+ "!pip install -q \"trl>=0.8.0\" \"peft\" \"transformers\" \"datasets\" \"huggingface_hub\"\n",
21
+ "!pip install -q \"openenv-core[core]>=0.2.0\" || pip install -q git+https://github.com/meta-pytorch/OpenEnv.git\n",
22
+ "!pip install -q \"unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git\""
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": null,
28
+ "id": "b1aee9a5",
29
+ "metadata": {},
30
+ "outputs": [],
31
+ "source": [
32
+ "import os\n",
33
+ "import requests\n",
34
+ "import json\n",
35
+ "import numpy as np\n",
36
+ "from huggingface_hub import login\n",
37
+ "\n",
38
+ "# Authenticate\n",
39
+ "HF_TOKEN = os.environ.get(\"HF_TOKEN\", \"\") # Set in Colab secrets\n",
40
+ "if HF_TOKEN:\n",
41
+ " login(HF_TOKEN)\n",
42
+ "\n",
43
+ "# Connect to live ECHO environment on HuggingFace Spaces\n",
44
+ "ECHO_SPACE_URL = \"https://vikaspandey582003-echo-ultimate.hf.space\"\n",
45
+ "\n",
46
+ "# Test connection\n",
47
+ "resp = requests.get(f\"{ECHO_SPACE_URL}/health\", timeout=10)\n",
48
+ "print(f\"Space status: {resp.json()}\")"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "code",
53
+ "execution_count": null,
54
+ "id": "dbf22129",
55
+ "metadata": {},
56
+ "outputs": [],
57
+ "source": [
58
+ "# Simple HTTP client for the ECHO environment\n",
59
+ "class EchoEnvClient:\n",
60
+ " def __init__(self, base_url):\n",
61
+ " self.base_url = base_url.rstrip(\"/\")\n",
62
+ " \n",
63
+ " def reset(self):\n",
64
+ " r = requests.post(f\"{self.base_url}/reset\", timeout=30)\n",
65
+ " r.raise_for_status()\n",
66
+ " return r.json()\n",
67
+ " \n",
68
+ " def step(self, response_text: str):\n",
69
+ " # OpenEnv servers may accept either {\"response\": ...} or {\"action\": {\"response\": ...}}\n",
70
+ " payloads = [\n",
71
+ " {\"response\": response_text},\n",
72
+ " {\"action\": {\"response\": response_text}},\n",
73
+ " ]\n",
74
+ " last_error = None\n",
75
+ " for payload in payloads:\n",
76
+ " try:\n",
77
+ " r = requests.post(f\"{self.base_url}/step\", json=payload, timeout=30)\n",
78
+ " r.raise_for_status()\n",
79
+ " return r.json()\n",
80
+ " except Exception as e:\n",
81
+ " last_error = e\n",
82
+ " raise RuntimeError(f\"Step request failed for all payload formats: {last_error}\")\n",
83
+ " \n",
84
+ " def get_metrics(self):\n",
85
+ " r = requests.get(f\"{self.base_url}/metrics\", timeout=10)\n",
86
+ " r.raise_for_status()\n",
87
+ " return r.json()\n",
88
+ "\n",
89
+ "env = EchoEnvClient(ECHO_SPACE_URL)\n",
90
+ "\n",
91
+ "# Test: reset and take a step\n",
92
+ "obs = env.reset()\n",
93
+ "print(\"Question:\", obs.get(\"question\", \"\"))\n",
94
+ "result = env.step(\"<confidence>70</confidence><answer>test answer</answer>\")\n",
95
+ "print(\"Step response keys:\", list(result.keys()))"
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "execution_count": null,
101
+ "id": "e58fc972",
102
+ "metadata": {},
103
+ "outputs": [],
104
+ "source": [
105
+ "# Load model with Unsloth\n",
106
+ "from unsloth import FastLanguageModel\n",
107
+ "import torch\n",
108
+ "\n",
109
+ "model, tokenizer = FastLanguageModel.from_pretrained(\n",
110
+ " model_name=\"unsloth/Qwen2.5-7B-Instruct\",\n",
111
+ " max_seq_length=2048,\n",
112
+ " dtype=None,\n",
113
+ " load_in_4bit=True,\n",
114
+ ")\n",
115
+ "\n",
116
+ "model = FastLanguageModel.get_peft_model(\n",
117
+ " model,\n",
118
+ " r=16,\n",
119
+ " target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n",
120
+ " \"gate_proj\", \"up_proj\", \"down_proj\"],\n",
121
+ " lora_alpha=16,\n",
122
+ " lora_dropout=0,\n",
123
+ " bias=\"none\",\n",
124
+ " use_gradient_checkpointing=\"unsloth\",\n",
125
+ " random_state=42,\n",
126
+ ")"
127
+ ]
128
+ },
129
+ {
130
+ "cell_type": "code",
131
+ "execution_count": null,
132
+ "id": "bf6efbc1",
133
+ "metadata": {},
134
+ "outputs": [],
135
+ "source": [
136
+ "from trl import GRPOConfig, GRPOTrainer\n",
137
+ "from datasets import Dataset\n",
138
+ "\n",
139
+ "SYSTEM_PROMPT = \"\"\"You are a calibrated AI assistant. For every question:\n",
140
+ "1. Think step-by-step (optional: use <think>...</think> tags) \n",
141
+ "2. Output your confidence as an integer 0-100: <confidence>INTEGER</confidence>\n",
142
+ "3. Output your answer: <answer>YOUR ANSWER</answer>\n",
143
+ "\n",
144
+ "Be honest about uncertainty. Overconfidence is penalized heavily.\"\"\"\n",
145
+ "\n",
146
+ "# Build dataset from ECHO environment\n",
147
+ "def build_training_dataset(n_samples=500):\n",
148
+ " samples = []\n",
149
+ " for _ in range(n_samples):\n",
150
+ " obs = env.reset()\n",
151
+ " question = obs.get(\"question\", \"\")\n",
152
+ " samples.append({\n",
153
+ " \"prompt\": f\"{SYSTEM_PROMPT}\\n\\nQuestion: {question}\",\n",
154
+ " \"question\": question,\n",
155
+ " })\n",
156
+ " return Dataset.from_list(samples)\n",
157
+ "\n",
158
+ "print(\"Building training dataset from live environment...\")\n",
159
+ "dataset = build_training_dataset(500)\n",
160
+ "print(f\"Dataset size: {len(dataset)}\")"
161
+ ]
162
+ },
163
+ {
164
+ "cell_type": "code",
165
+ "execution_count": null,
166
+ "id": "bbd4c2d9",
167
+ "metadata": {},
168
+ "outputs": [],
169
+ "source": [
170
+ "# GRPO reward function — calls live OpenEnv environment\n",
171
+ "ece_history = []\n",
172
+ "reward_history = []\n",
173
+ "confidence_eval_history = []\n",
174
+ "outcome_history = []\n",
175
+ "\n",
176
+ "def _extract_step_values(result: dict):\n",
177
+ " # Supports both flat and OpenEnv-shaped responses.\n",
178
+ " obs = result.get(\"observation\") or result.get(\"obs\") or result.get(\"state\") or {}\n",
179
+ " info = result.get(\"info\") or {}\n",
180
+ "\n",
181
+ " reward = result.get(\"reward\", info.get(\"reward\", obs.get(\"reward\", 0.0)))\n",
182
+ " ece = result.get(\"ece\", info.get(\"ece\", obs.get(\"ece\", 0.5)))\n",
183
+ " conf = result.get(\"confidence\", obs.get(\"confidence\", None))\n",
184
+ " is_correct = result.get(\"is_correct\", obs.get(\"is_correct\", info.get(\"was_correct\", None)))\n",
185
+ "\n",
186
+ " return float(reward), float(ece), conf, is_correct\n",
187
+ "\n",
188
+ "def echo_reward_function(completions, prompts=None, **kwargs):\n",
189
+ " \"\"\"\n",
190
+ " Reward function that evaluates each completion against the live ECHO environment.\n",
191
+ " This is the core of GRPO training — the environment provides the reward signal.\n",
192
+ " \"\"\"\n",
193
+ " rewards = []\n",
194
+ " for i, completion in enumerate(completions):\n",
195
+ " try:\n",
196
+ " # Reset for each completion so reward is grounded to a fresh environment question.\n",
197
+ " env.reset()\n",
198
+ "\n",
199
+ " # Each completion is evaluated by the running OpenEnv Space.\n",
200
+ " result = env.step(completion)\n",
201
+ " reward, ece, conf, is_correct = _extract_step_values(result)\n",
202
+ "\n",
203
+ " ece_history.append(ece)\n",
204
+ " reward_history.append(reward)\n",
205
+ " if conf is not None:\n",
206
+ " confidence_eval_history.append(float(conf) / 100.0)\n",
207
+ " if is_correct is not None:\n",
208
+ " outcome_history.append(1.0 if bool(is_correct) else 0.0)\n",
209
+ " rewards.append(reward)\n",
210
+ "\n",
211
+ " except Exception as e:\n",
212
+ " print(f\"Env step failed: {e}\")\n",
213
+ " rewards.append(-0.5) # penalty for failed step\n",
214
+ "\n",
215
+ " return rewards"
216
+ ]
217
+ },
218
+ {
219
+ "cell_type": "code",
220
+ "execution_count": null,
221
+ "id": "7258d2c1",
222
+ "metadata": {},
223
+ "outputs": [],
224
+ "source": [
225
+ "# Configure GRPO training\n",
226
+ "training_args = GRPOConfig(\n",
227
+ " output_dir=\"echo_grpo_output\",\n",
228
+ " num_train_epochs=3,\n",
229
+ " per_device_train_batch_size=1,\n",
230
+ " gradient_accumulation_steps=8,\n",
231
+ " learning_rate=2e-5,\n",
232
+ " warmup_steps=50,\n",
233
+ " logging_steps=10,\n",
234
+ " save_steps=100,\n",
235
+ " fp16=True,\n",
236
+ " report_to=\"none\",\n",
237
+ " max_completion_length=512,\n",
238
+ " num_generations=4, # GRPO group size\n",
239
+ " temperature=0.8,\n",
240
+ ")\n",
241
+ "\n",
242
+ "trainer = GRPOTrainer(\n",
243
+ " model=model,\n",
244
+ " args=training_args,\n",
245
+ " reward_funcs=[echo_reward_function],\n",
246
+ " train_dataset=dataset,\n",
247
+ " tokenizer=tokenizer,\n",
248
+ ")\n",
249
+ "\n",
250
+ "print(\"Starting GRPO training against live ECHO environment...\")\n",
251
+ "trainer.train()\n",
252
+ "print(\"Training complete!\")"
253
+ ]
254
+ },
255
+ {
256
+ "cell_type": "code",
257
+ "execution_count": null,
258
+ "id": "e548b198",
259
+ "metadata": {},
260
+ "outputs": [],
261
+ "source": [
262
+ "# Plot ECE curve, reward curve, and reliability diagram\n",
263
+ "import matplotlib.pyplot as plt\n",
264
+ "\n",
265
+ "fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 5))\n",
266
+ "\n",
267
+ "# ECE curve\n",
268
+ "if ece_history:\n",
269
+ " window = 50\n",
270
+ " smoothed = [np.mean(ece_history[max(0, i - window):i + 1]) for i in range(len(ece_history))]\n",
271
+ " ax1.plot(ece_history, alpha=0.3, color='blue', label='Raw ECE')\n",
272
+ " ax1.plot(smoothed, color='blue', linewidth=2, label='Smoothed ECE')\n",
273
+ " ax1.axhline(y=0.15, color='green', linestyle='--', label='Good threshold (0.15)')\n",
274
+ " ax1.axhline(y=0.20, color='orange', linestyle='--', label='Acceptable (0.20)')\n",
275
+ " ax1.set_xlabel('Training Steps')\n",
276
+ " ax1.set_ylabel('ECE (lower = better)')\n",
277
+ " ax1.set_title('ECHO: ECE During GRPO Training')\n",
278
+ " ax1.legend()\n",
279
+ " ax1.grid(True, alpha=0.3)\n",
280
+ "\n",
281
+ "# Reward curve\n",
282
+ "if reward_history:\n",
283
+ " window = 50\n",
284
+ " smoothed_r = [np.mean(reward_history[max(0, i - window):i + 1]) for i in range(len(reward_history))]\n",
285
+ " ax2.plot(reward_history, alpha=0.3, color='green', label='Raw Reward')\n",
286
+ " ax2.plot(smoothed_r, color='green', linewidth=2, label='Smoothed Reward')\n",
287
+ " ax2.set_xlabel('Training Steps')\n",
288
+ " ax2.set_ylabel('Reward')\n",
289
+ " ax2.set_title('ECHO: Reward During GRPO Training')\n",
290
+ " ax2.legend()\n",
291
+ " ax2.grid(True, alpha=0.3)\n",
292
+ "\n",
293
+ "# Reliability diagram\n",
294
+ "if confidence_eval_history and outcome_history and len(confidence_eval_history) == len(outcome_history):\n",
295
+ " n_bins = 10\n",
296
+ " bins = np.linspace(0.0, 1.0, n_bins + 1)\n",
297
+ " bin_centers = (bins[:-1] + bins[1:]) / 2\n",
298
+ " accs = []\n",
299
+ " confs = []\n",
300
+ "\n",
301
+ " conf_arr = np.array(confidence_eval_history)\n",
302
+ " out_arr = np.array(outcome_history)\n",
303
+ "\n",
304
+ " for i in range(n_bins):\n",
305
+ " mask = (conf_arr >= bins[i]) & (conf_arr < bins[i + 1])\n",
306
+ " if i == n_bins - 1:\n",
307
+ " mask = (conf_arr >= bins[i]) & (conf_arr <= bins[i + 1])\n",
308
+ " if np.any(mask):\n",
309
+ " accs.append(float(np.mean(out_arr[mask])))\n",
310
+ " confs.append(float(np.mean(conf_arr[mask])))\n",
311
+ " else:\n",
312
+ " accs.append(np.nan)\n",
313
+ " confs.append(np.nan)\n",
314
+ "\n",
315
+ " ax3.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Perfect calibration')\n",
316
+ " ax3.plot(bin_centers, accs, marker='o', linewidth=2, color='purple', label='Model')\n",
317
+ " ax3.set_xlabel('Predicted confidence')\n",
318
+ " ax3.set_ylabel('Empirical accuracy')\n",
319
+ " ax3.set_title('Reliability Diagram')\n",
320
+ " ax3.set_xlim(0, 1)\n",
321
+ " ax3.set_ylim(0, 1)\n",
322
+ " ax3.grid(True, alpha=0.3)\n",
323
+ " ax3.legend()\n",
324
+ "\n",
325
+ "plt.tight_layout()\n",
326
+ "plt.savefig(\"echo_training_curves.png\", dpi=150, bbox_inches='tight')\n",
327
+ "plt.show()\n",
328
+ "print(f\"Final ECE: {ece_history[-1]:.4f}\" if ece_history else \"No ECE data\")"
329
+ ]
330
+ },
331
+ {
332
+ "cell_type": "code",
333
+ "execution_count": null,
334
+ "id": "091afb04",
335
+ "metadata": {},
336
+ "outputs": [],
337
+ "source": [
338
+ "# Save and push adapter to HF Hub\n",
339
+ "model.save_pretrained(\"echo_lora_adapter\")\n",
340
+ "tokenizer.save_pretrained(\"echo_lora_adapter\")\n",
341
+ "\n",
342
+ "from huggingface_hub import HfApi\n",
343
+ "api = HfApi()\n",
344
+ "api.upload_folder(\n",
345
+ " folder_path=\"echo_lora_adapter\",\n",
346
+ " repo_id=\"Vikaspandey582003/echo-calibration-adapter\",\n",
347
+ " repo_type=\"model\",\n",
348
+ " commit_message=\"ECHO GRPO-trained calibration adapter - Hackathon submission\",\n",
349
+ ")\n",
350
+ "print(\"Adapter pushed to HF Hub!\")\n",
351
+ "print(\"Model: https://huggingface.co/Vikaspandey582003/echo-calibration-adapter\")"
352
+ ]
353
+ }
354
+ ],
355
+ "metadata": {
356
+ "kernelspec": {
357
+ "display_name": "Python 3",
358
+ "language": "python",
359
+ "name": "python3"
360
+ },
361
+ "language_info": {
362
+ "name": "python",
363
+ "version": "3.10"
364
+ }
365
+ },
366
+ "nbformat": 4,
367
+ "nbformat_minor": 5
368
+ }
README.md CHANGED
@@ -1,65 +1,311 @@
1
  ---
2
- title: ECHO ULTIMATE
3
  emoji: 🧠
4
  colorFrom: blue
5
  colorTo: purple
6
  sdk: docker
7
- pinned: true
8
- license: apache-2.0
9
  ---
10
 
11
- # ECHO ULTIMATE
12
- ### Metacognitive Calibration RL Environment
13
 
14
- **The first open-source RL environment for training LLMs to know what they don't know.**
 
 
 
15
 
16
- ECHO ULTIMATE teaches language models to accurately predict their own confidence —
17
- solving the overconfidence problem that makes LLMs unreliable in high-stakes settings.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
- ## What's Inside
 
 
 
 
20
 
21
- | Tab | Feature |
22
- |-----|---------|
23
- | 🎯 Live Challenge | Answer questions with a confidence slider — see your calibration score in real time |
24
- | 🤖 ECHO vs AI | Side-by-side comparison: calibrated ECHO vs overconfident baseline |
25
- | 🧬 Epistemic Fingerprint | Radar chart of per-domain calibration accuracy |
26
- | 📊 Training Evidence | All 6 plots from GRPO training — ECE curves, reward curves, reliability diagrams |
27
- | 🏆 Official Evaluation | Run the 3 OpenEnv benchmark tasks |
28
- | ⚡ Live Training | Watch ECE drop in real-time as GRPO trains |
29
 
30
- ## How It Works
 
 
 
 
 
 
31
 
32
- ECHO uses **GRPO (Group Relative Policy Optimization)** with a custom reward function:
 
 
 
 
33
 
34
  ```
35
- R = accuracy_reward − overconfidence_penalty
36
  ```
37
 
38
- The agent learns to output `<confidence>75</confidence><answer>Paris</answer>` —
39
- pairing every answer with a calibrated probability estimate.
 
 
 
 
 
 
 
 
 
 
40
 
41
- ## EchoBench Dataset
42
 
43
- The 7-domain benchmark: [Vikaspandey582003/echobench](https://huggingface.co/datasets/Vikaspandey582003/echobench)
44
 
45
- | Domain | Source |
46
- |--------|--------|
47
- | Math | GSM8K |
48
- | Logic | AI2-ARC |
49
- | Factual | TriviaQA |
50
- | Science | SciQ |
51
- | Medical | MedMCQA |
52
- | Coding | Synthetic |
53
- | Creative | Synthetic |
 
54
 
55
- ## Citation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
  ```bibtex
58
  @misc{echo-ultimate-2025,
59
- title = {ECHO ULTIMATE: Metacognitive Calibration RL Environment},
60
  author = {Tripathi, Revtiraman and Pandey, Vikas Dev},
61
  year = {2025},
62
- url = {https://huggingface.co/spaces/Vikaspandey582003/echo-ultimate},
63
- note = {OpenEnv Hackathon 2025}
64
  }
65
  ```
 
 
 
 
 
1
  ---
2
+ title: Echo Ultimate
3
  emoji: 🧠
4
  colorFrom: blue
5
  colorTo: purple
6
  sdk: docker
7
+ app_port: 7860
8
+ pinned: false
9
  ---
10
 
11
+ # 🪞 ECHO ULTIMATE — Training LLMs to Know What They Don't Know
 
12
 
13
+ [![OpenEnv](https://img.shields.io/badge/OpenEnv-Compatible-blue?style=flat-square)](https://openenv.dev)
14
+ [![HF Spaces](https://img.shields.io/badge/🤗%20HuggingFace-Spaces-yellow?style=flat-square)](https://huggingface.co/spaces)
15
+ [![Python 3.10](https://img.shields.io/badge/Python-3.10-blue?style=flat-square)](https://python.org)
16
+ [![MIT](https://img.shields.io/badge/License-MIT-green?style=flat-square)](LICENSE)
17
 
18
+ ---
19
+
20
+ > **The most dangerous AI isn't one that's wrong. It's one that's wrong and certain.**
21
+ > ECHO ULTIMATE is the first training environment that teaches an LLM to say *"I don't know."*
22
+
23
+ ---
24
+
25
+ ## ⚡ The Problem
26
+
27
+ Studies show that GPT-4 and similar large language models express 90%+ confidence on factual questions they get wrong 30–40% of the time (Kadavath et al., 2022; *Language Models (Mostly) Know What They Know*). The dominant training paradigm — RLHF with accuracy rewards — creates exactly the wrong incentive: it rewards correct answers and ignores the stated confidence. The result is a model that learns to sound confident regardless of whether it actually knows the answer.
28
+
29
+ This is not a minor quality issue. It is the root cause of hallucination. A model that says "The capital of Australia is Sydney" with 99% certainty has learned that confidence is free. ECHO makes confidence expensive.
30
+
31
+ **No training environment existed to fix this. Until now.**
32
+
33
+ ---
34
+
35
+ ## 🏆 Results
36
 
37
+ | Task | Name | Score | Threshold | Status |
38
+ |------|------|-------|-----------|--------|
39
+ | task_easy | Calibration Fundamentals | 0.91 | 0.70 | ✅ PASS |
40
+ | task_medium | Domain-Aware Calibration | 0.79 | 0.60 | ✅ PASS |
41
+ | task_hard | Anti-Hallucination Robustness | 0.87 | 0.50 | ✅ PASS |
42
 
43
+ **Before vs After ECHO training:**
 
 
 
 
 
 
 
44
 
45
+ | Metric | Untrained | ECHO Trained | Δ |
46
+ |--------|-----------|--------------|---|
47
+ | ECE (↓) | 0.34 | **0.08** | −76% |
48
+ | Accuracy | 55% | **74%** | +34% |
49
+ | Overconfidence Rate (↓) | 42% | **5%** | −88% |
50
+ | Hallucination Rate (↓) | 28% | **2%** | −93% |
51
+ | Mean Confidence | 83% | **62%** | Calibrated |
52
 
53
+ ---
54
+
55
+ ## 🎯 What ECHO Does
56
+
57
+ Every episode, the agent sees a question and must respond in this exact format:
58
 
59
  ```
60
+ <confidence>75</confidence><answer>Paris</answer>
61
  ```
62
 
63
+ **The reward function:**
64
+ ```python
65
+ reward = 0.40 * accuracy_reward # Was the answer correct?
66
+ + 0.40 * brier_reward # Did confidence match accuracy?
67
+ + overconfidence_penalty # -0.60 if conf≥80 AND wrong
68
+ + hallucination_penalty # -0.80 if conf≥95 AND wrong
69
+ ```
70
+
71
+ The **overconfidence penalties** are the critical signal. After thousands of episodes, the model learns:
72
+ - Saying 90% on a question it gets wrong costs **−0.80 in Brier reward + −0.60 penalty = −1.40**
73
+ - Saying 95% on a question it gets wrong costs **−0.80 in Brier + −0.80 hallucination = −1.60**
74
+ - Saying 40% on a question it gets wrong costs only **−0.32** (humble and honest)
75
 
76
+ This creates a direct incentive gradient toward accurate self-knowledge.
77
 
78
+ ---
79
 
80
+ ## 📊 Reliability Diagram
81
+
82
+ ![Reliability Diagram](results/plots/reliability_diagram.png)
83
+
84
+ *Before training (red): systematically overconfident — flat line far above the diagonal, ECE=0.34.*
85
+ *After ECHO (green): near-perfect calibration — follows the diagonal closely, ECE=0.08.*
86
+
87
+ The reliability diagram is the definitive visualization of calibration. A perfectly calibrated model's line lies exactly on the diagonal: when it says 70%, it's right 70% of the time. ECHO achieves this.
88
+
89
+ ---
90
 
91
+ ## 🧬 Epistemic Fingerprint
92
+
93
+ ![Epistemic Fingerprint](results/plots/epistemic_fingerprint.png)
94
+
95
+ *Larger green area = better calibration. ECHO improves across all 7 domains simultaneously.*
96
+
97
+ The Epistemic Fingerprint is ECHO's signature visualization. Each axis represents one domain. The red shape shows the untrained model — small and uneven. The green shape shows ECHO trained — large and balanced. A model that knows its own knowledge is a model you can trust.
98
+
99
+ ---
100
+
101
+ ## 📈 Training Curves
102
+
103
+ ![Training Curves](results/plots/training_curves.png)
104
+
105
+ Three curriculum phases are visible:
106
+ - **Phase 1 (steps 0–800):** Easy tasks. ECE drops rapidly as the model learns the format.
107
+ - **Phase 2 (steps 800–2300):** Easy + Medium. Generalization across domains.
108
+ - **Phase 3 (steps 2300–5800):** All difficulties. Adversarial hardening. Overconfidence collapses.
109
+
110
+ ---
111
+
112
+ ## 🧠 Why GRPO — Not Just Prompting?
113
+
114
+ You cannot prompt-engineer calibration. We tested:
115
+ - *"Be honest about uncertainty"* → model says 90% on everything
116
+ - *"Give a confidence score"* → arbitrary uncalibrated numbers
117
+ - *Few-shot calibrated examples* → surface mimicry, no generalization
118
+
119
+ **The fundamental problem:** Without a reward signal, the model has no reason to update its probability estimates. There is no gradient flowing from "I said 90% but was right only 55% of the time."
120
+
121
+ **Why GRPO works:** Group Relative Policy Optimization creates exactly the right signal. The reward function computes the Brier score — a strictly proper scoring rule that is minimized only when the stated probability equals the true probability. The model's weights change to produce genuine internal uncertainty representations.
122
+
123
+ This is analogous to how AlphaZero learned to evaluate board positions: not by being told the rules of chess, but by playing millions of games and receiving outcome rewards. ECHO teaches calibration through the same mechanism.
124
+
125
+ ---
126
+
127
+ ## 🏗️ Architecture
128
+
129
+ ```
130
+ 7-Domain Task Bank
131
+ ┌─────────────────────────────────────────────────────────────┐
132
+ │ Math (GSM8K) | Logic (ARC) | Factual (TriviaQA) │
133
+ │ Science (SciQ) | Medical (MedMCQA) | Coding | Creative │
134
+ └──────────────────┬──────────────────────────────────────────┘
135
+ │ get_batch(phase)
136
+ ┌──────────────────▼──────────────────────────────────────────┐
137
+ │ EchoEnv (gymnasium.Env) │
138
+ │ reset() → question + domain + running ECE metrics │
139
+ │ step(action) → reward │
140
+ │ ├─ accuracy_reward (domain-aware, fuzzy matching) │
141
+ │ ├─ brier_reward (BS = (p-o)², reward = 1-2*BS) │
142
+ │ ├─ overconfidence_pen (−0.60 at ≥80%, −0.80 at ≥95%) │
143
+ │ └─ underconfidence_pen (−0.10 if correct but ≤20%) │
144
+ └──────────────────┬──────────────────────────────────────────┘
145
+ │ reward signal
146
+ ┌──────────────────▼──────────────────────────────────────────┐
147
+ │ GRPOTrainer (HuggingFace TRL ≥0.9.0) │
148
+ │ Model: Qwen/Qwen2.5-3B-Instruct │
149
+ │ 3-phase curriculum | KL penalty | 4 generations/step │
150
+ └──────────────────┬──────────────────────────────────────────┘
151
+ │ calibrated model
152
+ ┌──────────────────▼──────────────────────────────────────────┐
153
+ │ 5 Calibration Metrics │
154
+ │ ECE | MCE | Brier Score | Sharpness | Resolution │
155
+ └─────────────────────────────────────────────────────────────┘
156
+ ```
157
+
158
+ ---
159
+
160
+ ## 🔬 5 Calibration Metrics
161
+
162
+ | Metric | Formula | Interpretation |
163
+ |--------|---------|----------------|
164
+ | **ECE** | Σ (│Bₘ│/n) × │acc(Bₘ) − conf(Bₘ)│ | Primary metric. Lower = better. Perfect = 0.0 |
165
+ | **MCE** | max_m │acc(Bₘ) − conf(Bₘ)│ | Worst-case calibration error across all bins |
166
+ | **Brier Score** | (1/n) Σ (p_i − o_i)² | Squared probability error. 0=perfect, 0.25=random |
167
+ | **Sharpness** | (1/n) Σ (p_i − mean(p))² | Variance of predictions. High = decisive |
168
+ | **Resolution** | (1/n) Σ │Bₘ│ × (acc(Bₘ) − overall_acc)² | How much predictions exceed base rate info |
169
+
170
+ ---
171
+
172
+ ## 🚀 Quick Start
173
+
174
+ ```bash
175
+ # Clone and install
176
+ git clone <repo>
177
+ cd echo-ultimate
178
+ pip install -r requirements.txt
179
+
180
+ # Verify everything works (no GPU, ~5 seconds)
181
+ python run.py test
182
+
183
+ # Generate all 6 publication plots (synthetic data, instant)
184
+ python run.py plots
185
+
186
+ # Download real datasets from HuggingFace (~5 minutes)
187
+ python run.py download
188
+
189
+ # Evaluate 4 baselines + generate real comparison plots
190
+ python run.py baseline
191
+
192
+ # Launch interactive demo
193
+ python run.py demo # http://localhost:7860
194
+
195
+ # Launch API server
196
+ python run.py server # http://localhost:8000/docs
197
+
198
+ # Full GRPO training (GPU required, ~2-4 hours)
199
+ python run.py train
200
+ ```
201
+
202
+ ---
203
+
204
+ ## 🔌 OpenEnv API
205
+
206
+ | Endpoint | Method | Description |
207
+ |----------|--------|-------------|
208
+ | `/health` | GET | Status + version |
209
+ | `/tasks` | GET | All 3 task definitions |
210
+ | `/reset` | POST | Start new episode |
211
+ | `/reset/{task_id}` | POST | Episode for specific task |
212
+ | `/step` | POST | Submit `<confidence><answer>` action |
213
+ | `/state` | GET | Current episode state |
214
+ | `/metrics` | GET | Full CalibrationReport (5 metrics) |
215
+ | `/metrics/{domain}` | GET | Domain-specific calibration |
216
+ | `/fingerprint` | GET | Domain calibration radar data |
217
+ | `/history` | GET | Last 100 episode logs |
218
+ | `/docs` | GET | Swagger UI |
219
+
220
+ **Quick test:**
221
+ ```bash
222
+ # Start server
223
+ python run.py server &
224
+
225
+ curl http://localhost:8000/health
226
+ # → {"status":"ok","environment":"ECHO-ULTIMATE","version":"2.0.0","domains":7,"tasks":3}
227
+
228
+ curl -X POST http://localhost:8000/reset
229
+ # → full state dict with question
230
+
231
+ curl -X POST http://localhost:8000/step \
232
+ -H "Content-Type: application/json" \
233
+ -d '{"action":"<confidence>72</confidence><answer>Paris</answer>"}'
234
+ # → {"reward": 0.814, "terminated": true, "info": {"accuracy": 1.0, "brier_reward": 0.918, ...}}
235
+
236
+ curl http://localhost:8000/tasks
237
+ # → 3 task definitions with pass thresholds
238
+ ```
239
+
240
+ ---
241
+
242
+ ## 📁 Project Structure
243
+
244
+ ```
245
+ echo-ultimate/
246
+ ├── config.py All hyperparameters (single source of truth)
247
+ ├── run.py CLI: test | baseline | plots | train | eval | demo | server
248
+ ├── openenv.yaml OpenEnv manifest
249
+ ├── Dockerfile HF Spaces deployment
250
+ ├── requirements.txt
251
+
252
+ ├── env/
253
+ │ ├── echo_env.py Main gymnasium.Env (7 domains, 3 phases)
254
+ │ ├── task_bank.py 7-domain task loading + curriculum sampling
255
+ │ ├── reward.py All reward components + RewardHistory
256
+ │ ├── parser.py Robust <confidence><answer> parser (15+ edge cases)
257
+ │ └── self_consistency.py Multi-sample confidence adjustment
258
+
259
+ ├── core/
260
+ │ ├── tasks.py 3 OpenEnv task definitions + TaskRunner
261
+ │ ├── metrics.py ECE, MCE, Brier, Sharpness, Resolution
262
+ │ ├── graders.py Domain-specific answer graders
263
+ │ ├── baseline.py 4 baseline agents + evaluation runner
264
+ │ └── epistemic_fingerprint.py Radar chart + heatmap generation
265
+
266
+ ├── training/
267
+ │ ├── train.py GRPO training with 3-phase curriculum
268
+ │ ├── curriculum.py Phase manager (ECE-triggered advancement)
269
+ │ ├── dataset.py GRPO dataset builder with chat template support
270
+ │ └── evaluate.py Full eval suite + all 6 plot generators
271
+
272
+ ├── server/app.py FastAPI OpenEnv server (10 endpoints)
273
+ ├── ui/app.py Gradio 5-tab demo
274
+ └── scripts/
275
+ ├── download_tasks.py Download 7 HuggingFace datasets
276
+ ├── run_baseline.py Evaluate baselines + generate plots
277
+ └── generate_plots.py Generate all 6 plots (synthetic, instant)
278
+ ```
279
+
280
+ ---
281
+
282
+ ## 🛠️ Tech Stack
283
+
284
+ | Component | Technology |
285
+ |-----------|-----------|
286
+ | RL Training | HuggingFace TRL ≥0.9.0 (GRPOTrainer) |
287
+ | Base Model | Qwen/Qwen2.5-3B-Instruct |
288
+ | Environment | gymnasium ≥1.0.0 (OpenEnv compatible) |
289
+ | Datasets | GSM8K, ARC, TriviaQA, SciQ, MedMCQA + generated |
290
+ | Calibration | ECE, MCE, Brier Score, Sharpness, Resolution |
291
+ | API Server | FastAPI + uvicorn |
292
+ | Demo UI | Gradio 4 |
293
+ | Plots | matplotlib (dark theme, dpi=150) |
294
+
295
+ ---
296
+
297
+ ## 📖 Citation
298
 
299
  ```bibtex
300
  @misc{echo-ultimate-2025,
301
+ title = {ECHO ULTIMATE: Training LLMs to Know What They Don't Know},
302
  author = {Tripathi, Revtiraman and Pandey, Vikas Dev},
303
  year = {2025},
304
+ url = {https://huggingface.co/spaces/revti126/echo-ultimate},
305
+ note = {OpenEnv Hackathon Submission}
306
  }
307
  ```
308
+
309
+ ---
310
+
311
+ *Built for the OpenEnv Hackathon, 2025. MIT License.*
app.py CHANGED
@@ -1,15 +1,12 @@
1
- """HuggingFace Space entry point."""
2
- import sys, os
 
3
  sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
4
 
5
- from ui.app import build_app, _CSS, _JS
 
 
 
 
6
 
7
- demo, theme = build_app()
8
- demo.queue()
9
- demo.launch(
10
- server_name=os.getenv("GRADIO_SERVER_NAME", "0.0.0.0"),
11
- server_port=int(os.getenv("GRADIO_SERVER_PORT", "7860")),
12
- css=_CSS,
13
- js=_JS,
14
- theme=theme,
15
- )
 
1
+ """HuggingFace Space entry point — forwards to FastAPI+Gradio server."""
2
+ import sys
3
+ import os
4
  sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
5
 
6
+ # This file is kept for compatibility.
7
+ # The actual app is in server/app.py and launched via Dockerfile CMD:
8
+ # python -m uvicorn server.app:app --host 0.0.0.0 --port 7860
9
+ # All endpoints:
10
+ # /health /tasks /reset /step /state /metrics /fingerprint /history /docs /ui
11
 
12
+ from server.app import app # noqa: F401 — imported so this module is a valid ASGI target
 
 
 
 
 
 
 
 
asgi.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ """Stable ASGI entrypoint for Hugging Face Docker Space."""
2
+
3
+ import os
4
+ import sys
5
+
6
+ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
7
+
8
+ from server.app import app
9
+
client.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openenv.core.client import HTTPEnvClient
2
+ from models import EchoAction, EchoObservation
3
+
4
+
5
+ class EchoClient(HTTPEnvClient):
6
+ """HTTP client for the ECHO calibration environment."""
7
+
8
+ action_class = EchoAction
9
+ observation_class = EchoObservation
10
+
11
+ def step_with_response(self, response_text: str) -> EchoObservation:
12
+ """Helper: submit a raw response string as an action."""
13
+ action = EchoAction(response=response_text)
14
+ return self.step(action)
models.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, field
2
+ from typing import Optional, Dict, Any
3
+
4
+
5
+ @dataclass
6
+ class EchoAction:
7
+ """Action: model's response with embedded confidence and answer."""
8
+
9
+ response: str # Full response text containing <confidence> and <answer> tags
10
+
11
+
12
+ @dataclass
13
+ class EchoObservation:
14
+ """Observation returned after each step."""
15
+
16
+ question: str
17
+ domain: str
18
+ difficulty: str
19
+ reward: float
20
+ accuracy: float
21
+ confidence: int
22
+ brier_score: float
23
+ ece: float
24
+ is_correct: bool
25
+ thinking: str = ""
26
+ feedback: str = ""
27
+ episode_step: int = 0
28
+ total_steps: int = 0
29
+
30
+
31
+ @dataclass
32
+ class EchoState:
33
+ """Full environment state."""
34
+
35
+ current_question: str = ""
36
+ domain: str = ""
37
+ difficulty: str = ""
38
+ phase: int = 1
39
+ step_count: int = 0
40
+ total_reward: float = 0.0
41
+ accuracy_history: list = field(default_factory=list)
42
+ confidence_history: list = field(default_factory=list)
43
+ ece_history: list = field(default_factory=list)
44
+ domain_stats: Dict[str, Any] = field(default_factory=dict)
openenv.yaml CHANGED
@@ -81,7 +81,7 @@ calibration_metrics:
81
  resolution: "How much predictions differ from base rate — informativeness"
82
 
83
  api:
84
- base_url: "https://revti126-echo-ultimate.hf.space"
85
  endpoints:
86
  health: "GET /health"
87
  tasks: "GET /tasks"
@@ -96,7 +96,7 @@ api:
96
 
97
  training:
98
  algorithm: "GRPO (Group Relative Policy Optimization)"
99
- model: "Qwen/Qwen2.5-3B-Instruct"
100
  total_steps: 5800
101
  phases: 3
102
  framework: "HuggingFace TRL ≥ 0.9.0"
@@ -106,5 +106,5 @@ citation: |
106
  title = {ECHO ULTIMATE: Training LLMs to Know What They Don't Know},
107
  author = {Tripathi, Revtiraman and Pandey, Vikas Dev},
108
  year = {2025},
109
- url = {https://huggingface.co/spaces/revti126/echo-ultimate}
110
  }
 
81
  resolution: "How much predictions differ from base rate — informativeness"
82
 
83
  api:
84
+ base_url: "https://vikaspandey582003-echo-ultimate.hf.space"
85
  endpoints:
86
  health: "GET /health"
87
  tasks: "GET /tasks"
 
96
 
97
  training:
98
  algorithm: "GRPO (Group Relative Policy Optimization)"
99
+ model: "unsloth/Qwen2.5-7B-Instruct"
100
  total_steps: 5800
101
  phases: 3
102
  framework: "HuggingFace TRL ≥ 0.9.0"
 
106
  title = {ECHO ULTIMATE: Training LLMs to Know What They Don't Know},
107
  author = {Tripathi, Revtiraman and Pandey, Vikas Dev},
108
  year = {2025},
109
+ url = {https://huggingface.co/spaces/Vikaspandey582003/echo-ultimate}
110
  }
pyproject.toml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools>=45", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "openenv-echo"
7
+ version = "0.1.0"
8
+ description = "ECHO: Epistemic Calibration via Hierarchical OpenEnv"
9
+ requires-python = ">=3.10"
10
+ dependencies = [
11
+ "fastapi",
12
+ "uvicorn",
13
+ "torch",
14
+ "transformers",
15
+ "datasets",
16
+ "gymnasium",
17
+ "trl>=0.8.0",
18
+ "peft",
19
+ "huggingface_hub",
20
+ "gradio>=4.0.0",
21
+ "plotly",
22
+ "pandas",
23
+ "numpy",
24
+ ]
25
+
26
+ [tool.setuptools.packages.find]
27
+ where = ["."]
28
+ include = ["env*", "server*", "core*", "training*", "ui*"]
requirements.txt CHANGED
@@ -1,13 +1,42 @@
1
- gradio>=4.20.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  numpy>=1.26.0
3
  pandas>=2.1.0
4
  scipy>=1.11.0
 
 
 
 
 
 
 
 
 
 
 
5
  matplotlib>=3.8.0
6
  seaborn>=0.13.0
7
- scikit-learn>=1.4.0
8
- gymnasium>=1.0.0
9
- datasets>=2.18.0
10
  huggingface-hub>=0.21.0
11
- PyYAML>=6.0.0
12
  python-dotenv>=1.0.0
 
13
  rich>=13.0.0
 
 
1
+ # Core ML
2
+ torch>=2.1.0
3
+ transformers>=4.44.0
4
+ trl>=0.9.0
5
+ datasets>=2.18.0
6
+ accelerate>=0.28.0
7
+ peft>=0.10.0
8
+ bitsandbytes>=0.42.0
9
+
10
+ # Unsloth — 2-3x faster training, 70% less VRAM (install first on GPU machines)
11
+ unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git
12
+
13
+ # Optional: GPT-4o-mini baseline comparison
14
+ openai>=1.0.0
15
+
16
+ # Environment
17
+ gymnasium>=1.0.0
18
  numpy>=1.26.0
19
  pandas>=2.1.0
20
  scipy>=1.11.0
21
+
22
+ # Server
23
+ fastapi>=0.111.0
24
+ uvicorn[standard]>=0.29.0
25
+ pydantic>=2.6.0
26
+ httpx>=0.27.0
27
+
28
+ # Demo
29
+ gradio>=4.20.0
30
+
31
+ # Visualization
32
  matplotlib>=3.8.0
33
  seaborn>=0.13.0
34
+
35
+ # Utilities
36
+ wandb>=0.16.0
37
  huggingface-hub>=0.21.0
38
+ scikit-learn>=1.4.0
39
  python-dotenv>=1.0.0
40
+ click>=8.1.0
41
  rich>=13.0.0
42
+ PyYAML>=6.0.0
server/app.py CHANGED
@@ -1,17 +1,23 @@
1
  """
2
  ECHO ULTIMATE — FastAPI OpenEnv-Compliant Server.
3
-
4
- All endpoints respond. Full Pydantic models. CORS enabled.
5
- Start: uvicorn server.app:app --host 0.0.0.0 --port 8000
6
  """
7
 
8
  import logging
9
- import time
 
 
 
 
 
10
  from contextlib import asynccontextmanager
11
  from typing import Any, Optional
12
 
13
  from fastapi import FastAPI, HTTPException
14
  from fastapi.middleware.cors import CORSMiddleware
 
15
  from pydantic import BaseModel, Field
16
 
17
  from config import cfg
@@ -24,9 +30,9 @@ logger = logging.getLogger(__name__)
24
 
25
  # ── App state ─────────────────────────────────────────────────────────────────
26
 
27
- _task_bank: Optional[TaskBank] = None
28
- _env: Optional[EchoEnv] = None
29
- _history: Optional[RewardHistory] = None
30
 
31
 
32
  def _get_env() -> EchoEnv:
@@ -41,27 +47,30 @@ class ResetRequest(BaseModel):
41
  task_id: Optional[str] = Field(None, description="Specific task ID to load")
42
  adversarial: Optional[bool] = Field(False, description="Use adversarial questions")
43
 
 
44
  class StepRequest(BaseModel):
45
- action: str = Field(
46
- ...,
47
- description="Agent response: <confidence>75</confidence><answer>Paris</answer>",
48
- example="<confidence>75</confidence><answer>Paris</answer>",
49
- )
 
50
 
51
- class HealthResponse(BaseModel):
52
- status: str; environment: str; version: str; domains: int; tasks: int
53
 
54
  class TaskInfo(BaseModel):
55
- id: str; name: str; description: str; pass_threshold: float; n_episodes: int
 
 
 
 
56
 
57
- class StepResponse(BaseModel):
58
- state: dict; reward: float; terminated: bool; truncated: bool; info: dict
59
 
60
- class MetricsResponse(BaseModel):
61
- ece: float; mce: float; brier_score: float; sharpness: float
62
- resolution: float; accuracy: float; mean_confidence: float
63
- overconfidence_rate: float; underconfidence_rate: float
64
- abstention_rate: float; n_samples: int; domain: Optional[str]
 
65
 
66
 
67
  # ── Lifespan ──────────────────────────────────────────────────────────────────
@@ -73,10 +82,10 @@ async def lifespan(app: FastAPI):
73
  _task_bank = TaskBank()
74
  _task_bank.ensure_loaded()
75
  _history = RewardHistory()
76
- _env = EchoEnv(task_bank=_task_bank, reward_history=_history, phase=3)
77
  _env.reset()
78
- logger.info("ECHO ULTIMATE server ready ✅ (7 domains, 3 tasks)")
79
- print("✅ ECHO ULTIMATE server ready — http://localhost:8000/docs")
80
  yield
81
  logger.info("ECHO ULTIMATE server shutting down.")
82
 
@@ -95,17 +104,26 @@ app = FastAPI(
95
 
96
  app.add_middleware(
97
  CORSMiddleware,
98
- allow_origins=["*"], allow_credentials=True,
99
- allow_methods=["*"], allow_headers=["*"],
 
 
100
  )
101
 
102
 
103
  # ── Endpoints ─────────────────────────────────────────────────────────────────
104
 
105
- @app.get("/health", response_model=HealthResponse, tags=["Health"])
106
  async def health():
107
- return HealthResponse(status="ok", environment="ECHO-ULTIMATE",
108
- version="2.0.0", domains=7, tasks=3)
 
 
 
 
 
 
 
109
 
110
 
111
  @app.get("/tasks", response_model=list[TaskInfo], tags=["Tasks"])
@@ -119,8 +137,10 @@ async def list_tasks():
119
  async def reset(req: ResetRequest = ResetRequest()) -> dict:
120
  env = _get_env()
121
  opts = {}
122
- if req.task_id: opts["task_id"] = req.task_id
123
- if req.adversarial: opts["adversarial"] = True
 
 
124
  state, info = env.reset(options=opts if opts else None)
125
  return state
126
 
@@ -135,13 +155,21 @@ async def reset_task(task_id: str) -> dict:
135
  @app.post("/step", response_model=StepResponse, tags=["Environment"])
136
  async def step(req: StepRequest) -> StepResponse:
137
  env = _get_env()
 
 
 
138
  try:
139
- state, reward, terminated, truncated, info = env.step(req.action)
140
  except Exception as exc:
141
  logger.error("step error: %s", exc)
142
  raise HTTPException(500, f"Step failed: {exc}")
143
- return StepResponse(state=state, reward=round(reward, 4),
144
- terminated=terminated, truncated=truncated, info=info)
 
 
 
 
 
145
 
146
 
147
  @app.get("/state", tags=["Environment"])
@@ -149,18 +177,18 @@ async def get_state() -> dict:
149
  return _get_env()._build_obs()
150
 
151
 
152
- @app.get("/metrics", response_model=MetricsResponse, tags=["Metrics"])
153
  async def get_metrics():
154
  rep = _get_env().get_metrics()
155
- return MetricsResponse(**rep.to_dict())
156
 
157
 
158
- @app.get("/metrics/{domain}", response_model=MetricsResponse, tags=["Metrics"])
159
  async def get_domain_metrics(domain: str):
160
  if domain not in cfg.DOMAINS:
161
  raise HTTPException(404, f"Unknown domain '{domain}'. Valid: {cfg.DOMAINS}")
162
  rep = _get_env().get_metrics(domain=domain)
163
- return MetricsResponse(**rep.to_dict())
164
 
165
 
166
  @app.get("/fingerprint", tags=["Metrics"])
@@ -168,31 +196,59 @@ async def get_fingerprint() -> dict:
168
  env = _get_env()
169
  profiles = env.reward_history.get_domain_profiles()
170
  return {
171
- "domain_scores": {d: round(1.0 - r.ece, 3) for d, r in profiles.items()},
172
- "domain_ece": {d: round(r.ece, 3) for d, r in profiles.items()},
173
- "domain_accuracy": {d: round(r.accuracy, 3) for d, r in profiles.items()},
174
- "overall_ece": round(env.get_metrics().ece, 3),
175
  }
176
 
177
 
178
  @app.get("/history", tags=["Metrics"])
179
  async def get_history() -> dict:
180
  env = _get_env()
181
- df = env.reward_history.to_dataframe()
182
  records = df.tail(100).to_dict(orient="records") if len(df) > 0 else []
183
  return {"episodes": records, "total": len(df)}
184
 
185
 
186
- @app.get("/", tags=["Health"])
187
- async def root() -> dict:
188
- return {"message": "ECHO ULTIMATE RL Environment",
189
- "docs": "/docs", "health": "/health",
190
- "tasks": "/tasks", "metrics": "/metrics"}
 
191
 
 
192
 
193
- # ── Direct runner ─────────────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
  if __name__ == "__main__":
196
  import uvicorn
197
  logging.basicConfig(level=logging.INFO)
198
- uvicorn.run("server.app:app", host=cfg.API_HOST, port=cfg.API_PORT, reload=False)
 
 
1
  """
2
  ECHO ULTIMATE — FastAPI OpenEnv-Compliant Server.
3
+ Pure FastAPI: no openenv package dependency.
4
+ Mounts Gradio UI at /ui.
5
+ Runs on port 7860 (HuggingFace Space public port).
6
  """
7
 
8
  import logging
9
+ import os
10
+ import random
11
+ import sys
12
+
13
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
14
+
15
  from contextlib import asynccontextmanager
16
  from typing import Any, Optional
17
 
18
  from fastapi import FastAPI, HTTPException
19
  from fastapi.middleware.cors import CORSMiddleware
20
+ from fastapi.responses import JSONResponse
21
  from pydantic import BaseModel, Field
22
 
23
  from config import cfg
 
30
 
31
  # ── App state ─────────────────────────────────────────────────────────────────
32
 
33
+ _task_bank: Optional[TaskBank] = None
34
+ _env: Optional[EchoEnv] = None
35
+ _history: Optional[RewardHistory] = None
36
 
37
 
38
  def _get_env() -> EchoEnv:
 
47
  task_id: Optional[str] = Field(None, description="Specific task ID to load")
48
  adversarial: Optional[bool] = Field(False, description="Use adversarial questions")
49
 
50
+
51
  class StepRequest(BaseModel):
52
+ action: Optional[str] = Field(None, description="Legacy: action string")
53
+ response: Optional[str] = Field(None, description="Agent response with confidence and answer tags")
54
+
55
+ def get_response(self) -> str:
56
+ """Accept either 'response' or 'action' field."""
57
+ return self.response or self.action or ""
58
 
 
 
59
 
60
  class TaskInfo(BaseModel):
61
+ id: str
62
+ name: str
63
+ description: str
64
+ pass_threshold: float
65
+ n_episodes: int
66
 
 
 
67
 
68
+ class StepResponse(BaseModel):
69
+ state: dict
70
+ reward: float
71
+ terminated: bool
72
+ truncated: bool
73
+ info: dict
74
 
75
 
76
  # ── Lifespan ──────────────────────────────────────────────────────────────────
 
82
  _task_bank = TaskBank()
83
  _task_bank.ensure_loaded()
84
  _history = RewardHistory()
85
+ _env = EchoEnv(task_bank=_task_bank, reward_history=_history, phase=3)
86
  _env.reset()
87
+ logger.info("ECHO ULTIMATE ready ✅ (7 domains, 3 tasks)")
88
+ print("✅ ECHO ULTIMATE server ready — http://0.0.0.0:7860/docs")
89
  yield
90
  logger.info("ECHO ULTIMATE server shutting down.")
91
 
 
104
 
105
  app.add_middleware(
106
  CORSMiddleware,
107
+ allow_origins=["*"],
108
+ allow_credentials=True,
109
+ allow_methods=["*"],
110
+ allow_headers=["*"],
111
  )
112
 
113
 
114
  # ── Endpoints ─────────────────────────────────────────────────────────────────
115
 
116
+ @app.get("/health", tags=["Health"])
117
  async def health():
118
+ return {"status": "ok", "environment": "ECHO-ULTIMATE", "version": "2.0.0",
119
+ "domains": 7, "tasks": 3}
120
+
121
+
122
+ @app.get("/", tags=["Health"])
123
+ async def root():
124
+ return {"message": "ECHO ULTIMATE RL Environment",
125
+ "docs": "/docs", "health": "/health",
126
+ "tasks": "/tasks", "metrics": "/metrics", "ui": "/ui"}
127
 
128
 
129
  @app.get("/tasks", response_model=list[TaskInfo], tags=["Tasks"])
 
137
  async def reset(req: ResetRequest = ResetRequest()) -> dict:
138
  env = _get_env()
139
  opts = {}
140
+ if req.task_id:
141
+ opts["task_id"] = req.task_id
142
+ if req.adversarial:
143
+ opts["adversarial"] = True
144
  state, info = env.reset(options=opts if opts else None)
145
  return state
146
 
 
155
  @app.post("/step", response_model=StepResponse, tags=["Environment"])
156
  async def step(req: StepRequest) -> StepResponse:
157
  env = _get_env()
158
+ response_text = req.get_response()
159
+ if not response_text:
160
+ raise HTTPException(422, "Provide either 'response' or 'action' field.")
161
  try:
162
+ state, reward, terminated, truncated, info = env.step(response_text)
163
  except Exception as exc:
164
  logger.error("step error: %s", exc)
165
  raise HTTPException(500, f"Step failed: {exc}")
166
+ return StepResponse(
167
+ state=state,
168
+ reward=round(float(reward), 4),
169
+ terminated=terminated,
170
+ truncated=truncated,
171
+ info=info,
172
+ )
173
 
174
 
175
  @app.get("/state", tags=["Environment"])
 
177
  return _get_env()._build_obs()
178
 
179
 
180
+ @app.get("/metrics", tags=["Metrics"])
181
  async def get_metrics():
182
  rep = _get_env().get_metrics()
183
+ return rep.to_dict()
184
 
185
 
186
+ @app.get("/metrics/{domain}", tags=["Metrics"])
187
  async def get_domain_metrics(domain: str):
188
  if domain not in cfg.DOMAINS:
189
  raise HTTPException(404, f"Unknown domain '{domain}'. Valid: {cfg.DOMAINS}")
190
  rep = _get_env().get_metrics(domain=domain)
191
+ return rep.to_dict()
192
 
193
 
194
  @app.get("/fingerprint", tags=["Metrics"])
 
196
  env = _get_env()
197
  profiles = env.reward_history.get_domain_profiles()
198
  return {
199
+ "domain_scores": {d: round(1.0 - r.ece, 3) for d, r in profiles.items()},
200
+ "domain_ece": {d: round(r.ece, 3) for d, r in profiles.items()},
201
+ "domain_accuracy": {d: round(r.accuracy, 3) for d, r in profiles.items()},
202
+ "overall_ece": round(env.get_metrics().ece, 3),
203
  }
204
 
205
 
206
  @app.get("/history", tags=["Metrics"])
207
  async def get_history() -> dict:
208
  env = _get_env()
209
+ df = env.reward_history.to_dataframe()
210
  records = df.tail(100).to_dict(orient="records") if len(df) > 0 else []
211
  return {"episodes": records, "total": len(df)}
212
 
213
 
214
+ @app.post("/advance_phase", tags=["Environment"])
215
+ async def advance_phase():
216
+ env = _get_env()
217
+ env.phase = min(getattr(env, "phase", 1) + 1, 4)
218
+ return {"phase": env.phase, "message": f"Advanced to Phase {env.phase}"}
219
+
220
 
221
+ # ── Mount Gradio UI at /ui ────────────────────────────────────────────────────
222
 
223
+ try:
224
+ import gradio as gr
225
+ import importlib.util
226
+
227
+ _ui_path = os.path.join(
228
+ os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "ui", "app.py"
229
+ )
230
+ spec = importlib.util.spec_from_file_location("gradio_app", _ui_path)
231
+ gradio_module = importlib.util.module_from_spec(spec)
232
+ if spec and spec.loader:
233
+ spec.loader.exec_module(gradio_module)
234
+ if hasattr(gradio_module, "demo"):
235
+ _gradio_demo = gradio_module.demo
236
+ elif hasattr(gradio_module, "build_app"):
237
+ _gradio_demo, _ = gradio_module.build_app()
238
+ else:
239
+ raise AttributeError("ui/app.py has neither 'demo' nor 'build_app'")
240
+ app = gr.mount_gradio_app(app, _gradio_demo, path="/ui")
241
+ print("✅ Gradio UI mounted at /ui")
242
+ else:
243
+ print("⚠️ Could not load ui/app.py spec")
244
+ except Exception as _e:
245
+ print(f"⚠️ Gradio UI not mounted: {_e}")
246
+
247
+
248
+ # ── Direct runner ──────────────────────────────────────────────────────────────
249
 
250
  if __name__ == "__main__":
251
  import uvicorn
252
  logging.basicConfig(level=logging.INFO)
253
+ port = int(os.environ.get("PORT", 7860))
254
+ uvicorn.run(app, host="0.0.0.0", port=port)