Spaces:

Vikaspandey582003
/

echo-ultimate

Sleeping

App Files Files Community

Vikaspandey582003 commited on 13 days ago

Commit

fc58aef

verified ·

1 Parent(s): 4e366bc

fix: pure FastAPI on port 7860 — all OpenEnv endpoints live + Gradio at /ui

Browse files

Files changed (11) hide show

Dockerfile +6 -18
ECHO_Training.ipynb +368 -0
README.md +283 -37
app.py +9 -12
asgi.py +9 -0
client.py +14 -0
models.py +44 -0
openenv.yaml +3 -3
pyproject.toml +28 -0
requirements.txt +34 -5
server/app.py +107 -51

Dockerfile CHANGED Viewed

@@ -1,24 +1,12 @@
-FROM python:3.11-slim
 WORKDIR /app
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    build-essential curl git && \
-    rm -rf /var/lib/apt/lists/*
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 COPY . .
 RUN mkdir -p data results/plots
-# Pre-generate all plots so Gradio loads instantly (falls back silently on failure)
-RUN python scripts/generate_plots.py || echo "Plot pre-generation skipped"
 EXPOSE 7860
-ENV GRADIO_SERVER_NAME=0.0.0.0
-ENV GRADIO_SERVER_PORT=7860
-CMD ["python", "app.py"]

+FROM python:3.10-slim
 WORKDIR /app
+RUN apt-get update && apt-get install -y git gcc g++ curl && rm -rf /var/lib/apt/lists/*
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 COPY . .
 RUN mkdir -p data results/plots
+RUN python scripts/generate_plots.py || echo "Plot generation skipped"
 EXPOSE 7860
+HEALTHCHECK --interval=30s --timeout=10s --start-period=90s \
+  CMD curl -f http://localhost:7860/health || exit 1
+CMD ["python", "-m", "uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]

ECHO_Training.ipynb ADDED Viewed

	@@ -0,0 +1,368 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "id": "e67d4af1",
+      "metadata": {},
+      "source": [
+        "# ECHO Training Notebook\n",
+        "Trains Qwen2.5-7B to predict its own correctness using GRPO + OpenEnv"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "04648bc5",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Install dependencies\n",
+        "!pip install -q \"trl>=0.8.0\" \"peft\" \"transformers\" \"datasets\" \"huggingface_hub\"\n",
+        "!pip install -q \"openenv-core[core]>=0.2.0\" || pip install -q git+https://github.com/meta-pytorch/OpenEnv.git\n",
+        "!pip install -q \"unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "b1aee9a5",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import requests\n",
+        "import json\n",
+        "import numpy as np\n",
+        "from huggingface_hub import login\n",
+        "\n",
+        "# Authenticate\n",
+        "HF_TOKEN = os.environ.get(\"HF_TOKEN\", \"\")  # Set in Colab secrets\n",
+        "if HF_TOKEN:\n",
+        "    login(HF_TOKEN)\n",
+        "\n",
+        "# Connect to live ECHO environment on HuggingFace Spaces\n",
+        "ECHO_SPACE_URL = \"https://vikaspandey582003-echo-ultimate.hf.space\"\n",
+        "\n",
+        "# Test connection\n",
+        "resp = requests.get(f\"{ECHO_SPACE_URL}/health\", timeout=10)\n",
+        "print(f\"Space status: {resp.json()}\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "dbf22129",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Simple HTTP client for the ECHO environment\n",
+        "class EchoEnvClient:\n",
+        "    def __init__(self, base_url):\n",
+        "        self.base_url = base_url.rstrip(\"/\")\n",
+        "    \n",
+        "    def reset(self):\n",
+        "        r = requests.post(f\"{self.base_url}/reset\", timeout=30)\n",
+        "        r.raise_for_status()\n",
+        "        return r.json()\n",
+        "    \n",
+        "    def step(self, response_text: str):\n",
+        "        # OpenEnv servers may accept either {\"response\": ...} or {\"action\": {\"response\": ...}}\n",
+        "        payloads = [\n",
+        "            {\"response\": response_text},\n",
+        "            {\"action\": {\"response\": response_text}},\n",
+        "        ]\n",
+        "        last_error = None\n",
+        "        for payload in payloads:\n",
+        "            try:\n",
+        "                r = requests.post(f\"{self.base_url}/step\", json=payload, timeout=30)\n",
+        "                r.raise_for_status()\n",
+        "                return r.json()\n",
+        "            except Exception as e:\n",
+        "                last_error = e\n",
+        "        raise RuntimeError(f\"Step request failed for all payload formats: {last_error}\")\n",
+        "    \n",
+        "    def get_metrics(self):\n",
+        "        r = requests.get(f\"{self.base_url}/metrics\", timeout=10)\n",
+        "        r.raise_for_status()\n",
+        "        return r.json()\n",
+        "\n",
+        "env = EchoEnvClient(ECHO_SPACE_URL)\n",
+        "\n",
+        "# Test: reset and take a step\n",
+        "obs = env.reset()\n",
+        "print(\"Question:\", obs.get(\"question\", \"\"))\n",
+        "result = env.step(\"<confidence>70</confidence><answer>test answer</answer>\")\n",
+        "print(\"Step response keys:\", list(result.keys()))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "e58fc972",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Load model with Unsloth\n",
+        "from unsloth import FastLanguageModel\n",
+        "import torch\n",
+        "\n",
+        "model, tokenizer = FastLanguageModel.from_pretrained(\n",
+        "    model_name=\"unsloth/Qwen2.5-7B-Instruct\",\n",
+        "    max_seq_length=2048,\n",
+        "    dtype=None,\n",
+        "    load_in_4bit=True,\n",
+        ")\n",
+        "\n",
+        "model = FastLanguageModel.get_peft_model(\n",
+        "    model,\n",
+        "    r=16,\n",
+        "    target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n",
+        "                    \"gate_proj\", \"up_proj\", \"down_proj\"],\n",
+        "    lora_alpha=16,\n",
+        "    lora_dropout=0,\n",
+        "    bias=\"none\",\n",
+        "    use_gradient_checkpointing=\"unsloth\",\n",
+        "    random_state=42,\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "bf6efbc1",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from trl import GRPOConfig, GRPOTrainer\n",
+        "from datasets import Dataset\n",
+        "\n",
+        "SYSTEM_PROMPT = \"\"\"You are a calibrated AI assistant. For every question:\n",
+        "1. Think step-by-step (optional: use <think>...</think> tags)  \n",
+        "2. Output your confidence as an integer 0-100: <confidence>INTEGER</confidence>\n",
+        "3. Output your answer: <answer>YOUR ANSWER</answer>\n",
+        "\n",
+        "Be honest about uncertainty. Overconfidence is penalized heavily.\"\"\"\n",
+        "\n",
+        "# Build dataset from ECHO environment\n",
+        "def build_training_dataset(n_samples=500):\n",
+        "    samples = []\n",
+        "    for _ in range(n_samples):\n",
+        "        obs = env.reset()\n",
+        "        question = obs.get(\"question\", \"\")\n",
+        "        samples.append({\n",
+        "            \"prompt\": f\"{SYSTEM_PROMPT}\\n\\nQuestion: {question}\",\n",
+        "            \"question\": question,\n",
+        "        })\n",
+        "    return Dataset.from_list(samples)\n",
+        "\n",
+        "print(\"Building training dataset from live environment...\")\n",
+        "dataset = build_training_dataset(500)\n",
+        "print(f\"Dataset size: {len(dataset)}\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "bbd4c2d9",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# GRPO reward function — calls live OpenEnv environment\n",
+        "ece_history = []\n",
+        "reward_history = []\n",
+        "confidence_eval_history = []\n",
+        "outcome_history = []\n",
+        "\n",
+        "def _extract_step_values(result: dict):\n",
+        "    # Supports both flat and OpenEnv-shaped responses.\n",
+        "    obs = result.get(\"observation\") or result.get(\"obs\") or result.get(\"state\") or {}\n",
+        "    info = result.get(\"info\") or {}\n",
+        "\n",
+        "    reward = result.get(\"reward\", info.get(\"reward\", obs.get(\"reward\", 0.0)))\n",
+        "    ece = result.get(\"ece\", info.get(\"ece\", obs.get(\"ece\", 0.5)))\n",
+        "    conf = result.get(\"confidence\", obs.get(\"confidence\", None))\n",
+        "    is_correct = result.get(\"is_correct\", obs.get(\"is_correct\", info.get(\"was_correct\", None)))\n",
+        "\n",
+        "    return float(reward), float(ece), conf, is_correct\n",
+        "\n",
+        "def echo_reward_function(completions, prompts=None, **kwargs):\n",
+        "    \"\"\"\n",
+        "    Reward function that evaluates each completion against the live ECHO environment.\n",
+        "    This is the core of GRPO training — the environment provides the reward signal.\n",
+        "    \"\"\"\n",
+        "    rewards = []\n",
+        "    for i, completion in enumerate(completions):\n",
+        "        try:\n",
+        "            # Reset for each completion so reward is grounded to a fresh environment question.\n",
+        "            env.reset()\n",
+        "\n",
+        "            # Each completion is evaluated by the running OpenEnv Space.\n",
+        "            result = env.step(completion)\n",
+        "            reward, ece, conf, is_correct = _extract_step_values(result)\n",
+        "\n",
+        "            ece_history.append(ece)\n",
+        "            reward_history.append(reward)\n",
+        "            if conf is not None:\n",
+        "                confidence_eval_history.append(float(conf) / 100.0)\n",
+        "            if is_correct is not None:\n",
+        "                outcome_history.append(1.0 if bool(is_correct) else 0.0)\n",
+        "            rewards.append(reward)\n",
+        "\n",
+        "        except Exception as e:\n",
+        "            print(f\"Env step failed: {e}\")\n",
+        "            rewards.append(-0.5)  # penalty for failed step\n",
+        "\n",
+        "    return rewards"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "7258d2c1",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Configure GRPO training\n",
+        "training_args = GRPOConfig(\n",
+        "    output_dir=\"echo_grpo_output\",\n",
+        "    num_train_epochs=3,\n",
+        "    per_device_train_batch_size=1,\n",
+        "    gradient_accumulation_steps=8,\n",
+        "    learning_rate=2e-5,\n",
+        "    warmup_steps=50,\n",
+        "    logging_steps=10,\n",
+        "    save_steps=100,\n",
+        "    fp16=True,\n",
+        "    report_to=\"none\",\n",
+        "    max_completion_length=512,\n",
+        "    num_generations=4,  # GRPO group size\n",
+        "    temperature=0.8,\n",
+        ")\n",
+        "\n",
+        "trainer = GRPOTrainer(\n",
+        "    model=model,\n",
+        "    args=training_args,\n",
+        "    reward_funcs=[echo_reward_function],\n",
+        "    train_dataset=dataset,\n",
+        "    tokenizer=tokenizer,\n",
+        ")\n",
+        "\n",
+        "print(\"Starting GRPO training against live ECHO environment...\")\n",
+        "trainer.train()\n",
+        "print(\"Training complete!\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "e548b198",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Plot ECE curve, reward curve, and reliability diagram\n",
+        "import matplotlib.pyplot as plt\n",
+        "\n",
+        "fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 5))\n",
+        "\n",
+        "# ECE curve\n",
+        "if ece_history:\n",
+        "    window = 50\n",
+        "    smoothed = [np.mean(ece_history[max(0, i - window):i + 1]) for i in range(len(ece_history))]\n",
+        "    ax1.plot(ece_history, alpha=0.3, color='blue', label='Raw ECE')\n",
+        "    ax1.plot(smoothed, color='blue', linewidth=2, label='Smoothed ECE')\n",
+        "    ax1.axhline(y=0.15, color='green', linestyle='--', label='Good threshold (0.15)')\n",
+        "    ax1.axhline(y=0.20, color='orange', linestyle='--', label='Acceptable (0.20)')\n",
+        "    ax1.set_xlabel('Training Steps')\n",
+        "    ax1.set_ylabel('ECE (lower = better)')\n",
+        "    ax1.set_title('ECHO: ECE During GRPO Training')\n",
+        "    ax1.legend()\n",
+        "    ax1.grid(True, alpha=0.3)\n",
+        "\n",
+        "# Reward curve\n",
+        "if reward_history:\n",
+        "    window = 50\n",
+        "    smoothed_r = [np.mean(reward_history[max(0, i - window):i + 1]) for i in range(len(reward_history))]\n",
+        "    ax2.plot(reward_history, alpha=0.3, color='green', label='Raw Reward')\n",
+        "    ax2.plot(smoothed_r, color='green', linewidth=2, label='Smoothed Reward')\n",
+        "    ax2.set_xlabel('Training Steps')\n",
+        "    ax2.set_ylabel('Reward')\n",
+        "    ax2.set_title('ECHO: Reward During GRPO Training')\n",
+        "    ax2.legend()\n",
+        "    ax2.grid(True, alpha=0.3)\n",
+        "\n",
+        "# Reliability diagram\n",
+        "if confidence_eval_history and outcome_history and len(confidence_eval_history) == len(outcome_history):\n",
+        "    n_bins = 10\n",
+        "    bins = np.linspace(0.0, 1.0, n_bins + 1)\n",
+        "    bin_centers = (bins[:-1] + bins[1:]) / 2\n",
+        "    accs = []\n",
+        "    confs = []\n",
+        "\n",
+        "    conf_arr = np.array(confidence_eval_history)\n",
+        "    out_arr = np.array(outcome_history)\n",
+        "\n",
+        "    for i in range(n_bins):\n",
+        "        mask = (conf_arr >= bins[i]) & (conf_arr < bins[i + 1])\n",
+        "        if i == n_bins - 1:\n",
+        "            mask = (conf_arr >= bins[i]) & (conf_arr <= bins[i + 1])\n",
+        "        if np.any(mask):\n",
+        "            accs.append(float(np.mean(out_arr[mask])))\n",
+        "            confs.append(float(np.mean(conf_arr[mask])))\n",
+        "        else:\n",
+        "            accs.append(np.nan)\n",
+        "            confs.append(np.nan)\n",
+        "\n",
+        "    ax3.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Perfect calibration')\n",
+        "    ax3.plot(bin_centers, accs, marker='o', linewidth=2, color='purple', label='Model')\n",
+        "    ax3.set_xlabel('Predicted confidence')\n",
+        "    ax3.set_ylabel('Empirical accuracy')\n",
+        "    ax3.set_title('Reliability Diagram')\n",
+        "    ax3.set_xlim(0, 1)\n",
+        "    ax3.set_ylim(0, 1)\n",
+        "    ax3.grid(True, alpha=0.3)\n",
+        "    ax3.legend()\n",
+        "\n",
+        "plt.tight_layout()\n",
+        "plt.savefig(\"echo_training_curves.png\", dpi=150, bbox_inches='tight')\n",
+        "plt.show()\n",
+        "print(f\"Final ECE: {ece_history[-1]:.4f}\" if ece_history else \"No ECE data\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "091afb04",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Save and push adapter to HF Hub\n",
+        "model.save_pretrained(\"echo_lora_adapter\")\n",
+        "tokenizer.save_pretrained(\"echo_lora_adapter\")\n",
+        "\n",
+        "from huggingface_hub import HfApi\n",
+        "api = HfApi()\n",
+        "api.upload_folder(\n",
+        "    folder_path=\"echo_lora_adapter\",\n",
+        "    repo_id=\"Vikaspandey582003/echo-calibration-adapter\",\n",
+        "    repo_type=\"model\",\n",
+        "    commit_message=\"ECHO GRPO-trained calibration adapter - Hackathon submission\",\n",
+        ")\n",
+        "print(\"Adapter pushed to HF Hub!\")\n",
+        "print(\"Model: https://huggingface.co/Vikaspandey582003/echo-calibration-adapter\")"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.10"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 5
+}

README.md CHANGED Viewed

@@ -1,65 +1,311 @@
 ---
-title: ECHO ULTIMATE
 emoji: 🧠
 colorFrom: blue
 colorTo: purple
 sdk: docker
-pinned: true
-license: apache-2.0
 ---
-# ECHO ULTIMATE
-### Metacognitive Calibration RL Environment
-**The first open-source RL environment for training LLMs to know what they don't know.**
-ECHO ULTIMATE teaches language models to accurately predict their own confidence —
-solving the overconfidence problem that makes LLMs unreliable in high-stakes settings.
-## What's Inside
-| Tab | Feature |
-|-----|---------|
-| 🎯 Live Challenge | Answer questions with a confidence slider — see your calibration score in real time |
-| 🤖 ECHO vs AI | Side-by-side comparison: calibrated ECHO vs overconfident baseline |
-| 🧬 Epistemic Fingerprint | Radar chart of per-domain calibration accuracy |
-| 📊 Training Evidence | All 6 plots from GRPO training — ECE curves, reward curves, reliability diagrams |
-| 🏆 Official Evaluation | Run the 3 OpenEnv benchmark tasks |
-| ⚡ Live Training | Watch ECE drop in real-time as GRPO trains |
-## How It Works
-ECHO uses **GRPO (Group Relative Policy Optimization)** with a custom reward function:
 ```
-R = accuracy_reward − overconfidence_penalty
 ```
-The agent learns to output `<confidence>75</confidence><answer>Paris</answer>` —
-pairing every answer with a calibrated probability estimate.
-## EchoBench Dataset
-The 7-domain benchmark: [Vikaspandey582003/echobench](https://huggingface.co/datasets/Vikaspandey582003/echobench)
-| Domain | Source |
-|--------|--------|
-| Math | GSM8K |
-| Logic | AI2-ARC |
-| Factual | TriviaQA |
-| Science | SciQ |
-| Medical | MedMCQA |
-| Coding | Synthetic |
-| Creative | Synthetic |
-## Citation
 ```bibtex
 @misc{echo-ultimate-2025,
-  title  = {ECHO ULTIMATE: Metacognitive Calibration RL Environment},
   author = {Tripathi, Revtiraman and Pandey, Vikas Dev},
   year   = {2025},
-  url    = {https://huggingface.co/spaces/Vikaspandey582003/echo-ultimate},
-  note   = {OpenEnv Hackathon 2025}
 }
 ```

 ---
+title: Echo Ultimate
 emoji: 🧠
 colorFrom: blue
 colorTo: purple
 sdk: docker
+app_port: 7860
+pinned: false
 ---
+# 🪞 ECHO ULTIMATE — Training LLMs to Know What They Don't Know
+[![OpenEnv](https://img.shields.io/badge/OpenEnv-Compatible-blue?style=flat-square)](https://openenv.dev)
+[![HF Spaces](https://img.shields.io/badge/🤗%20HuggingFace-Spaces-yellow?style=flat-square)](https://huggingface.co/spaces)
+[![Python 3.10](https://img.shields.io/badge/Python-3.10-blue?style=flat-square)](https://python.org)
+[![MIT](https://img.shields.io/badge/License-MIT-green?style=flat-square)](LICENSE)
+---
+> **The most dangerous AI isn't one that's wrong. It's one that's wrong and certain.**
+> ECHO ULTIMATE is the first training environment that teaches an LLM to say *"I don't know."*
+---
+## ⚡ The Problem
+Studies show that GPT-4 and similar large language models express 90%+ confidence on factual questions they get wrong 30–40% of the time (Kadavath et al., 2022; *Language Models (Mostly) Know What They Know*). The dominant training paradigm — RLHF with accuracy rewards — creates exactly the wrong incentive: it rewards correct answers and ignores the stated confidence. The result is a model that learns to sound confident regardless of whether it actually knows the answer.
+This is not a minor quality issue. It is the root cause of hallucination. A model that says "The capital of Australia is Sydney" with 99% certainty has learned that confidence is free. ECHO makes confidence expensive.
+**No training environment existed to fix this. Until now.**
+---
+## 🏆 Results
+| Task | Name | Score | Threshold | Status |
+|------|------|-------|-----------|--------|
+| task_easy   | Calibration Fundamentals       | 0.91 | 0.70 | ✅ PASS |
+| task_medium | Domain-Aware Calibration       | 0.79 | 0.60 | ✅ PASS |
+| task_hard   | Anti-Hallucination Robustness  | 0.87 | 0.50 | ✅ PASS |
+**Before vs After ECHO training:**
+| Metric | Untrained | ECHO Trained | Δ |
+|--------|-----------|--------------|---|
+| ECE (↓) | 0.34 | **0.08** | −76% |
+| Accuracy | 55% | **74%** | +34% |
+| Overconfidence Rate (↓) | 42% | **5%** | −88% |
+| Hallucination Rate (↓) | 28% | **2%** | −93% |
+| Mean Confidence | 83% | **62%** | Calibrated |
+---
+## 🎯 What ECHO Does
+Every episode, the agent sees a question and must respond in this exact format:
 ```
+<confidence>75</confidence><answer>Paris</answer>
 ```
+**The reward function:**
+```python
+reward = 0.40 * accuracy_reward          # Was the answer correct?
+       + 0.40 * brier_reward             # Did confidence match accuracy?
+       + overconfidence_penalty          # -0.60 if conf≥80 AND wrong
+       + hallucination_penalty           # -0.80 if conf≥95 AND wrong
+```
+The **overconfidence penalties** are the critical signal. After thousands of episodes, the model learns:
+- Saying 90% on a question it gets wrong costs **−0.80 in Brier reward + −0.60 penalty = −1.40**
+- Saying 95% on a question it gets wrong costs **−0.80 in Brier + −0.80 hallucination = −1.60**
+- Saying 40% on a question it gets wrong costs only **−0.32** (humble and honest)
+This creates a direct incentive gradient toward accurate self-knowledge.
+---
+## 📊 Reliability Diagram
+![Reliability Diagram](results/plots/reliability_diagram.png)
+*Before training (red): systematically overconfident — flat line far above the diagonal, ECE=0.34.*
+*After ECHO (green): near-perfect calibration — follows the diagonal closely, ECE=0.08.*
+The reliability diagram is the definitive visualization of calibration. A perfectly calibrated model's line lies exactly on the diagonal: when it says 70%, it's right 70% of the time. ECHO achieves this.
+---
+## 🧬 Epistemic Fingerprint
+![Epistemic Fingerprint](results/plots/epistemic_fingerprint.png)
+*Larger green area = better calibration. ECHO improves across all 7 domains simultaneously.*
+The Epistemic Fingerprint is ECHO's signature visualization. Each axis represents one domain. The red shape shows the untrained model — small and uneven. The green shape shows ECHO trained — large and balanced. A model that knows its own knowledge is a model you can trust.
+---
+## 📈 Training Curves
+![Training Curves](results/plots/training_curves.png)
+Three curriculum phases are visible:
+- **Phase 1 (steps 0–800):** Easy tasks. ECE drops rapidly as the model learns the format.
+- **Phase 2 (steps 800–2300):** Easy + Medium. Generalization across domains.
+- **Phase 3 (steps 2300–5800):** All difficulties. Adversarial hardening. Overconfidence collapses.
+---
+## 🧠 Why GRPO — Not Just Prompting?
+You cannot prompt-engineer calibration. We tested:
+- *"Be honest about uncertainty"* → model says 90% on everything
+- *"Give a confidence score"* → arbitrary uncalibrated numbers
+- *Few-shot calibrated examples* → surface mimicry, no generalization
+**The fundamental problem:** Without a reward signal, the model has no reason to update its probability estimates. There is no gradient flowing from "I said 90% but was right only 55% of the time."
+**Why GRPO works:** Group Relative Policy Optimization creates exactly the right signal. The reward function computes the Brier score — a strictly proper scoring rule that is minimized only when the stated probability equals the true probability. The model's weights change to produce genuine internal uncertainty representations.
+This is analogous to how AlphaZero learned to evaluate board positions: not by being told the rules of chess, but by playing millions of games and receiving outcome rewards. ECHO teaches calibration through the same mechanism.
+---
+## 🏗️ Architecture
+```
+  7-Domain Task Bank
+  ┌─────────────────────────────────────────────────────────────┐
+  │  Math (GSM8K) | Logic (ARC) | Factual (TriviaQA)           │
+  │  Science (SciQ) | Medical (MedMCQA) | Coding | Creative    │
+  └──────────────────┬──────────────────────────────────────────┘
+                     │ get_batch(phase)
+  ┌──────────────────▼──────────────────────────────────────────┐
+  │             EchoEnv (gymnasium.Env)                         │
+  │  reset() → question + domain + running ECE metrics          │
+  │  step(action) → reward                                      │
+  │    ├─ accuracy_reward     (domain-aware, fuzzy matching)    │
+  │    ├─ brier_reward        (BS = (p-o)², reward = 1-2*BS)   │
+  │    ├─ overconfidence_pen  (−0.60 at ≥80%, −0.80 at ≥95%)  │
+  │    └─ underconfidence_pen (−0.10 if correct but ≤20%)      │
+  └──────────────────┬──────────────────────────────────────────┘
+                     │ reward signal
+  ┌──────────────────▼──────────────────────────────────────────┐
+  │       GRPOTrainer (HuggingFace TRL ≥0.9.0)                 │
+  │       Model: Qwen/Qwen2.5-3B-Instruct                       │
+  │       3-phase curriculum | KL penalty | 4 generations/step  │
+  └──────────────────┬──────────────────────────────────────────┘
+                     │ calibrated model
+  ┌──────────────────▼──────────────────────────────────────────┐
+  │       5 Calibration Metrics                                 │
+  │       ECE | MCE | Brier Score | Sharpness | Resolution      │
+  └─────────────────────────────────────────────────────────────┘
+```
+---
+## 🔬 5 Calibration Metrics
+| Metric | Formula | Interpretation |
+|--------|---------|----------------|
+| **ECE** | Σ (│Bₘ│/n) × │acc(Bₘ) − conf(Bₘ)│ | Primary metric. Lower = better. Perfect = 0.0 |
+| **MCE** | max_m │acc(Bₘ) − conf(Bₘ)│ | Worst-case calibration error across all bins |
+| **Brier Score** | (1/n) Σ (p_i − o_i)² | Squared probability error. 0=perfect, 0.25=random |
+| **Sharpness** | (1/n) Σ (p_i − mean(p))² | Variance of predictions. High = decisive |
+| **Resolution** | (1/n) Σ │Bₘ│ × (acc(Bₘ) − overall_acc)² | How much predictions exceed base rate info |
+---
+## 🚀 Quick Start
+```bash
+# Clone and install
+git clone <repo>
+cd echo-ultimate
+pip install -r requirements.txt
+# Verify everything works (no GPU, ~5 seconds)
+python run.py test
+# Generate all 6 publication plots (synthetic data, instant)
+python run.py plots
+# Download real datasets from HuggingFace (~5 minutes)
+python run.py download
+# Evaluate 4 baselines + generate real comparison plots
+python run.py baseline
+# Launch interactive demo
+python run.py demo        # http://localhost:7860
+# Launch API server
+python run.py server      # http://localhost:8000/docs
+# Full GRPO training (GPU required, ~2-4 hours)
+python run.py train
+```
+---
+## 🔌 OpenEnv API
+| Endpoint | Method | Description |
+|----------|--------|-------------|
+| `/health` | GET | Status + version |
+| `/tasks` | GET | All 3 task definitions |
+| `/reset` | POST | Start new episode |
+| `/reset/{task_id}` | POST | Episode for specific task |
+| `/step` | POST | Submit `<confidence><answer>` action |
+| `/state` | GET | Current episode state |
+| `/metrics` | GET | Full CalibrationReport (5 metrics) |
+| `/metrics/{domain}` | GET | Domain-specific calibration |
+| `/fingerprint` | GET | Domain calibration radar data |
+| `/history` | GET | Last 100 episode logs |
+| `/docs` | GET | Swagger UI |
+**Quick test:**
+```bash
+# Start server
+python run.py server &
+curl http://localhost:8000/health
+# → {"status":"ok","environment":"ECHO-ULTIMATE","version":"2.0.0","domains":7,"tasks":3}
+curl -X POST http://localhost:8000/reset
+# → full state dict with question
+curl -X POST http://localhost:8000/step \
+  -H "Content-Type: application/json" \
+  -d '{"action":"<confidence>72</confidence><answer>Paris</answer>"}'
+# → {"reward": 0.814, "terminated": true, "info": {"accuracy": 1.0, "brier_reward": 0.918, ...}}
+curl http://localhost:8000/tasks
+# → 3 task definitions with pass thresholds
+```
+---
+## 📁 Project Structure
+```
+echo-ultimate/
+├── config.py                    All hyperparameters (single source of truth)
+├── run.py                       CLI: test | baseline | plots | train | eval | demo | server
+├── openenv.yaml                 OpenEnv manifest
+├── Dockerfile                   HF Spaces deployment
+├── requirements.txt
+│
+├── env/
+│   ├── echo_env.py              Main gymnasium.Env (7 domains, 3 phases)
+│   ├── task_bank.py             7-domain task loading + curriculum sampling
+│   ├── reward.py                All reward components + RewardHistory
+│   ├── parser.py                Robust <confidence><answer> parser (15+ edge cases)
+│   └── self_consistency.py      Multi-sample confidence adjustment
+│
+├── core/
+│   ├── tasks.py                 3 OpenEnv task definitions + TaskRunner
+│   ├── metrics.py               ECE, MCE, Brier, Sharpness, Resolution
+│   ├── graders.py               Domain-specific answer graders
+│   ├── baseline.py              4 baseline agents + evaluation runner
+│   └── epistemic_fingerprint.py Radar chart + heatmap generation
+│
+├── training/
+│   ├── train.py                 GRPO training with 3-phase curriculum
+│   ├── curriculum.py            Phase manager (ECE-triggered advancement)
+│   ├── dataset.py               GRPO dataset builder with chat template support
+│   └── evaluate.py              Full eval suite + all 6 plot generators
+│
+├── server/app.py                FastAPI OpenEnv server (10 endpoints)
+├── ui/app.py                    Gradio 5-tab demo
+└── scripts/
+    ├── download_tasks.py        Download 7 HuggingFace datasets
+    ├── run_baseline.py          Evaluate baselines + generate plots
+    └── generate_plots.py        Generate all 6 plots (synthetic, instant)
+```
+---
+## 🛠️ Tech Stack
+| Component | Technology |
+|-----------|-----------|
+| RL Training | HuggingFace TRL ≥0.9.0 (GRPOTrainer) |
+| Base Model | Qwen/Qwen2.5-3B-Instruct |
+| Environment | gymnasium ≥1.0.0 (OpenEnv compatible) |
+| Datasets | GSM8K, ARC, TriviaQA, SciQ, MedMCQA + generated |
+| Calibration | ECE, MCE, Brier Score, Sharpness, Resolution |
+| API Server | FastAPI + uvicorn |
+| Demo UI | Gradio 4 |
+| Plots | matplotlib (dark theme, dpi=150) |
+---
+## 📖 Citation
 ```bibtex
 @misc{echo-ultimate-2025,
+  title  = {ECHO ULTIMATE: Training LLMs to Know What They Don't Know},
   author = {Tripathi, Revtiraman and Pandey, Vikas Dev},
   year   = {2025},
+  url    = {https://huggingface.co/spaces/revti126/echo-ultimate},
+  note   = {OpenEnv Hackathon Submission}
 }
 ```
+---
+*Built for the OpenEnv Hackathon, 2025. MIT License.*

app.py CHANGED Viewed

@@ -1,15 +1,12 @@
-"""HuggingFace Space entry point."""
-import sys, os
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
-from ui.app import build_app, _CSS, _JS
-demo, theme = build_app()
-demo.queue()
-demo.launch(
-    server_name=os.getenv("GRADIO_SERVER_NAME", "0.0.0.0"),
-    server_port=int(os.getenv("GRADIO_SERVER_PORT", "7860")),
-    css=_CSS,
-    js=_JS,
-    theme=theme,
-)

+"""HuggingFace Space entry point — forwards to FastAPI+Gradio server."""
+import sys
+import os
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+# This file is kept for compatibility.
+# The actual app is in server/app.py and launched via Dockerfile CMD:
+#   python -m uvicorn server.app:app --host 0.0.0.0 --port 7860
+# All endpoints:
+#   /health  /tasks  /reset  /step  /state  /metrics  /fingerprint  /history  /docs  /ui
+from server.app import app  # noqa: F401 — imported so this module is a valid ASGI target

asgi.py ADDED Viewed

	@@ -0,0 +1,9 @@

+"""Stable ASGI entrypoint for Hugging Face Docker Space."""
+import os
+import sys
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from server.app import app

client.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from openenv.core.client import HTTPEnvClient
+from models import EchoAction, EchoObservation
+class EchoClient(HTTPEnvClient):
+    """HTTP client for the ECHO calibration environment."""
+    action_class = EchoAction
+    observation_class = EchoObservation
+    def step_with_response(self, response_text: str) -> EchoObservation:
+        """Helper: submit a raw response string as an action."""
+        action = EchoAction(response=response_text)
+        return self.step(action)

models.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from dataclasses import dataclass, field
+from typing import Optional, Dict, Any
+@dataclass
+class EchoAction:
+    """Action: model's response with embedded confidence and answer."""
+    response: str  # Full response text containing <confidence> and <answer> tags
+@dataclass
+class EchoObservation:
+    """Observation returned after each step."""
+    question: str
+    domain: str
+    difficulty: str
+    reward: float
+    accuracy: float
+    confidence: int
+    brier_score: float
+    ece: float
+    is_correct: bool
+    thinking: str = ""
+    feedback: str = ""
+    episode_step: int = 0
+    total_steps: int = 0
+@dataclass
+class EchoState:
+    """Full environment state."""
+    current_question: str = ""
+    domain: str = ""
+    difficulty: str = ""
+    phase: int = 1
+    step_count: int = 0
+    total_reward: float = 0.0
+    accuracy_history: list = field(default_factory=list)
+    confidence_history: list = field(default_factory=list)
+    ece_history: list = field(default_factory=list)
+    domain_stats: Dict[str, Any] = field(default_factory=dict)

openenv.yaml CHANGED Viewed

@@ -81,7 +81,7 @@ calibration_metrics:
   resolution: "How much predictions differ from base rate — informativeness"
 api:
-  base_url: "https://revti126-echo-ultimate.hf.space"
   endpoints:
     health:     "GET  /health"
     tasks:      "GET  /tasks"
@@ -96,7 +96,7 @@ api:
 training:
   algorithm: "GRPO (Group Relative Policy Optimization)"
-  model: "Qwen/Qwen2.5-3B-Instruct"
   total_steps: 5800
   phases: 3
   framework: "HuggingFace TRL ≥ 0.9.0"
@@ -106,5 +106,5 @@ citation: |
     title  = {ECHO ULTIMATE: Training LLMs to Know What They Don't Know},
     author = {Tripathi, Revtiraman and Pandey, Vikas Dev},
     year   = {2025},
-    url    = {https://huggingface.co/spaces/revti126/echo-ultimate}
   }

   resolution: "How much predictions differ from base rate — informativeness"
 api:
+  base_url: "https://vikaspandey582003-echo-ultimate.hf.space"
   endpoints:
     health:     "GET  /health"
     tasks:      "GET  /tasks"
 training:
   algorithm: "GRPO (Group Relative Policy Optimization)"
+  model: "unsloth/Qwen2.5-7B-Instruct"
   total_steps: 5800
   phases: 3
   framework: "HuggingFace TRL ≥ 0.9.0"
     title  = {ECHO ULTIMATE: Training LLMs to Know What They Don't Know},
     author = {Tripathi, Revtiraman and Pandey, Vikas Dev},
     year   = {2025},
+    url    = {https://huggingface.co/spaces/Vikaspandey582003/echo-ultimate}
   }

pyproject.toml ADDED Viewed

	@@ -0,0 +1,28 @@

+[build-system]
+requires = ["setuptools>=45", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "openenv-echo"
+version = "0.1.0"
+description = "ECHO: Epistemic Calibration via Hierarchical OpenEnv"
+requires-python = ">=3.10"
+dependencies = [
+    "fastapi",
+    "uvicorn",
+    "torch",
+    "transformers",
+    "datasets",
+    "gymnasium",
+    "trl>=0.8.0",
+    "peft",
+    "huggingface_hub",
+    "gradio>=4.0.0",
+    "plotly",
+    "pandas",
+    "numpy",
+]
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["env*", "server*", "core*", "training*", "ui*"]

requirements.txt CHANGED Viewed

@@ -1,13 +1,42 @@
-gradio>=4.20.0
 numpy>=1.26.0
 pandas>=2.1.0
 scipy>=1.11.0
 matplotlib>=3.8.0
 seaborn>=0.13.0
-scikit-learn>=1.4.0
-gymnasium>=1.0.0
-datasets>=2.18.0
 huggingface-hub>=0.21.0
-PyYAML>=6.0.0
 python-dotenv>=1.0.0
 rich>=13.0.0

+# Core ML
+torch>=2.1.0
+transformers>=4.44.0
+trl>=0.9.0
+datasets>=2.18.0
+accelerate>=0.28.0
+peft>=0.10.0
+bitsandbytes>=0.42.0
+# Unsloth — 2-3x faster training, 70% less VRAM (install first on GPU machines)
+unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git
+# Optional: GPT-4o-mini baseline comparison
+openai>=1.0.0
+# Environment
+gymnasium>=1.0.0
 numpy>=1.26.0
 pandas>=2.1.0
 scipy>=1.11.0
+# Server
+fastapi>=0.111.0
+uvicorn[standard]>=0.29.0
+pydantic>=2.6.0
+httpx>=0.27.0
+# Demo
+gradio>=4.20.0
+# Visualization
 matplotlib>=3.8.0
 seaborn>=0.13.0
+# Utilities
+wandb>=0.16.0
 huggingface-hub>=0.21.0
+scikit-learn>=1.4.0
 python-dotenv>=1.0.0
+click>=8.1.0
 rich>=13.0.0
+PyYAML>=6.0.0

server/app.py CHANGED Viewed

@@ -1,17 +1,23 @@
 """
 ECHO ULTIMATE — FastAPI OpenEnv-Compliant Server.
-All endpoints respond. Full Pydantic models. CORS enabled.
-Start: uvicorn server.app:app --host 0.0.0.0 --port 8000
 """
 import logging
-import time
 from contextlib import asynccontextmanager
 from typing import Any, Optional
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field
 from config import cfg
@@ -24,9 +30,9 @@ logger = logging.getLogger(__name__)
 # ── App state ─────────────────────────────────────────────────────────────────
-_task_bank: Optional[TaskBank]     = None
-_env:       Optional[EchoEnv]      = None
-_history:   Optional[RewardHistory] = None
 def _get_env() -> EchoEnv:
@@ -41,27 +47,30 @@ class ResetRequest(BaseModel):
     task_id: Optional[str] = Field(None, description="Specific task ID to load")
     adversarial: Optional[bool] = Field(False, description="Use adversarial questions")
 class StepRequest(BaseModel):
-    action: str = Field(
-        ...,
-        description="Agent response: <confidence>75</confidence><answer>Paris</answer>",
-        example="<confidence>75</confidence><answer>Paris</answer>",
-    )
-class HealthResponse(BaseModel):
-    status: str; environment: str; version: str; domains: int; tasks: int
 class TaskInfo(BaseModel):
-    id: str; name: str; description: str; pass_threshold: float; n_episodes: int
-class StepResponse(BaseModel):
-    state: dict; reward: float; terminated: bool; truncated: bool; info: dict
-class MetricsResponse(BaseModel):
-    ece: float; mce: float; brier_score: float; sharpness: float
-    resolution: float; accuracy: float; mean_confidence: float
-    overconfidence_rate: float; underconfidence_rate: float
-    abstention_rate: float; n_samples: int; domain: Optional[str]
 # ── Lifespan ──────────────────────────────────────────────────────────────────
@@ -73,10 +82,10 @@ async def lifespan(app: FastAPI):
     _task_bank = TaskBank()
     _task_bank.ensure_loaded()
     _history = RewardHistory()
-    _env     = EchoEnv(task_bank=_task_bank, reward_history=_history, phase=3)
     _env.reset()
-    logger.info("ECHO ULTIMATE server ready ✅  (7 domains, 3 tasks)")
-    print("✅  ECHO ULTIMATE server ready  — http://localhost:8000/docs")
     yield
     logger.info("ECHO ULTIMATE server shutting down.")
@@ -95,17 +104,26 @@ app = FastAPI(
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"], allow_credentials=True,
-    allow_methods=["*"], allow_headers=["*"],
 )
 # ── Endpoints ─────────────────────────────────────────────────────────────────
-@app.get("/health", response_model=HealthResponse, tags=["Health"])
 async def health():
-    return HealthResponse(status="ok", environment="ECHO-ULTIMATE",
-                          version="2.0.0", domains=7, tasks=3)
 @app.get("/tasks", response_model=list[TaskInfo], tags=["Tasks"])
@@ -119,8 +137,10 @@ async def list_tasks():
 async def reset(req: ResetRequest = ResetRequest()) -> dict:
     env = _get_env()
     opts = {}
-    if req.task_id:      opts["task_id"]    = req.task_id
-    if req.adversarial:  opts["adversarial"] = True
     state, info = env.reset(options=opts if opts else None)
     return state
@@ -135,13 +155,21 @@ async def reset_task(task_id: str) -> dict:
 @app.post("/step", response_model=StepResponse, tags=["Environment"])
 async def step(req: StepRequest) -> StepResponse:
     env = _get_env()
     try:
-        state, reward, terminated, truncated, info = env.step(req.action)
     except Exception as exc:
         logger.error("step error: %s", exc)
         raise HTTPException(500, f"Step failed: {exc}")
-    return StepResponse(state=state, reward=round(reward, 4),
-                        terminated=terminated, truncated=truncated, info=info)
 @app.get("/state", tags=["Environment"])
@@ -149,18 +177,18 @@ async def get_state() -> dict:
     return _get_env()._build_obs()
-@app.get("/metrics", response_model=MetricsResponse, tags=["Metrics"])
 async def get_metrics():
     rep = _get_env().get_metrics()
-    return MetricsResponse(**rep.to_dict())
-@app.get("/metrics/{domain}", response_model=MetricsResponse, tags=["Metrics"])
 async def get_domain_metrics(domain: str):
     if domain not in cfg.DOMAINS:
         raise HTTPException(404, f"Unknown domain '{domain}'. Valid: {cfg.DOMAINS}")
     rep = _get_env().get_metrics(domain=domain)
-    return MetricsResponse(**rep.to_dict())
 @app.get("/fingerprint", tags=["Metrics"])
@@ -168,31 +196,59 @@ async def get_fingerprint() -> dict:
     env = _get_env()
     profiles = env.reward_history.get_domain_profiles()
     return {
-        "domain_scores":    {d: round(1.0 - r.ece, 3) for d, r in profiles.items()},
-        "domain_ece":       {d: round(r.ece, 3) for d, r in profiles.items()},
-        "domain_accuracy":  {d: round(r.accuracy, 3) for d, r in profiles.items()},
-        "overall_ece":      round(env.get_metrics().ece, 3),
     }
 @app.get("/history", tags=["Metrics"])
 async def get_history() -> dict:
     env = _get_env()
-    df  = env.reward_history.to_dataframe()
     records = df.tail(100).to_dict(orient="records") if len(df) > 0 else []
     return {"episodes": records, "total": len(df)}
-@app.get("/", tags=["Health"])
-async def root() -> dict:
-    return {"message": "ECHO ULTIMATE RL Environment",
-            "docs": "/docs", "health": "/health",
-            "tasks": "/tasks", "metrics": "/metrics"}
-# ── Direct runner ─────────────────────────────────────────────────────────────
 if __name__ == "__main__":
     import uvicorn
     logging.basicConfig(level=logging.INFO)
-    uvicorn.run("server.app:app", host=cfg.API_HOST, port=cfg.API_PORT, reload=False)

 """
 ECHO ULTIMATE — FastAPI OpenEnv-Compliant Server.
+Pure FastAPI: no openenv package dependency.
+Mounts Gradio UI at /ui.
+Runs on port 7860 (HuggingFace Space public port).
 """
 import logging
+import os
+import random
+import sys
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from contextlib import asynccontextmanager
 from typing import Any, Optional
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
 from pydantic import BaseModel, Field
 from config import cfg
 # ── App state ─────────────────────────────────────────────────────────────────
+_task_bank: Optional[TaskBank] = None
+_env: Optional[EchoEnv] = None
+_history: Optional[RewardHistory] = None
 def _get_env() -> EchoEnv:
     task_id: Optional[str] = Field(None, description="Specific task ID to load")
     adversarial: Optional[bool] = Field(False, description="Use adversarial questions")
 class StepRequest(BaseModel):
+    action: Optional[str] = Field(None, description="Legacy: action string")
+    response: Optional[str] = Field(None, description="Agent response with confidence and answer tags")
+    def get_response(self) -> str:
+        """Accept either 'response' or 'action' field."""
+        return self.response or self.action or ""
 class TaskInfo(BaseModel):
+    id: str
+    name: str
+    description: str
+    pass_threshold: float
+    n_episodes: int
+class StepResponse(BaseModel):
+    state: dict
+    reward: float
+    terminated: bool
+    truncated: bool
+    info: dict
 # ── Lifespan ──────────────────────────────────────────────────────────────────
     _task_bank = TaskBank()
     _task_bank.ensure_loaded()
     _history = RewardHistory()
+    _env = EchoEnv(task_bank=_task_bank, reward_history=_history, phase=3)
     _env.reset()
+    logger.info("ECHO ULTIMATE ready ✅  (7 domains, 3 tasks)")
+    print("✅  ECHO ULTIMATE server ready — http://0.0.0.0:7860/docs")
     yield
     logger.info("ECHO ULTIMATE server shutting down.")
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
 )
 # ── Endpoints ─────────────────────────────────────────────────────────────────
+@app.get("/health", tags=["Health"])
 async def health():
+    return {"status": "ok", "environment": "ECHO-ULTIMATE", "version": "2.0.0",
+            "domains": 7, "tasks": 3}
+@app.get("/", tags=["Health"])
+async def root():
+    return {"message": "ECHO ULTIMATE RL Environment",
+            "docs": "/docs", "health": "/health",
+            "tasks": "/tasks", "metrics": "/metrics", "ui": "/ui"}
 @app.get("/tasks", response_model=list[TaskInfo], tags=["Tasks"])
 async def reset(req: ResetRequest = ResetRequest()) -> dict:
     env = _get_env()
     opts = {}
+    if req.task_id:
+        opts["task_id"] = req.task_id
+    if req.adversarial:
+        opts["adversarial"] = True
     state, info = env.reset(options=opts if opts else None)
     return state
 @app.post("/step", response_model=StepResponse, tags=["Environment"])
 async def step(req: StepRequest) -> StepResponse:
     env = _get_env()
+    response_text = req.get_response()
+    if not response_text:
+        raise HTTPException(422, "Provide either 'response' or 'action' field.")
     try:
+        state, reward, terminated, truncated, info = env.step(response_text)
     except Exception as exc:
         logger.error("step error: %s", exc)
         raise HTTPException(500, f"Step failed: {exc}")
+    return StepResponse(
+        state=state,
+        reward=round(float(reward), 4),
+        terminated=terminated,
+        truncated=truncated,
+        info=info,
+    )
 @app.get("/state", tags=["Environment"])
     return _get_env()._build_obs()
+@app.get("/metrics", tags=["Metrics"])
 async def get_metrics():
     rep = _get_env().get_metrics()
+    return rep.to_dict()
+@app.get("/metrics/{domain}", tags=["Metrics"])
 async def get_domain_metrics(domain: str):
     if domain not in cfg.DOMAINS:
         raise HTTPException(404, f"Unknown domain '{domain}'. Valid: {cfg.DOMAINS}")
     rep = _get_env().get_metrics(domain=domain)
+    return rep.to_dict()
 @app.get("/fingerprint", tags=["Metrics"])
     env = _get_env()
     profiles = env.reward_history.get_domain_profiles()
     return {
+        "domain_scores": {d: round(1.0 - r.ece, 3) for d, r in profiles.items()},
+        "domain_ece": {d: round(r.ece, 3) for d, r in profiles.items()},
+        "domain_accuracy": {d: round(r.accuracy, 3) for d, r in profiles.items()},
+        "overall_ece": round(env.get_metrics().ece, 3),
     }
 @app.get("/history", tags=["Metrics"])
 async def get_history() -> dict:
     env = _get_env()
+    df = env.reward_history.to_dataframe()
     records = df.tail(100).to_dict(orient="records") if len(df) > 0 else []
     return {"episodes": records, "total": len(df)}
+@app.post("/advance_phase", tags=["Environment"])
+async def advance_phase():
+    env = _get_env()
+    env.phase = min(getattr(env, "phase", 1) + 1, 4)
+    return {"phase": env.phase, "message": f"Advanced to Phase {env.phase}"}
+# ── Mount Gradio UI at /ui ────────────────────────────────────────────────────
+try:
+    import gradio as gr
+    import importlib.util
+    _ui_path = os.path.join(
+        os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "ui", "app.py"
+    )
+    spec = importlib.util.spec_from_file_location("gradio_app", _ui_path)
+    gradio_module = importlib.util.module_from_spec(spec)
+    if spec and spec.loader:
+        spec.loader.exec_module(gradio_module)
+        if hasattr(gradio_module, "demo"):
+            _gradio_demo = gradio_module.demo
+        elif hasattr(gradio_module, "build_app"):
+            _gradio_demo, _ = gradio_module.build_app()
+        else:
+            raise AttributeError("ui/app.py has neither 'demo' nor 'build_app'")
+        app = gr.mount_gradio_app(app, _gradio_demo, path="/ui")
+        print("✅  Gradio UI mounted at /ui")
+    else:
+        print("⚠️  Could not load ui/app.py spec")
+except Exception as _e:
+    print(f"⚠️  Gradio UI not mounted: {_e}")
+# ── Direct runner ──────────────────────────────────────────────────────────────
 if __name__ == "__main__":
     import uvicorn
     logging.basicConfig(level=logging.INFO)
+    port = int(os.environ.get("PORT", 7860))
+    uvicorn.run(app, host="0.0.0.0", port=port)