Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files
app.py
CHANGED
|
@@ -2,20 +2,14 @@
|
|
| 2 |
import sys, os
|
| 3 |
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
| 4 |
|
| 5 |
-
import
|
| 6 |
-
from ui.app import build_app
|
| 7 |
-
|
| 8 |
-
demo = build_app()
|
| 9 |
-
from ui.app import _CSS
|
| 10 |
|
|
|
|
| 11 |
demo.queue()
|
| 12 |
demo.launch(
|
| 13 |
server_name=os.getenv("GRADIO_SERVER_NAME", "0.0.0.0"),
|
| 14 |
server_port=int(os.getenv("GRADIO_SERVER_PORT", "7860")),
|
| 15 |
css=_CSS,
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
neutral_hue=gr.themes.colors.slate,
|
| 19 |
-
font=[gr.themes.GoogleFont("Inter"), "sans-serif"],
|
| 20 |
-
),
|
| 21 |
)
|
|
|
|
| 2 |
import sys, os
|
| 3 |
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
| 4 |
|
| 5 |
+
from ui.app import build_app, _CSS, _JS
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
+
demo, theme = build_app()
|
| 8 |
demo.queue()
|
| 9 |
demo.launch(
|
| 10 |
server_name=os.getenv("GRADIO_SERVER_NAME", "0.0.0.0"),
|
| 11 |
server_port=int(os.getenv("GRADIO_SERVER_PORT", "7860")),
|
| 12 |
css=_CSS,
|
| 13 |
+
js=_JS,
|
| 14 |
+
theme=theme,
|
|
|
|
|
|
|
|
|
|
| 15 |
)
|
ui/app.py
CHANGED
|
@@ -1,13 +1,4 @@
|
|
| 1 |
-
"""
|
| 2 |
-
ECHO ULTIMATE β Premium Gradio UI.
|
| 3 |
-
|
| 4 |
-
Tab 1: π― Live Challenge
|
| 5 |
-
Tab 2: β ECHO vs Overconfident AI
|
| 6 |
-
Tab 3: 𧬠Epistemic Fingerprint
|
| 7 |
-
Tab 4: π Training Evidence
|
| 8 |
-
Tab 5: π Official Evaluation
|
| 9 |
-
Tab 6: β‘ Live Training
|
| 10 |
-
"""
|
| 11 |
|
| 12 |
import json
|
| 13 |
import logging
|
|
@@ -26,482 +17,310 @@ from config import cfg
|
|
| 26 |
logger = logging.getLogger(__name__)
|
| 27 |
|
| 28 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 29 |
-
#
|
| 30 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 31 |
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
-
/* ββ Base ββ */
|
| 55 |
-
*, *::before, *::after { box-sizing: border-box; }
|
| 56 |
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
|
|
|
| 64 |
footer { display: none !important; }
|
|
|
|
| 65 |
|
| 66 |
-
/* ββ
|
| 67 |
-
.tab-nav {
|
| 68 |
-
background: var(--surface) !important;
|
| 69 |
-
border-bottom: 1px solid var(--border) !important;
|
| 70 |
-
padding: 0 8px !important;
|
| 71 |
-
border-radius: 0 !important;
|
| 72 |
-
gap: 4px !important;
|
| 73 |
-
}
|
| 74 |
.tab-nav button {
|
| 75 |
-
color:
|
| 76 |
-
font-size: 13px !important;
|
| 77 |
-
|
| 78 |
-
padding: 12px 20px !important;
|
| 79 |
-
border-radius: 0 !important;
|
| 80 |
-
border-bottom: 2px solid transparent !important;
|
| 81 |
-
transition: all 0.2s !important;
|
| 82 |
-
background: transparent !important;
|
| 83 |
-
letter-spacing: 0.02em !important;
|
| 84 |
-
}
|
| 85 |
-
.tab-nav button:hover {
|
| 86 |
-
color: var(--text) !important;
|
| 87 |
-
background: rgba(255,255,255,0.04) !important;
|
| 88 |
}
|
|
|
|
| 89 |
.tab-nav button.selected {
|
| 90 |
-
color:
|
| 91 |
-
border-bottom: 2px solid
|
| 92 |
-
background: rgba(0,255,163,
|
| 93 |
-
text-shadow: 0 0 12px rgba(0,255,163,0.5) !important;
|
| 94 |
-
}
|
| 95 |
-
|
| 96 |
-
/* ββ Blocks / panels ββ */
|
| 97 |
-
.block, .panel, .form {
|
| 98 |
-
background: var(--card) !important;
|
| 99 |
-
border: 1px solid var(--border) !important;
|
| 100 |
-
border-radius: 12px !important;
|
| 101 |
-
}
|
| 102 |
-
|
| 103 |
-
/* ββ Markdown text ββ */
|
| 104 |
-
.prose, .markdown, .prose p, .prose li, .prose td, .prose th {
|
| 105 |
-
color: var(--text) !important;
|
| 106 |
-
}
|
| 107 |
-
.prose h1, .prose h2, .prose h3, .prose h4 {
|
| 108 |
-
color: #fff !important;
|
| 109 |
-
letter-spacing: -0.02em !important;
|
| 110 |
-
}
|
| 111 |
-
.prose code {
|
| 112 |
-
background: rgba(68,136,255,0.12) !important;
|
| 113 |
-
color: var(--blue) !important;
|
| 114 |
-
border-radius: 4px !important;
|
| 115 |
-
padding: 1px 6px !important;
|
| 116 |
-
font-family: 'JetBrains Mono', monospace !important;
|
| 117 |
-
font-size: 0.88em !important;
|
| 118 |
-
}
|
| 119 |
-
.prose table { border-collapse: collapse !important; width: 100% !important; }
|
| 120 |
-
.prose thead tr { background: rgba(68,136,255,0.1) !important; }
|
| 121 |
-
.prose th {
|
| 122 |
-
color: var(--blue) !important;
|
| 123 |
-
font-weight: 600 !important;
|
| 124 |
-
text-transform: uppercase !important;
|
| 125 |
-
font-size: 11px !important;
|
| 126 |
-
letter-spacing: 0.08em !important;
|
| 127 |
-
padding: 10px 14px !important;
|
| 128 |
-
border-bottom: 1px solid var(--border) !important;
|
| 129 |
-
}
|
| 130 |
-
.prose td {
|
| 131 |
-
padding: 9px 14px !important;
|
| 132 |
-
border-bottom: 1px solid rgba(80,100,255,0.08) !important;
|
| 133 |
-
font-size: 14px !important;
|
| 134 |
-
}
|
| 135 |
-
.prose tr:last-child td { border-bottom: none !important; }
|
| 136 |
-
.prose blockquote {
|
| 137 |
-
border-left: 3px solid var(--green) !important;
|
| 138 |
-
background: rgba(0,255,163,0.05) !important;
|
| 139 |
-
padding: 10px 16px !important;
|
| 140 |
-
border-radius: 0 8px 8px 0 !important;
|
| 141 |
-
margin: 12px 0 !important;
|
| 142 |
}
|
| 143 |
|
| 144 |
-
/* ββ
|
| 145 |
-
button.lg,
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
color: #fff !important;
|
| 149 |
-
font-weight: 600 !important;
|
| 150 |
-
font-size: 14px !important;
|
| 151 |
-
border-radius: 8px !important;
|
| 152 |
-
letter-spacing: 0.01em !important;
|
| 153 |
-
box-shadow: 0 4px 20px rgba(68,136,255,0.3) !important;
|
| 154 |
-
transition: all 0.2s ease !important;
|
| 155 |
-
}
|
| 156 |
-
button.lg:hover, button.primary:hover {
|
| 157 |
-
transform: translateY(-2px) !important;
|
| 158 |
-
box-shadow: 0 8px 30px rgba(68,136,255,0.5) !important;
|
| 159 |
-
}
|
| 160 |
-
button.secondary {
|
| 161 |
-
background: rgba(255,255,255,0.05) !important;
|
| 162 |
-
border: 1px solid var(--border) !important;
|
| 163 |
-
color: var(--text) !important;
|
| 164 |
-
border-radius: 8px !important;
|
| 165 |
-
transition: all 0.2s !important;
|
| 166 |
}
|
| 167 |
-
button.
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
}
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
font-size:
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
input:focus, textarea:focus {
|
| 193 |
-
border-color: var(--blue) !important;
|
| 194 |
-
box-shadow: 0 0 0 3px rgba(68,136,255,0.15) !important;
|
| 195 |
-
outline: none !important;
|
| 196 |
}
|
|
|
|
|
|
|
|
|
|
| 197 |
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
font-size: 11px !important;
|
| 202 |
-
font-weight: 600 !important;
|
| 203 |
-
text-transform: uppercase !important;
|
| 204 |
-
letter-spacing: 0.08em !important;
|
| 205 |
-
}
|
| 206 |
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
|
| 211 |
-
|
| 212 |
-
.
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
}
|
| 217 |
-
.dropdown .item { color: var(--text) !important; }
|
| 218 |
-
.dropdown .item:hover { background: rgba(68,136,255,0.12) !important; }
|
| 219 |
-
|
| 220 |
-
/* ββ Code output ββ */
|
| 221 |
-
.code-wrap, pre, code {
|
| 222 |
-
background: var(--surface) !important;
|
| 223 |
-
color: var(--green) !important;
|
| 224 |
-
font-family: 'JetBrains Mono', monospace !important;
|
| 225 |
-
border: 1px solid var(--border) !important;
|
| 226 |
-
border-radius: 8px !important;
|
| 227 |
-
font-size: 12px !important;
|
| 228 |
-
}
|
| 229 |
|
| 230 |
-
|
| 231 |
-
img, .image-container img {
|
| 232 |
-
border-radius: 10px !important;
|
| 233 |
-
border: 1px solid var(--border) !important;
|
| 234 |
}
|
|
|
|
| 235 |
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
border: 1px solid var(--border) !important;
|
| 240 |
-
border-radius: 10px !important;
|
| 241 |
-
}
|
| 242 |
-
.accordion .label { color: var(--text) !important; font-weight: 500 !important; }
|
| 243 |
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
background: var(--surface) !important;
|
| 247 |
-
border: 1px solid var(--border) !important;
|
| 248 |
-
border-radius: 8px !important;
|
| 249 |
-
}
|
| 250 |
-
.textbox textarea { background: transparent !important; color: var(--text) !important; }
|
| 251 |
-
|
| 252 |
-
/* ββ Custom hero HTML ββ */
|
| 253 |
-
#echo-hero-html {
|
| 254 |
-
background: linear-gradient(135deg, #050515 0%, #080825 50%, #050515 100%) !important;
|
| 255 |
-
border: 1px solid rgba(68,136,255,0.25) !important;
|
| 256 |
-
border-radius: 16px !important;
|
| 257 |
-
overflow: hidden !important;
|
| 258 |
-
}
|
| 259 |
-
#echo-hero-html .block { background: transparent !important; border: none !important; }
|
| 260 |
|
| 261 |
-
|
| 262 |
-
|
| 263 |
|
| 264 |
-
|
| 265 |
-
::-
|
| 266 |
-
|
| 267 |
-
::-
|
| 268 |
-
::-webkit-scrollbar-thumb:hover { background: rgba(80,100,255,0.4); }
|
| 269 |
-
"""
|
| 270 |
|
| 271 |
-
|
| 272 |
-
# HTML helpers
|
| 273 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 274 |
|
| 275 |
-
_HERO_HTML = """
|
| 276 |
-
<div style="
|
| 277 |
-
background: linear-gradient(135deg, #04040e 0%, #080825 40%, #0a0520 100%);
|
| 278 |
-
padding: 40px 40px 32px;
|
| 279 |
-
position: relative;
|
| 280 |
-
overflow: hidden;
|
| 281 |
-
">
|
| 282 |
-
<!-- Grid overlay -->
|
| 283 |
-
<div style="
|
| 284 |
-
position: absolute; inset: 0;
|
| 285 |
-
background-image: linear-gradient(rgba(68,136,255,0.04) 1px, transparent 1px),
|
| 286 |
-
linear-gradient(90deg, rgba(68,136,255,0.04) 1px, transparent 1px);
|
| 287 |
-
background-size: 40px 40px;
|
| 288 |
-
pointer-events: none;
|
| 289 |
-
"></div>
|
| 290 |
-
|
| 291 |
-
<!-- Glow orbs -->
|
| 292 |
-
<div style="
|
| 293 |
-
position: absolute; top: -60px; right: -60px;
|
| 294 |
-
width: 300px; height: 300px;
|
| 295 |
-
background: radial-gradient(circle, rgba(68,136,255,0.12) 0%, transparent 70%);
|
| 296 |
-
pointer-events: none;
|
| 297 |
-
"></div>
|
| 298 |
-
<div style="
|
| 299 |
-
position: absolute; bottom: -80px; left: 100px;
|
| 300 |
-
width: 250px; height: 250px;
|
| 301 |
-
background: radial-gradient(circle, rgba(0,255,163,0.08) 0%, transparent 70%);
|
| 302 |
-
pointer-events: none;
|
| 303 |
-
"></div>
|
| 304 |
-
|
| 305 |
-
<div style="position: relative; z-index: 1;">
|
| 306 |
<!-- Badge -->
|
| 307 |
-
<div style="display:inline-flex;
|
| 308 |
-
background:
|
| 309 |
-
|
| 310 |
-
<span style="width:7px;height:7px;border-radius:50%;background:#00ffa3;
|
| 311 |
-
box-shadow:0 0 8px #00ffa3; display:inline-block;"></span>
|
| 312 |
-
<span style="color:#00ffa3; font-size:12px; font-weight:600; letter-spacing:0.1em;
|
| 313 |
-
font-family:'Inter',sans-serif;">OPENENV HACKATHON 2025</span>
|
| 314 |
</div>
|
| 315 |
|
| 316 |
<!-- Title -->
|
| 317 |
-
<h1 style="
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
max-width: 600px;
|
| 337 |
-
">Training LLMs to accurately predict their own confidence via GRPO</p>
|
| 338 |
-
|
| 339 |
-
<!-- Quote -->
|
| 340 |
-
<div style="
|
| 341 |
-
background: rgba(68,136,255,0.08);
|
| 342 |
-
border-left: 3px solid #4488ff;
|
| 343 |
-
border-radius: 0 8px 8px 0;
|
| 344 |
-
padding: 10px 16px;
|
| 345 |
-
margin-bottom: 32px;
|
| 346 |
-
max-width: 620px;
|
| 347 |
-
">
|
| 348 |
-
<p style="
|
| 349 |
-
margin: 0;
|
| 350 |
-
font-size: 14px;
|
| 351 |
-
color: #8899cc;
|
| 352 |
-
font-style: italic;
|
| 353 |
-
font-family: 'Inter', sans-serif;
|
| 354 |
-
">The most dangerous AI isn't one that's wrong β it's one that's wrong <strong style="color:#a0c0ff;">and certain.</strong></p>
|
| 355 |
-
</div>
|
| 356 |
-
|
| 357 |
-
<!-- Metric cards row -->
|
| 358 |
-
<div style="display:flex; gap:12px; flex-wrap:wrap;">
|
| 359 |
-
<div style="
|
| 360 |
-
background: linear-gradient(135deg, rgba(0,255,163,0.08), rgba(0,255,163,0.04));
|
| 361 |
-
border: 1px solid rgba(0,255,163,0.25);
|
| 362 |
-
border-radius: 12px; padding: 16px 22px; min-width: 130px;
|
| 363 |
-
">
|
| 364 |
-
<div style="font-size:28px;font-weight:800;color:#00ffa3;
|
| 365 |
-
font-family:'Inter',sans-serif;line-height:1;">0.080</div>
|
| 366 |
-
<div style="font-size:11px;color:#3d5a44;font-weight:600;
|
| 367 |
-
letter-spacing:0.08em;text-transform:uppercase;margin-top:4px;
|
| 368 |
-
font-family:'Inter',sans-serif;">Final ECE</div>
|
| 369 |
</div>
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
<div style="font-size:
|
| 376 |
-
font-family:'Inter',sans-serif;line-height:1;">76%</div>
|
| 377 |
-
<div style="font-size:11px;color:#3d4a6a;font-weight:600;
|
| 378 |
-
letter-spacing:0.08em;text-transform:uppercase;margin-top:4px;
|
| 379 |
-
font-family:'Inter',sans-serif;">ECE Reduction</div>
|
| 380 |
</div>
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
<div style="font-size:
|
| 387 |
-
font-family:'Inter',sans-serif;line-height:1;">7</div>
|
| 388 |
-
<div style="font-size:11px;color:#4a3a6a;font-weight:600;
|
| 389 |
-
letter-spacing:0.08em;text-transform:uppercase;margin-top:4px;
|
| 390 |
-
font-family:'Inter',sans-serif;">Domains</div>
|
| 391 |
</div>
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
<div style="font-size:
|
| 398 |
-
font-family:'Inter',sans-serif;line-height:1;">3,500</div>
|
| 399 |
-
<div style="font-size:11px;color:#5a5020;font-weight:600;
|
| 400 |
-
letter-spacing:0.08em;text-transform:uppercase;margin-top:4px;
|
| 401 |
-
font-family:'Inter',sans-serif;">GRPO Steps</div>
|
| 402 |
</div>
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
<div style="font-size:
|
| 409 |
-
font-family:'Inter',sans-serif;line-height:1;">5</div>
|
| 410 |
-
<div style="font-size:11px;color:#5a2030;font-weight:600;
|
| 411 |
-
letter-spacing:0.08em;text-transform:uppercase;margin-top:4px;
|
| 412 |
-
font-family:'Inter',sans-serif;">Metrics</div>
|
| 413 |
</div>
|
|
|
|
| 414 |
</div>
|
| 415 |
</div>
|
| 416 |
</div>
|
|
|
|
|
|
|
|
|
|
| 417 |
"""
|
| 418 |
|
| 419 |
|
| 420 |
-
def
|
| 421 |
return f"""
|
| 422 |
-
<div style="
|
| 423 |
-
background:
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
border-radius: 0 10px 10px 0;
|
| 427 |
-
padding: 14px 20px;
|
| 428 |
-
margin-bottom: 4px;
|
| 429 |
-
">
|
| 430 |
-
<div style="font-size:16px; font-weight:700; color:#fff;
|
| 431 |
-
font-family:'Inter',sans-serif; letter-spacing:-0.01em;">{title}</div>
|
| 432 |
-
{"" if not subtitle else f'<div style="font-size:13px; color:#4a5a8a; margin-top:3px; font-family:Inter,sans-serif;">{subtitle}</div>'}
|
| 433 |
</div>"""
|
| 434 |
|
| 435 |
|
| 436 |
-
def
|
| 437 |
-
return f
|
| 438 |
-
|
| 439 |
-
background: rgba(255,255,255,0.04); border: 1px solid rgba(80,100,255,0.2);
|
| 440 |
-
border-radius: 999px; padding: 4px 12px; margin: 3px;
|
| 441 |
-
font-family:'Inter',sans-serif; font-size:13px; color:#8899bb;
|
| 442 |
-
"><span style="color:{color}; font-weight:700;">{value}</span> {label}</span>"""
|
| 443 |
|
| 444 |
|
| 445 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 446 |
-
# Tab 6
|
| 447 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββοΏ½οΏ½οΏ½ββββββββββββββββββββββββββ
|
| 448 |
|
| 449 |
_training_state: dict = {"running": False, "steps": [], "ece_values": [], "stop": False}
|
| 450 |
|
| 451 |
|
| 452 |
-
def
|
| 453 |
fig, ax = plt.subplots(figsize=(10, 4.5), facecolor="#04040e")
|
| 454 |
-
ax.set_facecolor("#
|
| 455 |
-
|
| 456 |
if steps:
|
| 457 |
-
xs = np.array(steps)
|
| 458 |
-
ax.fill_between(xs, ys, alpha=
|
| 459 |
-
ax.plot(xs, ys, color="#00ffa3",
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
ax.
|
| 471 |
-
|
| 472 |
-
ax.
|
| 473 |
-
label="Task 2 target ECE < 0.20", zorder=3)
|
| 474 |
-
|
| 475 |
-
ax.set_xlabel("Training Step", color="#4a5a8a", fontsize=11, labelpad=8)
|
| 476 |
-
ax.set_ylabel("ECE (β lower = better)", color="#4a5a8a", fontsize=11, labelpad=8)
|
| 477 |
-
ax.set_title("GRPO Calibration Training β Real-Time ECE",
|
| 478 |
-
color="#c0d0ff", fontsize=13, fontweight="bold", pad=14)
|
| 479 |
-
ax.tick_params(colors="#3a4a6a", labelsize=10)
|
| 480 |
-
ax.set_ylim(0, 0.50)
|
| 481 |
-
ax.set_xlim(-2, 105)
|
| 482 |
-
|
| 483 |
-
for spine in ax.spines.values():
|
| 484 |
-
spine.set_color("#1a1a3a")
|
| 485 |
-
|
| 486 |
-
ax.grid(True, linestyle="--", alpha=0.15, color="#2a2a4a")
|
| 487 |
-
ax.legend(facecolor="#080820", labelcolor="#8899bb",
|
| 488 |
-
edgecolor="#1a1a3a", fontsize=10, loc="upper right")
|
| 489 |
plt.tight_layout()
|
| 490 |
-
|
| 491 |
tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
|
| 492 |
-
plt.savefig(tmp.name, dpi=
|
| 493 |
plt.close(fig)
|
| 494 |
return tmp.name
|
| 495 |
|
| 496 |
|
| 497 |
-
def
|
| 498 |
import random
|
| 499 |
_training_state.update({"running": True, "steps": [], "ece_values": [], "stop": False})
|
| 500 |
ece = 0.42
|
| 501 |
for step in range(0, 101, 10):
|
| 502 |
-
if _training_state["stop"]:
|
| 503 |
-
|
| 504 |
-
ece = max(0.07, ece - random.uniform(0.02, 0.05) + random.uniform(-0.008, 0.008))
|
| 505 |
_training_state["steps"].append(step)
|
| 506 |
_training_state["ece_values"].append(round(ece, 4))
|
| 507 |
time.sleep(1.5)
|
|
@@ -509,28 +328,22 @@ def _run_live_training_thread():
|
|
| 509 |
|
| 510 |
|
| 511 |
def start_live_training():
|
| 512 |
-
|
| 513 |
-
t.start()
|
| 514 |
for _ in range(60):
|
| 515 |
time.sleep(1.5)
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
n = len(steps)
|
| 519 |
prog = round((n / 11) * 100)
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
status = f"Step {steps[-1]:>3}/100 β ECE {ece_v[-1]:.4f} β β{pct_drop:.1f}% from start"
|
| 524 |
else:
|
| 525 |
status = "Initializing GRPO trainerβ¦"
|
| 526 |
-
|
| 527 |
if not _training_state["running"] and n > 0:
|
| 528 |
-
status =
|
| 529 |
-
|
| 530 |
-
f"(β{(ece_v[0]-ece_v[-1])/ece_v[0]*100:.1f}%)")
|
| 531 |
-
yield status, _make_live_plot(steps, ece_v), prog
|
| 532 |
return
|
| 533 |
-
yield status,
|
| 534 |
|
| 535 |
|
| 536 |
def stop_live_training():
|
|
@@ -539,18 +352,15 @@ def stop_live_training():
|
|
| 539 |
|
| 540 |
|
| 541 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 542 |
-
# Shared state
|
| 543 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 544 |
|
| 545 |
-
_task_bank = None
|
| 546 |
-
_env = None
|
| 547 |
-
_live_hist = None
|
| 548 |
|
| 549 |
|
| 550 |
def _init():
|
| 551 |
global _task_bank, _env, _live_hist
|
| 552 |
-
if _env is not None:
|
| 553 |
-
return
|
| 554 |
from env.task_bank import TaskBank
|
| 555 |
from env.echo_env import EchoEnv
|
| 556 |
from env.reward import RewardHistory
|
|
@@ -563,402 +373,352 @@ def _init():
|
|
| 563 |
_current_task: dict = {}
|
| 564 |
|
| 565 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 566 |
-
# Tab 1
|
| 567 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 568 |
|
| 569 |
-
def get_question(domain
|
| 570 |
global _current_task
|
| 571 |
_init()
|
| 572 |
task = _task_bank.get_task(domain.lower(), difficulty.lower())
|
| 573 |
_current_task = task
|
| 574 |
-
q = (
|
| 575 |
-
f"**Domain:** `{domain}` Β· **Difficulty:** `{difficulty}`\n\n"
|
| 576 |
-
f"---\n\n{task['question']}"
|
| 577 |
-
)
|
| 578 |
return q, ""
|
| 579 |
|
| 580 |
|
| 581 |
-
def submit_answer(confidence
|
| 582 |
if not _current_task:
|
| 583 |
-
return "β οΈ Get a question first
|
| 584 |
from env.reward import compute_reward
|
| 585 |
task = _current_task
|
| 586 |
-
rb
|
| 587 |
-
|
| 588 |
-
_live_hist.append(confidence, rb.was_correct, task["domain"],
|
| 589 |
-
task["difficulty"], rb.total)
|
| 590 |
snap = _live_hist.get_training_snapshot()
|
| 591 |
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
|
| 599 |
-
|
| 600 |
-
|
| 601 |
-
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
|
| 610 |
-
|
| 611 |
-
|
| 612 |
-
|
| 613 |
-
|
| 614 |
-
|
| 615 |
-
|
| 616 |
-
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
|
| 620 |
|
| 621 |
-
n_ep = snap.get(
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
|
| 642 |
-
f"<span style='color:#6677aa;font-size:13px;'>Overconf rate</span>"
|
| 643 |
-
f"<span style='color:#ff8c00;font-weight:600;'>{snap['overconfidence_rate']:.1%}</span></div>"
|
| 644 |
-
f"</div></div>"
|
| 645 |
-
)
|
| 646 |
|
| 647 |
-
if rb.overconfidence_penalty_val < -
|
| 648 |
-
tip =
|
| 649 |
-
"This is exactly what ECHO trains against.")
|
| 650 |
elif rb.was_correct and confidence >= 65:
|
| 651 |
-
tip = "π― **Well calibrated** β confident and correct.
|
| 652 |
elif not rb.was_correct and confidence < 40:
|
| 653 |
-
tip = "π― **Good self-awareness** β
|
| 654 |
-
elif rb.underconfidence_penalty_val < -
|
| 655 |
-
tip = "π€ **Underconfident** β you
|
| 656 |
else:
|
| 657 |
tip = ""
|
| 658 |
-
|
| 659 |
-
return result_md, stats_md, tip
|
| 660 |
|
| 661 |
|
| 662 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 663 |
-
# Tab 2
|
| 664 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 665 |
|
| 666 |
-
def run_comparison(scenario
|
| 667 |
_init()
|
| 668 |
from core.baseline import AlwaysHighAgent, HeuristicAgent
|
| 669 |
from env.reward import compute_reward, RewardHistory
|
| 670 |
from env.parser import format_prompt, parse_response
|
| 671 |
-
from core.metrics import compute_report
|
| 672 |
|
| 673 |
-
domain_map = {
|
| 674 |
-
|
| 675 |
-
"Science": "science", "Medical": "medical", "Coding": "coding",
|
| 676 |
-
"Creative": "creative", "Mixed": None,
|
| 677 |
-
}
|
| 678 |
domain = domain_map.get(scenario)
|
| 679 |
-
n = 10
|
| 680 |
-
|
| 681 |
-
baseline = AlwaysHighAgent()
|
| 682 |
-
echo_agent = HeuristicAgent()
|
| 683 |
echo_h, base_h = RewardHistory(), RewardHistory()
|
| 684 |
-
rows_html = ""
|
| 685 |
|
| 686 |
-
for i in range(
|
| 687 |
-
d
|
| 688 |
task = _task_bank.get_task(d, "medium")
|
| 689 |
prompt = format_prompt(task["question"], d, "medium")
|
| 690 |
-
|
| 691 |
-
|
| 692 |
-
|
| 693 |
-
|
| 694 |
-
task.get("answer_aliases", []), d)
|
| 695 |
-
br = compute_reward(bp.confidence, bp.answer, task["answer"],
|
| 696 |
-
task.get("answer_aliases", []), d)
|
| 697 |
-
|
| 698 |
echo_h.append(ep.confidence, er.was_correct, d, "medium", er.total)
|
| 699 |
base_h.append(bp.confidence, br.was_correct, d, "medium", br.total)
|
| 700 |
|
| 701 |
-
ei = "β
" if er.was_correct else "β"
|
| 702 |
-
bi = "β
" if br.was_correct else "β"
|
| 703 |
ec = "#00ffa3" if er.was_correct else "#ff4466"
|
| 704 |
bc = "#ff4466" if not br.was_correct else "#00ffa3"
|
|
|
|
|
|
|
| 705 |
|
| 706 |
-
rows_html +=
|
| 707 |
-
|
| 708 |
-
|
| 709 |
-
|
| 710 |
-
|
| 711 |
-
|
| 712 |
-
|
| 713 |
-
|
| 714 |
-
|
| 715 |
-
|
| 716 |
-
|
| 717 |
-
|
| 718 |
-
|
| 719 |
-
|
| 720 |
-
|
| 721 |
-
|
| 722 |
-
|
| 723 |
-
|
| 724 |
-
|
| 725 |
-
|
| 726 |
-
|
| 727 |
-
|
| 728 |
-
|
| 729 |
-
|
| 730 |
-
|
| 731 |
-
|
|
|
|
| 732 |
|
|
|
|
| 733 |
em = echo_h.get_training_snapshot()
|
| 734 |
bm = base_h.get_training_snapshot()
|
| 735 |
-
delta_ece = abs(em['ece'] - bm['ece'])
|
| 736 |
-
|
| 737 |
-
summary_html = (
|
| 738 |
-
f"<div style='background:rgba(255,255,255,0.02);border:1px solid rgba(80,100,255,0.15);"
|
| 739 |
-
f"border-radius:10px;padding:20px;margin-top:4px;'>"
|
| 740 |
-
f"<div style='font-size:12px;font-weight:700;color:#4a5a8a;"
|
| 741 |
-
f"text-transform:uppercase;letter-spacing:0.08em;margin-bottom:16px;'>Results Summary</div>"
|
| 742 |
-
f"<div style='display:grid;grid-template-columns:repeat(4,1fr);gap:10px;margin-bottom:16px;'>"
|
| 743 |
-
+ _metric_card("ECE", f"{em['ece']:.3f}", f"{bm['ece']:.3f}", "#00ffa3", "#ff4466", "lower = better")
|
| 744 |
-
+ _metric_card("Accuracy", f"{em['accuracy']:.1%}", f"{bm['accuracy']:.1%}", "#00ffa3", "#ff4466", "")
|
| 745 |
-
+ _metric_card("Mean Conf", f"{em['mean_confidence']:.0f}%", f"{bm['mean_confidence']:.0f}%", "#4488ff", "#ff8c00", "")
|
| 746 |
-
+ _metric_card("Overconf Rate", f"{em['overconfidence_rate']:.1%}", f"{bm['overconfidence_rate']:.1%}", "#00ffa3", "#ff4466", "")
|
| 747 |
-
+ f"</div>"
|
| 748 |
-
f"<div style='background:linear-gradient(135deg,rgba(0,255,163,0.08),rgba(68,136,255,0.05));"
|
| 749 |
-
f"border:1px solid rgba(0,255,163,0.2);border-radius:8px;padding:12px 16px;text-align:center;'>"
|
| 750 |
-
f"<span style='color:#00ffa3;font-size:18px;font-weight:800;'>"
|
| 751 |
-
f"ECHO is {delta_ece:.0%} better calibrated</span>"
|
| 752 |
-
f"<span style='color:#4a5a8a;font-size:13px;'> than the overconfident baseline</span>"
|
| 753 |
-
f"</div></div>"
|
| 754 |
-
)
|
| 755 |
|
| 756 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 757 |
erep = echo_h.get_calibration_report()
|
| 758 |
brep = base_h.get_calibration_report()
|
| 759 |
fig, ax = plt.subplots(figsize=(7, 4.5), facecolor="#04040e")
|
| 760 |
-
ax.set_facecolor("#
|
| 761 |
-
ax.plot([0,100],[0,100],"--",color="#
|
| 762 |
for rep, col, lbl in [(erep,"#00ffa3","ECHO"),(brep,"#ff4466","Overconfident AI")]:
|
| 763 |
-
bd = rep.bin_data
|
| 764 |
-
xs = sorted(bd.keys())
|
| 765 |
ys = [bd[b]["accuracy"]*100 for b in xs]
|
| 766 |
-
if xs:
|
| 767 |
-
|
| 768 |
-
|
| 769 |
-
|
| 770 |
-
ax.
|
| 771 |
-
ax.
|
| 772 |
-
ax.
|
| 773 |
-
ax.
|
| 774 |
-
|
| 775 |
-
ax.grid(True, linestyle="--", alpha=0.12, color="#2a2a4a")
|
| 776 |
-
ax.legend(facecolor="#080820", labelcolor="#8899bb", edgecolor="#1a1a3a", fontsize=10)
|
| 777 |
plt.tight_layout()
|
| 778 |
tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
|
| 779 |
-
plt.savefig(tmp.name, dpi=
|
| 780 |
plt.close(fig)
|
| 781 |
|
| 782 |
-
return
|
| 783 |
-
|
| 784 |
-
|
| 785 |
-
def _metric_card(label, echo_val, base_val, echo_col, base_col, note):
|
| 786 |
-
return (
|
| 787 |
-
f"<div style='background:rgba(255,255,255,0.02);border:1px solid rgba(80,100,255,0.1);"
|
| 788 |
-
f"border-radius:8px;padding:12px;text-align:center;'>"
|
| 789 |
-
f"<div style='font-size:11px;color:#3a4a6a;text-transform:uppercase;"
|
| 790 |
-
f"letter-spacing:0.07em;margin-bottom:6px;'>{label}</div>"
|
| 791 |
-
f"<div style='display:flex;justify-content:center;gap:12px;align-items:baseline;'>"
|
| 792 |
-
f"<span style='color:{echo_col};font-size:16px;font-weight:800;'>{echo_val}</span>"
|
| 793 |
-
f"<span style='color:#2a3a5a;font-size:12px;'>vs</span>"
|
| 794 |
-
f"<span style='color:{base_col};font-size:16px;font-weight:800;'>{base_val}</span>"
|
| 795 |
-
f"</div>"
|
| 796 |
-
f"{'<div style=color:#2a3a5a;font-size:10px;margin-top:3px;>'+note+'</div>' if note else ''}"
|
| 797 |
-
f"</div>"
|
| 798 |
-
)
|
| 799 |
|
| 800 |
|
| 801 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 802 |
-
# Tab 3
|
| 803 |
# ββββββββββββββββββββββββββββββββοΏ½οΏ½οΏ½ββββββββββββββββββββββββββββββββββββββββββββ
|
| 804 |
|
| 805 |
-
def generate_fingerprint(model_label
|
| 806 |
from core.epistemic_fingerprint import _make_synthetic_fingerprint, plot_radar
|
| 807 |
_init()
|
| 808 |
-
|
| 809 |
-
fp
|
| 810 |
-
|
| 811 |
-
|
| 812 |
tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
|
| 813 |
-
plot_radar(
|
| 814 |
-
|
| 815 |
-
strongest = fp.strongest_domain.capitalize()
|
| 816 |
-
weakest = fp.weakest_domain.capitalize()
|
| 817 |
|
| 818 |
-
|
| 819 |
-
"<div style='display:flex;flex-direction:column;gap:6px;'>"
|
| 820 |
-
)
|
| 821 |
for d in cfg.DOMAINS:
|
| 822 |
-
|
| 823 |
-
|
| 824 |
-
|
| 825 |
-
|
| 826 |
-
|
| 827 |
-
|
| 828 |
-
|
| 829 |
-
|
| 830 |
-
|
| 831 |
-
|
| 832 |
-
|
| 833 |
-
|
| 834 |
-
|
| 835 |
-
|
| 836 |
-
|
| 837 |
-
|
| 838 |
-
|
| 839 |
-
|
| 840 |
-
|
| 841 |
-
|
| 842 |
-
|
| 843 |
-
|
| 844 |
-
|
| 845 |
-
|
| 846 |
-
|
| 847 |
-
f"<strong style='color:#ff4466;'>{weakest}</strong>.</div>"
|
| 848 |
-
f"<div style='margin-top:8px;font-size:14px;color:#6677aa;'>"
|
| 849 |
-
f"Overall ECE: <strong style='color:#ffd700;'>{fp.overall_ece:.3f}</strong></div></div>"
|
| 850 |
-
)
|
| 851 |
|
| 852 |
-
return tmp.name,
|
| 853 |
|
| 854 |
|
| 855 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 856 |
-
# Tab 5
|
| 857 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 858 |
|
| 859 |
-
def run_evaluation()
|
| 860 |
_init()
|
| 861 |
from core.tasks import TASKS, TaskRunner, TASKS_BY_ID
|
| 862 |
from core.baseline import HeuristicAgent
|
| 863 |
-
|
| 864 |
-
agent = HeuristicAgent()
|
| 865 |
-
result = runner.run_all(agent, _task_bank)
|
| 866 |
|
| 867 |
-
|
| 868 |
for r in result.tasks:
|
| 869 |
-
t
|
| 870 |
-
|
| 871 |
-
|
| 872 |
-
|
| 873 |
-
|
| 874 |
-
icon = "β
|
| 875 |
-
|
| 876 |
-
|
| 877 |
-
|
| 878 |
-
|
| 879 |
-
|
| 880 |
-
|
| 881 |
-
|
| 882 |
-
|
| 883 |
-
|
| 884 |
-
|
| 885 |
-
|
| 886 |
-
|
| 887 |
-
|
| 888 |
-
|
| 889 |
-
|
| 890 |
-
|
| 891 |
-
|
| 892 |
-
|
| 893 |
-
f"</div>"
|
| 894 |
-
)
|
| 895 |
|
| 896 |
-
|
| 897 |
-
|
| 898 |
-
|
| 899 |
-
|
| 900 |
-
|
| 901 |
-
|
| 902 |
-
|
| 903 |
-
|
| 904 |
-
)
|
| 905 |
|
| 906 |
json_str = json.dumps(result.to_dict(), indent=2, default=str)
|
| 907 |
-
return
|
| 908 |
|
| 909 |
|
| 910 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 911 |
-
#
|
| 912 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 913 |
|
| 914 |
def build_app():
|
| 915 |
import gradio as gr
|
| 916 |
|
| 917 |
plots = {k: f"{cfg.PLOTS_DIR}/{v}" for k, v in {
|
| 918 |
-
"reliability":
|
| 919 |
-
"training":
|
| 920 |
-
"fingerprint":
|
| 921 |
-
"heatmap":
|
| 922 |
-
"distribution":
|
| 923 |
-
"domain":
|
| 924 |
}.items()}
|
| 925 |
-
def _img(
|
|
|
|
|
|
|
| 926 |
|
| 927 |
with gr.Blocks(title="ECHO ULTIMATE") as demo:
|
| 928 |
|
| 929 |
# ββ Hero βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 930 |
-
gr.HTML(
|
| 931 |
|
| 932 |
-
# ββ Tab 1
|
| 933 |
with gr.Tab("π― Live Challenge"):
|
| 934 |
-
gr.HTML(
|
| 935 |
-
"
|
| 936 |
-
"Answer questions with a confidence score β discover how well-calibrated you are",
|
| 937 |
-
"#00ffa3"
|
| 938 |
-
))
|
| 939 |
with gr.Row():
|
| 940 |
-
dom_dd = gr.Dropdown(
|
| 941 |
-
|
| 942 |
-
value="Math", label="Domain"
|
| 943 |
-
)
|
| 944 |
diff_dd = gr.Dropdown(["Easy","Medium","Hard"], value="Easy", label="Difficulty")
|
| 945 |
-
get_btn = gr.Button("π² Get Question", variant="primary"
|
| 946 |
-
|
| 947 |
question_box = gr.Markdown(
|
| 948 |
-
"<div style='color:#
|
| 949 |
-
"Select a domain and difficulty, then click Get Question.</div>"
|
| 950 |
)
|
| 951 |
-
|
| 952 |
with gr.Row():
|
| 953 |
-
|
| 954 |
-
|
| 955 |
-
label="Confidence (0 = no idea Β· 100 = certain)")
|
| 956 |
-
with gr.Column(scale=3):
|
| 957 |
-
ans_box = gr.Textbox(label="Your Answer", placeholder="Type your answerβ¦",
|
| 958 |
-
lines=1)
|
| 959 |
-
|
| 960 |
sub_btn = gr.Button("β
Submit Answer", variant="primary")
|
| 961 |
-
|
| 962 |
with gr.Row():
|
| 963 |
result_html = gr.HTML()
|
| 964 |
stats_html = gr.HTML()
|
|
@@ -967,200 +727,131 @@ def build_app():
|
|
| 967 |
get_btn.click(get_question, [dom_dd, diff_dd], [question_box, ans_box])
|
| 968 |
sub_btn.click(submit_answer, [conf_sl, ans_box], [result_html, stats_html, tip_md])
|
| 969 |
|
| 970 |
-
# ββ Tab 2
|
| 971 |
-
with gr.Tab("β ECHO vs
|
| 972 |
-
gr.HTML(
|
| 973 |
-
"
|
| 974 |
-
"10-question head-to-head: calibrated ECHO vs AlwaysHigh baseline (always 90% confident)",
|
| 975 |
-
"#ff4466"
|
| 976 |
-
))
|
| 977 |
with gr.Row():
|
| 978 |
scenario_dd = gr.Dropdown(
|
| 979 |
["Mixed","Math","Logic","Factual","Science","Medical","Coding","Creative"],
|
| 980 |
-
value="Mixed", label="Test Scenario"
|
| 981 |
-
)
|
| 982 |
run_btn = gr.Button("β Run 10 Questions", variant="primary")
|
| 983 |
-
|
| 984 |
with gr.Row():
|
| 985 |
-
with gr.Column(scale=3):
|
| 986 |
-
|
| 987 |
-
|
| 988 |
-
mini_img = gr.Image(label="Live Reliability Diagram", type="filepath",
|
| 989 |
-
show_label=True, height=320)
|
| 990 |
-
|
| 991 |
run_btn.click(run_comparison, [scenario_dd], [cmp_html, mini_img])
|
| 992 |
|
| 993 |
-
# ββ Tab 3
|
| 994 |
with gr.Tab("𧬠Epistemic Fingerprint"):
|
| 995 |
-
gr.HTML(
|
| 996 |
-
"
|
| 997 |
-
"Radar chart of calibration across all 7 domains β larger green = better everywhere",
|
| 998 |
-
"#a855f7"
|
| 999 |
-
))
|
| 1000 |
with gr.Row():
|
| 1001 |
-
model_dd = gr.Dropdown(
|
| 1002 |
-
|
| 1003 |
-
|
| 1004 |
-
)
|
| 1005 |
-
fp_btn = gr.Button("π¬ Generate Fingerprint", variant="primary")
|
| 1006 |
-
|
| 1007 |
with gr.Row():
|
| 1008 |
with gr.Column(scale=3):
|
| 1009 |
fp_img = gr.Image(label="Epistemic Fingerprint", type="filepath",
|
| 1010 |
-
|
| 1011 |
with gr.Column(scale=2):
|
| 1012 |
-
fp_bars
|
| 1013 |
fp_insight = gr.HTML()
|
| 1014 |
-
|
| 1015 |
fp_btn.click(generate_fingerprint, [model_dd], [fp_img, fp_bars, fp_insight])
|
| 1016 |
|
| 1017 |
-
# ββ Tab 4
|
| 1018 |
with gr.Tab("π Training Evidence"):
|
| 1019 |
-
gr.HTML(
|
| 1020 |
-
"
|
| 1021 |
-
|
| 1022 |
-
"#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1023 |
))
|
| 1024 |
-
|
| 1025 |
-
gr.HTML("""
|
| 1026 |
-
<div style='background:rgba(0,255,163,0.05);border:1px solid rgba(0,255,163,0.2);
|
| 1027 |
-
border-radius:10px;padding:16px 20px;margin-bottom:8px;'>
|
| 1028 |
-
<div style='font-size:15px;font-weight:700;color:#00ffa3;margin-bottom:6px;'>
|
| 1029 |
-
β
Hero Plot β Reliability Diagram</div>
|
| 1030 |
-
<div style='color:#6677aa;font-size:13px;'>
|
| 1031 |
-
The smoking gun. Untrained model (red): flat line far from the diagonal β always overconfident.
|
| 1032 |
-
ECHO trained (green): hugs the perfect calibration diagonal.
|
| 1033 |
-
</div>
|
| 1034 |
-
</div>""")
|
| 1035 |
gr.Image(value=_img("reliability"), label="Reliability Diagram", height=380)
|
| 1036 |
-
|
| 1037 |
with gr.Row():
|
| 1038 |
with gr.Column():
|
| 1039 |
-
gr.HTML("<div style='font-size:13px;font-weight:600;color:#4488ff;"
|
| 1040 |
-
|
| 1041 |
-
"<div style='font-size:12px;color:#4a5a8a;margin-bottom:8px;'>"
|
| 1042 |
-
"ECE drops 0.34 β 0.08 across 3 curriculum phases</div>")
|
| 1043 |
-
gr.Image(value=_img("training"), label="Training Curves", height=300)
|
| 1044 |
with gr.Column():
|
| 1045 |
-
gr.HTML("<div style='font-size:13px;font-weight:600;color:#a855f7;"
|
| 1046 |
-
|
| 1047 |
-
"<div style='font-size:12px;color:#4a5a8a;margin-bottom:8px;'>"
|
| 1048 |
-
"Domain-level calibration β green fills every axis</div>")
|
| 1049 |
-
gr.Image(value=_img("fingerprint"), label="Epistemic Fingerprint", height=300)
|
| 1050 |
-
|
| 1051 |
with gr.Row():
|
| 1052 |
with gr.Column():
|
| 1053 |
-
gr.HTML("<div style='font-size:13px;font-weight:600;color:#ffd700;"
|
| 1054 |
-
|
| 1055 |
-
"<div style='font-size:12px;color:#4a5a8a;margin-bottom:8px;'>"
|
| 1056 |
-
"7 domains Γ 3 difficulties β red=bad, green=good</div>")
|
| 1057 |
-
gr.Image(value=_img("heatmap"), label="Calibration Heatmap", height=300)
|
| 1058 |
with gr.Column():
|
| 1059 |
-
gr.HTML("<div style='font-size:13px;font-weight:600;color:#ff8c00;"
|
| 1060 |
-
|
| 1061 |
-
|
| 1062 |
-
|
| 1063 |
-
gr.Image(value=_img("distribution"), label="Confidence Distribution", height=300)
|
| 1064 |
-
|
| 1065 |
-
gr.HTML("<div style='font-size:13px;font-weight:600;color:#ff4466;"
|
| 1066 |
-
"margin:8px 0 4px;'>π’ Domain Comparison</div>"
|
| 1067 |
-
"<div style='font-size:12px;color:#4a5a8a;margin-bottom:8px;'>"
|
| 1068 |
-
"ECE improvement across all 7 domains</div>")
|
| 1069 |
-
gr.Image(value=_img("domain"), label="Domain Comparison", height=320)
|
| 1070 |
-
|
| 1071 |
regen_btn = gr.Button("π Regenerate All Plots", variant="secondary")
|
| 1072 |
-
|
| 1073 |
-
|
| 1074 |
def regen():
|
| 1075 |
from training.evaluate import make_synthetic_pair, compare_and_plot
|
| 1076 |
-
|
| 1077 |
-
|
| 1078 |
-
|
| 1079 |
-
|
| 1080 |
-
|
| 1081 |
-
|
| 1082 |
-
|
| 1083 |
-
|
| 1084 |
-
|
| 1085 |
-
# ββ Tab 5: Evaluation βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1086 |
-
with gr.Tab("π Official Evaluation"):
|
| 1087 |
-
gr.HTML(_section_header(
|
| 1088 |
-
"π Official OpenEnv Evaluation",
|
| 1089 |
-
"3 tasks Γ 30 episodes β validates ECHO meets the benchmark thresholds",
|
| 1090 |
-
"#ffd700"
|
| 1091 |
-
))
|
| 1092 |
gr.HTML("""
|
| 1093 |
-
<div style=
|
| 1094 |
-
<div style=
|
| 1095 |
-
|
| 1096 |
-
<div style=
|
| 1097 |
-
<div style='color:#3a4a6a;font-size:12px;margin-top:4px;'>ECE target: < 0.15</div>
|
| 1098 |
</div>
|
| 1099 |
-
<div style=
|
| 1100 |
-
|
| 1101 |
-
<div style=
|
| 1102 |
-
<div style='color:#3a4a6a;font-size:12px;margin-top:4px;'>ECE target: < 0.20</div>
|
| 1103 |
</div>
|
| 1104 |
-
<div style=
|
| 1105 |
-
|
| 1106 |
-
<div style=
|
| 1107 |
-
<div style='color:#3a4a6a;font-size:12px;margin-top:4px;'>ECE target: < 0.25</div>
|
| 1108 |
</div>
|
| 1109 |
</div>""")
|
| 1110 |
-
eval_btn
|
| 1111 |
result_html = gr.HTML()
|
| 1112 |
-
with gr.Accordion("π Raw JSON
|
| 1113 |
json_out = gr.Code(language="json")
|
| 1114 |
eval_btn.click(run_evaluation, outputs=[result_html, json_out])
|
| 1115 |
|
| 1116 |
-
# ββ Tab 6
|
| 1117 |
with gr.Tab("β‘ Live Training"):
|
| 1118 |
-
gr.HTML(
|
| 1119 |
-
"
|
| 1120 |
-
"Watch ECE drop in real-time as the model trains. Dashed lines = pass thresholds.",
|
| 1121 |
-
"#4488ff"
|
| 1122 |
-
))
|
| 1123 |
with gr.Row():
|
| 1124 |
-
|
| 1125 |
-
|
| 1126 |
-
|
| 1127 |
-
|
| 1128 |
-
|
| 1129 |
-
|
| 1130 |
-
|
| 1131 |
-
|
| 1132 |
-
)
|
| 1133 |
-
lt_plot = gr.Image(
|
| 1134 |
-
label="ECE During Training",
|
| 1135 |
-
type="filepath", height=380,
|
| 1136 |
-
)
|
| 1137 |
-
lt_progress = gr.Slider(
|
| 1138 |
-
minimum=0, maximum=100, value=0,
|
| 1139 |
-
label="Progress (%)", interactive=False,
|
| 1140 |
-
)
|
| 1141 |
-
|
| 1142 |
-
lt_start_btn.click(start_live_training,
|
| 1143 |
-
outputs=[lt_status, lt_plot, lt_progress])
|
| 1144 |
-
lt_stop_btn.click(stop_live_training, outputs=[lt_status])
|
| 1145 |
|
| 1146 |
-
return demo
|
| 1147 |
|
| 1148 |
|
| 1149 |
def main():
|
| 1150 |
import gradio as gr
|
| 1151 |
logging.basicConfig(level=logging.INFO)
|
| 1152 |
-
demo = build_app()
|
| 1153 |
demo.launch(
|
| 1154 |
server_name="0.0.0.0",
|
| 1155 |
server_port=cfg.GRADIO_PORT,
|
| 1156 |
share=False,
|
| 1157 |
show_error=True,
|
| 1158 |
css=_CSS,
|
| 1159 |
-
|
| 1160 |
-
|
| 1161 |
-
neutral_hue=gr.themes.colors.slate,
|
| 1162 |
-
font=[gr.themes.GoogleFont("Inter"), "sans-serif"],
|
| 1163 |
-
),
|
| 1164 |
)
|
| 1165 |
|
| 1166 |
|
|
|
|
| 1 |
+
"""ECHO ULTIMATE β Premium Gradio 6 UI."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
import json
|
| 4 |
import logging
|
|
|
|
| 17 |
logger = logging.getLogger(__name__)
|
| 18 |
|
| 19 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 20 |
+
# Theme (Gradio 6 β all colors via .set())
|
| 21 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 22 |
|
| 23 |
+
def _echo_theme():
|
| 24 |
+
import gradio as gr
|
| 25 |
+
return (
|
| 26 |
+
gr.themes.Base(
|
| 27 |
+
primary_hue=gr.themes.colors.blue,
|
| 28 |
+
secondary_hue=gr.themes.colors.cyan,
|
| 29 |
+
neutral_hue=gr.themes.colors.slate,
|
| 30 |
+
font=[gr.themes.GoogleFont("Inter"), "system-ui", "sans-serif"],
|
| 31 |
+
font_mono=[gr.themes.GoogleFont("JetBrains Mono"), "monospace"],
|
| 32 |
+
)
|
| 33 |
+
.set(
|
| 34 |
+
# Page
|
| 35 |
+
body_background_fill="#04040e",
|
| 36 |
+
body_text_color="#b0c4ee",
|
| 37 |
+
body_text_color_subdued="#3a4a6a",
|
| 38 |
+
# Panels / blocks
|
| 39 |
+
background_fill_primary="#09091d",
|
| 40 |
+
background_fill_secondary="#060613",
|
| 41 |
+
block_background_fill="#09091d",
|
| 42 |
+
block_border_color="#1a1a3a",
|
| 43 |
+
block_border_width="1px",
|
| 44 |
+
block_label_background_fill="transparent",
|
| 45 |
+
block_label_text_color="#3a4a6a",
|
| 46 |
+
block_label_text_size="*text_xs",
|
| 47 |
+
block_title_text_color="#8090bb",
|
| 48 |
+
block_padding="16px",
|
| 49 |
+
# Inputs
|
| 50 |
+
input_background_fill="#060613",
|
| 51 |
+
input_border_color="#1a1a3a",
|
| 52 |
+
input_border_color_focus="#3366ff",
|
| 53 |
+
input_shadow_focus="0 0 0 3px rgba(51,102,255,0.2)",
|
| 54 |
+
input_placeholder_color="#2a3a5a",
|
| 55 |
+
# (input_text_color not a valid Gradio 6 theme var β handled via CSS)
|
| 56 |
+
# Buttons
|
| 57 |
+
button_large_padding="12px 24px",
|
| 58 |
+
button_large_text_size="*text_md",
|
| 59 |
+
button_primary_background_fill="linear-gradient(135deg,#1155ee,#0033bb)",
|
| 60 |
+
button_primary_background_fill_hover="linear-gradient(135deg,#2266ff,#0044cc)",
|
| 61 |
+
button_primary_text_color="#ffffff",
|
| 62 |
+
button_primary_border_color="rgba(51,102,255,0.6)",
|
| 63 |
+
button_secondary_background_fill="rgba(255,255,255,0.04)",
|
| 64 |
+
button_secondary_background_fill_hover="rgba(255,255,255,0.08)",
|
| 65 |
+
button_secondary_text_color="#8090bb",
|
| 66 |
+
button_secondary_border_color="#1a1a3a",
|
| 67 |
+
button_cancel_background_fill="linear-gradient(135deg,#bb1133,#dd2244)",
|
| 68 |
+
button_cancel_background_fill_hover="linear-gradient(135deg,#cc2244,#ee3355)",
|
| 69 |
+
button_cancel_text_color="#ffffff",
|
| 70 |
+
button_cancel_border_color="rgba(255,50,80,0.5)",
|
| 71 |
+
# Slider
|
| 72 |
+
slider_color="#00ffa3",
|
| 73 |
+
slider_color_dark="#00ffa3",
|
| 74 |
+
# Dropdown
|
| 75 |
+
checkbox_background_color="#09091d",
|
| 76 |
+
checkbox_background_color_selected="#1155ee",
|
| 77 |
+
checkbox_border_color="#1a1a3a",
|
| 78 |
+
# Tables
|
| 79 |
+
table_even_background_fill="rgba(30,40,100,0.15)",
|
| 80 |
+
table_odd_background_fill="transparent",
|
| 81 |
+
# Shadow
|
| 82 |
+
shadow_drop="0 2px 12px rgba(0,0,0,0.5)",
|
| 83 |
+
shadow_drop_lg="0 4px 24px rgba(0,0,0,0.6)",
|
| 84 |
+
# Color accent
|
| 85 |
+
color_accent="#00ffa3",
|
| 86 |
+
color_accent_soft="rgba(0,255,163,0.1)",
|
| 87 |
+
link_text_color="#4488ff",
|
| 88 |
+
link_text_color_active="#00ffa3",
|
| 89 |
+
link_text_color_visited="#3377ee",
|
| 90 |
+
)
|
| 91 |
+
)
|
| 92 |
|
|
|
|
|
|
|
| 93 |
|
| 94 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 95 |
+
# CSS (only for custom HTML sections + tab bar overrides)
|
| 96 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 97 |
+
|
| 98 |
+
_CSS = """
|
| 99 |
+
@import url('https://fonts.googleapis.com/css2?family=Inter:ital,wght@0,300;0,400;0,500;0,600;0,700;0,800;0,900;1,400&family=JetBrains+Mono:wght@400;500;600&display=swap');
|
| 100 |
+
|
| 101 |
+
html, body { background: #04040e !important; }
|
| 102 |
footer { display: none !important; }
|
| 103 |
+
.gradio-container { max-width: 1440px !important; margin: 0 auto !important; }
|
| 104 |
|
| 105 |
+
/* ββ Active tab indicator ββ */
|
| 106 |
+
.tab-nav { border-bottom: 1px solid #1a1a3a !important; background: #060613 !important; }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
.tab-nav button {
|
| 108 |
+
color: #2a3a6a !important; font-weight: 500 !important;
|
| 109 |
+
font-size: 13px !important; transition: all .18s !important;
|
| 110 |
+
border-radius: 0 !important; border-bottom: 2px solid transparent !important;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
}
|
| 112 |
+
.tab-nav button:hover { color: #6677aa !important; background: rgba(255,255,255,.03) !important; }
|
| 113 |
.tab-nav button.selected {
|
| 114 |
+
color: #00ffa3 !important;
|
| 115 |
+
border-bottom: 2px solid #00ffa3 !important;
|
| 116 |
+
background: rgba(0,255,163,.06) !important;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
}
|
| 118 |
|
| 119 |
+
/* ββ Primary button glow ββ */
|
| 120 |
+
button.lg.primary, .lg.primary {
|
| 121 |
+
box-shadow: 0 4px 20px rgba(51,102,255,.4) !important;
|
| 122 |
+
transition: all .2s !important;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
}
|
| 124 |
+
button.lg.primary:hover { transform: translateY(-2px) !important; box-shadow: 0 8px 32px rgba(51,102,255,.6) !important; }
|
| 125 |
+
|
| 126 |
+
/* ββ Cancel/stop button ββ */
|
| 127 |
+
button.lg.stop { box-shadow: 0 4px 20px rgba(255,50,80,.35) !important; }
|
| 128 |
+
|
| 129 |
+
/* ββ Textarea / textbox ββ */
|
| 130 |
+
textarea, input[type=text] { font-family: 'Inter', sans-serif !important; }
|
| 131 |
+
|
| 132 |
+
/* ββ Input text color (not a Gradio 6 theme var) ββ */
|
| 133 |
+
input, textarea, select, .svelte-1f354aw { color: #c0d0ff !important; }
|
| 134 |
+
label span { color: #3a4a6a !important; }
|
| 135 |
+
|
| 136 |
+
/* ββ Slim scrollbar ββ */
|
| 137 |
+
::-webkit-scrollbar { width: 5px; height: 5px; }
|
| 138 |
+
::-webkit-scrollbar-track { background: #04040e; }
|
| 139 |
+
::-webkit-scrollbar-thumb { background: #1a1a3a; border-radius: 3px; }
|
| 140 |
+
::-webkit-scrollbar-thumb:hover { background: #2a2a5a; }
|
| 141 |
+
|
| 142 |
+
/* ββ Markdown table ββ */
|
| 143 |
+
table { width: 100% !important; border-collapse: collapse !important; }
|
| 144 |
+
thead tr { background: rgba(51,102,255,.12) !important; }
|
| 145 |
+
th {
|
| 146 |
+
color: #3366ff !important; font-size: 11px !important; font-weight: 700 !important;
|
| 147 |
+
text-transform: uppercase !important; letter-spacing: .08em !important;
|
| 148 |
+
padding: 10px 14px !important; border-bottom: 1px solid #1a1a3a !important;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
}
|
| 150 |
+
td { padding: 9px 14px !important; border-bottom: 1px solid rgba(30,40,100,.3) !important; color: #8090bb !important; font-size: 13px !important; }
|
| 151 |
+
tr:last-child td { border-bottom: none !important; }
|
| 152 |
+
"""
|
| 153 |
|
| 154 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 155 |
+
# JavaScript
|
| 156 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
+
_JS = """
|
| 159 |
+
function echoInit() {
|
| 160 |
+
// Animate .echo-counter elements once
|
| 161 |
+
function animateCounter(el) {
|
| 162 |
+
var end = parseFloat(el.dataset.end);
|
| 163 |
+
var decimals = parseInt(el.dataset.decimals || 0);
|
| 164 |
+
var suffix = el.dataset.suffix || '';
|
| 165 |
+
var start = 0, duration = 1400, startTs = null;
|
| 166 |
+
function step(ts) {
|
| 167 |
+
if (!startTs) startTs = ts;
|
| 168 |
+
var p = Math.min((ts - startTs) / duration, 1);
|
| 169 |
+
var ease = 1 - Math.pow(1 - p, 4);
|
| 170 |
+
var val = start + (end - start) * ease;
|
| 171 |
+
el.textContent = (decimals > 0 ? val.toFixed(decimals) : Math.floor(val)) + suffix;
|
| 172 |
+
if (p < 1) requestAnimationFrame(step);
|
| 173 |
+
}
|
| 174 |
+
requestAnimationFrame(step);
|
| 175 |
+
}
|
| 176 |
|
| 177 |
+
setTimeout(function() {
|
| 178 |
+
document.querySelectorAll('.echo-counter').forEach(function(el) {
|
| 179 |
+
if (!el.dataset.animated) { el.dataset.animated = '1'; animateCounter(el); }
|
| 180 |
+
});
|
| 181 |
+
}, 400);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
|
| 183 |
+
return [];
|
|
|
|
|
|
|
|
|
|
| 184 |
}
|
| 185 |
+
"""
|
| 186 |
|
| 187 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 188 |
+
# HTML building blocks
|
| 189 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
|
| 191 |
+
HERO = """
|
| 192 |
+
<div style="position:relative;overflow:hidden;background:linear-gradient(160deg,#04040e 0%,#070720 45%,#04040e 100%);border-bottom:1px solid #1a1a3a;padding:48px 48px 40px;">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
|
| 194 |
+
<!-- Dot grid -->
|
| 195 |
+
<div style="position:absolute;inset:0;background-image:radial-gradient(circle,rgba(51,102,255,.18) 1px,transparent 1px);background-size:32px 32px;pointer-events:none;"></div>
|
| 196 |
|
| 197 |
+
<!-- Blue glow top-right -->
|
| 198 |
+
<div style="position:absolute;top:-120px;right:-80px;width:480px;height:480px;background:radial-gradient(circle,rgba(51,102,255,.1) 0%,transparent 65%);pointer-events:none;"></div>
|
| 199 |
+
<!-- Green glow bottom-left -->
|
| 200 |
+
<div style="position:absolute;bottom:-100px;left:80px;width:360px;height:360px;background:radial-gradient(circle,rgba(0,255,163,.07) 0%,transparent 65%);pointer-events:none;"></div>
|
|
|
|
|
|
|
| 201 |
|
| 202 |
+
<div style="position:relative;z-index:1;">
|
|
|
|
|
|
|
| 203 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
<!-- Badge -->
|
| 205 |
+
<div style="display:inline-flex;align-items:center;gap:8px;background:rgba(0,255,163,.08);border:1px solid rgba(0,255,163,.28);border-radius:999px;padding:5px 16px;margin-bottom:24px;">
|
| 206 |
+
<span style="width:7px;height:7px;border-radius:50%;background:#00ffa3;box-shadow:0 0 8px #00ffa3;display:inline-block;animation:pulse 2s infinite;"></span>
|
| 207 |
+
<span style="color:#00ffa3;font-size:11px;font-weight:700;letter-spacing:.14em;font-family:Inter,sans-serif;">OPENENV HACKATHON 2025</span>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
</div>
|
| 209 |
|
| 210 |
<!-- Title -->
|
| 211 |
+
<h1 style="margin:0 0 10px;font-size:clamp(32px,5vw,56px);font-weight:900;line-height:1.05;letter-spacing:-.03em;font-family:Inter,sans-serif;background:linear-gradient(135deg,#fff 0%,#88aaff 45%,#00ffa3 100%);-webkit-background-clip:text;-webkit-text-fill-color:transparent;background-clip:text;">
|
| 212 |
+
πͺ ECHO ULTIMATE
|
| 213 |
+
</h1>
|
| 214 |
+
|
| 215 |
+
<p style="margin:0 0 8px;font-size:20px;color:#4a5a8a;font-weight:300;font-family:Inter,sans-serif;letter-spacing:-.01em;">
|
| 216 |
+
Training LLMs to accurately predict their own confidence
|
| 217 |
+
</p>
|
| 218 |
+
<p style="margin:0 0 36px;font-size:14px;color:#2a3a5a;font-family:Inter,sans-serif;">
|
| 219 |
+
via GRPO Β· 7 domains Β· 5 calibration metrics Β· 3-phase curriculum Β· Phase 4 adversarial self-play
|
| 220 |
+
</p>
|
| 221 |
+
|
| 222 |
+
<!-- Stat cards -->
|
| 223 |
+
<div style="display:flex;gap:12px;flex-wrap:wrap;">
|
| 224 |
+
|
| 225 |
+
<div style="background:rgba(0,255,163,.07);border:1px solid rgba(0,255,163,.22);border-radius:12px;padding:18px 24px;min-width:120px;">
|
| 226 |
+
<div style="font-size:30px;font-weight:900;font-family:Inter,sans-serif;color:#00ffa3;line-height:1;">
|
| 227 |
+
<span class="echo-counter" data-end="0.080" data-decimals="3">0.080</span>
|
| 228 |
+
</div>
|
| 229 |
+
<div style="font-size:10px;color:#1a4a2a;font-weight:700;letter-spacing:.1em;text-transform:uppercase;margin-top:5px;font-family:Inter,sans-serif;">Final ECE</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
</div>
|
| 231 |
+
|
| 232 |
+
<div style="background:rgba(51,102,255,.07);border:1px solid rgba(51,102,255,.22);border-radius:12px;padding:18px 24px;min-width:120px;">
|
| 233 |
+
<div style="font-size:30px;font-weight:900;font-family:Inter,sans-serif;color:#4488ff;line-height:1;">
|
| 234 |
+
<span class="echo-counter" data-end="76" data-suffix="%">0%</span>
|
| 235 |
+
</div>
|
| 236 |
+
<div style="font-size:10px;color:#1a2a5a;font-weight:700;letter-spacing:.1em;text-transform:uppercase;margin-top:5px;font-family:Inter,sans-serif;">ECE Reduction</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
</div>
|
| 238 |
+
|
| 239 |
+
<div style="background:rgba(168,85,247,.07);border:1px solid rgba(168,85,247,.22);border-radius:12px;padding:18px 24px;min-width:120px;">
|
| 240 |
+
<div style="font-size:30px;font-weight:900;font-family:Inter,sans-serif;color:#a855f7;line-height:1;">
|
| 241 |
+
<span class="echo-counter" data-end="7">0</span>
|
| 242 |
+
</div>
|
| 243 |
+
<div style="font-size:10px;color:#2a1a4a;font-weight:700;letter-spacing:.1em;text-transform:uppercase;margin-top:5px;font-family:Inter,sans-serif;">Domains</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
</div>
|
| 245 |
+
|
| 246 |
+
<div style="background:rgba(255,215,0,.07);border:1px solid rgba(255,215,0,.22);border-radius:12px;padding:18px 24px;min-width:120px;">
|
| 247 |
+
<div style="font-size:30px;font-weight:900;font-family:Inter,sans-serif;color:#ffd700;line-height:1;">
|
| 248 |
+
<span class="echo-counter" data-end="3500">0</span>
|
| 249 |
+
</div>
|
| 250 |
+
<div style="font-size:10px;color:#3a3000;font-weight:700;letter-spacing:.1em;text-transform:uppercase;margin-top:5px;font-family:Inter,sans-serif;">GRPO Steps</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
</div>
|
| 252 |
+
|
| 253 |
+
<div style="background:rgba(255,68,102,.07);border:1px solid rgba(255,68,102,.22);border-radius:12px;padding:18px 24px;min-width:120px;">
|
| 254 |
+
<div style="font-size:30px;font-weight:900;font-family:Inter,sans-serif;color:#ff4466;line-height:1;">
|
| 255 |
+
<span class="echo-counter" data-end="5">0</span>
|
| 256 |
+
</div>
|
| 257 |
+
<div style="font-size:10px;color:#3a1020;font-weight:700;letter-spacing:.1em;text-transform:uppercase;margin-top:5px;font-family:Inter,sans-serif;">Metrics</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
</div>
|
| 259 |
+
|
| 260 |
</div>
|
| 261 |
</div>
|
| 262 |
</div>
|
| 263 |
+
<style>
|
| 264 |
+
@keyframes pulse { 0%,100%{opacity:1;box-shadow:0 0 6px #00ffa3} 50%{opacity:.5;box-shadow:0 0 14px #00ffa3} }
|
| 265 |
+
</style>
|
| 266 |
"""
|
| 267 |
|
| 268 |
|
| 269 |
+
def _tab_header(title: str, sub: str, accent: str = "#4488ff") -> str:
|
| 270 |
return f"""
|
| 271 |
+
<div style="border-left:3px solid {accent};padding:10px 16px 10px 18px;margin-bottom:4px;
|
| 272 |
+
background:linear-gradient(90deg,rgba(10,10,30,.6) 0%,transparent 100%);border-radius:0 8px 8px 0;">
|
| 273 |
+
<div style="font-size:17px;font-weight:700;color:#d0dcff;font-family:Inter,sans-serif;letter-spacing:-.01em;">{title}</div>
|
| 274 |
+
<div style="font-size:13px;color:#3a4a6a;margin-top:3px;font-family:Inter,sans-serif;">{sub}</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
</div>"""
|
| 276 |
|
| 277 |
|
| 278 |
+
def _card(content: str, border_color: str = "rgba(30,40,100,.4)") -> str:
|
| 279 |
+
return (f'<div style="background:#09091d;border:1px solid {border_color};'
|
| 280 |
+
f'border-radius:10px;padding:16px 20px;margin:4px 0;">{content}</div>')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
|
| 282 |
|
| 283 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 284 |
+
# Tab 6 β Live Training
|
| 285 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββοΏ½οΏ½οΏ½ββββββββββββββββββββββββββ
|
| 286 |
|
| 287 |
_training_state: dict = {"running": False, "steps": [], "ece_values": [], "stop": False}
|
| 288 |
|
| 289 |
|
| 290 |
+
def _live_plot(steps, ece_values):
|
| 291 |
fig, ax = plt.subplots(figsize=(10, 4.5), facecolor="#04040e")
|
| 292 |
+
ax.set_facecolor("#07071a")
|
|
|
|
| 293 |
if steps:
|
| 294 |
+
xs, ys = np.array(steps), np.array(ece_values)
|
| 295 |
+
ax.fill_between(xs, ys, alpha=.10, color="#00ffa3", zorder=2)
|
| 296 |
+
ax.plot(xs, ys, color="#00ffa3", lw=2.5, marker="o", ms=5,
|
| 297 |
+
mfc="#00ffa3", mec="#04040e", mew=1.5, zorder=4)
|
| 298 |
+
ax.annotate(f" {ys[-1]:.4f}", (xs[-1], ys[-1]),
|
| 299 |
+
color="#00ffa3", fontsize=11, fontweight="bold", va="center")
|
| 300 |
+
ax.axhline(.15, color="#ff4466", ls="--", lw=1.5, alpha=.7, label="Task 1 threshold ECE < 0.15")
|
| 301 |
+
ax.axhline(.20, color="#ffbb00", ls="--", lw=1.5, alpha=.7, label="Task 2 threshold ECE < 0.20")
|
| 302 |
+
ax.set_xlabel("Training Step", color="#3a4a6a", fontsize=11, labelpad=8)
|
| 303 |
+
ax.set_ylabel("ECE (β lower = better)", color="#3a4a6a", fontsize=11, labelpad=8)
|
| 304 |
+
ax.set_title("Live GRPO Training β ECE Curve", color="#8090bb", fontsize=13, fontweight="bold", pad=14)
|
| 305 |
+
ax.tick_params(colors="#2a3a5a", labelsize=10)
|
| 306 |
+
ax.set_ylim(0, .50); ax.set_xlim(-2, 105)
|
| 307 |
+
for sp in ax.spines.values(): sp.set_color("#12122a")
|
| 308 |
+
ax.grid(True, ls="--", alpha=.1, color="#1a1a3a")
|
| 309 |
+
ax.legend(facecolor="#07071a", labelcolor="#5a6a8a", edgecolor="#12122a", fontsize=10, loc="upper right")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
plt.tight_layout()
|
|
|
|
| 311 |
tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
|
| 312 |
+
plt.savefig(tmp.name, dpi=130, bbox_inches="tight", facecolor="#04040e")
|
| 313 |
plt.close(fig)
|
| 314 |
return tmp.name
|
| 315 |
|
| 316 |
|
| 317 |
+
def _train_thread():
|
| 318 |
import random
|
| 319 |
_training_state.update({"running": True, "steps": [], "ece_values": [], "stop": False})
|
| 320 |
ece = 0.42
|
| 321 |
for step in range(0, 101, 10):
|
| 322 |
+
if _training_state["stop"]: break
|
| 323 |
+
ece = max(.07, ece - random.uniform(.02, .05) + random.uniform(-.007, .007))
|
|
|
|
| 324 |
_training_state["steps"].append(step)
|
| 325 |
_training_state["ece_values"].append(round(ece, 4))
|
| 326 |
time.sleep(1.5)
|
|
|
|
| 328 |
|
| 329 |
|
| 330 |
def start_live_training():
|
| 331 |
+
threading.Thread(target=_train_thread, daemon=True).start()
|
|
|
|
| 332 |
for _ in range(60):
|
| 333 |
time.sleep(1.5)
|
| 334 |
+
s, v = _training_state["steps"][:], _training_state["ece_values"][:]
|
| 335 |
+
n = len(s)
|
|
|
|
| 336 |
prog = round((n / 11) * 100)
|
| 337 |
+
if s:
|
| 338 |
+
drop_pct = (v[0] - v[-1]) / v[0] * 100 if len(v) > 1 else 0
|
| 339 |
+
status = f"Step {s[-1]:>3}/100 β ECE {v[-1]:.4f} β β{drop_pct:.1f}% from start"
|
|
|
|
| 340 |
else:
|
| 341 |
status = "Initializing GRPO trainerβ¦"
|
|
|
|
| 342 |
if not _training_state["running"] and n > 0:
|
| 343 |
+
status = f"β
Done! ECE {v[0]:.4f} β {v[-1]:.4f} (β{(v[0]-v[-1])/v[0]*100:.1f}%)"
|
| 344 |
+
yield status, _live_plot(s, v), prog
|
|
|
|
|
|
|
| 345 |
return
|
| 346 |
+
yield status, _live_plot(s, v), prog
|
| 347 |
|
| 348 |
|
| 349 |
def stop_live_training():
|
|
|
|
| 352 |
|
| 353 |
|
| 354 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 355 |
+
# Shared state + init
|
| 356 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 357 |
|
| 358 |
+
_task_bank = _env = _live_hist = None
|
|
|
|
|
|
|
| 359 |
|
| 360 |
|
| 361 |
def _init():
|
| 362 |
global _task_bank, _env, _live_hist
|
| 363 |
+
if _env is not None: return
|
|
|
|
| 364 |
from env.task_bank import TaskBank
|
| 365 |
from env.echo_env import EchoEnv
|
| 366 |
from env.reward import RewardHistory
|
|
|
|
| 373 |
_current_task: dict = {}
|
| 374 |
|
| 375 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 376 |
+
# Tab 1 logic
|
| 377 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 378 |
|
| 379 |
+
def get_question(domain, difficulty):
|
| 380 |
global _current_task
|
| 381 |
_init()
|
| 382 |
task = _task_bank.get_task(domain.lower(), difficulty.lower())
|
| 383 |
_current_task = task
|
| 384 |
+
q = (f"**`{domain}`** Β· **`{difficulty}`**\n\n---\n\n{task['question']}")
|
|
|
|
|
|
|
|
|
|
| 385 |
return q, ""
|
| 386 |
|
| 387 |
|
| 388 |
+
def submit_answer(confidence, user_answer):
|
| 389 |
if not _current_task:
|
| 390 |
+
return _card("<span style='color:#ff4466'>β οΈ Get a question first.</span>"), "", ""
|
| 391 |
from env.reward import compute_reward
|
| 392 |
task = _current_task
|
| 393 |
+
rb = compute_reward(confidence, user_answer, task["answer"],
|
| 394 |
+
task.get("answer_aliases", []), task["domain"])
|
| 395 |
+
_live_hist.append(confidence, rb.was_correct, task["domain"], task["difficulty"], rb.total)
|
|
|
|
| 396 |
snap = _live_hist.get_training_snapshot()
|
| 397 |
|
| 398 |
+
c = "#00ffa3" if rb.was_correct else "#ff4466"
|
| 399 |
+
icon = "β
Correct!" if rb.was_correct else "β Incorrect"
|
| 400 |
+
|
| 401 |
+
result_html = f"""
|
| 402 |
+
<div style="background:#09091d;border:1px solid {c}33;border-left:3px solid {c};
|
| 403 |
+
border-radius:10px;padding:18px 20px;">
|
| 404 |
+
<div style="font-size:19px;font-weight:800;color:{c};margin-bottom:14px;font-family:Inter,sans-serif;">{icon}</div>
|
| 405 |
+
<div style="font-size:11px;color:#2a3a5a;text-transform:uppercase;letter-spacing:.08em;margin-bottom:4px;">Correct Answer</div>
|
| 406 |
+
<div style="font-size:16px;font-weight:700;color:#c0d0ff;font-family:'JetBrains Mono',monospace;margin-bottom:18px;">{task['answer']}</div>
|
| 407 |
+
<div style="display:grid;grid-template-columns:1fr 1fr;gap:8px;">
|
| 408 |
+
<div style="background:rgba(51,102,255,.08);border-radius:8px;padding:10px 14px;">
|
| 409 |
+
<div style="font-size:11px;color:#2a3a5a;margin-bottom:3px;">Accuracy</div>
|
| 410 |
+
<div style="color:#4488ff;font-weight:700;font-size:15px;">{rb.accuracy_score:.2f} <span style="font-size:11px;color:#1a2a4a;">Γ 0.40</span></div>
|
| 411 |
+
</div>
|
| 412 |
+
<div style="background:rgba(0,255,163,.06);border-radius:8px;padding:10px 14px;">
|
| 413 |
+
<div style="font-size:11px;color:#2a3a5a;margin-bottom:3px;">Brier Calibration</div>
|
| 414 |
+
<div style="color:#00ffa3;font-weight:700;font-size:15px;">{rb.brier_reward_val:.2f} <span style="font-size:11px;color:#1a3a2a;">Γ 0.40</span></div>
|
| 415 |
+
</div>
|
| 416 |
+
<div style="background:rgba(255,68,102,.06);border-radius:8px;padding:10px 14px;">
|
| 417 |
+
<div style="font-size:11px;color:#2a3a5a;margin-bottom:3px;">Overconf penalty</div>
|
| 418 |
+
<div style="color:#ff4466;font-weight:700;font-size:15px;">{rb.overconfidence_penalty_val:.3f}</div>
|
| 419 |
+
</div>
|
| 420 |
+
<div style="background:rgba(255,215,0,.06);border-radius:8px;padding:10px 14px;">
|
| 421 |
+
<div style="font-size:11px;color:#2a3a5a;margin-bottom:3px;">Total Reward</div>
|
| 422 |
+
<div style="color:#ffd700;font-weight:900;font-size:18px;">{rb.total:+.3f}</div>
|
| 423 |
+
</div>
|
| 424 |
+
</div>
|
| 425 |
+
</div>"""
|
| 426 |
|
| 427 |
+
n_ep = snap.get("episodes", len(_live_hist))
|
| 428 |
+
ece_v = snap["ece"]
|
| 429 |
+
ec = "#00ffa3" if ece_v < .20 else ("#ffbb00" if ece_v < .35 else "#ff4466")
|
| 430 |
+
|
| 431 |
+
stats_html = f"""
|
| 432 |
+
<div style="background:#09091d;border:1px solid #1a1a3a;border-radius:10px;padding:16px 20px;">
|
| 433 |
+
<div style="font-size:11px;color:#2a3a5a;text-transform:uppercase;letter-spacing:.08em;margin-bottom:14px;">
|
| 434 |
+
Your Stats β {n_ep} questions
|
| 435 |
+
</div>
|
| 436 |
+
<div style="display:flex;flex-direction:column;gap:10px;">
|
| 437 |
+
{"".join(f'''<div style="display:flex;justify-content:space-between;align-items:center;">
|
| 438 |
+
<span style="color:#3a4a6a;font-size:13px;">{label}</span>
|
| 439 |
+
<span style="color:{vc};font-weight:700;font-size:14px;">{val}</span>
|
| 440 |
+
</div>''' for label, val, vc in [
|
| 441 |
+
("Accuracy", f"{snap['accuracy']:.1%}", "#c0d0ff"),
|
| 442 |
+
("ECE", f"{ece_v:.3f}", ec),
|
| 443 |
+
("Mean Confidence", f"{snap['mean_confidence']:.0f}%", "#c0d0ff"),
|
| 444 |
+
("Overconf Rate", f"{snap['overconfidence_rate']:.1%}", "#ff8c00"),
|
| 445 |
+
])}
|
| 446 |
+
</div>
|
| 447 |
+
</div>"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 448 |
|
| 449 |
+
if rb.overconfidence_penalty_val < -.1:
|
| 450 |
+
tip = "β οΈ **Overconfident** β high confidence, wrong answer. ECHO trains against this exact pattern."
|
|
|
|
| 451 |
elif rb.was_correct and confidence >= 65:
|
| 452 |
+
tip = "π― **Well calibrated** β confident and correct."
|
| 453 |
elif not rb.was_correct and confidence < 40:
|
| 454 |
+
tip = "π― **Good self-awareness** β sensed uncertainty correctly."
|
| 455 |
+
elif rb.underconfidence_penalty_val < -.1:
|
| 456 |
+
tip = "π€ **Underconfident** β you knew it but doubted yourself."
|
| 457 |
else:
|
| 458 |
tip = ""
|
| 459 |
+
return result_html, stats_html, tip
|
|
|
|
| 460 |
|
| 461 |
|
| 462 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 463 |
+
# Tab 2 logic
|
| 464 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 465 |
|
| 466 |
+
def run_comparison(scenario):
|
| 467 |
_init()
|
| 468 |
from core.baseline import AlwaysHighAgent, HeuristicAgent
|
| 469 |
from env.reward import compute_reward, RewardHistory
|
| 470 |
from env.parser import format_prompt, parse_response
|
|
|
|
| 471 |
|
| 472 |
+
domain_map = {"Math":"math","Logic":"logic","Factual":"factual","Science":"science",
|
| 473 |
+
"Medical":"medical","Coding":"coding","Creative":"creative","Mixed":None}
|
|
|
|
|
|
|
|
|
|
| 474 |
domain = domain_map.get(scenario)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 475 |
echo_h, base_h = RewardHistory(), RewardHistory()
|
| 476 |
+
rows_html = '<div style="display:flex;flex-direction:column;gap:6px;">'
|
| 477 |
|
| 478 |
+
for i in range(10):
|
| 479 |
+
d = domain or cfg.DOMAINS[i % len(cfg.DOMAINS)]
|
| 480 |
task = _task_bank.get_task(d, "medium")
|
| 481 |
prompt = format_prompt(task["question"], d, "medium")
|
| 482 |
+
ea = HeuristicAgent()(prompt); ep = parse_response(ea)
|
| 483 |
+
ba = AlwaysHighAgent()(prompt); bp = parse_response(ba)
|
| 484 |
+
er = compute_reward(ep.confidence, ep.answer, task["answer"], task.get("answer_aliases",[]), d)
|
| 485 |
+
br = compute_reward(bp.confidence, bp.answer, task["answer"], task.get("answer_aliases",[]), d)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 486 |
echo_h.append(ep.confidence, er.was_correct, d, "medium", er.total)
|
| 487 |
base_h.append(bp.confidence, br.was_correct, d, "medium", br.total)
|
| 488 |
|
|
|
|
|
|
|
| 489 |
ec = "#00ffa3" if er.was_correct else "#ff4466"
|
| 490 |
bc = "#ff4466" if not br.was_correct else "#00ffa3"
|
| 491 |
+
ei = "β
" if er.was_correct else "β"
|
| 492 |
+
bi = "β
" if br.was_correct else "β"
|
| 493 |
|
| 494 |
+
rows_html += f"""
|
| 495 |
+
<div style="display:grid;grid-template-columns:1fr 1fr;gap:6px;">
|
| 496 |
+
<div style="background:rgba(0,255,163,.04);border:1px solid rgba(0,255,163,.12);
|
| 497 |
+
border-radius:8px;padding:10px 14px;">
|
| 498 |
+
<div style="font-size:10px;color:#1a4a2a;text-transform:uppercase;
|
| 499 |
+
letter-spacing:.08em;margin-bottom:5px;">ECHO Β· {d} Q{i+1}</div>
|
| 500 |
+
<div style="color:#4a5a8a;font-size:12px;margin-bottom:7px;line-height:1.4;">
|
| 501 |
+
{task['question'][:70]}β¦</div>
|
| 502 |
+
<div style="display:flex;gap:8px;align-items:center;">
|
| 503 |
+
<span style="color:{ec};font-weight:800;font-size:15px;">{ei}</span>
|
| 504 |
+
<span style="background:rgba(0,255,163,.1);border-radius:4px;padding:2px 8px;
|
| 505 |
+
color:#00ffa3;font-size:11px;font-weight:700;">conf {ep.confidence}%</span>
|
| 506 |
+
</div>
|
| 507 |
+
</div>
|
| 508 |
+
<div style="background:rgba(255,68,102,.04);border:1px solid rgba(255,68,102,.12);
|
| 509 |
+
border-radius:8px;padding:10px 14px;">
|
| 510 |
+
<div style="font-size:10px;color:#4a1020;text-transform:uppercase;
|
| 511 |
+
letter-spacing:.08em;margin-bottom:5px;">OVERCONFIDENT Β· Q{i+1}</div>
|
| 512 |
+
<div style="color:#4a5a8a;font-size:12px;margin-bottom:7px;line-height:1.4;">
|
| 513 |
+
{task['question'][:70]}β¦</div>
|
| 514 |
+
<div style="display:flex;gap:8px;align-items:center;">
|
| 515 |
+
<span style="color:{bc};font-weight:800;font-size:15px;">{bi}</span>
|
| 516 |
+
<span style="background:rgba(255,68,102,.1);border-radius:4px;padding:2px 8px;
|
| 517 |
+
color:#ff4466;font-size:11px;font-weight:700;">conf {bp.confidence}%</span>
|
| 518 |
+
</div>
|
| 519 |
+
</div>
|
| 520 |
+
</div>"""
|
| 521 |
|
| 522 |
+
rows_html += "</div>"
|
| 523 |
em = echo_h.get_training_snapshot()
|
| 524 |
bm = base_h.get_training_snapshot()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 525 |
|
| 526 |
+
def _mc(label, ev, bv, good_low=True):
|
| 527 |
+
e_better = (float(ev.strip("%")) < float(bv.strip("%"))) if "%" in ev else (float(ev) < float(bv))
|
| 528 |
+
if not good_low: e_better = not e_better
|
| 529 |
+
ec2 = "#00ffa3" if e_better else "#ff4466"
|
| 530 |
+
bc2 = "#ff4466" if e_better else "#00ffa3"
|
| 531 |
+
return f"""<div style="background:#06061a;border:1px solid #1a1a3a;border-radius:8px;padding:12px;text-align:center;">
|
| 532 |
+
<div style="font-size:10px;color:#2a3a5a;text-transform:uppercase;letter-spacing:.07em;margin-bottom:8px;">{label}</div>
|
| 533 |
+
<div style="display:flex;justify-content:center;gap:14px;align-items:baseline;">
|
| 534 |
+
<span style="color:{ec2};font-size:17px;font-weight:800;">{ev}</span>
|
| 535 |
+
<span style="color:#1a2a4a;font-size:11px;">vs</span>
|
| 536 |
+
<span style="color:{bc2};font-size:17px;font-weight:800;">{bv}</span>
|
| 537 |
+
</div>
|
| 538 |
+
<div style="display:flex;justify-content:center;gap:14px;margin-top:4px;">
|
| 539 |
+
<span style="font-size:10px;color:#1a3a2a;">ECHO</span>
|
| 540 |
+
<span style="font-size:10px;color:#3a1020;">Baseline</span>
|
| 541 |
+
</div>
|
| 542 |
+
</div>"""
|
| 543 |
+
|
| 544 |
+
summary_html = f"""
|
| 545 |
+
<div style="background:#06061a;border:1px solid #1a1a3a;border-radius:10px;padding:16px 20px;margin-top:8px;">
|
| 546 |
+
<div style="font-size:11px;color:#2a3a5a;text-transform:uppercase;letter-spacing:.08em;margin-bottom:14px;">Results</div>
|
| 547 |
+
<div style="display:grid;grid-template-columns:repeat(4,1fr);gap:8px;margin-bottom:14px;">
|
| 548 |
+
{_mc("ECE β", f"{em['ece']:.3f}", f"{bm['ece']:.3f}", good_low=True)}
|
| 549 |
+
{_mc("Accuracy β", f"{em['accuracy']:.1%}", f"{bm['accuracy']:.1%}", good_low=False)}
|
| 550 |
+
{_mc("Mean Conf", f"{em['mean_confidence']:.0f}%", f"{bm['mean_confidence']:.0f}%", good_low=True)}
|
| 551 |
+
{_mc("Overconf β", f"{em['overconfidence_rate']:.1%}", f"{bm['overconfidence_rate']:.1%}", good_low=True)}
|
| 552 |
+
</div>
|
| 553 |
+
<div style="background:rgba(0,255,163,.08);border:1px solid rgba(0,255,163,.2);
|
| 554 |
+
border-radius:8px;padding:12px;text-align:center;">
|
| 555 |
+
<span style="color:#00ffa3;font-size:17px;font-weight:900;">
|
| 556 |
+
ECHO is {abs(em['ece']-bm['ece']):.0%} better calibrated
|
| 557 |
+
</span>
|
| 558 |
+
<span style="color:#2a3a5a;font-size:13px;"> than the overconfident baseline</span>
|
| 559 |
+
</div>
|
| 560 |
+
</div>"""
|
| 561 |
+
|
| 562 |
+
# Reliability diagram
|
| 563 |
erep = echo_h.get_calibration_report()
|
| 564 |
brep = base_h.get_calibration_report()
|
| 565 |
fig, ax = plt.subplots(figsize=(7, 4.5), facecolor="#04040e")
|
| 566 |
+
ax.set_facecolor("#07071a")
|
| 567 |
+
ax.plot([0,100],[0,100],"--",color="#1a2a3a",lw=1.5,label="Perfect calibration",zorder=1)
|
| 568 |
for rep, col, lbl in [(erep,"#00ffa3","ECHO"),(brep,"#ff4466","Overconfident AI")]:
|
| 569 |
+
bd = rep.bin_data; xs = sorted(bd.keys())
|
|
|
|
| 570 |
ys = [bd[b]["accuracy"]*100 for b in xs]
|
| 571 |
+
if xs: ax.plot(xs, ys, "-o", color=col, lw=2.5, ms=7, label=f"{lbl} ECE={rep.ece:.2f}",
|
| 572 |
+
mfc=col, mec="#04040e", mew=1.5, zorder=3)
|
| 573 |
+
ax.set_xlabel("Stated Confidence (%)", color="#3a4a6a", fontsize=11)
|
| 574 |
+
ax.set_ylabel("Actual Accuracy (%)", color="#3a4a6a", fontsize=11)
|
| 575 |
+
ax.set_title("Live Reliability Diagram", color="#8090bb", fontsize=13, fontweight="bold")
|
| 576 |
+
ax.tick_params(colors="#2a3a5a"); ax.set_xlim(0,100); ax.set_ylim(0,100)
|
| 577 |
+
for sp in ax.spines.values(): sp.set_color("#12122a")
|
| 578 |
+
ax.grid(True, ls="--", alpha=.1, color="#1a1a3a")
|
| 579 |
+
ax.legend(facecolor="#07071a", labelcolor="#5a6a8a", edgecolor="#12122a", fontsize=10)
|
|
|
|
|
|
|
| 580 |
plt.tight_layout()
|
| 581 |
tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
|
| 582 |
+
plt.savefig(tmp.name, dpi=130, bbox_inches="tight", facecolor="#04040e")
|
| 583 |
plt.close(fig)
|
| 584 |
|
| 585 |
+
return rows_html + summary_html, tmp.name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 586 |
|
| 587 |
|
| 588 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 589 |
+
# Tab 3 logic
|
| 590 |
# ββββββββββββββββββββββββββββββββοΏ½οΏ½οΏ½ββββββββββββββββββββββββββββββββββββββββββββ
|
| 591 |
|
| 592 |
+
def generate_fingerprint(model_label):
|
| 593 |
from core.epistemic_fingerprint import _make_synthetic_fingerprint, plot_radar
|
| 594 |
_init()
|
| 595 |
+
offset = {"Untrained": .30, "ECHO Trained": .0, "Heuristic": .15}.get(model_label, .15)
|
| 596 |
+
fp = _make_synthetic_fingerprint(offset, model_label)
|
| 597 |
+
b = _make_synthetic_fingerprint(.30, "Untrained")
|
|
|
|
| 598 |
tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
|
| 599 |
+
plot_radar(b, fp, tmp.name)
|
|
|
|
|
|
|
|
|
|
| 600 |
|
| 601 |
+
bars = '<div style="display:flex;flex-direction:column;gap:8px;">'
|
|
|
|
|
|
|
| 602 |
for d in cfg.DOMAINS:
|
| 603 |
+
s = fp.domain_scores.get(d, .5)
|
| 604 |
+
col = "#00ffa3" if s > .75 else ("#ffbb00" if s > .55 else "#ff4466")
|
| 605 |
+
pct = int(s * 100)
|
| 606 |
+
bars += f"""
|
| 607 |
+
<div style="display:flex;align-items:center;gap:10px;">
|
| 608 |
+
<div style="width:72px;text-align:right;color:#3a4a6a;font-size:12px;font-weight:500;font-family:Inter,sans-serif;">{d.capitalize()}</div>
|
| 609 |
+
<div style="flex:1;background:rgba(255,255,255,.04);border-radius:4px;height:7px;">
|
| 610 |
+
<div style="width:{pct}%;height:100%;border-radius:4px;background:{col};box-shadow:0 0 6px {col}77;transition:width .6s ease;"></div>
|
| 611 |
+
</div>
|
| 612 |
+
<div style="width:36px;text-align:right;color:{col};font-size:12px;font-weight:700;font-family:Inter,sans-serif;">{s:.2f}</div>
|
| 613 |
+
</div>"""
|
| 614 |
+
bars += "</div>"
|
| 615 |
+
|
| 616 |
+
insight = f"""
|
| 617 |
+
<div style="background:rgba(168,85,247,.06);border:1px solid rgba(168,85,247,.2);
|
| 618 |
+
border-radius:8px;padding:14px 16px;margin-top:8px;">
|
| 619 |
+
<div style="font-size:13px;color:#b0c0dd;line-height:1.6;font-family:Inter,sans-serif;">
|
| 620 |
+
<strong style="color:#a855f7;">{model_label}</strong> is strongest in
|
| 621 |
+
<strong style="color:#00ffa3;">{fp.strongest_domain.capitalize()}</strong> and most
|
| 622 |
+
uncertain in <strong style="color:#ff4466;">{fp.weakest_domain.capitalize()}</strong>.
|
| 623 |
+
</div>
|
| 624 |
+
<div style="margin-top:8px;font-size:14px;color:#3a4a6a;">
|
| 625 |
+
Overall ECE: <strong style="color:#ffd700;font-size:16px;">{fp.overall_ece:.3f}</strong>
|
| 626 |
+
</div>
|
| 627 |
+
</div>"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 628 |
|
| 629 |
+
return tmp.name, bars, insight
|
| 630 |
|
| 631 |
|
| 632 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 633 |
+
# Tab 5 logic
|
| 634 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 635 |
|
| 636 |
+
def run_evaluation():
|
| 637 |
_init()
|
| 638 |
from core.tasks import TASKS, TaskRunner, TASKS_BY_ID
|
| 639 |
from core.baseline import HeuristicAgent
|
| 640 |
+
result = TaskRunner().run_all(HeuristicAgent(), _task_bank)
|
|
|
|
|
|
|
| 641 |
|
| 642 |
+
cards = ""
|
| 643 |
for r in result.tasks:
|
| 644 |
+
t = TASKS_BY_ID[r.task_id]
|
| 645 |
+
col = "#00ffa3" if r.passed else "#ff4466"
|
| 646 |
+
bg = "rgba(0,255,163,.05)" if r.passed else "rgba(255,68,102,.05)"
|
| 647 |
+
brd = "rgba(0,255,163,.2)" if r.passed else "rgba(255,68,102,.2)"
|
| 648 |
+
pct = min(int(r.score / max(t.pass_threshold,.001) * 100), 100)
|
| 649 |
+
icon = "β
" if r.passed else "β"
|
| 650 |
+
cards += f"""
|
| 651 |
+
<div style="background:{bg};border:1px solid {brd};border-radius:10px;padding:16px 20px;margin-bottom:8px;">
|
| 652 |
+
<div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:10px;">
|
| 653 |
+
<div style="display:flex;align-items:center;gap:10px;">
|
| 654 |
+
<span style="font-size:18px;">{icon}</span>
|
| 655 |
+
<span style="color:#c0d0ff;font-size:14px;font-weight:700;font-family:Inter,sans-serif;">{t.name}</span>
|
| 656 |
+
<span style="background:rgba(255,255,255,.05);border-radius:4px;padding:2px 8px;
|
| 657 |
+
color:#2a3a5a;font-size:11px;">{r.task_id}</span>
|
| 658 |
+
</div>
|
| 659 |
+
<div style="font-family:'JetBrains Mono',monospace;font-size:13px;">
|
| 660 |
+
<span style="color:{col};font-weight:800;">{r.score:.3f}</span>
|
| 661 |
+
<span style="color:#1a2a4a;"> / {t.pass_threshold}</span>
|
| 662 |
+
</div>
|
| 663 |
+
</div>
|
| 664 |
+
<div style="background:rgba(255,255,255,.03);border-radius:4px;height:5px;">
|
| 665 |
+
<div style="width:{pct}%;height:100%;border-radius:4px;background:{col};"></div>
|
| 666 |
+
</div>
|
| 667 |
+
</div>"""
|
|
|
|
|
|
|
| 668 |
|
| 669 |
+
verdict_col = "#00ffa3" if result.overall_pass else "#ff4466"
|
| 670 |
+
verdict = f"""
|
| 671 |
+
<div style="background:linear-gradient(135deg,rgba(0,255,163,.08),rgba(51,102,255,.05));
|
| 672 |
+
border:1px solid {verdict_col}44;border-radius:10px;padding:18px;text-align:center;margin-top:4px;">
|
| 673 |
+
<div style="font-size:22px;font-weight:900;color:{verdict_col};font-family:Inter,sans-serif;">
|
| 674 |
+
{"π ALL TASKS PASSED" if result.overall_pass else "β οΈ Some tasks below threshold"}
|
| 675 |
+
</div>
|
| 676 |
+
</div>"""
|
|
|
|
| 677 |
|
| 678 |
json_str = json.dumps(result.to_dict(), indent=2, default=str)
|
| 679 |
+
return cards + verdict, json_str
|
| 680 |
|
| 681 |
|
| 682 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 683 |
+
# App builder
|
| 684 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 685 |
|
| 686 |
def build_app():
|
| 687 |
import gradio as gr
|
| 688 |
|
| 689 |
plots = {k: f"{cfg.PLOTS_DIR}/{v}" for k, v in {
|
| 690 |
+
"reliability": "reliability_diagram.png",
|
| 691 |
+
"training": "training_curves.png",
|
| 692 |
+
"fingerprint": "epistemic_fingerprint.png",
|
| 693 |
+
"heatmap": "calibration_heatmap.png",
|
| 694 |
+
"distribution":"confidence_distribution.png",
|
| 695 |
+
"domain": "domain_comparison.png",
|
| 696 |
}.items()}
|
| 697 |
+
def _img(k): return plots[k] if Path(plots[k]).exists() else None
|
| 698 |
+
|
| 699 |
+
theme = _echo_theme()
|
| 700 |
|
| 701 |
with gr.Blocks(title="ECHO ULTIMATE") as demo:
|
| 702 |
|
| 703 |
# ββ Hero βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 704 |
+
gr.HTML(HERO)
|
| 705 |
|
| 706 |
+
# ββ Tab 1 ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 707 |
with gr.Tab("π― Live Challenge"):
|
| 708 |
+
gr.HTML(_tab_header("π― Live Challenge",
|
| 709 |
+
"Answer with a confidence score β see if you're as well-calibrated as ECHO", "#00ffa3"))
|
|
|
|
|
|
|
|
|
|
| 710 |
with gr.Row():
|
| 711 |
+
dom_dd = gr.Dropdown(["Math","Logic","Factual","Science","Medical","Coding","Creative"],
|
| 712 |
+
value="Math", label="Domain")
|
|
|
|
|
|
|
| 713 |
diff_dd = gr.Dropdown(["Easy","Medium","Hard"], value="Easy", label="Difficulty")
|
| 714 |
+
get_btn = gr.Button("π² Get Question", variant="primary")
|
|
|
|
| 715 |
question_box = gr.Markdown(
|
| 716 |
+
"<div style='color:#2a3a5a;padding:10px;font-style:italic;'>Select domain & difficulty, then click Get Question.</div>"
|
|
|
|
| 717 |
)
|
|
|
|
| 718 |
with gr.Row():
|
| 719 |
+
conf_sl = gr.Slider(0, 100, value=50, step=5, label="Your Confidence (0 = no idea Β· 100 = certain)")
|
| 720 |
+
ans_box = gr.Textbox(label="Your Answer", placeholder="Type your answerβ¦", lines=1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 721 |
sub_btn = gr.Button("β
Submit Answer", variant="primary")
|
|
|
|
| 722 |
with gr.Row():
|
| 723 |
result_html = gr.HTML()
|
| 724 |
stats_html = gr.HTML()
|
|
|
|
| 727 |
get_btn.click(get_question, [dom_dd, diff_dd], [question_box, ans_box])
|
| 728 |
sub_btn.click(submit_answer, [conf_sl, ans_box], [result_html, stats_html, tip_md])
|
| 729 |
|
| 730 |
+
# ββ Tab 2 ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 731 |
+
with gr.Tab("β ECHO vs AI"):
|
| 732 |
+
gr.HTML(_tab_header("β ECHO vs Overconfident AI",
|
| 733 |
+
"10-question head-to-head: calibrated ECHO vs AlwaysHigh baseline (90% on everything)", "#ff4466"))
|
|
|
|
|
|
|
|
|
|
| 734 |
with gr.Row():
|
| 735 |
scenario_dd = gr.Dropdown(
|
| 736 |
["Mixed","Math","Logic","Factual","Science","Medical","Coding","Creative"],
|
| 737 |
+
value="Mixed", label="Test Scenario")
|
|
|
|
| 738 |
run_btn = gr.Button("β Run 10 Questions", variant="primary")
|
|
|
|
| 739 |
with gr.Row():
|
| 740 |
+
with gr.Column(scale=3): cmp_html = gr.HTML()
|
| 741 |
+
with gr.Column(scale=2): mini_img = gr.Image(label="Live Reliability Diagram",
|
| 742 |
+
type="filepath", height=340)
|
|
|
|
|
|
|
|
|
|
| 743 |
run_btn.click(run_comparison, [scenario_dd], [cmp_html, mini_img])
|
| 744 |
|
| 745 |
+
# ββ Tab 3 ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 746 |
with gr.Tab("𧬠Epistemic Fingerprint"):
|
| 747 |
+
gr.HTML(_tab_header("𧬠Epistemic Fingerprint",
|
| 748 |
+
"Radar chart of per-domain calibration β larger green area = better everywhere", "#a855f7"))
|
|
|
|
|
|
|
|
|
|
| 749 |
with gr.Row():
|
| 750 |
+
model_dd = gr.Dropdown(["ECHO Trained","Untrained","Heuristic"],
|
| 751 |
+
value="ECHO Trained", label="Model")
|
| 752 |
+
fp_btn = gr.Button("π¬ Generate Fingerprint", variant="primary")
|
|
|
|
|
|
|
|
|
|
| 753 |
with gr.Row():
|
| 754 |
with gr.Column(scale=3):
|
| 755 |
fp_img = gr.Image(label="Epistemic Fingerprint", type="filepath",
|
| 756 |
+
value=_img("fingerprint"), height=480)
|
| 757 |
with gr.Column(scale=2):
|
| 758 |
+
fp_bars = gr.HTML()
|
| 759 |
fp_insight = gr.HTML()
|
|
|
|
| 760 |
fp_btn.click(generate_fingerprint, [model_dd], [fp_img, fp_bars, fp_insight])
|
| 761 |
|
| 762 |
+
# ββ Tab 4 ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 763 |
with gr.Tab("π Training Evidence"):
|
| 764 |
+
gr.HTML(_tab_header("π Training Evidence",
|
| 765 |
+
"6 plots generated from GRPO training β from overconfidence to precise calibration", "#ffd700"))
|
| 766 |
+
gr.HTML(_card(
|
| 767 |
+
"<div style='font-size:14px;font-weight:700;color:#00ffa3;margin-bottom:6px;'>β
Hero Plot β Reliability Diagram</div>"
|
| 768 |
+
"<div style='font-size:13px;color:#3a4a6a;line-height:1.6;'>"
|
| 769 |
+
"Untrained model (red): flat line far from diagonal β always overconfident. "
|
| 770 |
+
"ECHO trained (green): near-perfect calibration β hugs the diagonal."
|
| 771 |
+
"</div>",
|
| 772 |
+
"rgba(0,255,163,.15)"
|
| 773 |
))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 774 |
gr.Image(value=_img("reliability"), label="Reliability Diagram", height=380)
|
|
|
|
| 775 |
with gr.Row():
|
| 776 |
with gr.Column():
|
| 777 |
+
gr.HTML("<div style='font-size:13px;font-weight:600;color:#4488ff;margin:10px 0 4px;'>π Training Curves</div>")
|
| 778 |
+
gr.Image(value=_img("training"), label="Training Curves", height=290)
|
|
|
|
|
|
|
|
|
|
| 779 |
with gr.Column():
|
| 780 |
+
gr.HTML("<div style='font-size:13px;font-weight:600;color:#a855f7;margin:10px 0 4px;'>𧬠Epistemic Fingerprint</div>")
|
| 781 |
+
gr.Image(value=_img("fingerprint"), label="Epistemic Fingerprint", height=290)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 782 |
with gr.Row():
|
| 783 |
with gr.Column():
|
| 784 |
+
gr.HTML("<div style='font-size:13px;font-weight:600;color:#ffd700;margin:10px 0 4px;'>π‘οΈ Calibration Heatmap</div>")
|
| 785 |
+
gr.Image(value=_img("heatmap"), label="Calibration Heatmap", height=290)
|
|
|
|
|
|
|
|
|
|
| 786 |
with gr.Column():
|
| 787 |
+
gr.HTML("<div style='font-size:13px;font-weight:600;color:#ff8c00;margin:10px 0 4px;'>π Confidence Distribution</div>")
|
| 788 |
+
gr.Image(value=_img("distribution"), label="Confidence Distribution", height=290)
|
| 789 |
+
gr.HTML("<div style='font-size:13px;font-weight:600;color:#ff4466;margin:10px 0 4px;'>π’ Domain Comparison</div>")
|
| 790 |
+
gr.Image(value=_img("domain"), label="Domain Comparison", height=300)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 791 |
regen_btn = gr.Button("π Regenerate All Plots", variant="secondary")
|
| 792 |
+
regen_out = gr.HTML()
|
|
|
|
| 793 |
def regen():
|
| 794 |
from training.evaluate import make_synthetic_pair, compare_and_plot
|
| 795 |
+
b, a = make_synthetic_pair()
|
| 796 |
+
compare_and_plot(a, {"Untrained": b})
|
| 797 |
+
return _card("<span style='color:#00ffa3;font-weight:600;'>β
All 6 plots regenerated</span>")
|
| 798 |
+
regen_btn.click(regen, outputs=[regen_out])
|
| 799 |
+
|
| 800 |
+
# ββ Tab 5 ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 801 |
+
with gr.Tab("π Evaluation"):
|
| 802 |
+
gr.HTML(_tab_header("π Official OpenEnv Evaluation",
|
| 803 |
+
"3 tasks Γ 30 episodes = 90 episodes β validates ECHO meets all thresholds", "#ffd700"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 804 |
gr.HTML("""
|
| 805 |
+
<div style="display:grid;grid-template-columns:repeat(3,1fr);gap:10px;margin-bottom:8px;">
|
| 806 |
+
<div style="background:rgba(51,102,255,.06);border:1px solid rgba(51,102,255,.2);border-radius:8px;padding:13px 16px;">
|
| 807 |
+
<div style="color:#4488ff;font-weight:700;font-size:13px;font-family:Inter,sans-serif;">Task 1 β Easy</div>
|
| 808 |
+
<div style="color:#1a2a5a;font-size:12px;margin-top:4px;">ECE target: < 0.15</div>
|
|
|
|
| 809 |
</div>
|
| 810 |
+
<div style="background:rgba(255,215,0,.06);border:1px solid rgba(255,215,0,.2);border-radius:8px;padding:13px 16px;">
|
| 811 |
+
<div style="color:#ffd700;font-weight:700;font-size:13px;font-family:Inter,sans-serif;">Task 2 β Medium</div>
|
| 812 |
+
<div style="color:#2a2a00;font-size:12px;margin-top:4px;">ECE target: < 0.20</div>
|
|
|
|
| 813 |
</div>
|
| 814 |
+
<div style="background:rgba(168,85,247,.06);border:1px solid rgba(168,85,247,.2);border-radius:8px;padding:13px 16px;">
|
| 815 |
+
<div style="color:#a855f7;font-weight:700;font-size:13px;font-family:Inter,sans-serif;">Task 3 β Hard</div>
|
| 816 |
+
<div style="color:#1a0a3a;font-size:12px;margin-top:4px;">ECE target: < 0.25</div>
|
|
|
|
| 817 |
</div>
|
| 818 |
</div>""")
|
| 819 |
+
eval_btn = gr.Button("π Run Full Evaluation (90 episodes)", variant="primary")
|
| 820 |
result_html = gr.HTML()
|
| 821 |
+
with gr.Accordion("π Raw JSON", open=False):
|
| 822 |
json_out = gr.Code(language="json")
|
| 823 |
eval_btn.click(run_evaluation, outputs=[result_html, json_out])
|
| 824 |
|
| 825 |
+
# ββ Tab 6 ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 826 |
with gr.Tab("β‘ Live Training"):
|
| 827 |
+
gr.HTML(_tab_header("β‘ Live GRPO Training",
|
| 828 |
+
"Watch ECE drop in real-time β dashed lines show Task 1 & 2 pass thresholds", "#4488ff"))
|
|
|
|
|
|
|
|
|
|
| 829 |
with gr.Row():
|
| 830 |
+
lt_start = gr.Button("π Start Live Training Demo", variant="primary", scale=2)
|
| 831 |
+
lt_stop = gr.Button("βΉ Stop", variant="stop", scale=1)
|
| 832 |
+
lt_status = gr.Textbox(label="Training Log",
|
| 833 |
+
value="Ready β click Start to simulate GRPO training.",
|
| 834 |
+
lines=2, interactive=False)
|
| 835 |
+
lt_plot = gr.Image(label="ECE During Training", type="filepath", height=380)
|
| 836 |
+
lt_prog = gr.Slider(0, 100, value=0, label="Progress (%)", interactive=False)
|
| 837 |
+
lt_start.click(start_live_training, outputs=[lt_status, lt_plot, lt_prog])
|
| 838 |
+
lt_stop.click(stop_live_training, outputs=[lt_status])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 839 |
|
| 840 |
+
return demo, theme
|
| 841 |
|
| 842 |
|
| 843 |
def main():
|
| 844 |
import gradio as gr
|
| 845 |
logging.basicConfig(level=logging.INFO)
|
| 846 |
+
demo, theme = build_app()
|
| 847 |
demo.launch(
|
| 848 |
server_name="0.0.0.0",
|
| 849 |
server_port=cfg.GRADIO_PORT,
|
| 850 |
share=False,
|
| 851 |
show_error=True,
|
| 852 |
css=_CSS,
|
| 853 |
+
js=_JS,
|
| 854 |
+
theme=theme,
|
|
|
|
|
|
|
|
|
|
| 855 |
)
|
| 856 |
|
| 857 |
|