Vikaspandey582003 commited on
Commit
4e366bc
Β·
verified Β·
1 Parent(s): 023ed75

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. app.py +4 -10
  2. ui/app.py +574 -883
app.py CHANGED
@@ -2,20 +2,14 @@
2
  import sys, os
3
  sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
4
 
5
- import gradio as gr
6
- from ui.app import build_app
7
-
8
- demo = build_app()
9
- from ui.app import _CSS
10
 
 
11
  demo.queue()
12
  demo.launch(
13
  server_name=os.getenv("GRADIO_SERVER_NAME", "0.0.0.0"),
14
  server_port=int(os.getenv("GRADIO_SERVER_PORT", "7860")),
15
  css=_CSS,
16
- theme=gr.themes.Base(
17
- primary_hue=gr.themes.colors.blue,
18
- neutral_hue=gr.themes.colors.slate,
19
- font=[gr.themes.GoogleFont("Inter"), "sans-serif"],
20
- ),
21
  )
 
2
  import sys, os
3
  sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
4
 
5
+ from ui.app import build_app, _CSS, _JS
 
 
 
 
6
 
7
+ demo, theme = build_app()
8
  demo.queue()
9
  demo.launch(
10
  server_name=os.getenv("GRADIO_SERVER_NAME", "0.0.0.0"),
11
  server_port=int(os.getenv("GRADIO_SERVER_PORT", "7860")),
12
  css=_CSS,
13
+ js=_JS,
14
+ theme=theme,
 
 
 
15
  )
ui/app.py CHANGED
@@ -1,13 +1,4 @@
1
- """
2
- ECHO ULTIMATE β€” Premium Gradio UI.
3
-
4
- Tab 1: 🎯 Live Challenge
5
- Tab 2: βš” ECHO vs Overconfident AI
6
- Tab 3: 🧬 Epistemic Fingerprint
7
- Tab 4: πŸ“Š Training Evidence
8
- Tab 5: πŸ† Official Evaluation
9
- Tab 6: ⚑ Live Training
10
- """
11
 
12
  import json
13
  import logging
@@ -26,482 +17,310 @@ from config import cfg
26
  logger = logging.getLogger(__name__)
27
 
28
  # ─────────────────────────────────────────────────────────────────────────────
29
- # CSS
30
  # ─────────────────────────────────────────────────────────────────────────────
31
 
32
- _CSS = """
33
- @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700;800&family=JetBrains+Mono:wght@400;500&display=swap');
34
-
35
- :root {
36
- --bg: #04040e;
37
- --surface: #080818;
38
- --card: #0c0c22;
39
- --card2: #0f0f2a;
40
- --border: rgba(80,100,255,0.18);
41
- --green: #00ffa3;
42
- --blue: #4488ff;
43
- --purple: #a855f7;
44
- --gold: #ffd700;
45
- --red: #ff4466;
46
- --orange: #ff8c00;
47
- --text: #c8d8ff;
48
- --dim: #4a5a8a;
49
- --glow-g: 0 0 24px rgba(0,255,163,0.35);
50
- --glow-b: 0 0 24px rgba(68,136,255,0.35);
51
- --glow-p: 0 0 24px rgba(168,85,247,0.35);
52
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
- /* ── Base ── */
55
- *, *::before, *::after { box-sizing: border-box; }
56
 
57
- .gradio-container {
58
- background: var(--bg) !important;
59
- font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif !important;
60
- max-width: 1440px !important;
61
- margin: 0 auto !important;
62
- }
63
- body, html { background: var(--bg) !important; }
 
64
  footer { display: none !important; }
 
65
 
66
- /* ── Tabs ── */
67
- .tab-nav {
68
- background: var(--surface) !important;
69
- border-bottom: 1px solid var(--border) !important;
70
- padding: 0 8px !important;
71
- border-radius: 0 !important;
72
- gap: 4px !important;
73
- }
74
  .tab-nav button {
75
- color: var(--dim) !important;
76
- font-size: 13px !important;
77
- font-weight: 500 !important;
78
- padding: 12px 20px !important;
79
- border-radius: 0 !important;
80
- border-bottom: 2px solid transparent !important;
81
- transition: all 0.2s !important;
82
- background: transparent !important;
83
- letter-spacing: 0.02em !important;
84
- }
85
- .tab-nav button:hover {
86
- color: var(--text) !important;
87
- background: rgba(255,255,255,0.04) !important;
88
  }
 
89
  .tab-nav button.selected {
90
- color: var(--green) !important;
91
- border-bottom: 2px solid var(--green) !important;
92
- background: rgba(0,255,163,0.06) !important;
93
- text-shadow: 0 0 12px rgba(0,255,163,0.5) !important;
94
- }
95
-
96
- /* ── Blocks / panels ── */
97
- .block, .panel, .form {
98
- background: var(--card) !important;
99
- border: 1px solid var(--border) !important;
100
- border-radius: 12px !important;
101
- }
102
-
103
- /* ── Markdown text ── */
104
- .prose, .markdown, .prose p, .prose li, .prose td, .prose th {
105
- color: var(--text) !important;
106
- }
107
- .prose h1, .prose h2, .prose h3, .prose h4 {
108
- color: #fff !important;
109
- letter-spacing: -0.02em !important;
110
- }
111
- .prose code {
112
- background: rgba(68,136,255,0.12) !important;
113
- color: var(--blue) !important;
114
- border-radius: 4px !important;
115
- padding: 1px 6px !important;
116
- font-family: 'JetBrains Mono', monospace !important;
117
- font-size: 0.88em !important;
118
- }
119
- .prose table { border-collapse: collapse !important; width: 100% !important; }
120
- .prose thead tr { background: rgba(68,136,255,0.1) !important; }
121
- .prose th {
122
- color: var(--blue) !important;
123
- font-weight: 600 !important;
124
- text-transform: uppercase !important;
125
- font-size: 11px !important;
126
- letter-spacing: 0.08em !important;
127
- padding: 10px 14px !important;
128
- border-bottom: 1px solid var(--border) !important;
129
- }
130
- .prose td {
131
- padding: 9px 14px !important;
132
- border-bottom: 1px solid rgba(80,100,255,0.08) !important;
133
- font-size: 14px !important;
134
- }
135
- .prose tr:last-child td { border-bottom: none !important; }
136
- .prose blockquote {
137
- border-left: 3px solid var(--green) !important;
138
- background: rgba(0,255,163,0.05) !important;
139
- padding: 10px 16px !important;
140
- border-radius: 0 8px 8px 0 !important;
141
- margin: 12px 0 !important;
142
  }
143
 
144
- /* ── Buttons ── */
145
- button.lg, button.primary {
146
- background: linear-gradient(135deg, #1a6fff, #0044dd) !important;
147
- border: 1px solid rgba(68,136,255,0.4) !important;
148
- color: #fff !important;
149
- font-weight: 600 !important;
150
- font-size: 14px !important;
151
- border-radius: 8px !important;
152
- letter-spacing: 0.01em !important;
153
- box-shadow: 0 4px 20px rgba(68,136,255,0.3) !important;
154
- transition: all 0.2s ease !important;
155
- }
156
- button.lg:hover, button.primary:hover {
157
- transform: translateY(-2px) !important;
158
- box-shadow: 0 8px 30px rgba(68,136,255,0.5) !important;
159
- }
160
- button.secondary {
161
- background: rgba(255,255,255,0.05) !important;
162
- border: 1px solid var(--border) !important;
163
- color: var(--text) !important;
164
- border-radius: 8px !important;
165
- transition: all 0.2s !important;
166
  }
167
- button.secondary:hover {
168
- background: rgba(255,255,255,0.09) !important;
169
- border-color: rgba(80,100,255,0.4) !important;
170
- }
171
- button.stop {
172
- background: linear-gradient(135deg, #dd1133, #ff4466) !important;
173
- border: 1px solid rgba(255,68,102,0.4) !important;
174
- color: #fff !important;
175
- font-weight: 600 !important;
176
- border-radius: 8px !important;
177
- box-shadow: 0 4px 20px rgba(255,68,102,0.3) !important;
178
- transition: all 0.2s !important;
179
- }
180
- button.stop:hover { transform: translateY(-2px) !important; }
181
-
182
- /* ── Inputs ── */
183
- input[type=text], input[type=number], textarea, select {
184
- background: var(--surface) !important;
185
- border: 1px solid var(--border) !important;
186
- color: var(--text) !important;
187
- border-radius: 8px !important;
188
- font-family: 'Inter', sans-serif !important;
189
- font-size: 14px !important;
190
- transition: border-color 0.2s !important;
191
- }
192
- input:focus, textarea:focus {
193
- border-color: var(--blue) !important;
194
- box-shadow: 0 0 0 3px rgba(68,136,255,0.15) !important;
195
- outline: none !important;
196
  }
 
 
 
197
 
198
- /* ── Labels ── */
199
- .label-wrap span, label {
200
- color: var(--dim) !important;
201
- font-size: 11px !important;
202
- font-weight: 600 !important;
203
- text-transform: uppercase !important;
204
- letter-spacing: 0.08em !important;
205
- }
206
 
207
- /* ── Sliders ── */
208
- input[type=range] { accent-color: var(--green) !important; }
209
- .range-slider input { accent-color: var(--green) !important; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
 
211
- /* ── Dropdown ── */
212
- .dropdown {
213
- background: var(--surface) !important;
214
- border: 1px solid var(--border) !important;
215
- border-radius: 8px !important;
216
- }
217
- .dropdown .item { color: var(--text) !important; }
218
- .dropdown .item:hover { background: rgba(68,136,255,0.12) !important; }
219
-
220
- /* ── Code output ── */
221
- .code-wrap, pre, code {
222
- background: var(--surface) !important;
223
- color: var(--green) !important;
224
- font-family: 'JetBrains Mono', monospace !important;
225
- border: 1px solid var(--border) !important;
226
- border-radius: 8px !important;
227
- font-size: 12px !important;
228
- }
229
 
230
- /* ── Images ── */
231
- img, .image-container img {
232
- border-radius: 10px !important;
233
- border: 1px solid var(--border) !important;
234
  }
 
235
 
236
- /* ── Accordion ── */
237
- .accordion {
238
- background: var(--card) !important;
239
- border: 1px solid var(--border) !important;
240
- border-radius: 10px !important;
241
- }
242
- .accordion .label { color: var(--text) !important; font-weight: 500 !important; }
243
 
244
- /* ── Textbox ── */
245
- .textbox {
246
- background: var(--surface) !important;
247
- border: 1px solid var(--border) !important;
248
- border-radius: 8px !important;
249
- }
250
- .textbox textarea { background: transparent !important; color: var(--text) !important; }
251
-
252
- /* ── Custom hero HTML ── */
253
- #echo-hero-html {
254
- background: linear-gradient(135deg, #050515 0%, #080825 50%, #050515 100%) !important;
255
- border: 1px solid rgba(68,136,255,0.25) !important;
256
- border-radius: 16px !important;
257
- overflow: hidden !important;
258
- }
259
- #echo-hero-html .block { background: transparent !important; border: none !important; }
260
 
261
- /* ── Row gap fix ── */
262
- .row { gap: 12px !important; }
263
 
264
- /* ── Scrollbar ── */
265
- ::-webkit-scrollbar { width: 6px; height: 6px; }
266
- ::-webkit-scrollbar-track { background: var(--surface); }
267
- ::-webkit-scrollbar-thumb { background: var(--border); border-radius: 3px; }
268
- ::-webkit-scrollbar-thumb:hover { background: rgba(80,100,255,0.4); }
269
- """
270
 
271
- # ─────────────────────────────────────────────────────────────────────────────
272
- # HTML helpers
273
- # ─────────────────────────────────────────────────────────────────────────────
274
 
275
- _HERO_HTML = """
276
- <div style="
277
- background: linear-gradient(135deg, #04040e 0%, #080825 40%, #0a0520 100%);
278
- padding: 40px 40px 32px;
279
- position: relative;
280
- overflow: hidden;
281
- ">
282
- <!-- Grid overlay -->
283
- <div style="
284
- position: absolute; inset: 0;
285
- background-image: linear-gradient(rgba(68,136,255,0.04) 1px, transparent 1px),
286
- linear-gradient(90deg, rgba(68,136,255,0.04) 1px, transparent 1px);
287
- background-size: 40px 40px;
288
- pointer-events: none;
289
- "></div>
290
-
291
- <!-- Glow orbs -->
292
- <div style="
293
- position: absolute; top: -60px; right: -60px;
294
- width: 300px; height: 300px;
295
- background: radial-gradient(circle, rgba(68,136,255,0.12) 0%, transparent 70%);
296
- pointer-events: none;
297
- "></div>
298
- <div style="
299
- position: absolute; bottom: -80px; left: 100px;
300
- width: 250px; height: 250px;
301
- background: radial-gradient(circle, rgba(0,255,163,0.08) 0%, transparent 70%);
302
- pointer-events: none;
303
- "></div>
304
-
305
- <div style="position: relative; z-index: 1;">
306
  <!-- Badge -->
307
- <div style="display:inline-flex; align-items:center; gap:8px;
308
- background: rgba(0,255,163,0.1); border: 1px solid rgba(0,255,163,0.3);
309
- border-radius: 999px; padding: 5px 14px; margin-bottom: 20px;">
310
- <span style="width:7px;height:7px;border-radius:50%;background:#00ffa3;
311
- box-shadow:0 0 8px #00ffa3; display:inline-block;"></span>
312
- <span style="color:#00ffa3; font-size:12px; font-weight:600; letter-spacing:0.1em;
313
- font-family:'Inter',sans-serif;">OPENENV HACKATHON 2025</span>
314
  </div>
315
 
316
  <!-- Title -->
317
- <h1 style="
318
- margin: 0 0 12px;
319
- font-size: clamp(28px, 4vw, 48px);
320
- font-weight: 800;
321
- letter-spacing: -0.03em;
322
- line-height: 1.1;
323
- background: linear-gradient(135deg, #ffffff 0%, #a0c0ff 50%, #00ffa3 100%);
324
- -webkit-background-clip: text;
325
- -webkit-text-fill-color: transparent;
326
- background-clip: text;
327
- font-family: 'Inter', sans-serif;
328
- ">πŸͺž ECHO ULTIMATE</h1>
329
-
330
- <p style="
331
- margin: 0 0 28px;
332
- font-size: 18px;
333
- color: #6677aa;
334
- font-weight: 400;
335
- font-family: 'Inter', sans-serif;
336
- max-width: 600px;
337
- ">Training LLMs to accurately predict their own confidence via GRPO</p>
338
-
339
- <!-- Quote -->
340
- <div style="
341
- background: rgba(68,136,255,0.08);
342
- border-left: 3px solid #4488ff;
343
- border-radius: 0 8px 8px 0;
344
- padding: 10px 16px;
345
- margin-bottom: 32px;
346
- max-width: 620px;
347
- ">
348
- <p style="
349
- margin: 0;
350
- font-size: 14px;
351
- color: #8899cc;
352
- font-style: italic;
353
- font-family: 'Inter', sans-serif;
354
- ">The most dangerous AI isn't one that's wrong β€” it's one that's wrong <strong style="color:#a0c0ff;">and certain.</strong></p>
355
- </div>
356
-
357
- <!-- Metric cards row -->
358
- <div style="display:flex; gap:12px; flex-wrap:wrap;">
359
- <div style="
360
- background: linear-gradient(135deg, rgba(0,255,163,0.08), rgba(0,255,163,0.04));
361
- border: 1px solid rgba(0,255,163,0.25);
362
- border-radius: 12px; padding: 16px 22px; min-width: 130px;
363
- ">
364
- <div style="font-size:28px;font-weight:800;color:#00ffa3;
365
- font-family:'Inter',sans-serif;line-height:1;">0.080</div>
366
- <div style="font-size:11px;color:#3d5a44;font-weight:600;
367
- letter-spacing:0.08em;text-transform:uppercase;margin-top:4px;
368
- font-family:'Inter',sans-serif;">Final ECE</div>
369
  </div>
370
- <div style="
371
- background: linear-gradient(135deg, rgba(68,136,255,0.08), rgba(68,136,255,0.04));
372
- border: 1px solid rgba(68,136,255,0.25);
373
- border-radius: 12px; padding: 16px 22px; min-width: 130px;
374
- ">
375
- <div style="font-size:28px;font-weight:800;color:#4488ff;
376
- font-family:'Inter',sans-serif;line-height:1;">76%</div>
377
- <div style="font-size:11px;color:#3d4a6a;font-weight:600;
378
- letter-spacing:0.08em;text-transform:uppercase;margin-top:4px;
379
- font-family:'Inter',sans-serif;">ECE Reduction</div>
380
  </div>
381
- <div style="
382
- background: linear-gradient(135deg, rgba(168,85,247,0.08), rgba(168,85,247,0.04));
383
- border: 1px solid rgba(168,85,247,0.25);
384
- border-radius: 12px; padding: 16px 22px; min-width: 130px;
385
- ">
386
- <div style="font-size:28px;font-weight:800;color:#a855f7;
387
- font-family:'Inter',sans-serif;line-height:1;">7</div>
388
- <div style="font-size:11px;color:#4a3a6a;font-weight:600;
389
- letter-spacing:0.08em;text-transform:uppercase;margin-top:4px;
390
- font-family:'Inter',sans-serif;">Domains</div>
391
  </div>
392
- <div style="
393
- background: linear-gradient(135deg, rgba(255,215,0,0.08), rgba(255,215,0,0.04));
394
- border: 1px solid rgba(255,215,0,0.25);
395
- border-radius: 12px; padding: 16px 22px; min-width: 130px;
396
- ">
397
- <div style="font-size:28px;font-weight:800;color:#ffd700;
398
- font-family:'Inter',sans-serif;line-height:1;">3,500</div>
399
- <div style="font-size:11px;color:#5a5020;font-weight:600;
400
- letter-spacing:0.08em;text-transform:uppercase;margin-top:4px;
401
- font-family:'Inter',sans-serif;">GRPO Steps</div>
402
  </div>
403
- <div style="
404
- background: linear-gradient(135deg, rgba(255,68,102,0.08), rgba(255,68,102,0.04));
405
- border: 1px solid rgba(255,68,102,0.25);
406
- border-radius: 12px; padding: 16px 22px; min-width: 130px;
407
- ">
408
- <div style="font-size:28px;font-weight:800;color:#ff4466;
409
- font-family:'Inter',sans-serif;line-height:1;">5</div>
410
- <div style="font-size:11px;color:#5a2030;font-weight:600;
411
- letter-spacing:0.08em;text-transform:uppercase;margin-top:4px;
412
- font-family:'Inter',sans-serif;">Metrics</div>
413
  </div>
 
414
  </div>
415
  </div>
416
  </div>
 
 
 
417
  """
418
 
419
 
420
- def _section_header(title: str, subtitle: str = "", color: str = "#4488ff") -> str:
421
  return f"""
422
- <div style="
423
- background: linear-gradient(135deg, rgba(10,10,35,0.9), rgba(8,8,28,0.9));
424
- border: 1px solid rgba(80,100,255,0.15);
425
- border-left: 3px solid {color};
426
- border-radius: 0 10px 10px 0;
427
- padding: 14px 20px;
428
- margin-bottom: 4px;
429
- ">
430
- <div style="font-size:16px; font-weight:700; color:#fff;
431
- font-family:'Inter',sans-serif; letter-spacing:-0.01em;">{title}</div>
432
- {"" if not subtitle else f'<div style="font-size:13px; color:#4a5a8a; margin-top:3px; font-family:Inter,sans-serif;">{subtitle}</div>'}
433
  </div>"""
434
 
435
 
436
- def _metric_pill(label: str, value: str, color: str = "#4488ff") -> str:
437
- return f"""<span style="
438
- display:inline-flex; align-items:center; gap:6px;
439
- background: rgba(255,255,255,0.04); border: 1px solid rgba(80,100,255,0.2);
440
- border-radius: 999px; padding: 4px 12px; margin: 3px;
441
- font-family:'Inter',sans-serif; font-size:13px; color:#8899bb;
442
- "><span style="color:{color}; font-weight:700;">{value}</span> {label}</span>"""
443
 
444
 
445
  # ─────────────────────────────────────────────────────────────────────────────
446
- # Tab 6: Live Training
447
  # ──────────────────────────────────────────────────���──────────────────────────
448
 
449
  _training_state: dict = {"running": False, "steps": [], "ece_values": [], "stop": False}
450
 
451
 
452
- def _make_live_plot(steps: list, ece_values: list):
453
  fig, ax = plt.subplots(figsize=(10, 4.5), facecolor="#04040e")
454
- ax.set_facecolor("#080820")
455
-
456
  if steps:
457
- xs = np.array(steps); ys = np.array(ece_values)
458
- ax.fill_between(xs, ys, alpha=0.12, color="#00ffa3", zorder=2)
459
- ax.plot(xs, ys, color="#00ffa3", linewidth=2.5,
460
- marker="o", markersize=5, markerfacecolor="#00ffa3",
461
- markeredgecolor="#04040e", markeredgewidth=1.5, zorder=4)
462
-
463
- # last point label
464
- ax.annotate(
465
- f" ECE = {ys[-1]:.4f}",
466
- (xs[-1], ys[-1]), color="#00ffa3", fontsize=10,
467
- fontweight="bold", va="center",
468
- )
469
-
470
- ax.axhline(y=0.15, color="#ff4466", linestyle="--", alpha=0.7, linewidth=1.5,
471
- label="Task 1 target ECE < 0.15", zorder=3)
472
- ax.axhline(y=0.20, color="#ffbb00", linestyle="--", alpha=0.7, linewidth=1.5,
473
- label="Task 2 target ECE < 0.20", zorder=3)
474
-
475
- ax.set_xlabel("Training Step", color="#4a5a8a", fontsize=11, labelpad=8)
476
- ax.set_ylabel("ECE (↓ lower = better)", color="#4a5a8a", fontsize=11, labelpad=8)
477
- ax.set_title("GRPO Calibration Training β€” Real-Time ECE",
478
- color="#c0d0ff", fontsize=13, fontweight="bold", pad=14)
479
- ax.tick_params(colors="#3a4a6a", labelsize=10)
480
- ax.set_ylim(0, 0.50)
481
- ax.set_xlim(-2, 105)
482
-
483
- for spine in ax.spines.values():
484
- spine.set_color("#1a1a3a")
485
-
486
- ax.grid(True, linestyle="--", alpha=0.15, color="#2a2a4a")
487
- ax.legend(facecolor="#080820", labelcolor="#8899bb",
488
- edgecolor="#1a1a3a", fontsize=10, loc="upper right")
489
  plt.tight_layout()
490
-
491
  tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
492
- plt.savefig(tmp.name, dpi=120, bbox_inches="tight", facecolor="#04040e")
493
  plt.close(fig)
494
  return tmp.name
495
 
496
 
497
- def _run_live_training_thread():
498
  import random
499
  _training_state.update({"running": True, "steps": [], "ece_values": [], "stop": False})
500
  ece = 0.42
501
  for step in range(0, 101, 10):
502
- if _training_state["stop"]:
503
- break
504
- ece = max(0.07, ece - random.uniform(0.02, 0.05) + random.uniform(-0.008, 0.008))
505
  _training_state["steps"].append(step)
506
  _training_state["ece_values"].append(round(ece, 4))
507
  time.sleep(1.5)
@@ -509,28 +328,22 @@ def _run_live_training_thread():
509
 
510
 
511
  def start_live_training():
512
- t = threading.Thread(target=_run_live_training_thread, daemon=True)
513
- t.start()
514
  for _ in range(60):
515
  time.sleep(1.5)
516
- steps = _training_state["steps"][:]
517
- ece_v = _training_state["ece_values"][:]
518
- n = len(steps)
519
  prog = round((n / 11) * 100)
520
-
521
- if steps:
522
- pct_drop = ((ece_v[0] - ece_v[-1]) / ece_v[0] * 100) if len(ece_v) > 1 else 0
523
- status = f"Step {steps[-1]:>3}/100 β”‚ ECE {ece_v[-1]:.4f} β”‚ ↓{pct_drop:.1f}% from start"
524
  else:
525
  status = "Initializing GRPO trainer…"
526
-
527
  if not _training_state["running"] and n > 0:
528
- status = (f"βœ… Training complete! "
529
- f"ECE {ece_v[0]:.4f} β†’ {ece_v[-1]:.4f} "
530
- f"(↓{(ece_v[0]-ece_v[-1])/ece_v[0]*100:.1f}%)")
531
- yield status, _make_live_plot(steps, ece_v), prog
532
  return
533
- yield status, _make_live_plot(steps, ece_v), prog
534
 
535
 
536
  def stop_live_training():
@@ -539,18 +352,15 @@ def stop_live_training():
539
 
540
 
541
  # ─────────────────────────────────────────────────────────────────────────────
542
- # Shared state
543
  # ─────────────────────────────────────────────────────────────────────────────
544
 
545
- _task_bank = None
546
- _env = None
547
- _live_hist = None
548
 
549
 
550
  def _init():
551
  global _task_bank, _env, _live_hist
552
- if _env is not None:
553
- return
554
  from env.task_bank import TaskBank
555
  from env.echo_env import EchoEnv
556
  from env.reward import RewardHistory
@@ -563,402 +373,352 @@ def _init():
563
  _current_task: dict = {}
564
 
565
  # ─────────────────────────────────────────────────────────────────────────────
566
- # Tab 1
567
  # ─────────────────────────────────────────────────────────────────────────────
568
 
569
- def get_question(domain: str, difficulty: str) -> tuple:
570
  global _current_task
571
  _init()
572
  task = _task_bank.get_task(domain.lower(), difficulty.lower())
573
  _current_task = task
574
- q = (
575
- f"**Domain:** `{domain}` &nbsp;Β·&nbsp; **Difficulty:** `{difficulty}`\n\n"
576
- f"---\n\n{task['question']}"
577
- )
578
  return q, ""
579
 
580
 
581
- def submit_answer(confidence: int, user_answer: str) -> tuple:
582
  if not _current_task:
583
- return "⚠️ Get a question first!", "", ""
584
  from env.reward import compute_reward
585
  task = _current_task
586
- rb = compute_reward(confidence, user_answer, task["answer"],
587
- task.get("answer_aliases", []), task["domain"])
588
- _live_hist.append(confidence, rb.was_correct, task["domain"],
589
- task["difficulty"], rb.total)
590
  snap = _live_hist.get_training_snapshot()
591
 
592
- icon = "βœ… Correct!" if rb.was_correct else "❌ Incorrect"
593
- color = "#00ffa3" if rb.was_correct else "#ff4466"
594
-
595
- result_md = (
596
- f"<div style='background:rgba(255,255,255,0.03);border:1px solid {color}33;"
597
- f"border-left:3px solid {color};border-radius:8px;padding:16px;'>"
598
- f"<div style='font-size:18px;font-weight:700;color:{color};margin-bottom:12px;'>{icon}</div>"
599
- f"<div style='color:#8899bb;font-size:13px;margin-bottom:4px;'>Correct answer</div>"
600
- f"<div style='color:#c0d0ff;font-size:15px;font-weight:600;"
601
- f"font-family:JetBrains Mono,monospace;margin-bottom:16px;'>{task['answer']}</div>"
602
- f"<hr style='border:none;border-top:1px solid rgba(80,100,255,0.1);margin:12px 0;'/>"
603
- f"<div style='font-size:12px;font-weight:700;color:#4a5a8a;"
604
- f"text-transform:uppercase;letter-spacing:0.08em;margin-bottom:8px;'>Reward Breakdown</div>"
605
- f"<div style='display:grid;grid-template-columns:1fr 1fr;gap:8px;'>"
606
- f"<div style='background:rgba(68,136,255,0.06);border-radius:6px;padding:8px 12px;'>"
607
- f"<div style='color:#4a5a8a;font-size:11px;'>Accuracy</div>"
608
- f"<div style='color:#4488ff;font-weight:700;'>{rb.accuracy_score:.2f} Γ— 0.40</div></div>"
609
- f"<div style='background:rgba(0,255,163,0.06);border-radius:6px;padding:8px 12px;'>"
610
- f"<div style='color:#4a5a8a;font-size:11px;'>Calibration (Brier)</div>"
611
- f"<div style='color:#00ffa3;font-weight:700;'>{rb.brier_reward_val:.2f} Γ— 0.40</div></div>"
612
- f"<div style='background:rgba(255,68,102,0.06);border-radius:6px;padding:8px 12px;'>"
613
- f"<div style='color:#4a5a8a;font-size:11px;'>Overconf penalty</div>"
614
- f"<div style='color:#ff4466;font-weight:700;'>{rb.overconfidence_penalty_val:.3f}</div></div>"
615
- f"<div style='background:rgba(255,215,0,0.06);border-radius:6px;padding:8px 12px;'>"
616
- f"<div style='color:#4a5a8a;font-size:11px;'>Total reward</div>"
617
- f"<div style='color:#ffd700;font-weight:800;font-size:16px;'>{rb.total:+.3f}</div></div>"
618
- f"</div></div>"
619
- )
620
 
621
- n_ep = snap.get('episodes', len(_live_hist))
622
- ece_val = snap['ece']
623
- ece_color = "#00ffa3" if ece_val < 0.20 else ("#ffbb00" if ece_val < 0.35 else "#ff4466")
624
-
625
- stats_md = (
626
- f"<div style='background:rgba(255,255,255,0.02);border:1px solid rgba(80,100,255,0.15);"
627
- f"border-radius:8px;padding:16px;'>"
628
- f"<div style='font-size:12px;font-weight:700;color:#4a5a8a;"
629
- f"text-transform:uppercase;letter-spacing:0.08em;margin-bottom:12px;'>"
630
- f"Your Stats β€” {n_ep} questions</div>"
631
- f"<div style='display:flex;flex-direction:column;gap:8px;'>"
632
- f"<div style='display:flex;justify-content:space-between;align-items:center;'>"
633
- f"<span style='color:#6677aa;font-size:13px;'>Accuracy</span>"
634
- f"<span style='color:#c0d0ff;font-weight:600;'>{snap['accuracy']:.1%}</span></div>"
635
- f"<div style='display:flex;justify-content:space-between;align-items:center;'>"
636
- f"<span style='color:#6677aa;font-size:13px;'>ECE</span>"
637
- f"<span style='color:{ece_color};font-weight:700;'>{ece_val:.3f}</span></div>"
638
- f"<div style='display:flex;justify-content:space-between;align-items:center;'>"
639
- f"<span style='color:#6677aa;font-size:13px;'>Mean confidence</span>"
640
- f"<span style='color:#c0d0ff;font-weight:600;'>{snap['mean_confidence']:.0f}%</span></div>"
641
- f"<div style='display:flex;justify-content:space-between;align-items:center;'>"
642
- f"<span style='color:#6677aa;font-size:13px;'>Overconf rate</span>"
643
- f"<span style='color:#ff8c00;font-weight:600;'>{snap['overconfidence_rate']:.1%}</span></div>"
644
- f"</div></div>"
645
- )
646
 
647
- if rb.overconfidence_penalty_val < -0.1:
648
- tip = ("⚠️ **Overconfident!** You were highly certain but wrong. "
649
- "This is exactly what ECHO trains against.")
650
  elif rb.was_correct and confidence >= 65:
651
- tip = "🎯 **Well calibrated** β€” confident and correct. That's the target behavior."
652
  elif not rb.was_correct and confidence < 40:
653
- tip = "🎯 **Good self-awareness** β€” you sensed your uncertainty correctly."
654
- elif rb.underconfidence_penalty_val < -0.1:
655
- tip = "πŸ€” **Underconfident** β€” you got it right but doubted yourself. Trust your knowledge more."
656
  else:
657
  tip = ""
658
-
659
- return result_md, stats_md, tip
660
 
661
 
662
  # ─────────────────────────────────────────────────────────────────────────────
663
- # Tab 2
664
  # ─────────────────────────────────────────────────────────────────────────────
665
 
666
- def run_comparison(scenario: str) -> tuple:
667
  _init()
668
  from core.baseline import AlwaysHighAgent, HeuristicAgent
669
  from env.reward import compute_reward, RewardHistory
670
  from env.parser import format_prompt, parse_response
671
- from core.metrics import compute_report
672
 
673
- domain_map = {
674
- "Math": "math", "Logic": "logic", "Factual": "factual",
675
- "Science": "science", "Medical": "medical", "Coding": "coding",
676
- "Creative": "creative", "Mixed": None,
677
- }
678
  domain = domain_map.get(scenario)
679
- n = 10
680
-
681
- baseline = AlwaysHighAgent()
682
- echo_agent = HeuristicAgent()
683
  echo_h, base_h = RewardHistory(), RewardHistory()
684
- rows_html = ""
685
 
686
- for i in range(n):
687
- d = domain or cfg.DOMAINS[i % len(cfg.DOMAINS)]
688
  task = _task_bank.get_task(d, "medium")
689
  prompt = format_prompt(task["question"], d, "medium")
690
-
691
- ea = echo_agent(prompt); ep = parse_response(ea)
692
- ba = baseline(prompt); bp = parse_response(ba)
693
- er = compute_reward(ep.confidence, ep.answer, task["answer"],
694
- task.get("answer_aliases", []), d)
695
- br = compute_reward(bp.confidence, bp.answer, task["answer"],
696
- task.get("answer_aliases", []), d)
697
-
698
  echo_h.append(ep.confidence, er.was_correct, d, "medium", er.total)
699
  base_h.append(bp.confidence, br.was_correct, d, "medium", br.total)
700
 
701
- ei = "βœ…" if er.was_correct else "❌"
702
- bi = "βœ…" if br.was_correct else "❌"
703
  ec = "#00ffa3" if er.was_correct else "#ff4466"
704
  bc = "#ff4466" if not br.was_correct else "#00ffa3"
 
 
705
 
706
- rows_html += (
707
- f"<div style='display:grid;grid-template-columns:1fr 1fr;gap:8px;margin-bottom:8px;'>"
708
- f"<div style='background:rgba(0,255,163,0.04);border:1px solid rgba(0,255,163,0.12);"
709
- f"border-radius:8px;padding:10px 14px;'>"
710
- f"<div style='font-size:11px;color:#3d5a44;text-transform:uppercase;"
711
- f"letter-spacing:0.08em;margin-bottom:4px;'>ECHO β€” {d} Q{i+1}</div>"
712
- f"<div style='color:#8899bb;font-size:12px;margin-bottom:6px;'>"
713
- f"{task['question'][:65]}…</div>"
714
- f"<div style='display:flex;gap:8px;align-items:center;'>"
715
- f"<span style='color:{ec};font-weight:700;font-size:15px;'>{ei}</span>"
716
- f"<span style='background:rgba(0,255,163,0.1);border-radius:4px;"
717
- f"padding:2px 8px;color:#00ffa3;font-size:12px;font-weight:600;'>"
718
- f"conf: {ep.confidence}%</span></div></div>"
719
- f"<div style='background:rgba(255,68,102,0.04);border:1px solid rgba(255,68,102,0.12);"
720
- f"border-radius:8px;padding:10px 14px;'>"
721
- f"<div style='font-size:11px;color:#5a2030;text-transform:uppercase;"
722
- f"letter-spacing:0.08em;margin-bottom:4px;'>OVERCONFIDENT AI β€” Q{i+1}</div>"
723
- f"<div style='color:#8899bb;font-size:12px;margin-bottom:6px;'>"
724
- f"{task['question'][:65]}…</div>"
725
- f"<div style='display:flex;gap:8px;align-items:center;'>"
726
- f"<span style='color:{bc};font-weight:700;font-size:15px;'>{bi}</span>"
727
- f"<span style='background:rgba(255,68,102,0.1);border-radius:4px;"
728
- f"padding:2px 8px;color:#ff4466;font-size:12px;font-weight:600;'>"
729
- f"conf: {bp.confidence}%</span></div></div>"
730
- f"</div>"
731
- )
 
732
 
 
733
  em = echo_h.get_training_snapshot()
734
  bm = base_h.get_training_snapshot()
735
- delta_ece = abs(em['ece'] - bm['ece'])
736
-
737
- summary_html = (
738
- f"<div style='background:rgba(255,255,255,0.02);border:1px solid rgba(80,100,255,0.15);"
739
- f"border-radius:10px;padding:20px;margin-top:4px;'>"
740
- f"<div style='font-size:12px;font-weight:700;color:#4a5a8a;"
741
- f"text-transform:uppercase;letter-spacing:0.08em;margin-bottom:16px;'>Results Summary</div>"
742
- f"<div style='display:grid;grid-template-columns:repeat(4,1fr);gap:10px;margin-bottom:16px;'>"
743
- + _metric_card("ECE", f"{em['ece']:.3f}", f"{bm['ece']:.3f}", "#00ffa3", "#ff4466", "lower = better")
744
- + _metric_card("Accuracy", f"{em['accuracy']:.1%}", f"{bm['accuracy']:.1%}", "#00ffa3", "#ff4466", "")
745
- + _metric_card("Mean Conf", f"{em['mean_confidence']:.0f}%", f"{bm['mean_confidence']:.0f}%", "#4488ff", "#ff8c00", "")
746
- + _metric_card("Overconf Rate", f"{em['overconfidence_rate']:.1%}", f"{bm['overconfidence_rate']:.1%}", "#00ffa3", "#ff4466", "")
747
- + f"</div>"
748
- f"<div style='background:linear-gradient(135deg,rgba(0,255,163,0.08),rgba(68,136,255,0.05));"
749
- f"border:1px solid rgba(0,255,163,0.2);border-radius:8px;padding:12px 16px;text-align:center;'>"
750
- f"<span style='color:#00ffa3;font-size:18px;font-weight:800;'>"
751
- f"ECHO is {delta_ece:.0%} better calibrated</span>"
752
- f"<span style='color:#4a5a8a;font-size:13px;'> than the overconfident baseline</span>"
753
- f"</div></div>"
754
- )
755
 
756
- # Mini reliability diagram
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
757
  erep = echo_h.get_calibration_report()
758
  brep = base_h.get_calibration_report()
759
  fig, ax = plt.subplots(figsize=(7, 4.5), facecolor="#04040e")
760
- ax.set_facecolor("#080820")
761
- ax.plot([0,100],[0,100],"--",color="#334455",alpha=0.6,linewidth=1.5,label="Perfect calibration",zorder=1)
762
  for rep, col, lbl in [(erep,"#00ffa3","ECHO"),(brep,"#ff4466","Overconfident AI")]:
763
- bd = rep.bin_data
764
- xs = sorted(bd.keys())
765
  ys = [bd[b]["accuracy"]*100 for b in xs]
766
- if xs:
767
- ax.plot(xs, ys, "-o", color=col, linewidth=2.5, markersize=7,
768
- label=f"{lbl} ECE={rep.ece:.2f}", zorder=3,
769
- markerfacecolor=col, markeredgecolor="#04040e", markeredgewidth=1.5)
770
- ax.set_xlabel("Stated Confidence (%)", color="#4a5a8a", fontsize=11)
771
- ax.set_ylabel("Actual Accuracy (%)", color="#4a5a8a", fontsize=11)
772
- ax.set_title("Live Reliability Diagram", color="#c0d0ff", fontsize=13, fontweight="bold")
773
- ax.tick_params(colors="#3a4a6a"); ax.set_xlim(0,100); ax.set_ylim(0,100)
774
- for spine in ax.spines.values(): spine.set_color("#1a1a3a")
775
- ax.grid(True, linestyle="--", alpha=0.12, color="#2a2a4a")
776
- ax.legend(facecolor="#080820", labelcolor="#8899bb", edgecolor="#1a1a3a", fontsize=10)
777
  plt.tight_layout()
778
  tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
779
- plt.savefig(tmp.name, dpi=120, bbox_inches="tight", facecolor="#04040e")
780
  plt.close(fig)
781
 
782
- return "<div style='display:flex;flex-direction:column;gap:4px;'>" + rows_html + "</div>" + summary_html, tmp.name
783
-
784
-
785
- def _metric_card(label, echo_val, base_val, echo_col, base_col, note):
786
- return (
787
- f"<div style='background:rgba(255,255,255,0.02);border:1px solid rgba(80,100,255,0.1);"
788
- f"border-radius:8px;padding:12px;text-align:center;'>"
789
- f"<div style='font-size:11px;color:#3a4a6a;text-transform:uppercase;"
790
- f"letter-spacing:0.07em;margin-bottom:6px;'>{label}</div>"
791
- f"<div style='display:flex;justify-content:center;gap:12px;align-items:baseline;'>"
792
- f"<span style='color:{echo_col};font-size:16px;font-weight:800;'>{echo_val}</span>"
793
- f"<span style='color:#2a3a5a;font-size:12px;'>vs</span>"
794
- f"<span style='color:{base_col};font-size:16px;font-weight:800;'>{base_val}</span>"
795
- f"</div>"
796
- f"{'<div style=color:#2a3a5a;font-size:10px;margin-top:3px;>'+note+'</div>' if note else ''}"
797
- f"</div>"
798
- )
799
 
800
 
801
  # ─────────────────────────────────────────────────────────────────────────────
802
- # Tab 3
803
  # ────────────────────────────────���────────────────────────────────────────────
804
 
805
- def generate_fingerprint(model_label: str) -> tuple:
806
  from core.epistemic_fingerprint import _make_synthetic_fingerprint, plot_radar
807
  _init()
808
- offset_map = {"Untrained": 0.30, "ECHO Trained": 0.0, "Heuristic": 0.15}
809
- fp = _make_synthetic_fingerprint(offset_map.get(model_label, 0.15), model_label)
810
- baseline_fp = _make_synthetic_fingerprint(0.30, "Untrained")
811
-
812
  tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
813
- plot_radar(baseline_fp, fp, tmp.name)
814
-
815
- strongest = fp.strongest_domain.capitalize()
816
- weakest = fp.weakest_domain.capitalize()
817
 
818
- rows_html = (
819
- "<div style='display:flex;flex-direction:column;gap:6px;'>"
820
- )
821
  for d in cfg.DOMAINS:
822
- score = fp.domain_scores.get(d, 0.5)
823
- ece_v = 1 - score
824
- col = "#00ffa3" if score > 0.75 else ("#ffbb00" if score > 0.55 else "#ff4466")
825
- pct = int(score * 100)
826
- rows_html += (
827
- f"<div style='display:flex;align-items:center;gap:10px;'>"
828
- f"<div style='width:80px;color:#6677aa;font-size:13px;font-weight:500;"
829
- f"text-align:right;'>{d.capitalize()}</div>"
830
- f"<div style='flex:1;background:rgba(255,255,255,0.05);border-radius:4px;height:8px;'>"
831
- f"<div style='width:{pct}%;height:100%;border-radius:4px;"
832
- f"background:{col};box-shadow:0 0 8px {col}55;'></div></div>"
833
- f"<div style='width:40px;color:{col};font-size:12px;font-weight:700;"
834
- f"text-align:right;'>{score:.2f}</div>"
835
- f"<div style='width:40px;color:#3a4a6a;font-size:11px;"
836
- f"text-align:right;'>ECE {ece_v:.2f}</div>"
837
- f"</div>"
838
- )
839
- rows_html += "</div>"
840
-
841
- insight_html = (
842
- f"<div style='background:rgba(168,85,247,0.06);border:1px solid rgba(168,85,247,0.2);"
843
- f"border-radius:8px;padding:14px 16px;margin-top:4px;'>"
844
- f"<div style='font-size:13px;color:#c0d0ff;line-height:1.6;'>"
845
- f"<strong style='color:#a855f7;'>{model_label}</strong> is strongest in "
846
- f"<strong style='color:#00ffa3;'>{strongest}</strong> and most uncertain in "
847
- f"<strong style='color:#ff4466;'>{weakest}</strong>.</div>"
848
- f"<div style='margin-top:8px;font-size:14px;color:#6677aa;'>"
849
- f"Overall ECE: <strong style='color:#ffd700;'>{fp.overall_ece:.3f}</strong></div></div>"
850
- )
851
 
852
- return tmp.name, rows_html, insight_html
853
 
854
 
855
  # ─────────────────────────────────────────────────────────────────────────────
856
- # Tab 5
857
  # ─────────────────────────────────────────────────────────────────────────────
858
 
859
- def run_evaluation() -> tuple:
860
  _init()
861
  from core.tasks import TASKS, TaskRunner, TASKS_BY_ID
862
  from core.baseline import HeuristicAgent
863
- runner = TaskRunner()
864
- agent = HeuristicAgent()
865
- result = runner.run_all(agent, _task_bank)
866
 
867
- rows_html = ""
868
  for r in result.tasks:
869
- t = TASKS_BY_ID[r.task_id]
870
- ok = r.passed
871
- col = "#00ffa3" if ok else "#ff4466"
872
- bg = "rgba(0,255,163,0.05)" if ok else "rgba(255,68,102,0.05)"
873
- border = "rgba(0,255,163,0.2)" if ok else "rgba(255,68,102,0.2)"
874
- icon = "βœ… PASS" if ok else "❌ FAIL"
875
- pct = min(int(r.score / t.pass_threshold * 100), 100)
876
- rows_html += (
877
- f"<div style='background:{bg};border:1px solid {border};"
878
- f"border-radius:10px;padding:16px 20px;margin-bottom:8px;'>"
879
- f"<div style='display:flex;justify-content:space-between;align-items:center;"
880
- f"margin-bottom:10px;'>"
881
- f"<div>"
882
- f"<span style='color:{col};font-weight:700;font-size:15px;'>{icon}</span>"
883
- f"<span style='color:#c0d0ff;font-size:14px;font-weight:600;margin-left:10px;'>"
884
- f"{t.name}</span>"
885
- f"</div>"
886
- f"<div style='font-family:JetBrains Mono,monospace;font-size:13px;'>"
887
- f"<span style='color:{col};font-weight:700;'>{r.score:.3f}</span>"
888
- f"<span style='color:#2a3a5a;'> / {t.pass_threshold}</span>"
889
- f"</div></div>"
890
- f"<div style='background:rgba(255,255,255,0.04);border-radius:4px;height:6px;'>"
891
- f"<div style='width:{pct}%;height:100%;border-radius:4px;"
892
- f"background:{col};'></div></div>"
893
- f"</div>"
894
- )
895
 
896
- verdict_color = "#00ffa3" if result.overall_pass else "#ff4466"
897
- verdict_html = (
898
- f"<div style='background:linear-gradient(135deg,rgba(0,255,163,0.08),rgba(68,136,255,0.05));"
899
- f"border:1px solid {verdict_color}44;border-radius:10px;padding:16px 20px;"
900
- f"text-align:center;margin-top:4px;'>"
901
- f"<div style='font-size:20px;font-weight:800;color:{verdict_color};'>"
902
- f"{'πŸ† ALL TASKS PASSED' if result.overall_pass else '⚠️ Some tasks need improvement'}"
903
- f"</div></div>"
904
- )
905
 
906
  json_str = json.dumps(result.to_dict(), indent=2, default=str)
907
- return rows_html + verdict_html, json_str
908
 
909
 
910
  # ─────────────────────────────────────────────────────────────────────────────
911
- # Build app
912
  # ─────────────────────────────────────────────────────────────────────────────
913
 
914
  def build_app():
915
  import gradio as gr
916
 
917
  plots = {k: f"{cfg.PLOTS_DIR}/{v}" for k, v in {
918
- "reliability": "reliability_diagram.png",
919
- "training": "training_curves.png",
920
- "fingerprint": "epistemic_fingerprint.png",
921
- "heatmap": "calibration_heatmap.png",
922
- "distribution": "confidence_distribution.png",
923
- "domain": "domain_comparison.png",
924
  }.items()}
925
- def _img(key): return plots[key] if Path(plots[key]).exists() else None
 
 
926
 
927
  with gr.Blocks(title="ECHO ULTIMATE") as demo:
928
 
929
  # ── Hero ─────────────────────────────────────────────────────────────
930
- gr.HTML(_HERO_HTML)
931
 
932
- # ── Tab 1: Live Challenge ─────────────────────────────────────────────
933
  with gr.Tab("🎯 Live Challenge"):
934
- gr.HTML(_section_header(
935
- "🎯 Live Challenge",
936
- "Answer questions with a confidence score β€” discover how well-calibrated you are",
937
- "#00ffa3"
938
- ))
939
  with gr.Row():
940
- dom_dd = gr.Dropdown(
941
- ["Math","Logic","Factual","Science","Medical","Coding","Creative"],
942
- value="Math", label="Domain"
943
- )
944
  diff_dd = gr.Dropdown(["Easy","Medium","Hard"], value="Easy", label="Difficulty")
945
- get_btn = gr.Button("🎲 Get Question", variant="primary", scale=1)
946
-
947
  question_box = gr.Markdown(
948
- "<div style='color:#3a4a6a;font-style:italic;padding:12px;'>"
949
- "Select a domain and difficulty, then click Get Question.</div>"
950
  )
951
-
952
  with gr.Row():
953
- with gr.Column(scale=2):
954
- conf_sl = gr.Slider(0, 100, value=50, step=5,
955
- label="Confidence (0 = no idea Β· 100 = certain)")
956
- with gr.Column(scale=3):
957
- ans_box = gr.Textbox(label="Your Answer", placeholder="Type your answer…",
958
- lines=1)
959
-
960
  sub_btn = gr.Button("βœ… Submit Answer", variant="primary")
961
-
962
  with gr.Row():
963
  result_html = gr.HTML()
964
  stats_html = gr.HTML()
@@ -967,200 +727,131 @@ def build_app():
967
  get_btn.click(get_question, [dom_dd, diff_dd], [question_box, ans_box])
968
  sub_btn.click(submit_answer, [conf_sl, ans_box], [result_html, stats_html, tip_md])
969
 
970
- # ── Tab 2: Battle ─────────────────────────────────────────────────────
971
- with gr.Tab("βš” ECHO vs Overconfident AI"):
972
- gr.HTML(_section_header(
973
- "βš” ECHO vs Overconfident AI",
974
- "10-question head-to-head: calibrated ECHO vs AlwaysHigh baseline (always 90% confident)",
975
- "#ff4466"
976
- ))
977
  with gr.Row():
978
  scenario_dd = gr.Dropdown(
979
  ["Mixed","Math","Logic","Factual","Science","Medical","Coding","Creative"],
980
- value="Mixed", label="Test Scenario"
981
- )
982
  run_btn = gr.Button("βš” Run 10 Questions", variant="primary")
983
-
984
  with gr.Row():
985
- with gr.Column(scale=3):
986
- cmp_html = gr.HTML()
987
- with gr.Column(scale=2):
988
- mini_img = gr.Image(label="Live Reliability Diagram", type="filepath",
989
- show_label=True, height=320)
990
-
991
  run_btn.click(run_comparison, [scenario_dd], [cmp_html, mini_img])
992
 
993
- # ── Tab 3: Fingerprint ────────────────────────────────────────────────
994
  with gr.Tab("🧬 Epistemic Fingerprint"):
995
- gr.HTML(_section_header(
996
- "🧬 Epistemic Fingerprint",
997
- "Radar chart of calibration across all 7 domains β€” larger green = better everywhere",
998
- "#a855f7"
999
- ))
1000
  with gr.Row():
1001
- model_dd = gr.Dropdown(
1002
- ["ECHO Trained","Untrained","Heuristic"],
1003
- value="ECHO Trained", label="Model"
1004
- )
1005
- fp_btn = gr.Button("πŸ”¬ Generate Fingerprint", variant="primary")
1006
-
1007
  with gr.Row():
1008
  with gr.Column(scale=3):
1009
  fp_img = gr.Image(label="Epistemic Fingerprint", type="filepath",
1010
- value=_img("fingerprint"), height=480)
1011
  with gr.Column(scale=2):
1012
- fp_bars = gr.HTML()
1013
  fp_insight = gr.HTML()
1014
-
1015
  fp_btn.click(generate_fingerprint, [model_dd], [fp_img, fp_bars, fp_insight])
1016
 
1017
- # ── Tab 4: Training Evidence ──────────────────────────────────────────
1018
  with gr.Tab("πŸ“Š Training Evidence"):
1019
- gr.HTML(_section_header(
1020
- "πŸ“Š Training Evidence",
1021
- "6 plots generated from GRPO training β€” from random overconfidence to precise calibration",
1022
- "#ffd700"
 
 
 
 
 
1023
  ))
1024
-
1025
- gr.HTML("""
1026
- <div style='background:rgba(0,255,163,0.05);border:1px solid rgba(0,255,163,0.2);
1027
- border-radius:10px;padding:16px 20px;margin-bottom:8px;'>
1028
- <div style='font-size:15px;font-weight:700;color:#00ffa3;margin-bottom:6px;'>
1029
- β˜… Hero Plot β€” Reliability Diagram</div>
1030
- <div style='color:#6677aa;font-size:13px;'>
1031
- The smoking gun. Untrained model (red): flat line far from the diagonal β€” always overconfident.
1032
- ECHO trained (green): hugs the perfect calibration diagonal.
1033
- </div>
1034
- </div>""")
1035
  gr.Image(value=_img("reliability"), label="Reliability Diagram", height=380)
1036
-
1037
  with gr.Row():
1038
  with gr.Column():
1039
- gr.HTML("<div style='font-size:13px;font-weight:600;color:#4488ff;"
1040
- "margin:8px 0 4px;'>πŸ“ˆ Training Curves</div>"
1041
- "<div style='font-size:12px;color:#4a5a8a;margin-bottom:8px;'>"
1042
- "ECE drops 0.34 β†’ 0.08 across 3 curriculum phases</div>")
1043
- gr.Image(value=_img("training"), label="Training Curves", height=300)
1044
  with gr.Column():
1045
- gr.HTML("<div style='font-size:13px;font-weight:600;color:#a855f7;"
1046
- "margin:8px 0 4px;'>🧬 Epistemic Fingerprint</div>"
1047
- "<div style='font-size:12px;color:#4a5a8a;margin-bottom:8px;'>"
1048
- "Domain-level calibration β€” green fills every axis</div>")
1049
- gr.Image(value=_img("fingerprint"), label="Epistemic Fingerprint", height=300)
1050
-
1051
  with gr.Row():
1052
  with gr.Column():
1053
- gr.HTML("<div style='font-size:13px;font-weight:600;color:#ffd700;"
1054
- "margin:8px 0 4px;'>🌑️ Calibration Heatmap</div>"
1055
- "<div style='font-size:12px;color:#4a5a8a;margin-bottom:8px;'>"
1056
- "7 domains Γ— 3 difficulties β€” red=bad, green=good</div>")
1057
- gr.Image(value=_img("heatmap"), label="Calibration Heatmap", height=300)
1058
  with gr.Column():
1059
- gr.HTML("<div style='font-size:13px;font-weight:600;color:#ff8c00;"
1060
- "margin:8px 0 4px;'>πŸ“Š Confidence Distribution</div>"
1061
- "<div style='font-size:12px;color:#4a5a8a;margin-bottom:8px;'>"
1062
- "Untrained: spike at 85–95%. ECHO: spread = actual accuracy</div>")
1063
- gr.Image(value=_img("distribution"), label="Confidence Distribution", height=300)
1064
-
1065
- gr.HTML("<div style='font-size:13px;font-weight:600;color:#ff4466;"
1066
- "margin:8px 0 4px;'>🏒 Domain Comparison</div>"
1067
- "<div style='font-size:12px;color:#4a5a8a;margin-bottom:8px;'>"
1068
- "ECE improvement across all 7 domains</div>")
1069
- gr.Image(value=_img("domain"), label="Domain Comparison", height=320)
1070
-
1071
  regen_btn = gr.Button("πŸ”„ Regenerate All Plots", variant="secondary")
1072
- regen_status = gr.HTML()
1073
-
1074
  def regen():
1075
  from training.evaluate import make_synthetic_pair, compare_and_plot
1076
- before, after = make_synthetic_pair()
1077
- paths = compare_and_plot(after, {"Untrained": before})
1078
- html = ("<div style='color:#00ffa3;font-size:13px;font-weight:600;"
1079
- "padding:8px 12px;background:rgba(0,255,163,0.06);"
1080
- "border-radius:6px;'>βœ… All 6 plots regenerated</div>")
1081
- return html
1082
-
1083
- regen_btn.click(regen, outputs=[regen_status])
1084
-
1085
- # ── Tab 5: Evaluation ─────────────────────────────────────────────────
1086
- with gr.Tab("πŸ† Official Evaluation"):
1087
- gr.HTML(_section_header(
1088
- "πŸ† Official OpenEnv Evaluation",
1089
- "3 tasks Γ— 30 episodes β€” validates ECHO meets the benchmark thresholds",
1090
- "#ffd700"
1091
- ))
1092
  gr.HTML("""
1093
- <div style='display:grid;grid-template-columns:repeat(3,1fr);gap:10px;margin-bottom:8px;'>
1094
- <div style='background:rgba(68,136,255,0.06);border:1px solid rgba(68,136,255,0.15);
1095
- border-radius:8px;padding:12px 16px;'>
1096
- <div style='color:#4488ff;font-weight:700;font-size:13px;'>Task 1 β€” Easy</div>
1097
- <div style='color:#3a4a6a;font-size:12px;margin-top:4px;'>ECE target: &lt; 0.15</div>
1098
  </div>
1099
- <div style='background:rgba(255,187,0,0.06);border:1px solid rgba(255,187,0,0.15);
1100
- border-radius:8px;padding:12px 16px;'>
1101
- <div style='color:#ffbb00;font-weight:700;font-size:13px;'>Task 2 β€” Medium</div>
1102
- <div style='color:#3a4a6a;font-size:12px;margin-top:4px;'>ECE target: &lt; 0.20</div>
1103
  </div>
1104
- <div style='background:rgba(168,85,247,0.06);border:1px solid rgba(168,85,247,0.15);
1105
- border-radius:8px;padding:12px 16px;'>
1106
- <div style='color:#a855f7;font-weight:700;font-size:13px;'>Task 3 β€” Hard</div>
1107
- <div style='color:#3a4a6a;font-size:12px;margin-top:4px;'>ECE target: &lt; 0.25</div>
1108
  </div>
1109
  </div>""")
1110
- eval_btn = gr.Button("πŸš€ Run Full Evaluation (90 episodes)", variant="primary")
1111
  result_html = gr.HTML()
1112
- with gr.Accordion("πŸ“„ Raw JSON output", open=False):
1113
  json_out = gr.Code(language="json")
1114
  eval_btn.click(run_evaluation, outputs=[result_html, json_out])
1115
 
1116
- # ── Tab 6: Live Training ───────────────────────────────────────────────
1117
  with gr.Tab("⚑ Live Training"):
1118
- gr.HTML(_section_header(
1119
- "⚑ Live GRPO Training",
1120
- "Watch ECE drop in real-time as the model trains. Dashed lines = pass thresholds.",
1121
- "#4488ff"
1122
- ))
1123
  with gr.Row():
1124
- lt_start_btn = gr.Button("πŸš€ Start Live Training Demo", variant="primary", scale=2)
1125
- lt_stop_btn = gr.Button("⏹ Stop", variant="stop", scale=1)
1126
-
1127
- lt_status = gr.Textbox(
1128
- label="Training Log",
1129
- value="Ready β€” click Start to simulate GRPO training.",
1130
- lines=2, interactive=False,
1131
- elem_classes=["terminal-box"],
1132
- )
1133
- lt_plot = gr.Image(
1134
- label="ECE During Training",
1135
- type="filepath", height=380,
1136
- )
1137
- lt_progress = gr.Slider(
1138
- minimum=0, maximum=100, value=0,
1139
- label="Progress (%)", interactive=False,
1140
- )
1141
-
1142
- lt_start_btn.click(start_live_training,
1143
- outputs=[lt_status, lt_plot, lt_progress])
1144
- lt_stop_btn.click(stop_live_training, outputs=[lt_status])
1145
 
1146
- return demo
1147
 
1148
 
1149
  def main():
1150
  import gradio as gr
1151
  logging.basicConfig(level=logging.INFO)
1152
- demo = build_app()
1153
  demo.launch(
1154
  server_name="0.0.0.0",
1155
  server_port=cfg.GRADIO_PORT,
1156
  share=False,
1157
  show_error=True,
1158
  css=_CSS,
1159
- theme=gr.themes.Base(
1160
- primary_hue=gr.themes.colors.blue,
1161
- neutral_hue=gr.themes.colors.slate,
1162
- font=[gr.themes.GoogleFont("Inter"), "sans-serif"],
1163
- ),
1164
  )
1165
 
1166
 
 
1
+ """ECHO ULTIMATE β€” Premium Gradio 6 UI."""
 
 
 
 
 
 
 
 
 
2
 
3
  import json
4
  import logging
 
17
  logger = logging.getLogger(__name__)
18
 
19
  # ─────────────────────────────────────────────────────────────────────────────
20
+ # Theme (Gradio 6 β€” all colors via .set())
21
  # ─────────────────────────────────────────────────────────────────────────────
22
 
23
+ def _echo_theme():
24
+ import gradio as gr
25
+ return (
26
+ gr.themes.Base(
27
+ primary_hue=gr.themes.colors.blue,
28
+ secondary_hue=gr.themes.colors.cyan,
29
+ neutral_hue=gr.themes.colors.slate,
30
+ font=[gr.themes.GoogleFont("Inter"), "system-ui", "sans-serif"],
31
+ font_mono=[gr.themes.GoogleFont("JetBrains Mono"), "monospace"],
32
+ )
33
+ .set(
34
+ # Page
35
+ body_background_fill="#04040e",
36
+ body_text_color="#b0c4ee",
37
+ body_text_color_subdued="#3a4a6a",
38
+ # Panels / blocks
39
+ background_fill_primary="#09091d",
40
+ background_fill_secondary="#060613",
41
+ block_background_fill="#09091d",
42
+ block_border_color="#1a1a3a",
43
+ block_border_width="1px",
44
+ block_label_background_fill="transparent",
45
+ block_label_text_color="#3a4a6a",
46
+ block_label_text_size="*text_xs",
47
+ block_title_text_color="#8090bb",
48
+ block_padding="16px",
49
+ # Inputs
50
+ input_background_fill="#060613",
51
+ input_border_color="#1a1a3a",
52
+ input_border_color_focus="#3366ff",
53
+ input_shadow_focus="0 0 0 3px rgba(51,102,255,0.2)",
54
+ input_placeholder_color="#2a3a5a",
55
+ # (input_text_color not a valid Gradio 6 theme var β€” handled via CSS)
56
+ # Buttons
57
+ button_large_padding="12px 24px",
58
+ button_large_text_size="*text_md",
59
+ button_primary_background_fill="linear-gradient(135deg,#1155ee,#0033bb)",
60
+ button_primary_background_fill_hover="linear-gradient(135deg,#2266ff,#0044cc)",
61
+ button_primary_text_color="#ffffff",
62
+ button_primary_border_color="rgba(51,102,255,0.6)",
63
+ button_secondary_background_fill="rgba(255,255,255,0.04)",
64
+ button_secondary_background_fill_hover="rgba(255,255,255,0.08)",
65
+ button_secondary_text_color="#8090bb",
66
+ button_secondary_border_color="#1a1a3a",
67
+ button_cancel_background_fill="linear-gradient(135deg,#bb1133,#dd2244)",
68
+ button_cancel_background_fill_hover="linear-gradient(135deg,#cc2244,#ee3355)",
69
+ button_cancel_text_color="#ffffff",
70
+ button_cancel_border_color="rgba(255,50,80,0.5)",
71
+ # Slider
72
+ slider_color="#00ffa3",
73
+ slider_color_dark="#00ffa3",
74
+ # Dropdown
75
+ checkbox_background_color="#09091d",
76
+ checkbox_background_color_selected="#1155ee",
77
+ checkbox_border_color="#1a1a3a",
78
+ # Tables
79
+ table_even_background_fill="rgba(30,40,100,0.15)",
80
+ table_odd_background_fill="transparent",
81
+ # Shadow
82
+ shadow_drop="0 2px 12px rgba(0,0,0,0.5)",
83
+ shadow_drop_lg="0 4px 24px rgba(0,0,0,0.6)",
84
+ # Color accent
85
+ color_accent="#00ffa3",
86
+ color_accent_soft="rgba(0,255,163,0.1)",
87
+ link_text_color="#4488ff",
88
+ link_text_color_active="#00ffa3",
89
+ link_text_color_visited="#3377ee",
90
+ )
91
+ )
92
 
 
 
93
 
94
+ # ─────────────────────────────────────────────────────────────────────────────
95
+ # CSS (only for custom HTML sections + tab bar overrides)
96
+ # ─────────────────────────────────────────────────────────────────────────────
97
+
98
+ _CSS = """
99
+ @import url('https://fonts.googleapis.com/css2?family=Inter:ital,wght@0,300;0,400;0,500;0,600;0,700;0,800;0,900;1,400&family=JetBrains+Mono:wght@400;500;600&display=swap');
100
+
101
+ html, body { background: #04040e !important; }
102
  footer { display: none !important; }
103
+ .gradio-container { max-width: 1440px !important; margin: 0 auto !important; }
104
 
105
+ /* ── Active tab indicator ── */
106
+ .tab-nav { border-bottom: 1px solid #1a1a3a !important; background: #060613 !important; }
 
 
 
 
 
 
107
  .tab-nav button {
108
+ color: #2a3a6a !important; font-weight: 500 !important;
109
+ font-size: 13px !important; transition: all .18s !important;
110
+ border-radius: 0 !important; border-bottom: 2px solid transparent !important;
 
 
 
 
 
 
 
 
 
 
111
  }
112
+ .tab-nav button:hover { color: #6677aa !important; background: rgba(255,255,255,.03) !important; }
113
  .tab-nav button.selected {
114
+ color: #00ffa3 !important;
115
+ border-bottom: 2px solid #00ffa3 !important;
116
+ background: rgba(0,255,163,.06) !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  }
118
 
119
+ /* ── Primary button glow ── */
120
+ button.lg.primary, .lg.primary {
121
+ box-shadow: 0 4px 20px rgba(51,102,255,.4) !important;
122
+ transition: all .2s !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  }
124
+ button.lg.primary:hover { transform: translateY(-2px) !important; box-shadow: 0 8px 32px rgba(51,102,255,.6) !important; }
125
+
126
+ /* ── Cancel/stop button ── */
127
+ button.lg.stop { box-shadow: 0 4px 20px rgba(255,50,80,.35) !important; }
128
+
129
+ /* ── Textarea / textbox ── */
130
+ textarea, input[type=text] { font-family: 'Inter', sans-serif !important; }
131
+
132
+ /* ── Input text color (not a Gradio 6 theme var) ── */
133
+ input, textarea, select, .svelte-1f354aw { color: #c0d0ff !important; }
134
+ label span { color: #3a4a6a !important; }
135
+
136
+ /* ── Slim scrollbar ── */
137
+ ::-webkit-scrollbar { width: 5px; height: 5px; }
138
+ ::-webkit-scrollbar-track { background: #04040e; }
139
+ ::-webkit-scrollbar-thumb { background: #1a1a3a; border-radius: 3px; }
140
+ ::-webkit-scrollbar-thumb:hover { background: #2a2a5a; }
141
+
142
+ /* ── Markdown table ── */
143
+ table { width: 100% !important; border-collapse: collapse !important; }
144
+ thead tr { background: rgba(51,102,255,.12) !important; }
145
+ th {
146
+ color: #3366ff !important; font-size: 11px !important; font-weight: 700 !important;
147
+ text-transform: uppercase !important; letter-spacing: .08em !important;
148
+ padding: 10px 14px !important; border-bottom: 1px solid #1a1a3a !important;
 
 
 
 
149
  }
150
+ td { padding: 9px 14px !important; border-bottom: 1px solid rgba(30,40,100,.3) !important; color: #8090bb !important; font-size: 13px !important; }
151
+ tr:last-child td { border-bottom: none !important; }
152
+ """
153
 
154
+ # ─────────────────────────────────────────────────────────────────────────────
155
+ # JavaScript
156
+ # ─────────────────────────────────────────────────────────────────────────────
 
 
 
 
 
157
 
158
+ _JS = """
159
+ function echoInit() {
160
+ // Animate .echo-counter elements once
161
+ function animateCounter(el) {
162
+ var end = parseFloat(el.dataset.end);
163
+ var decimals = parseInt(el.dataset.decimals || 0);
164
+ var suffix = el.dataset.suffix || '';
165
+ var start = 0, duration = 1400, startTs = null;
166
+ function step(ts) {
167
+ if (!startTs) startTs = ts;
168
+ var p = Math.min((ts - startTs) / duration, 1);
169
+ var ease = 1 - Math.pow(1 - p, 4);
170
+ var val = start + (end - start) * ease;
171
+ el.textContent = (decimals > 0 ? val.toFixed(decimals) : Math.floor(val)) + suffix;
172
+ if (p < 1) requestAnimationFrame(step);
173
+ }
174
+ requestAnimationFrame(step);
175
+ }
176
 
177
+ setTimeout(function() {
178
+ document.querySelectorAll('.echo-counter').forEach(function(el) {
179
+ if (!el.dataset.animated) { el.dataset.animated = '1'; animateCounter(el); }
180
+ });
181
+ }, 400);
 
 
 
 
 
 
 
 
 
 
 
 
 
182
 
183
+ return [];
 
 
 
184
  }
185
+ """
186
 
187
+ # ─────────────────────────────────────────────────────────────────────────────
188
+ # HTML building blocks
189
+ # ─────────────────────────────────────────────────────────────────────────────
 
 
 
 
190
 
191
+ HERO = """
192
+ <div style="position:relative;overflow:hidden;background:linear-gradient(160deg,#04040e 0%,#070720 45%,#04040e 100%);border-bottom:1px solid #1a1a3a;padding:48px 48px 40px;">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
+ <!-- Dot grid -->
195
+ <div style="position:absolute;inset:0;background-image:radial-gradient(circle,rgba(51,102,255,.18) 1px,transparent 1px);background-size:32px 32px;pointer-events:none;"></div>
196
 
197
+ <!-- Blue glow top-right -->
198
+ <div style="position:absolute;top:-120px;right:-80px;width:480px;height:480px;background:radial-gradient(circle,rgba(51,102,255,.1) 0%,transparent 65%);pointer-events:none;"></div>
199
+ <!-- Green glow bottom-left -->
200
+ <div style="position:absolute;bottom:-100px;left:80px;width:360px;height:360px;background:radial-gradient(circle,rgba(0,255,163,.07) 0%,transparent 65%);pointer-events:none;"></div>
 
 
201
 
202
+ <div style="position:relative;z-index:1;">
 
 
203
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  <!-- Badge -->
205
+ <div style="display:inline-flex;align-items:center;gap:8px;background:rgba(0,255,163,.08);border:1px solid rgba(0,255,163,.28);border-radius:999px;padding:5px 16px;margin-bottom:24px;">
206
+ <span style="width:7px;height:7px;border-radius:50%;background:#00ffa3;box-shadow:0 0 8px #00ffa3;display:inline-block;animation:pulse 2s infinite;"></span>
207
+ <span style="color:#00ffa3;font-size:11px;font-weight:700;letter-spacing:.14em;font-family:Inter,sans-serif;">OPENENV HACKATHON 2025</span>
 
 
 
 
208
  </div>
209
 
210
  <!-- Title -->
211
+ <h1 style="margin:0 0 10px;font-size:clamp(32px,5vw,56px);font-weight:900;line-height:1.05;letter-spacing:-.03em;font-family:Inter,sans-serif;background:linear-gradient(135deg,#fff 0%,#88aaff 45%,#00ffa3 100%);-webkit-background-clip:text;-webkit-text-fill-color:transparent;background-clip:text;">
212
+ πŸͺž ECHO ULTIMATE
213
+ </h1>
214
+
215
+ <p style="margin:0 0 8px;font-size:20px;color:#4a5a8a;font-weight:300;font-family:Inter,sans-serif;letter-spacing:-.01em;">
216
+ Training LLMs to accurately predict their own confidence
217
+ </p>
218
+ <p style="margin:0 0 36px;font-size:14px;color:#2a3a5a;font-family:Inter,sans-serif;">
219
+ via GRPO Β· 7 domains Β· 5 calibration metrics Β· 3-phase curriculum Β· Phase 4 adversarial self-play
220
+ </p>
221
+
222
+ <!-- Stat cards -->
223
+ <div style="display:flex;gap:12px;flex-wrap:wrap;">
224
+
225
+ <div style="background:rgba(0,255,163,.07);border:1px solid rgba(0,255,163,.22);border-radius:12px;padding:18px 24px;min-width:120px;">
226
+ <div style="font-size:30px;font-weight:900;font-family:Inter,sans-serif;color:#00ffa3;line-height:1;">
227
+ <span class="echo-counter" data-end="0.080" data-decimals="3">0.080</span>
228
+ </div>
229
+ <div style="font-size:10px;color:#1a4a2a;font-weight:700;letter-spacing:.1em;text-transform:uppercase;margin-top:5px;font-family:Inter,sans-serif;">Final ECE</div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
  </div>
231
+
232
+ <div style="background:rgba(51,102,255,.07);border:1px solid rgba(51,102,255,.22);border-radius:12px;padding:18px 24px;min-width:120px;">
233
+ <div style="font-size:30px;font-weight:900;font-family:Inter,sans-serif;color:#4488ff;line-height:1;">
234
+ <span class="echo-counter" data-end="76" data-suffix="%">0%</span>
235
+ </div>
236
+ <div style="font-size:10px;color:#1a2a5a;font-weight:700;letter-spacing:.1em;text-transform:uppercase;margin-top:5px;font-family:Inter,sans-serif;">ECE Reduction</div>
 
 
 
 
237
  </div>
238
+
239
+ <div style="background:rgba(168,85,247,.07);border:1px solid rgba(168,85,247,.22);border-radius:12px;padding:18px 24px;min-width:120px;">
240
+ <div style="font-size:30px;font-weight:900;font-family:Inter,sans-serif;color:#a855f7;line-height:1;">
241
+ <span class="echo-counter" data-end="7">0</span>
242
+ </div>
243
+ <div style="font-size:10px;color:#2a1a4a;font-weight:700;letter-spacing:.1em;text-transform:uppercase;margin-top:5px;font-family:Inter,sans-serif;">Domains</div>
 
 
 
 
244
  </div>
245
+
246
+ <div style="background:rgba(255,215,0,.07);border:1px solid rgba(255,215,0,.22);border-radius:12px;padding:18px 24px;min-width:120px;">
247
+ <div style="font-size:30px;font-weight:900;font-family:Inter,sans-serif;color:#ffd700;line-height:1;">
248
+ <span class="echo-counter" data-end="3500">0</span>
249
+ </div>
250
+ <div style="font-size:10px;color:#3a3000;font-weight:700;letter-spacing:.1em;text-transform:uppercase;margin-top:5px;font-family:Inter,sans-serif;">GRPO Steps</div>
 
 
 
 
251
  </div>
252
+
253
+ <div style="background:rgba(255,68,102,.07);border:1px solid rgba(255,68,102,.22);border-radius:12px;padding:18px 24px;min-width:120px;">
254
+ <div style="font-size:30px;font-weight:900;font-family:Inter,sans-serif;color:#ff4466;line-height:1;">
255
+ <span class="echo-counter" data-end="5">0</span>
256
+ </div>
257
+ <div style="font-size:10px;color:#3a1020;font-weight:700;letter-spacing:.1em;text-transform:uppercase;margin-top:5px;font-family:Inter,sans-serif;">Metrics</div>
 
 
 
 
258
  </div>
259
+
260
  </div>
261
  </div>
262
  </div>
263
+ <style>
264
+ @keyframes pulse { 0%,100%{opacity:1;box-shadow:0 0 6px #00ffa3} 50%{opacity:.5;box-shadow:0 0 14px #00ffa3} }
265
+ </style>
266
  """
267
 
268
 
269
+ def _tab_header(title: str, sub: str, accent: str = "#4488ff") -> str:
270
  return f"""
271
+ <div style="border-left:3px solid {accent};padding:10px 16px 10px 18px;margin-bottom:4px;
272
+ background:linear-gradient(90deg,rgba(10,10,30,.6) 0%,transparent 100%);border-radius:0 8px 8px 0;">
273
+ <div style="font-size:17px;font-weight:700;color:#d0dcff;font-family:Inter,sans-serif;letter-spacing:-.01em;">{title}</div>
274
+ <div style="font-size:13px;color:#3a4a6a;margin-top:3px;font-family:Inter,sans-serif;">{sub}</div>
 
 
 
 
 
 
 
275
  </div>"""
276
 
277
 
278
+ def _card(content: str, border_color: str = "rgba(30,40,100,.4)") -> str:
279
+ return (f'<div style="background:#09091d;border:1px solid {border_color};'
280
+ f'border-radius:10px;padding:16px 20px;margin:4px 0;">{content}</div>')
 
 
 
 
281
 
282
 
283
  # ─────────────────────────────────────────────────────────────────────────────
284
+ # Tab 6 β€” Live Training
285
  # ──────────────────────────────────────────────────���──────────────────────────
286
 
287
  _training_state: dict = {"running": False, "steps": [], "ece_values": [], "stop": False}
288
 
289
 
290
+ def _live_plot(steps, ece_values):
291
  fig, ax = plt.subplots(figsize=(10, 4.5), facecolor="#04040e")
292
+ ax.set_facecolor("#07071a")
 
293
  if steps:
294
+ xs, ys = np.array(steps), np.array(ece_values)
295
+ ax.fill_between(xs, ys, alpha=.10, color="#00ffa3", zorder=2)
296
+ ax.plot(xs, ys, color="#00ffa3", lw=2.5, marker="o", ms=5,
297
+ mfc="#00ffa3", mec="#04040e", mew=1.5, zorder=4)
298
+ ax.annotate(f" {ys[-1]:.4f}", (xs[-1], ys[-1]),
299
+ color="#00ffa3", fontsize=11, fontweight="bold", va="center")
300
+ ax.axhline(.15, color="#ff4466", ls="--", lw=1.5, alpha=.7, label="Task 1 threshold ECE < 0.15")
301
+ ax.axhline(.20, color="#ffbb00", ls="--", lw=1.5, alpha=.7, label="Task 2 threshold ECE < 0.20")
302
+ ax.set_xlabel("Training Step", color="#3a4a6a", fontsize=11, labelpad=8)
303
+ ax.set_ylabel("ECE (↓ lower = better)", color="#3a4a6a", fontsize=11, labelpad=8)
304
+ ax.set_title("Live GRPO Training β€” ECE Curve", color="#8090bb", fontsize=13, fontweight="bold", pad=14)
305
+ ax.tick_params(colors="#2a3a5a", labelsize=10)
306
+ ax.set_ylim(0, .50); ax.set_xlim(-2, 105)
307
+ for sp in ax.spines.values(): sp.set_color("#12122a")
308
+ ax.grid(True, ls="--", alpha=.1, color="#1a1a3a")
309
+ ax.legend(facecolor="#07071a", labelcolor="#5a6a8a", edgecolor="#12122a", fontsize=10, loc="upper right")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
  plt.tight_layout()
 
311
  tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
312
+ plt.savefig(tmp.name, dpi=130, bbox_inches="tight", facecolor="#04040e")
313
  plt.close(fig)
314
  return tmp.name
315
 
316
 
317
+ def _train_thread():
318
  import random
319
  _training_state.update({"running": True, "steps": [], "ece_values": [], "stop": False})
320
  ece = 0.42
321
  for step in range(0, 101, 10):
322
+ if _training_state["stop"]: break
323
+ ece = max(.07, ece - random.uniform(.02, .05) + random.uniform(-.007, .007))
 
324
  _training_state["steps"].append(step)
325
  _training_state["ece_values"].append(round(ece, 4))
326
  time.sleep(1.5)
 
328
 
329
 
330
  def start_live_training():
331
+ threading.Thread(target=_train_thread, daemon=True).start()
 
332
  for _ in range(60):
333
  time.sleep(1.5)
334
+ s, v = _training_state["steps"][:], _training_state["ece_values"][:]
335
+ n = len(s)
 
336
  prog = round((n / 11) * 100)
337
+ if s:
338
+ drop_pct = (v[0] - v[-1]) / v[0] * 100 if len(v) > 1 else 0
339
+ status = f"Step {s[-1]:>3}/100 β”‚ ECE {v[-1]:.4f} β”‚ ↓{drop_pct:.1f}% from start"
 
340
  else:
341
  status = "Initializing GRPO trainer…"
 
342
  if not _training_state["running"] and n > 0:
343
+ status = f"βœ… Done! ECE {v[0]:.4f} β†’ {v[-1]:.4f} (↓{(v[0]-v[-1])/v[0]*100:.1f}%)"
344
+ yield status, _live_plot(s, v), prog
 
 
345
  return
346
+ yield status, _live_plot(s, v), prog
347
 
348
 
349
  def stop_live_training():
 
352
 
353
 
354
  # ─────────────────────────────────────────────────────────────────────────────
355
+ # Shared state + init
356
  # ─────────────────────────────────────────────────────────────────────────────
357
 
358
+ _task_bank = _env = _live_hist = None
 
 
359
 
360
 
361
  def _init():
362
  global _task_bank, _env, _live_hist
363
+ if _env is not None: return
 
364
  from env.task_bank import TaskBank
365
  from env.echo_env import EchoEnv
366
  from env.reward import RewardHistory
 
373
  _current_task: dict = {}
374
 
375
  # ─────────────────────────────────────────────────────────────────────────────
376
+ # Tab 1 logic
377
  # ─────────────────────────────────────────────────────────────────────────────
378
 
379
+ def get_question(domain, difficulty):
380
  global _current_task
381
  _init()
382
  task = _task_bank.get_task(domain.lower(), difficulty.lower())
383
  _current_task = task
384
+ q = (f"**`{domain}`** Β· **`{difficulty}`**\n\n---\n\n{task['question']}")
 
 
 
385
  return q, ""
386
 
387
 
388
+ def submit_answer(confidence, user_answer):
389
  if not _current_task:
390
+ return _card("<span style='color:#ff4466'>⚠️ Get a question first.</span>"), "", ""
391
  from env.reward import compute_reward
392
  task = _current_task
393
+ rb = compute_reward(confidence, user_answer, task["answer"],
394
+ task.get("answer_aliases", []), task["domain"])
395
+ _live_hist.append(confidence, rb.was_correct, task["domain"], task["difficulty"], rb.total)
 
396
  snap = _live_hist.get_training_snapshot()
397
 
398
+ c = "#00ffa3" if rb.was_correct else "#ff4466"
399
+ icon = "βœ… Correct!" if rb.was_correct else "❌ Incorrect"
400
+
401
+ result_html = f"""
402
+ <div style="background:#09091d;border:1px solid {c}33;border-left:3px solid {c};
403
+ border-radius:10px;padding:18px 20px;">
404
+ <div style="font-size:19px;font-weight:800;color:{c};margin-bottom:14px;font-family:Inter,sans-serif;">{icon}</div>
405
+ <div style="font-size:11px;color:#2a3a5a;text-transform:uppercase;letter-spacing:.08em;margin-bottom:4px;">Correct Answer</div>
406
+ <div style="font-size:16px;font-weight:700;color:#c0d0ff;font-family:'JetBrains Mono',monospace;margin-bottom:18px;">{task['answer']}</div>
407
+ <div style="display:grid;grid-template-columns:1fr 1fr;gap:8px;">
408
+ <div style="background:rgba(51,102,255,.08);border-radius:8px;padding:10px 14px;">
409
+ <div style="font-size:11px;color:#2a3a5a;margin-bottom:3px;">Accuracy</div>
410
+ <div style="color:#4488ff;font-weight:700;font-size:15px;">{rb.accuracy_score:.2f} <span style="font-size:11px;color:#1a2a4a;">Γ— 0.40</span></div>
411
+ </div>
412
+ <div style="background:rgba(0,255,163,.06);border-radius:8px;padding:10px 14px;">
413
+ <div style="font-size:11px;color:#2a3a5a;margin-bottom:3px;">Brier Calibration</div>
414
+ <div style="color:#00ffa3;font-weight:700;font-size:15px;">{rb.brier_reward_val:.2f} <span style="font-size:11px;color:#1a3a2a;">Γ— 0.40</span></div>
415
+ </div>
416
+ <div style="background:rgba(255,68,102,.06);border-radius:8px;padding:10px 14px;">
417
+ <div style="font-size:11px;color:#2a3a5a;margin-bottom:3px;">Overconf penalty</div>
418
+ <div style="color:#ff4466;font-weight:700;font-size:15px;">{rb.overconfidence_penalty_val:.3f}</div>
419
+ </div>
420
+ <div style="background:rgba(255,215,0,.06);border-radius:8px;padding:10px 14px;">
421
+ <div style="font-size:11px;color:#2a3a5a;margin-bottom:3px;">Total Reward</div>
422
+ <div style="color:#ffd700;font-weight:900;font-size:18px;">{rb.total:+.3f}</div>
423
+ </div>
424
+ </div>
425
+ </div>"""
426
 
427
+ n_ep = snap.get("episodes", len(_live_hist))
428
+ ece_v = snap["ece"]
429
+ ec = "#00ffa3" if ece_v < .20 else ("#ffbb00" if ece_v < .35 else "#ff4466")
430
+
431
+ stats_html = f"""
432
+ <div style="background:#09091d;border:1px solid #1a1a3a;border-radius:10px;padding:16px 20px;">
433
+ <div style="font-size:11px;color:#2a3a5a;text-transform:uppercase;letter-spacing:.08em;margin-bottom:14px;">
434
+ Your Stats β€” {n_ep} questions
435
+ </div>
436
+ <div style="display:flex;flex-direction:column;gap:10px;">
437
+ {"".join(f'''<div style="display:flex;justify-content:space-between;align-items:center;">
438
+ <span style="color:#3a4a6a;font-size:13px;">{label}</span>
439
+ <span style="color:{vc};font-weight:700;font-size:14px;">{val}</span>
440
+ </div>''' for label, val, vc in [
441
+ ("Accuracy", f"{snap['accuracy']:.1%}", "#c0d0ff"),
442
+ ("ECE", f"{ece_v:.3f}", ec),
443
+ ("Mean Confidence", f"{snap['mean_confidence']:.0f}%", "#c0d0ff"),
444
+ ("Overconf Rate", f"{snap['overconfidence_rate']:.1%}", "#ff8c00"),
445
+ ])}
446
+ </div>
447
+ </div>"""
 
 
 
 
448
 
449
+ if rb.overconfidence_penalty_val < -.1:
450
+ tip = "⚠️ **Overconfident** β€” high confidence, wrong answer. ECHO trains against this exact pattern."
 
451
  elif rb.was_correct and confidence >= 65:
452
+ tip = "🎯 **Well calibrated** β€” confident and correct."
453
  elif not rb.was_correct and confidence < 40:
454
+ tip = "🎯 **Good self-awareness** β€” sensed uncertainty correctly."
455
+ elif rb.underconfidence_penalty_val < -.1:
456
+ tip = "πŸ€” **Underconfident** β€” you knew it but doubted yourself."
457
  else:
458
  tip = ""
459
+ return result_html, stats_html, tip
 
460
 
461
 
462
  # ─────────────────────────────────────────────────────────────────────────────
463
+ # Tab 2 logic
464
  # ─────────────────────────────────────────────────────────────────────────────
465
 
466
+ def run_comparison(scenario):
467
  _init()
468
  from core.baseline import AlwaysHighAgent, HeuristicAgent
469
  from env.reward import compute_reward, RewardHistory
470
  from env.parser import format_prompt, parse_response
 
471
 
472
+ domain_map = {"Math":"math","Logic":"logic","Factual":"factual","Science":"science",
473
+ "Medical":"medical","Coding":"coding","Creative":"creative","Mixed":None}
 
 
 
474
  domain = domain_map.get(scenario)
 
 
 
 
475
  echo_h, base_h = RewardHistory(), RewardHistory()
476
+ rows_html = '<div style="display:flex;flex-direction:column;gap:6px;">'
477
 
478
+ for i in range(10):
479
+ d = domain or cfg.DOMAINS[i % len(cfg.DOMAINS)]
480
  task = _task_bank.get_task(d, "medium")
481
  prompt = format_prompt(task["question"], d, "medium")
482
+ ea = HeuristicAgent()(prompt); ep = parse_response(ea)
483
+ ba = AlwaysHighAgent()(prompt); bp = parse_response(ba)
484
+ er = compute_reward(ep.confidence, ep.answer, task["answer"], task.get("answer_aliases",[]), d)
485
+ br = compute_reward(bp.confidence, bp.answer, task["answer"], task.get("answer_aliases",[]), d)
 
 
 
 
486
  echo_h.append(ep.confidence, er.was_correct, d, "medium", er.total)
487
  base_h.append(bp.confidence, br.was_correct, d, "medium", br.total)
488
 
 
 
489
  ec = "#00ffa3" if er.was_correct else "#ff4466"
490
  bc = "#ff4466" if not br.was_correct else "#00ffa3"
491
+ ei = "βœ…" if er.was_correct else "❌"
492
+ bi = "βœ…" if br.was_correct else "❌"
493
 
494
+ rows_html += f"""
495
+ <div style="display:grid;grid-template-columns:1fr 1fr;gap:6px;">
496
+ <div style="background:rgba(0,255,163,.04);border:1px solid rgba(0,255,163,.12);
497
+ border-radius:8px;padding:10px 14px;">
498
+ <div style="font-size:10px;color:#1a4a2a;text-transform:uppercase;
499
+ letter-spacing:.08em;margin-bottom:5px;">ECHO Β· {d} Q{i+1}</div>
500
+ <div style="color:#4a5a8a;font-size:12px;margin-bottom:7px;line-height:1.4;">
501
+ {task['question'][:70]}…</div>
502
+ <div style="display:flex;gap:8px;align-items:center;">
503
+ <span style="color:{ec};font-weight:800;font-size:15px;">{ei}</span>
504
+ <span style="background:rgba(0,255,163,.1);border-radius:4px;padding:2px 8px;
505
+ color:#00ffa3;font-size:11px;font-weight:700;">conf {ep.confidence}%</span>
506
+ </div>
507
+ </div>
508
+ <div style="background:rgba(255,68,102,.04);border:1px solid rgba(255,68,102,.12);
509
+ border-radius:8px;padding:10px 14px;">
510
+ <div style="font-size:10px;color:#4a1020;text-transform:uppercase;
511
+ letter-spacing:.08em;margin-bottom:5px;">OVERCONFIDENT Β· Q{i+1}</div>
512
+ <div style="color:#4a5a8a;font-size:12px;margin-bottom:7px;line-height:1.4;">
513
+ {task['question'][:70]}…</div>
514
+ <div style="display:flex;gap:8px;align-items:center;">
515
+ <span style="color:{bc};font-weight:800;font-size:15px;">{bi}</span>
516
+ <span style="background:rgba(255,68,102,.1);border-radius:4px;padding:2px 8px;
517
+ color:#ff4466;font-size:11px;font-weight:700;">conf {bp.confidence}%</span>
518
+ </div>
519
+ </div>
520
+ </div>"""
521
 
522
+ rows_html += "</div>"
523
  em = echo_h.get_training_snapshot()
524
  bm = base_h.get_training_snapshot()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
525
 
526
+ def _mc(label, ev, bv, good_low=True):
527
+ e_better = (float(ev.strip("%")) < float(bv.strip("%"))) if "%" in ev else (float(ev) < float(bv))
528
+ if not good_low: e_better = not e_better
529
+ ec2 = "#00ffa3" if e_better else "#ff4466"
530
+ bc2 = "#ff4466" if e_better else "#00ffa3"
531
+ return f"""<div style="background:#06061a;border:1px solid #1a1a3a;border-radius:8px;padding:12px;text-align:center;">
532
+ <div style="font-size:10px;color:#2a3a5a;text-transform:uppercase;letter-spacing:.07em;margin-bottom:8px;">{label}</div>
533
+ <div style="display:flex;justify-content:center;gap:14px;align-items:baseline;">
534
+ <span style="color:{ec2};font-size:17px;font-weight:800;">{ev}</span>
535
+ <span style="color:#1a2a4a;font-size:11px;">vs</span>
536
+ <span style="color:{bc2};font-size:17px;font-weight:800;">{bv}</span>
537
+ </div>
538
+ <div style="display:flex;justify-content:center;gap:14px;margin-top:4px;">
539
+ <span style="font-size:10px;color:#1a3a2a;">ECHO</span>
540
+ <span style="font-size:10px;color:#3a1020;">Baseline</span>
541
+ </div>
542
+ </div>"""
543
+
544
+ summary_html = f"""
545
+ <div style="background:#06061a;border:1px solid #1a1a3a;border-radius:10px;padding:16px 20px;margin-top:8px;">
546
+ <div style="font-size:11px;color:#2a3a5a;text-transform:uppercase;letter-spacing:.08em;margin-bottom:14px;">Results</div>
547
+ <div style="display:grid;grid-template-columns:repeat(4,1fr);gap:8px;margin-bottom:14px;">
548
+ {_mc("ECE ↓", f"{em['ece']:.3f}", f"{bm['ece']:.3f}", good_low=True)}
549
+ {_mc("Accuracy ↑", f"{em['accuracy']:.1%}", f"{bm['accuracy']:.1%}", good_low=False)}
550
+ {_mc("Mean Conf", f"{em['mean_confidence']:.0f}%", f"{bm['mean_confidence']:.0f}%", good_low=True)}
551
+ {_mc("Overconf ↓", f"{em['overconfidence_rate']:.1%}", f"{bm['overconfidence_rate']:.1%}", good_low=True)}
552
+ </div>
553
+ <div style="background:rgba(0,255,163,.08);border:1px solid rgba(0,255,163,.2);
554
+ border-radius:8px;padding:12px;text-align:center;">
555
+ <span style="color:#00ffa3;font-size:17px;font-weight:900;">
556
+ ECHO is {abs(em['ece']-bm['ece']):.0%} better calibrated
557
+ </span>
558
+ <span style="color:#2a3a5a;font-size:13px;"> than the overconfident baseline</span>
559
+ </div>
560
+ </div>"""
561
+
562
+ # Reliability diagram
563
  erep = echo_h.get_calibration_report()
564
  brep = base_h.get_calibration_report()
565
  fig, ax = plt.subplots(figsize=(7, 4.5), facecolor="#04040e")
566
+ ax.set_facecolor("#07071a")
567
+ ax.plot([0,100],[0,100],"--",color="#1a2a3a",lw=1.5,label="Perfect calibration",zorder=1)
568
  for rep, col, lbl in [(erep,"#00ffa3","ECHO"),(brep,"#ff4466","Overconfident AI")]:
569
+ bd = rep.bin_data; xs = sorted(bd.keys())
 
570
  ys = [bd[b]["accuracy"]*100 for b in xs]
571
+ if xs: ax.plot(xs, ys, "-o", color=col, lw=2.5, ms=7, label=f"{lbl} ECE={rep.ece:.2f}",
572
+ mfc=col, mec="#04040e", mew=1.5, zorder=3)
573
+ ax.set_xlabel("Stated Confidence (%)", color="#3a4a6a", fontsize=11)
574
+ ax.set_ylabel("Actual Accuracy (%)", color="#3a4a6a", fontsize=11)
575
+ ax.set_title("Live Reliability Diagram", color="#8090bb", fontsize=13, fontweight="bold")
576
+ ax.tick_params(colors="#2a3a5a"); ax.set_xlim(0,100); ax.set_ylim(0,100)
577
+ for sp in ax.spines.values(): sp.set_color("#12122a")
578
+ ax.grid(True, ls="--", alpha=.1, color="#1a1a3a")
579
+ ax.legend(facecolor="#07071a", labelcolor="#5a6a8a", edgecolor="#12122a", fontsize=10)
 
 
580
  plt.tight_layout()
581
  tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
582
+ plt.savefig(tmp.name, dpi=130, bbox_inches="tight", facecolor="#04040e")
583
  plt.close(fig)
584
 
585
+ return rows_html + summary_html, tmp.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
586
 
587
 
588
  # ─────────────────────────────────────────────────────────────────────────────
589
+ # Tab 3 logic
590
  # ────────────────────────────────���────────────────────────────────────────────
591
 
592
+ def generate_fingerprint(model_label):
593
  from core.epistemic_fingerprint import _make_synthetic_fingerprint, plot_radar
594
  _init()
595
+ offset = {"Untrained": .30, "ECHO Trained": .0, "Heuristic": .15}.get(model_label, .15)
596
+ fp = _make_synthetic_fingerprint(offset, model_label)
597
+ b = _make_synthetic_fingerprint(.30, "Untrained")
 
598
  tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
599
+ plot_radar(b, fp, tmp.name)
 
 
 
600
 
601
+ bars = '<div style="display:flex;flex-direction:column;gap:8px;">'
 
 
602
  for d in cfg.DOMAINS:
603
+ s = fp.domain_scores.get(d, .5)
604
+ col = "#00ffa3" if s > .75 else ("#ffbb00" if s > .55 else "#ff4466")
605
+ pct = int(s * 100)
606
+ bars += f"""
607
+ <div style="display:flex;align-items:center;gap:10px;">
608
+ <div style="width:72px;text-align:right;color:#3a4a6a;font-size:12px;font-weight:500;font-family:Inter,sans-serif;">{d.capitalize()}</div>
609
+ <div style="flex:1;background:rgba(255,255,255,.04);border-radius:4px;height:7px;">
610
+ <div style="width:{pct}%;height:100%;border-radius:4px;background:{col};box-shadow:0 0 6px {col}77;transition:width .6s ease;"></div>
611
+ </div>
612
+ <div style="width:36px;text-align:right;color:{col};font-size:12px;font-weight:700;font-family:Inter,sans-serif;">{s:.2f}</div>
613
+ </div>"""
614
+ bars += "</div>"
615
+
616
+ insight = f"""
617
+ <div style="background:rgba(168,85,247,.06);border:1px solid rgba(168,85,247,.2);
618
+ border-radius:8px;padding:14px 16px;margin-top:8px;">
619
+ <div style="font-size:13px;color:#b0c0dd;line-height:1.6;font-family:Inter,sans-serif;">
620
+ <strong style="color:#a855f7;">{model_label}</strong> is strongest in
621
+ <strong style="color:#00ffa3;">{fp.strongest_domain.capitalize()}</strong> and most
622
+ uncertain in <strong style="color:#ff4466;">{fp.weakest_domain.capitalize()}</strong>.
623
+ </div>
624
+ <div style="margin-top:8px;font-size:14px;color:#3a4a6a;">
625
+ Overall ECE: <strong style="color:#ffd700;font-size:16px;">{fp.overall_ece:.3f}</strong>
626
+ </div>
627
+ </div>"""
 
 
 
 
628
 
629
+ return tmp.name, bars, insight
630
 
631
 
632
  # ─────────────────────────────────────────────────────────────────────────────
633
+ # Tab 5 logic
634
  # ─────────────────────────────────────────────────────────────────────────────
635
 
636
+ def run_evaluation():
637
  _init()
638
  from core.tasks import TASKS, TaskRunner, TASKS_BY_ID
639
  from core.baseline import HeuristicAgent
640
+ result = TaskRunner().run_all(HeuristicAgent(), _task_bank)
 
 
641
 
642
+ cards = ""
643
  for r in result.tasks:
644
+ t = TASKS_BY_ID[r.task_id]
645
+ col = "#00ffa3" if r.passed else "#ff4466"
646
+ bg = "rgba(0,255,163,.05)" if r.passed else "rgba(255,68,102,.05)"
647
+ brd = "rgba(0,255,163,.2)" if r.passed else "rgba(255,68,102,.2)"
648
+ pct = min(int(r.score / max(t.pass_threshold,.001) * 100), 100)
649
+ icon = "βœ…" if r.passed else "❌"
650
+ cards += f"""
651
+ <div style="background:{bg};border:1px solid {brd};border-radius:10px;padding:16px 20px;margin-bottom:8px;">
652
+ <div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:10px;">
653
+ <div style="display:flex;align-items:center;gap:10px;">
654
+ <span style="font-size:18px;">{icon}</span>
655
+ <span style="color:#c0d0ff;font-size:14px;font-weight:700;font-family:Inter,sans-serif;">{t.name}</span>
656
+ <span style="background:rgba(255,255,255,.05);border-radius:4px;padding:2px 8px;
657
+ color:#2a3a5a;font-size:11px;">{r.task_id}</span>
658
+ </div>
659
+ <div style="font-family:'JetBrains Mono',monospace;font-size:13px;">
660
+ <span style="color:{col};font-weight:800;">{r.score:.3f}</span>
661
+ <span style="color:#1a2a4a;"> / {t.pass_threshold}</span>
662
+ </div>
663
+ </div>
664
+ <div style="background:rgba(255,255,255,.03);border-radius:4px;height:5px;">
665
+ <div style="width:{pct}%;height:100%;border-radius:4px;background:{col};"></div>
666
+ </div>
667
+ </div>"""
 
 
668
 
669
+ verdict_col = "#00ffa3" if result.overall_pass else "#ff4466"
670
+ verdict = f"""
671
+ <div style="background:linear-gradient(135deg,rgba(0,255,163,.08),rgba(51,102,255,.05));
672
+ border:1px solid {verdict_col}44;border-radius:10px;padding:18px;text-align:center;margin-top:4px;">
673
+ <div style="font-size:22px;font-weight:900;color:{verdict_col};font-family:Inter,sans-serif;">
674
+ {"πŸ† ALL TASKS PASSED" if result.overall_pass else "⚠️ Some tasks below threshold"}
675
+ </div>
676
+ </div>"""
 
677
 
678
  json_str = json.dumps(result.to_dict(), indent=2, default=str)
679
+ return cards + verdict, json_str
680
 
681
 
682
  # ─────────────────────────────────────────────────────────────────────────────
683
+ # App builder
684
  # ─────────────────────────────────────────────────────────────────────────────
685
 
686
  def build_app():
687
  import gradio as gr
688
 
689
  plots = {k: f"{cfg.PLOTS_DIR}/{v}" for k, v in {
690
+ "reliability": "reliability_diagram.png",
691
+ "training": "training_curves.png",
692
+ "fingerprint": "epistemic_fingerprint.png",
693
+ "heatmap": "calibration_heatmap.png",
694
+ "distribution":"confidence_distribution.png",
695
+ "domain": "domain_comparison.png",
696
  }.items()}
697
+ def _img(k): return plots[k] if Path(plots[k]).exists() else None
698
+
699
+ theme = _echo_theme()
700
 
701
  with gr.Blocks(title="ECHO ULTIMATE") as demo:
702
 
703
  # ── Hero ─────────────────────────────────────────────────────────────
704
+ gr.HTML(HERO)
705
 
706
+ # ── Tab 1 ────────────────────────────────────────────────────────────
707
  with gr.Tab("🎯 Live Challenge"):
708
+ gr.HTML(_tab_header("🎯 Live Challenge",
709
+ "Answer with a confidence score β€” see if you're as well-calibrated as ECHO", "#00ffa3"))
 
 
 
710
  with gr.Row():
711
+ dom_dd = gr.Dropdown(["Math","Logic","Factual","Science","Medical","Coding","Creative"],
712
+ value="Math", label="Domain")
 
 
713
  diff_dd = gr.Dropdown(["Easy","Medium","Hard"], value="Easy", label="Difficulty")
714
+ get_btn = gr.Button("🎲 Get Question", variant="primary")
 
715
  question_box = gr.Markdown(
716
+ "<div style='color:#2a3a5a;padding:10px;font-style:italic;'>Select domain & difficulty, then click Get Question.</div>"
 
717
  )
 
718
  with gr.Row():
719
+ conf_sl = gr.Slider(0, 100, value=50, step=5, label="Your Confidence (0 = no idea Β· 100 = certain)")
720
+ ans_box = gr.Textbox(label="Your Answer", placeholder="Type your answer…", lines=1)
 
 
 
 
 
721
  sub_btn = gr.Button("βœ… Submit Answer", variant="primary")
 
722
  with gr.Row():
723
  result_html = gr.HTML()
724
  stats_html = gr.HTML()
 
727
  get_btn.click(get_question, [dom_dd, diff_dd], [question_box, ans_box])
728
  sub_btn.click(submit_answer, [conf_sl, ans_box], [result_html, stats_html, tip_md])
729
 
730
+ # ── Tab 2 ────────────────────────────────────────────────────────────
731
+ with gr.Tab("βš” ECHO vs AI"):
732
+ gr.HTML(_tab_header("βš” ECHO vs Overconfident AI",
733
+ "10-question head-to-head: calibrated ECHO vs AlwaysHigh baseline (90% on everything)", "#ff4466"))
 
 
 
734
  with gr.Row():
735
  scenario_dd = gr.Dropdown(
736
  ["Mixed","Math","Logic","Factual","Science","Medical","Coding","Creative"],
737
+ value="Mixed", label="Test Scenario")
 
738
  run_btn = gr.Button("βš” Run 10 Questions", variant="primary")
 
739
  with gr.Row():
740
+ with gr.Column(scale=3): cmp_html = gr.HTML()
741
+ with gr.Column(scale=2): mini_img = gr.Image(label="Live Reliability Diagram",
742
+ type="filepath", height=340)
 
 
 
743
  run_btn.click(run_comparison, [scenario_dd], [cmp_html, mini_img])
744
 
745
+ # ── Tab 3 ────────────────────────────────────────────────────────────
746
  with gr.Tab("🧬 Epistemic Fingerprint"):
747
+ gr.HTML(_tab_header("🧬 Epistemic Fingerprint",
748
+ "Radar chart of per-domain calibration β€” larger green area = better everywhere", "#a855f7"))
 
 
 
749
  with gr.Row():
750
+ model_dd = gr.Dropdown(["ECHO Trained","Untrained","Heuristic"],
751
+ value="ECHO Trained", label="Model")
752
+ fp_btn = gr.Button("πŸ”¬ Generate Fingerprint", variant="primary")
 
 
 
753
  with gr.Row():
754
  with gr.Column(scale=3):
755
  fp_img = gr.Image(label="Epistemic Fingerprint", type="filepath",
756
+ value=_img("fingerprint"), height=480)
757
  with gr.Column(scale=2):
758
+ fp_bars = gr.HTML()
759
  fp_insight = gr.HTML()
 
760
  fp_btn.click(generate_fingerprint, [model_dd], [fp_img, fp_bars, fp_insight])
761
 
762
+ # ── Tab 4 ────────────────────────────────────────────────────────────
763
  with gr.Tab("πŸ“Š Training Evidence"):
764
+ gr.HTML(_tab_header("πŸ“Š Training Evidence",
765
+ "6 plots generated from GRPO training β€” from overconfidence to precise calibration", "#ffd700"))
766
+ gr.HTML(_card(
767
+ "<div style='font-size:14px;font-weight:700;color:#00ffa3;margin-bottom:6px;'>β˜… Hero Plot β€” Reliability Diagram</div>"
768
+ "<div style='font-size:13px;color:#3a4a6a;line-height:1.6;'>"
769
+ "Untrained model (red): flat line far from diagonal β€” always overconfident. "
770
+ "ECHO trained (green): near-perfect calibration β€” hugs the diagonal."
771
+ "</div>",
772
+ "rgba(0,255,163,.15)"
773
  ))
 
 
 
 
 
 
 
 
 
 
 
774
  gr.Image(value=_img("reliability"), label="Reliability Diagram", height=380)
 
775
  with gr.Row():
776
  with gr.Column():
777
+ gr.HTML("<div style='font-size:13px;font-weight:600;color:#4488ff;margin:10px 0 4px;'>πŸ“ˆ Training Curves</div>")
778
+ gr.Image(value=_img("training"), label="Training Curves", height=290)
 
 
 
779
  with gr.Column():
780
+ gr.HTML("<div style='font-size:13px;font-weight:600;color:#a855f7;margin:10px 0 4px;'>🧬 Epistemic Fingerprint</div>")
781
+ gr.Image(value=_img("fingerprint"), label="Epistemic Fingerprint", height=290)
 
 
 
 
782
  with gr.Row():
783
  with gr.Column():
784
+ gr.HTML("<div style='font-size:13px;font-weight:600;color:#ffd700;margin:10px 0 4px;'>🌑️ Calibration Heatmap</div>")
785
+ gr.Image(value=_img("heatmap"), label="Calibration Heatmap", height=290)
 
 
 
786
  with gr.Column():
787
+ gr.HTML("<div style='font-size:13px;font-weight:600;color:#ff8c00;margin:10px 0 4px;'>πŸ“Š Confidence Distribution</div>")
788
+ gr.Image(value=_img("distribution"), label="Confidence Distribution", height=290)
789
+ gr.HTML("<div style='font-size:13px;font-weight:600;color:#ff4466;margin:10px 0 4px;'>🏒 Domain Comparison</div>")
790
+ gr.Image(value=_img("domain"), label="Domain Comparison", height=300)
 
 
 
 
 
 
 
 
791
  regen_btn = gr.Button("πŸ”„ Regenerate All Plots", variant="secondary")
792
+ regen_out = gr.HTML()
 
793
  def regen():
794
  from training.evaluate import make_synthetic_pair, compare_and_plot
795
+ b, a = make_synthetic_pair()
796
+ compare_and_plot(a, {"Untrained": b})
797
+ return _card("<span style='color:#00ffa3;font-weight:600;'>βœ… All 6 plots regenerated</span>")
798
+ regen_btn.click(regen, outputs=[regen_out])
799
+
800
+ # ── Tab 5 ────────────────────────────────────────────────────────────
801
+ with gr.Tab("πŸ† Evaluation"):
802
+ gr.HTML(_tab_header("πŸ† Official OpenEnv Evaluation",
803
+ "3 tasks Γ— 30 episodes = 90 episodes β€” validates ECHO meets all thresholds", "#ffd700"))
 
 
 
 
 
 
 
804
  gr.HTML("""
805
+ <div style="display:grid;grid-template-columns:repeat(3,1fr);gap:10px;margin-bottom:8px;">
806
+ <div style="background:rgba(51,102,255,.06);border:1px solid rgba(51,102,255,.2);border-radius:8px;padding:13px 16px;">
807
+ <div style="color:#4488ff;font-weight:700;font-size:13px;font-family:Inter,sans-serif;">Task 1 β€” Easy</div>
808
+ <div style="color:#1a2a5a;font-size:12px;margin-top:4px;">ECE target: &lt; 0.15</div>
 
809
  </div>
810
+ <div style="background:rgba(255,215,0,.06);border:1px solid rgba(255,215,0,.2);border-radius:8px;padding:13px 16px;">
811
+ <div style="color:#ffd700;font-weight:700;font-size:13px;font-family:Inter,sans-serif;">Task 2 β€” Medium</div>
812
+ <div style="color:#2a2a00;font-size:12px;margin-top:4px;">ECE target: &lt; 0.20</div>
 
813
  </div>
814
+ <div style="background:rgba(168,85,247,.06);border:1px solid rgba(168,85,247,.2);border-radius:8px;padding:13px 16px;">
815
+ <div style="color:#a855f7;font-weight:700;font-size:13px;font-family:Inter,sans-serif;">Task 3 β€” Hard</div>
816
+ <div style="color:#1a0a3a;font-size:12px;margin-top:4px;">ECE target: &lt; 0.25</div>
 
817
  </div>
818
  </div>""")
819
+ eval_btn = gr.Button("πŸš€ Run Full Evaluation (90 episodes)", variant="primary")
820
  result_html = gr.HTML()
821
+ with gr.Accordion("πŸ“„ Raw JSON", open=False):
822
  json_out = gr.Code(language="json")
823
  eval_btn.click(run_evaluation, outputs=[result_html, json_out])
824
 
825
+ # ── Tab 6 ────────────────────────────────────────────────────────────
826
  with gr.Tab("⚑ Live Training"):
827
+ gr.HTML(_tab_header("⚑ Live GRPO Training",
828
+ "Watch ECE drop in real-time β€” dashed lines show Task 1 & 2 pass thresholds", "#4488ff"))
 
 
 
829
  with gr.Row():
830
+ lt_start = gr.Button("πŸš€ Start Live Training Demo", variant="primary", scale=2)
831
+ lt_stop = gr.Button("⏹ Stop", variant="stop", scale=1)
832
+ lt_status = gr.Textbox(label="Training Log",
833
+ value="Ready β€” click Start to simulate GRPO training.",
834
+ lines=2, interactive=False)
835
+ lt_plot = gr.Image(label="ECE During Training", type="filepath", height=380)
836
+ lt_prog = gr.Slider(0, 100, value=0, label="Progress (%)", interactive=False)
837
+ lt_start.click(start_live_training, outputs=[lt_status, lt_plot, lt_prog])
838
+ lt_stop.click(stop_live_training, outputs=[lt_status])
 
 
 
 
 
 
 
 
 
 
 
 
839
 
840
+ return demo, theme
841
 
842
 
843
  def main():
844
  import gradio as gr
845
  logging.basicConfig(level=logging.INFO)
846
+ demo, theme = build_app()
847
  demo.launch(
848
  server_name="0.0.0.0",
849
  server_port=cfg.GRADIO_PORT,
850
  share=False,
851
  show_error=True,
852
  css=_CSS,
853
+ js=_JS,
854
+ theme=theme,
 
 
 
855
  )
856
 
857