anugrahteesdollar commited on
Commit
505bf67
Β·
verified Β·
1 Parent(s): e1066a0

demo: add 5 quick test cases + grader breakdown panel + Show JSON

Browse files
Files changed (1) hide show
  1. space/env/gradio_demo.py +365 -32
space/env/gradio_demo.py CHANGED
@@ -25,6 +25,7 @@ numpy.
25
 
26
  from __future__ import annotations
27
 
 
28
  import logging
29
  from typing import Any, Dict, Iterator, List, Optional, Tuple
30
 
@@ -67,12 +68,164 @@ def _resolve_scenario(label_or_value: str) -> Dict[str, Any]:
67
  return {"scenario_name": value}
68
 
69
 
70
- AGENT_CHOICES = ["random", "heuristic", "oracle"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
 
73
  # ── Helpers for rendering observations ──────────────────────────────────
74
 
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  def _credit_progress_md(obs) -> str:
77
  used = max(0, obs.credits_total - obs.credits_remaining)
78
  total = max(1, obs.credits_total)
@@ -185,8 +338,13 @@ def _stream_baseline(
185
  seed: int,
186
  agent_name: str,
187
  max_steps: int = 30,
188
- ) -> Iterator[Tuple[str, str, str, str, str, str]]:
189
- """Run a full episode in-process; yield UI updates per step."""
 
 
 
 
 
190
  import random
191
 
192
  from models import ActionType
@@ -214,27 +372,79 @@ def _stream_baseline(
214
  _credit_progress_md(obs),
215
  _dossier_md(obs),
216
  "*(truth revealed when the episode ends)*",
 
 
217
  )
218
 
219
  steps = 0
220
  while not obs.done and steps < max_steps:
221
  if agent_name == "random":
222
  action = _random_step(obs, rng)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  else:
224
- # ``oracle`` and ``heuristic`` both run the standard
225
- # pipeline order; oracle additionally patches the
226
- # terminal step with the hidden ``correct_decision``.
 
 
227
  action = _heuristic_step(obs, history)
228
  if (
229
- agent_name == "oracle"
230
- and action.action_type == ActionType.SUBMIT_VALIDATION_REPORT
231
  and env._latent is not None
232
  ):
233
- action = action.model_copy(update={
234
- "final_decision": env._latent.target.correct_decision,
235
- "confidence": 0.85,
236
- "reasoning": "Oracle: submit correct decision (peeked latent).",
237
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  history.append(action.action_type)
239
  obs = env.step(action)
240
  rew = float(obs.reward or 0.0)
@@ -252,6 +462,8 @@ def _stream_baseline(
252
  _credit_progress_md(obs),
253
  _dossier_md(obs),
254
  _truth_md(env._latent, obs.done) if obs.done else "*(truth revealed when the episode ends)*",
 
 
255
  )
256
 
257
  log_lines.append("-" * 70)
@@ -268,13 +480,17 @@ def _stream_baseline(
268
  _credit_progress_md(obs),
269
  _dossier_md(obs),
270
  _truth_md(env._latent, True),
 
 
271
  )
272
 
273
 
274
  # ── Tab 2: build your own actions ───────────────────────────────────────
275
 
276
 
277
- def _new_episode(scenario_label: str, seed: int) -> Tuple[Any, Any, str, str, str, str, str, str]:
 
 
278
  from server.hackathon_environment import DrugTargetEnvironment
279
 
280
  env = DrugTargetEnvironment(**_resolve_scenario(scenario_label))
@@ -294,6 +510,8 @@ def _new_episode(scenario_label: str, seed: int) -> Tuple[Any, Any, str, str, st
294
  _credit_progress_md(obs), # credits
295
  _dossier_md(obs), # dossier
296
  "*(submit a `submit_validation_report` or run out of credits to reveal)*",
 
 
297
  )
298
 
299
 
@@ -306,7 +524,7 @@ def _submit_step(
306
  final_decision: str,
307
  confidence: float,
308
  reasoning: str,
309
- ) -> Tuple[Any, Any, str, str, str, str, str, str]:
310
  from models import ActionType, DrugTargetAction
311
 
312
  if env is None or obs is None:
@@ -317,6 +535,8 @@ def _submit_step(
317
  "*(no episode)*",
318
  "*(no episode)*",
319
  "*(no episode)*",
 
 
320
  )
321
 
322
  if obs.done:
@@ -328,6 +548,8 @@ def _submit_step(
328
  _credit_progress_md(obs),
329
  _dossier_md(obs),
330
  _truth_md(env._latent, True),
 
 
331
  )
332
 
333
  try:
@@ -341,6 +563,8 @@ def _submit_step(
341
  _credit_progress_md(obs),
342
  _dossier_md(obs),
343
  "*(truth shown at end of episode)*",
 
 
344
  )
345
 
346
  params: Dict[str, Any] = {}
@@ -393,6 +617,8 @@ def _submit_step(
393
  _credit_progress_md(new_obs),
394
  _dossier_md(new_obs),
395
  _truth_md(env._latent, new_obs.done),
 
 
396
  )
397
 
398
 
@@ -482,11 +708,15 @@ def build_gradio_demo() -> gr.Blocks:
482
  with gr.TabItem("β–Ά Watch baseline agent"):
483
  gr.Markdown(
484
  "Pick a scenario and seed, then click one of **Random / "
485
- "Heuristic / Oracle**. The agent will play a full episode "
486
- "and stream every action+reward into the log. The "
487
- "**Oracle** baseline is the default because it always "
488
- "submits the correct decision β€” the most reliable way "
489
- "to see DrugEnv 'work'."
 
 
 
 
490
  )
491
 
492
  with gr.Row():
@@ -499,9 +729,12 @@ def build_gradio_demo() -> gr.Blocks:
499
  seed_in = gr.Number(value=7, precision=0, label="Seed")
500
 
501
  with gr.Row():
502
- btn_random = gr.Button("β–Ά Run Random agent", variant="secondary")
503
- btn_heuristic = gr.Button("β–Ά Run Heuristic agent", variant="secondary")
504
- btn_oracle = gr.Button("β–Ά Run Oracle agent", variant="primary")
 
 
 
505
 
506
  with gr.Row():
507
  with gr.Column(scale=3):
@@ -518,6 +751,13 @@ def build_gradio_demo() -> gr.Blocks:
518
  "*(truth revealed when the episode ends)*",
519
  label="🎯 Hidden target profile (revealed at end of episode)",
520
  )
 
 
 
 
 
 
 
521
 
522
  def _run(scenario_label, seed, agent_name):
523
  yield from _stream_baseline(
@@ -526,7 +766,11 @@ def build_gradio_demo() -> gr.Blocks:
526
  agent_name,
527
  )
528
 
529
- outputs_b = [log_md, cum_reward_b, step_b, credits_b, dossier_b, truth_b]
 
 
 
 
530
  btn_random.click(
531
  lambda s, sd: _run(s, sd, "random"),
532
  inputs=[scenario_dd, seed_in],
@@ -542,6 +786,71 @@ def build_gradio_demo() -> gr.Blocks:
542
  inputs=[scenario_dd, seed_in],
543
  outputs=outputs_b,
544
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
545
 
546
  # ───────── Tab 2: Build your own actions ─────────
547
  with gr.TabItem("πŸ›  Build custom action"):
@@ -613,14 +922,27 @@ def build_gradio_demo() -> gr.Blocks:
613
  "*(truth revealed when the episode ends)*",
614
  label="🎯 Hidden target profile",
615
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
616
 
617
  btn_new.click(
618
  _new_episode,
619
  inputs=[scenario_dd2, seed_in2],
620
- outputs=[
621
- env_state, obs_state, status_md, cum_reward, step_idx,
622
- credits, dossier, truth,
623
- ],
624
  )
625
 
626
  btn_submit.click(
@@ -629,10 +951,21 @@ def build_gradio_demo() -> gr.Blocks:
629
  env_state, obs_state, action_type, database,
630
  include_allosteric, final_decision, confidence, reasoning,
631
  ],
632
- outputs=[
633
- env_state, obs_state, status_md, cum_reward, step_idx,
634
- credits, dossier, truth,
635
- ],
 
 
 
 
 
 
 
 
 
 
 
636
  )
637
 
638
  # ───────── Tab 3: Inspect hidden truth ─────────
 
25
 
26
  from __future__ import annotations
27
 
28
+ import json
29
  import logging
30
  from typing import Any, Dict, Iterator, List, Optional, Tuple
31
 
 
68
  return {"scenario_name": value}
69
 
70
 
71
+ AGENT_CHOICES = [
72
+ "random",
73
+ "heuristic",
74
+ "oracle",
75
+ "antioracle",
76
+ "lazy_antioracle",
77
+ "spammer",
78
+ ]
79
+
80
+
81
+ # ── Quick test cases (preset (scenario, seed, agent, why) tuples) ──────
82
+ #
83
+ # Picked so the demo audience can see, in <30 s each, that the grader
84
+ # actually grades and that wrong play loses points. The first three are
85
+ # "positive" (correct decision β†’ high terminal reward); the last two are
86
+ # *deliberately* penalised so you can show the rule / decision-accuracy
87
+ # components firing.
88
+ TEST_CASES: List[Dict[str, Any]] = [
89
+ {
90
+ "label": "βœ… Easy GO Β· Oracle on EGFR / NSCLC",
91
+ "scenario": "egfr_nsclc_viable",
92
+ "seed": 7,
93
+ "agent": "oracle",
94
+ "expectation": (
95
+ "Oracle peeks at the latent target and submits the correct "
96
+ "**`go`** with calibrated confidence on a clear-positive "
97
+ "scenario β†’ big positive `term_decision_accuracy` and "
98
+ "`term_evidence_coverage`. Total cum reward β‰ˆ **+6**."
99
+ ),
100
+ },
101
+ {
102
+ "label": "βœ… Easy NO_GO Β· Oracle on TP53 / solid tumours",
103
+ "scenario": "tp53_solid_tumors_clear_fail",
104
+ "seed": 7,
105
+ "agent": "oracle",
106
+ "expectation": (
107
+ "Oracle submits the correct **`no_go`** on an obvious "
108
+ "tumour-suppressor (undruggable) target β†’ also a big "
109
+ "positive terminal. Shows the grader rewards correct "
110
+ "*negative* decisions, not just `go`s."
111
+ ),
112
+ },
113
+ {
114
+ "label": "βœ… Borderline GO Β· Heuristic on KRAS G12C / PDAC",
115
+ "scenario": "kras_pdac_borderline",
116
+ "seed": 11,
117
+ "agent": "heuristic",
118
+ "expectation": (
119
+ "Fixed-pipeline heuristic on a medium-difficulty borderline "
120
+ "case. Coverage is good, decision is usually correct β†’ "
121
+ "moderate positive terminal. Useful baseline to compare "
122
+ "against the two penalty cases below."
123
+ ),
124
+ },
125
+ {
126
+ "label": "❌ Penalty: redundancy + confident-wrong · Lazy anti-oracle on KRAS",
127
+ "scenario": "kras_pdac_borderline",
128
+ "seed": 11,
129
+ "agent": "lazy_antioracle",
130
+ "expectation": (
131
+ "Spams 12 redundant `query_expression` calls (firing the "
132
+ "`redundant_*` soft-rule penalty repeatedly) then submits "
133
+ "the **opposite** of the correct decision with confidence "
134
+ "0.95. The grader stacks three guards: redundancy step "
135
+ "penalties, near-zero `term_evidence_coverage`, and "
136
+ "`confident_wrong_answer_penalty = -0.9`. Cum total goes "
137
+ "**clearly negative** β€” vs the heuristic's β‰ˆ +6 on the same "
138
+ "scenario."
139
+ ),
140
+ },
141
+ {
142
+ "label": "❌ Penalty: format farming, never submits · Spammer on KRAS",
143
+ "scenario": "kras_pdac_borderline",
144
+ "seed": 11,
145
+ "agent": "spammer",
146
+ "expectation": (
147
+ "Repeats `query_expression` for 30 steps and never reaches "
148
+ "`submit_validation_report`. Triggers "
149
+ "`no_report_submitted_penalty`, `redundancy_frac β†’ 1.0` "
150
+ "(zero credit_efficiency), and zero novelty after the first "
151
+ "step. The grader's per-step reward floor (`step_reward_clip "
152
+ "= +0.3`) is what stops this strategy from outscoring real "
153
+ "submissions."
154
+ ),
155
+ },
156
+ ]
157
 
158
 
159
  # ── Helpers for rendering observations ──────────────────────────────────
160
 
161
 
162
+ def _to_json_dict(value: Any) -> Any:
163
+ """Best-effort recursive conversion of pydantic/dataclass/dict objects
164
+ into a JSON-serialisable dict for ``gr.JSON``."""
165
+ if value is None or isinstance(value, (str, int, float, bool)):
166
+ return value
167
+ if hasattr(value, "model_dump"):
168
+ try:
169
+ return value.model_dump(mode="json")
170
+ except Exception:
171
+ try:
172
+ return value.model_dump()
173
+ except Exception:
174
+ pass
175
+ if isinstance(value, dict):
176
+ return {str(k): _to_json_dict(v) for k, v in value.items()}
177
+ if isinstance(value, (list, tuple, set)):
178
+ return [_to_json_dict(v) for v in value]
179
+ try:
180
+ return json.loads(json.dumps(value, default=str))
181
+ except Exception:
182
+ return str(value)
183
+
184
+
185
+ def _grader_breakdown_md(obs, terminal_only: bool = False) -> str:
186
+ """Format the per-component reward breakdown for the side panel.
187
+
188
+ DrugEnv puts the decomposed RewardBreakdown into
189
+ ``obs.step_reward_breakdown`` β€” both the step components and the
190
+ terminal components (prefixed with ``term_``) when the episode ends.
191
+ """
192
+ if obs is None:
193
+ return "*(no episode)*"
194
+ bd: Dict[str, float] = dict(getattr(obs, "step_reward_breakdown", {}) or {})
195
+ if not bd:
196
+ return "*(no reward yet β€” take a step)*"
197
+
198
+ step_keys = [
199
+ "novelty", "reasoning_coherence", "credit_efficiency",
200
+ "shaping", "penalty", "total",
201
+ ]
202
+ term_keys = [
203
+ "decision_accuracy", "evidence_coverage", "credit_efficiency",
204
+ "reasoning_coherence", "penalty", "terminal", "total",
205
+ ]
206
+
207
+ def _fmt(v: float) -> str:
208
+ return f"`{v:+.3f}`"
209
+
210
+ lines: List[str] = []
211
+ if not terminal_only:
212
+ step_present = [k for k in step_keys if k in bd]
213
+ if step_present:
214
+ lines.append("**Step reward components**")
215
+ for k in step_present:
216
+ lines.append(f"- {k}: {_fmt(bd[k])}")
217
+
218
+ term_present = [k for k in term_keys if f"term_{k}" in bd]
219
+ if term_present:
220
+ lines.append("\n**Terminal reward components** *(only at episode end)*")
221
+ for k in term_present:
222
+ lines.append(f"- {k}: {_fmt(bd[f'term_{k}'])}")
223
+
224
+ if not lines:
225
+ return "*(no reward yet β€” take a step)*"
226
+ return "\n".join(lines)
227
+
228
+
229
  def _credit_progress_md(obs) -> str:
230
  used = max(0, obs.credits_total - obs.credits_remaining)
231
  total = max(1, obs.credits_total)
 
338
  seed: int,
339
  agent_name: str,
340
  max_steps: int = 30,
341
+ ) -> Iterator[Tuple[str, str, str, str, str, str, str, Dict[str, Any]]]:
342
+ """Run a full episode in-process; yield UI updates per step.
343
+
344
+ Yields an 8-tuple of UI-bound values:
345
+ ``(log_md, cum_reward, step_idx, credits_md, dossier_md, truth_md,
346
+ breakdown_md, obs_json)``.
347
+ """
348
  import random
349
 
350
  from models import ActionType
 
372
  _credit_progress_md(obs),
373
  _dossier_md(obs),
374
  "*(truth revealed when the episode ends)*",
375
+ "*(no reward yet β€” first step pending)*",
376
+ _to_json_dict(obs),
377
  )
378
 
379
  steps = 0
380
  while not obs.done and steps < max_steps:
381
  if agent_name == "random":
382
  action = _random_step(obs, rng)
383
+ elif agent_name == "lazy_antioracle":
384
+ # Run a small burst of redundant cheap queries (to rack up
385
+ # `redundant_*` soft violations and tank `credit_efficiency`),
386
+ # then submit the *opposite* of the correct decision with
387
+ # confidence 0.95 to fire ``confident_wrong_answer_penalty``.
388
+ # Combined effect: cum total goes clearly negative.
389
+ from training.training_script import build_drug_target_action
390
+
391
+ REDUNDANT_QUERIES = 12
392
+ if len(history) < REDUNDANT_QUERIES:
393
+ action = build_drug_target_action(
394
+ ActionType.QUERY_EXPRESSION, obs,
395
+ )
396
+ else:
397
+ action = build_drug_target_action(
398
+ ActionType.SUBMIT_VALIDATION_REPORT, obs,
399
+ )
400
+ if env._latent is not None:
401
+ correct = env._latent.target.correct_decision
402
+ wrong = "no_go" if correct == "go" else "go"
403
+ action = action.model_copy(update={
404
+ "final_decision": wrong,
405
+ "confidence": 0.95,
406
+ "reasoning": (
407
+ "Lazy anti-oracle: redundant queries + opposite "
408
+ "decision with high confidence to compound "
409
+ "redundancy and confident-wrong penalties."
410
+ ),
411
+ })
412
+ elif agent_name == "spammer":
413
+ # Repeat the cheapest action over and over without ever
414
+ # submitting. Triggers redundancy penalties + the
415
+ # ``no_report_submitted_penalty`` at terminal.
416
+ from training.training_script import build_drug_target_action
417
+
418
+ action = build_drug_target_action(ActionType.QUERY_EXPRESSION, obs)
419
  else:
420
+ # ``oracle``, ``antioracle``, and ``heuristic`` all run the
421
+ # standard pipeline order; oracle / antioracle additionally
422
+ # patch the terminal step (oracle = correct decision,
423
+ # antioracle = opposite decision with high confidence β€” to
424
+ # demo the overconfident-wrong penalty in the grader).
425
  action = _heuristic_step(obs, history)
426
  if (
427
+ action.action_type == ActionType.SUBMIT_VALIDATION_REPORT
 
428
  and env._latent is not None
429
  ):
430
+ if agent_name == "oracle":
431
+ action = action.model_copy(update={
432
+ "final_decision": env._latent.target.correct_decision,
433
+ "confidence": 0.85,
434
+ "reasoning": "Oracle: submit correct decision (peeked latent).",
435
+ })
436
+ elif agent_name == "antioracle":
437
+ correct = env._latent.target.correct_decision
438
+ wrong = "no_go" if correct == "go" else "go"
439
+ action = action.model_copy(update={
440
+ "final_decision": wrong,
441
+ "confidence": 0.95,
442
+ "reasoning": (
443
+ "Anti-oracle: submit deliberately wrong decision "
444
+ "with high confidence to trigger the "
445
+ "overconfident-wrong penalty."
446
+ ),
447
+ })
448
  history.append(action.action_type)
449
  obs = env.step(action)
450
  rew = float(obs.reward or 0.0)
 
462
  _credit_progress_md(obs),
463
  _dossier_md(obs),
464
  _truth_md(env._latent, obs.done) if obs.done else "*(truth revealed when the episode ends)*",
465
+ _grader_breakdown_md(obs, terminal_only=False),
466
+ _to_json_dict(obs),
467
  )
468
 
469
  log_lines.append("-" * 70)
 
480
  _credit_progress_md(obs),
481
  _dossier_md(obs),
482
  _truth_md(env._latent, True),
483
+ _grader_breakdown_md(obs, terminal_only=False),
484
+ _to_json_dict(obs),
485
  )
486
 
487
 
488
  # ── Tab 2: build your own actions ───────────────────────────────────────
489
 
490
 
491
+ def _new_episode(
492
+ scenario_label: str, seed: int,
493
+ ) -> Tuple[Any, Any, str, str, str, str, str, str, str, Dict[str, Any]]:
494
  from server.hackathon_environment import DrugTargetEnvironment
495
 
496
  env = DrugTargetEnvironment(**_resolve_scenario(scenario_label))
 
510
  _credit_progress_md(obs), # credits
511
  _dossier_md(obs), # dossier
512
  "*(submit a `submit_validation_report` or run out of credits to reveal)*",
513
+ "*(no reward yet β€” take a step)*", # breakdown_md
514
+ _to_json_dict(obs), # obs_json
515
  )
516
 
517
 
 
524
  final_decision: str,
525
  confidence: float,
526
  reasoning: str,
527
+ ) -> Tuple[Any, Any, str, str, str, str, str, str, str, Dict[str, Any]]:
528
  from models import ActionType, DrugTargetAction
529
 
530
  if env is None or obs is None:
 
535
  "*(no episode)*",
536
  "*(no episode)*",
537
  "*(no episode)*",
538
+ "*(no episode)*",
539
+ {},
540
  )
541
 
542
  if obs.done:
 
548
  _credit_progress_md(obs),
549
  _dossier_md(obs),
550
  _truth_md(env._latent, True),
551
+ _grader_breakdown_md(obs, terminal_only=False),
552
+ _to_json_dict(obs),
553
  )
554
 
555
  try:
 
563
  _credit_progress_md(obs),
564
  _dossier_md(obs),
565
  "*(truth shown at end of episode)*",
566
+ _grader_breakdown_md(obs, terminal_only=False),
567
+ _to_json_dict(obs),
568
  )
569
 
570
  params: Dict[str, Any] = {}
 
617
  _credit_progress_md(new_obs),
618
  _dossier_md(new_obs),
619
  _truth_md(env._latent, new_obs.done),
620
+ _grader_breakdown_md(new_obs, terminal_only=False),
621
+ _to_json_dict(new_obs),
622
  )
623
 
624
 
 
708
  with gr.TabItem("β–Ά Watch baseline agent"):
709
  gr.Markdown(
710
  "Pick a scenario and seed, then click one of **Random / "
711
+ "Heuristic / Oracle / Anti-oracle**. The agent will play "
712
+ "a full episode and stream every action+reward into the "
713
+ "log. The **Oracle** baseline submits the ground-truth "
714
+ "decision; the **Anti-oracle** submits the *opposite* "
715
+ "with high confidence β€” a quick way to see the grader's "
716
+ "overconfident-wrong penalty fire.\n\n"
717
+ "Or jump to **πŸ“‹ Quick test cases** below for one-click "
718
+ "presets that demonstrate both happy-path scoring and "
719
+ "two deliberately-penalised failure modes."
720
  )
721
 
722
  with gr.Row():
 
729
  seed_in = gr.Number(value=7, precision=0, label="Seed")
730
 
731
  with gr.Row():
732
+ btn_random = gr.Button("β–Ά Random", variant="secondary")
733
+ btn_heuristic = gr.Button("β–Ά Heuristic", variant="secondary")
734
+ btn_oracle = gr.Button("β–Ά Oracle (correct)", variant="primary")
735
+ btn_antioracle = gr.Button("β–Ά Anti-oracle (wrong)", variant="stop")
736
+ btn_lazy = gr.Button("β–Ά Lazy anti-oracle", variant="stop")
737
+ btn_spammer = gr.Button("β–Ά Spammer (no submit)", variant="stop")
738
 
739
  with gr.Row():
740
  with gr.Column(scale=3):
 
751
  "*(truth revealed when the episode ends)*",
752
  label="🎯 Hidden target profile (revealed at end of episode)",
753
  )
754
+ breakdown_b = gr.Markdown(
755
+ "*(no reward yet)*",
756
+ label="πŸ“Š Grader breakdown (per-component reward)",
757
+ )
758
+
759
+ with gr.Accordion("πŸ“‹ Raw observation JSON (latest step)", open=False):
760
+ obs_json_b = gr.JSON(value={}, label="ValidationObservation")
761
 
762
  def _run(scenario_label, seed, agent_name):
763
  yield from _stream_baseline(
 
766
  agent_name,
767
  )
768
 
769
+ outputs_b = [
770
+ log_md, cum_reward_b, step_b,
771
+ credits_b, dossier_b, truth_b,
772
+ breakdown_b, obs_json_b,
773
+ ]
774
  btn_random.click(
775
  lambda s, sd: _run(s, sd, "random"),
776
  inputs=[scenario_dd, seed_in],
 
786
  inputs=[scenario_dd, seed_in],
787
  outputs=outputs_b,
788
  )
789
+ btn_antioracle.click(
790
+ lambda s, sd: _run(s, sd, "antioracle"),
791
+ inputs=[scenario_dd, seed_in],
792
+ outputs=outputs_b,
793
+ )
794
+ btn_lazy.click(
795
+ lambda s, sd: _run(s, sd, "lazy_antioracle"),
796
+ inputs=[scenario_dd, seed_in],
797
+ outputs=outputs_b,
798
+ )
799
+ btn_spammer.click(
800
+ lambda s, sd: _run(s, sd, "spammer"),
801
+ inputs=[scenario_dd, seed_in],
802
+ outputs=outputs_b,
803
+ )
804
+
805
+ # ───── Quick test cases (one-click presets) ─────
806
+ gr.Markdown(
807
+ "---\n"
808
+ "### πŸ“‹ Quick test cases β€” demonstrating the grader\n"
809
+ "These five preset rollouts each take a few seconds. "
810
+ "The first three demonstrate **correct** play scoring "
811
+ "high; the last two are **deliberately penalised** so "
812
+ "you can watch the grader's `decision_accuracy`, "
813
+ "`evidence_coverage`, and `penalty` components fire."
814
+ )
815
+
816
+ def _tc_label(scenario_value: str) -> str:
817
+ """Map a scenario_name back to its dropdown label."""
818
+ for lab, val in SCENARIO_CHOICES:
819
+ if val == scenario_value:
820
+ return lab
821
+ return scenario_value
822
+
823
+ for tc in TEST_CASES:
824
+ with gr.Row():
825
+ with gr.Column(scale=2):
826
+ tc_btn = gr.Button(
827
+ tc["label"],
828
+ variant="primary"
829
+ if tc["agent"] in ("oracle", "heuristic")
830
+ else "stop",
831
+ )
832
+ with gr.Column(scale=5):
833
+ gr.Markdown(
834
+ f"*scenario=`{tc['scenario']}` Β· "
835
+ f"seed=`{tc['seed']}` Β· "
836
+ f"agent=`{tc['agent']}`* \n"
837
+ f"{tc['expectation']}"
838
+ )
839
+
840
+ def _make_runner(scenario_value: str, seed: int, agent_name: str):
841
+ scenario_label = _tc_label(scenario_value)
842
+
843
+ def _runner():
844
+ yield from _stream_baseline(
845
+ scenario_label, int(seed), agent_name,
846
+ )
847
+ return _runner
848
+
849
+ tc_btn.click(
850
+ _make_runner(tc["scenario"], tc["seed"], tc["agent"]),
851
+ inputs=None,
852
+ outputs=outputs_b,
853
+ )
854
 
855
  # ───────── Tab 2: Build your own actions ─────────
856
  with gr.TabItem("πŸ›  Build custom action"):
 
922
  "*(truth revealed when the episode ends)*",
923
  label="🎯 Hidden target profile",
924
  )
925
+ breakdown_md_2 = gr.Markdown(
926
+ "*(no reward yet)*",
927
+ label="πŸ“Š Grader breakdown",
928
+ )
929
+
930
+ with gr.Accordion("πŸ“‹ Raw observation JSON", open=False):
931
+ with gr.Row():
932
+ btn_show_json = gr.Button(
933
+ "πŸ“‹ Show observation JSON", variant="secondary",
934
+ )
935
+ obs_json_2 = gr.JSON(value={}, label="ValidationObservation")
936
+
937
+ tab2_outputs = [
938
+ env_state, obs_state, status_md, cum_reward, step_idx,
939
+ credits, dossier, truth, breakdown_md_2, obs_json_2,
940
+ ]
941
 
942
  btn_new.click(
943
  _new_episode,
944
  inputs=[scenario_dd2, seed_in2],
945
+ outputs=tab2_outputs,
 
 
 
946
  )
947
 
948
  btn_submit.click(
 
951
  env_state, obs_state, action_type, database,
952
  include_allosteric, final_decision, confidence, reasoning,
953
  ],
954
+ outputs=tab2_outputs,
955
+ )
956
+
957
+ # Manual "Show JSON" refresh β€” re-emit the current observation
958
+ # as JSON without advancing the env. Lets the user inspect the
959
+ # full ValidationObservation pydantic structure on demand.
960
+ def _show_obs_json(obs) -> Dict[str, Any]:
961
+ if obs is None:
962
+ return {"error": "no active episode β€” click 'πŸ”„ New episode' first"}
963
+ return _to_json_dict(obs)
964
+
965
+ btn_show_json.click(
966
+ _show_obs_json,
967
+ inputs=[obs_state],
968
+ outputs=[obs_json_2],
969
  )
970
 
971
  # ───────── Tab 3: Inspect hidden truth ─────────