Spaces:

anugrahteesdollar
/

drugenv

Sleeping

App Files Files Community

anugrahteesdollar commited on 12 days ago

Commit

505bf67

verified ·

1 Parent(s): e1066a0

demo: add 5 quick test cases + grader breakdown panel + Show JSON

Browse files

Files changed (1) hide show

space/env/gradio_demo.py +365 -32

space/env/gradio_demo.py CHANGED Viewed

@@ -25,6 +25,7 @@ numpy.
 from __future__ import annotations
 import logging
 from typing import Any, Dict, Iterator, List, Optional, Tuple
@@ -67,12 +68,164 @@ def _resolve_scenario(label_or_value: str) -> Dict[str, Any]:
     return {"scenario_name": value}
-AGENT_CHOICES = ["random", "heuristic", "oracle"]
 # ── Helpers for rendering observations ──────────────────────────────────
 def _credit_progress_md(obs) -> str:
     used = max(0, obs.credits_total - obs.credits_remaining)
     total = max(1, obs.credits_total)
@@ -185,8 +338,13 @@ def _stream_baseline(
     seed: int,
     agent_name: str,
     max_steps: int = 30,
-) -> Iterator[Tuple[str, str, str, str, str, str]]:
-    """Run a full episode in-process; yield UI updates per step."""
     import random
     from models import ActionType
@@ -214,27 +372,79 @@ def _stream_baseline(
         _credit_progress_md(obs),
         _dossier_md(obs),
         "*(truth revealed when the episode ends)*",
     )
     steps = 0
     while not obs.done and steps < max_steps:
         if agent_name == "random":
             action = _random_step(obs, rng)
         else:
-            # ``oracle`` and ``heuristic`` both run the standard
-            # pipeline order; oracle additionally patches the
-            # terminal step with the hidden ``correct_decision``.
             action = _heuristic_step(obs, history)
             if (
-                agent_name == "oracle"
-                and action.action_type == ActionType.SUBMIT_VALIDATION_REPORT
                 and env._latent is not None
             ):
-                action = action.model_copy(update={
-                    "final_decision": env._latent.target.correct_decision,
-                    "confidence": 0.85,
-                    "reasoning": "Oracle: submit correct decision (peeked latent).",
-                })
         history.append(action.action_type)
         obs = env.step(action)
         rew = float(obs.reward or 0.0)
@@ -252,6 +462,8 @@ def _stream_baseline(
             _credit_progress_md(obs),
             _dossier_md(obs),
             _truth_md(env._latent, obs.done) if obs.done else "*(truth revealed when the episode ends)*",
         )
     log_lines.append("-" * 70)
@@ -268,13 +480,17 @@ def _stream_baseline(
         _credit_progress_md(obs),
         _dossier_md(obs),
         _truth_md(env._latent, True),
     )
 # ── Tab 2: build your own actions ───────────────────────────────────────
-def _new_episode(scenario_label: str, seed: int) -> Tuple[Any, Any, str, str, str, str, str, str]:
     from server.hackathon_environment import DrugTargetEnvironment
     env = DrugTargetEnvironment(**_resolve_scenario(scenario_label))
@@ -294,6 +510,8 @@ def _new_episode(scenario_label: str, seed: int) -> Tuple[Any, Any, str, str, st
         _credit_progress_md(obs),             # credits
         _dossier_md(obs),                     # dossier
         "*(submit a `submit_validation_report` or run out of credits to reveal)*",
     )
@@ -306,7 +524,7 @@ def _submit_step(
     final_decision: str,
     confidence: float,
     reasoning: str,
-) -> Tuple[Any, Any, str, str, str, str, str, str]:
     from models import ActionType, DrugTargetAction
     if env is None or obs is None:
@@ -317,6 +535,8 @@ def _submit_step(
             "*(no episode)*",
             "*(no episode)*",
             "*(no episode)*",
         )
     if obs.done:
@@ -328,6 +548,8 @@ def _submit_step(
             _credit_progress_md(obs),
             _dossier_md(obs),
             _truth_md(env._latent, True),
         )
     try:
@@ -341,6 +563,8 @@ def _submit_step(
             _credit_progress_md(obs),
             _dossier_md(obs),
             "*(truth shown at end of episode)*",
         )
     params: Dict[str, Any] = {}
@@ -393,6 +617,8 @@ def _submit_step(
         _credit_progress_md(new_obs),
         _dossier_md(new_obs),
         _truth_md(env._latent, new_obs.done),
     )
@@ -482,11 +708,15 @@ def build_gradio_demo() -> gr.Blocks:
             with gr.TabItem("▶ Watch baseline agent"):
                 gr.Markdown(
                     "Pick a scenario and seed, then click one of **Random / "
-                    "Heuristic / Oracle**. The agent will play a full episode "
-                    "and stream every action+reward into the log. The "
-                    "**Oracle** baseline is the default because it always "
-                    "submits the correct decision — the most reliable way "
-                    "to see DrugEnv 'work'."
                 )
                 with gr.Row():
@@ -499,9 +729,12 @@ def build_gradio_demo() -> gr.Blocks:
                     seed_in = gr.Number(value=7, precision=0, label="Seed")
                 with gr.Row():
-                    btn_random = gr.Button("▶ Run Random agent", variant="secondary")
-                    btn_heuristic = gr.Button("▶ Run Heuristic agent", variant="secondary")
-                    btn_oracle = gr.Button("▶ Run Oracle agent", variant="primary")
                 with gr.Row():
                     with gr.Column(scale=3):
@@ -518,6 +751,13 @@ def build_gradio_demo() -> gr.Blocks:
                             "*(truth revealed when the episode ends)*",
                             label="🎯 Hidden target profile (revealed at end of episode)",
                         )
                 def _run(scenario_label, seed, agent_name):
                     yield from _stream_baseline(
@@ -526,7 +766,11 @@ def build_gradio_demo() -> gr.Blocks:
                         agent_name,
                     )
-                outputs_b = [log_md, cum_reward_b, step_b, credits_b, dossier_b, truth_b]
                 btn_random.click(
                     lambda s, sd: _run(s, sd, "random"),
                     inputs=[scenario_dd, seed_in],
@@ -542,6 +786,71 @@ def build_gradio_demo() -> gr.Blocks:
                     inputs=[scenario_dd, seed_in],
                     outputs=outputs_b,
                 )
             # ───────── Tab 2: Build your own actions ─────────
             with gr.TabItem("🛠 Build custom action"):
@@ -613,14 +922,27 @@ def build_gradio_demo() -> gr.Blocks:
                             "*(truth revealed when the episode ends)*",
                             label="🎯 Hidden target profile",
                         )
                 btn_new.click(
                     _new_episode,
                     inputs=[scenario_dd2, seed_in2],
-                    outputs=[
-                        env_state, obs_state, status_md, cum_reward, step_idx,
-                        credits, dossier, truth,
-                    ],
                 )
                 btn_submit.click(
@@ -629,10 +951,21 @@ def build_gradio_demo() -> gr.Blocks:
                         env_state, obs_state, action_type, database,
                         include_allosteric, final_decision, confidence, reasoning,
                     ],
-                    outputs=[
-                        env_state, obs_state, status_md, cum_reward, step_idx,
-                        credits, dossier, truth,
-                    ],
                 )
             # ───────── Tab 3: Inspect hidden truth ─────────

 from __future__ import annotations
+import json
 import logging
 from typing import Any, Dict, Iterator, List, Optional, Tuple
     return {"scenario_name": value}
+AGENT_CHOICES = [
+    "random",
+    "heuristic",
+    "oracle",
+    "antioracle",
+    "lazy_antioracle",
+    "spammer",
+]
+# ── Quick test cases (preset (scenario, seed, agent, why) tuples) ──────
+#
+# Picked so the demo audience can see, in <30 s each, that the grader
+# actually grades and that wrong play loses points. The first three are
+# "positive" (correct decision → high terminal reward); the last two are
+# *deliberately* penalised so you can show the rule / decision-accuracy
+# components firing.
+TEST_CASES: List[Dict[str, Any]] = [
+    {
+        "label": "✅ Easy GO  ·  Oracle on EGFR / NSCLC",
+        "scenario": "egfr_nsclc_viable",
+        "seed": 7,
+        "agent": "oracle",
+        "expectation": (
+            "Oracle peeks at the latent target and submits the correct "
+            "**`go`** with calibrated confidence on a clear-positive "
+            "scenario → big positive `term_decision_accuracy` and "
+            "`term_evidence_coverage`. Total cum reward ≈ **+6**."
+        ),
+    },
+    {
+        "label": "✅ Easy NO_GO  ·  Oracle on TP53 / solid tumours",
+        "scenario": "tp53_solid_tumors_clear_fail",
+        "seed": 7,
+        "agent": "oracle",
+        "expectation": (
+            "Oracle submits the correct **`no_go`** on an obvious "
+            "tumour-suppressor (undruggable) target → also a big "
+            "positive terminal. Shows the grader rewards correct "
+            "*negative* decisions, not just `go`s."
+        ),
+    },
+    {
+        "label": "✅ Borderline GO  ·  Heuristic on KRAS G12C / PDAC",
+        "scenario": "kras_pdac_borderline",
+        "seed": 11,
+        "agent": "heuristic",
+        "expectation": (
+            "Fixed-pipeline heuristic on a medium-difficulty borderline "
+            "case. Coverage is good, decision is usually correct → "
+            "moderate positive terminal. Useful baseline to compare "
+            "against the two penalty cases below."
+        ),
+    },
+    {
+        "label": "❌ Penalty: redundancy + confident-wrong  ·  Lazy anti-oracle on KRAS",
+        "scenario": "kras_pdac_borderline",
+        "seed": 11,
+        "agent": "lazy_antioracle",
+        "expectation": (
+            "Spams 12 redundant `query_expression` calls (firing the "
+            "`redundant_*` soft-rule penalty repeatedly) then submits "
+            "the **opposite** of the correct decision with confidence "
+            "0.95. The grader stacks three guards: redundancy step "
+            "penalties, near-zero `term_evidence_coverage`, and "
+            "`confident_wrong_answer_penalty = -0.9`. Cum total goes "
+            "**clearly negative** — vs the heuristic's ≈ +6 on the same "
+            "scenario."
+        ),
+    },
+    {
+        "label": "❌ Penalty: format farming, never submits  ·  Spammer on KRAS",
+        "scenario": "kras_pdac_borderline",
+        "seed": 11,
+        "agent": "spammer",
+        "expectation": (
+            "Repeats `query_expression` for 30 steps and never reaches "
+            "`submit_validation_report`. Triggers "
+            "`no_report_submitted_penalty`, `redundancy_frac → 1.0` "
+            "(zero credit_efficiency), and zero novelty after the first "
+            "step. The grader's per-step reward floor (`step_reward_clip "
+            "= +0.3`) is what stops this strategy from outscoring real "
+            "submissions."
+        ),
+    },
+]
 # ── Helpers for rendering observations ──────────────────────────────────
+def _to_json_dict(value: Any) -> Any:
+    """Best-effort recursive conversion of pydantic/dataclass/dict objects
+    into a JSON-serialisable dict for ``gr.JSON``."""
+    if value is None or isinstance(value, (str, int, float, bool)):
+        return value
+    if hasattr(value, "model_dump"):
+        try:
+            return value.model_dump(mode="json")
+        except Exception:
+            try:
+                return value.model_dump()
+            except Exception:
+                pass
+    if isinstance(value, dict):
+        return {str(k): _to_json_dict(v) for k, v in value.items()}
+    if isinstance(value, (list, tuple, set)):
+        return [_to_json_dict(v) for v in value]
+    try:
+        return json.loads(json.dumps(value, default=str))
+    except Exception:
+        return str(value)
+def _grader_breakdown_md(obs, terminal_only: bool = False) -> str:
+    """Format the per-component reward breakdown for the side panel.
+    DrugEnv puts the decomposed RewardBreakdown into
+    ``obs.step_reward_breakdown`` — both the step components and the
+    terminal components (prefixed with ``term_``) when the episode ends.
+    """
+    if obs is None:
+        return "*(no episode)*"
+    bd: Dict[str, float] = dict(getattr(obs, "step_reward_breakdown", {}) or {})
+    if not bd:
+        return "*(no reward yet — take a step)*"
+    step_keys = [
+        "novelty", "reasoning_coherence", "credit_efficiency",
+        "shaping", "penalty", "total",
+    ]
+    term_keys = [
+        "decision_accuracy", "evidence_coverage", "credit_efficiency",
+        "reasoning_coherence", "penalty", "terminal", "total",
+    ]
+    def _fmt(v: float) -> str:
+        return f"`{v:+.3f}`"
+    lines: List[str] = []
+    if not terminal_only:
+        step_present = [k for k in step_keys if k in bd]
+        if step_present:
+            lines.append("**Step reward components**")
+            for k in step_present:
+                lines.append(f"- {k}: {_fmt(bd[k])}")
+    term_present = [k for k in term_keys if f"term_{k}" in bd]
+    if term_present:
+        lines.append("\n**Terminal reward components** *(only at episode end)*")
+        for k in term_present:
+            lines.append(f"- {k}: {_fmt(bd[f'term_{k}'])}")
+    if not lines:
+        return "*(no reward yet — take a step)*"
+    return "\n".join(lines)
 def _credit_progress_md(obs) -> str:
     used = max(0, obs.credits_total - obs.credits_remaining)
     total = max(1, obs.credits_total)
     seed: int,
     agent_name: str,
     max_steps: int = 30,
+) -> Iterator[Tuple[str, str, str, str, str, str, str, Dict[str, Any]]]:
+    """Run a full episode in-process; yield UI updates per step.
+    Yields an 8-tuple of UI-bound values:
+    ``(log_md, cum_reward, step_idx, credits_md, dossier_md, truth_md,
+       breakdown_md, obs_json)``.
+    """
     import random
     from models import ActionType
         _credit_progress_md(obs),
         _dossier_md(obs),
         "*(truth revealed when the episode ends)*",
+        "*(no reward yet — first step pending)*",
+        _to_json_dict(obs),
     )
     steps = 0
     while not obs.done and steps < max_steps:
         if agent_name == "random":
             action = _random_step(obs, rng)
+        elif agent_name == "lazy_antioracle":
+            # Run a small burst of redundant cheap queries (to rack up
+            # `redundant_*` soft violations and tank `credit_efficiency`),
+            # then submit the *opposite* of the correct decision with
+            # confidence 0.95 to fire ``confident_wrong_answer_penalty``.
+            # Combined effect: cum total goes clearly negative.
+            from training.training_script import build_drug_target_action
+            REDUNDANT_QUERIES = 12
+            if len(history) < REDUNDANT_QUERIES:
+                action = build_drug_target_action(
+                    ActionType.QUERY_EXPRESSION, obs,
+                )
+            else:
+                action = build_drug_target_action(
+                    ActionType.SUBMIT_VALIDATION_REPORT, obs,
+                )
+                if env._latent is not None:
+                    correct = env._latent.target.correct_decision
+                    wrong = "no_go" if correct == "go" else "go"
+                    action = action.model_copy(update={
+                        "final_decision": wrong,
+                        "confidence": 0.95,
+                        "reasoning": (
+                            "Lazy anti-oracle: redundant queries + opposite "
+                            "decision with high confidence to compound "
+                            "redundancy and confident-wrong penalties."
+                        ),
+                    })
+        elif agent_name == "spammer":
+            # Repeat the cheapest action over and over without ever
+            # submitting. Triggers redundancy penalties + the
+            # ``no_report_submitted_penalty`` at terminal.
+            from training.training_script import build_drug_target_action
+            action = build_drug_target_action(ActionType.QUERY_EXPRESSION, obs)
         else:
+            # ``oracle``, ``antioracle``, and ``heuristic`` all run the
+            # standard pipeline order; oracle / antioracle additionally
+            # patch the terminal step (oracle = correct decision,
+            # antioracle = opposite decision with high confidence — to
+            # demo the overconfident-wrong penalty in the grader).
             action = _heuristic_step(obs, history)
             if (
+                action.action_type == ActionType.SUBMIT_VALIDATION_REPORT
                 and env._latent is not None
             ):
+                if agent_name == "oracle":
+                    action = action.model_copy(update={
+                        "final_decision": env._latent.target.correct_decision,
+                        "confidence": 0.85,
+                        "reasoning": "Oracle: submit correct decision (peeked latent).",
+                    })
+                elif agent_name == "antioracle":
+                    correct = env._latent.target.correct_decision
+                    wrong = "no_go" if correct == "go" else "go"
+                    action = action.model_copy(update={
+                        "final_decision": wrong,
+                        "confidence": 0.95,
+                        "reasoning": (
+                            "Anti-oracle: submit deliberately wrong decision "
+                            "with high confidence to trigger the "
+                            "overconfident-wrong penalty."
+                        ),
+                    })
         history.append(action.action_type)
         obs = env.step(action)
         rew = float(obs.reward or 0.0)
             _credit_progress_md(obs),
             _dossier_md(obs),
             _truth_md(env._latent, obs.done) if obs.done else "*(truth revealed when the episode ends)*",
+            _grader_breakdown_md(obs, terminal_only=False),
+            _to_json_dict(obs),
         )
     log_lines.append("-" * 70)
         _credit_progress_md(obs),
         _dossier_md(obs),
         _truth_md(env._latent, True),
+        _grader_breakdown_md(obs, terminal_only=False),
+        _to_json_dict(obs),
     )
 # ── Tab 2: build your own actions ───────────────────────────────────────
+def _new_episode(
+    scenario_label: str, seed: int,
+) -> Tuple[Any, Any, str, str, str, str, str, str, str, Dict[str, Any]]:
     from server.hackathon_environment import DrugTargetEnvironment
     env = DrugTargetEnvironment(**_resolve_scenario(scenario_label))
         _credit_progress_md(obs),             # credits
         _dossier_md(obs),                     # dossier
         "*(submit a `submit_validation_report` or run out of credits to reveal)*",
+        "*(no reward yet — take a step)*",    # breakdown_md
+        _to_json_dict(obs),                   # obs_json
     )
     final_decision: str,
     confidence: float,
     reasoning: str,
+) -> Tuple[Any, Any, str, str, str, str, str, str, str, Dict[str, Any]]:
     from models import ActionType, DrugTargetAction
     if env is None or obs is None:
             "*(no episode)*",
             "*(no episode)*",
             "*(no episode)*",
+            "*(no episode)*",
+            {},
         )
     if obs.done:
             _credit_progress_md(obs),
             _dossier_md(obs),
             _truth_md(env._latent, True),
+            _grader_breakdown_md(obs, terminal_only=False),
+            _to_json_dict(obs),
         )
     try:
             _credit_progress_md(obs),
             _dossier_md(obs),
             "*(truth shown at end of episode)*",
+            _grader_breakdown_md(obs, terminal_only=False),
+            _to_json_dict(obs),
         )
     params: Dict[str, Any] = {}
         _credit_progress_md(new_obs),
         _dossier_md(new_obs),
         _truth_md(env._latent, new_obs.done),
+        _grader_breakdown_md(new_obs, terminal_only=False),
+        _to_json_dict(new_obs),
     )
             with gr.TabItem("▶ Watch baseline agent"):
                 gr.Markdown(
                     "Pick a scenario and seed, then click one of **Random / "
+                    "Heuristic / Oracle / Anti-oracle**. The agent will play "
+                    "a full episode and stream every action+reward into the "
+                    "log. The **Oracle** baseline submits the ground-truth "
+                    "decision; the **Anti-oracle** submits the *opposite* "
+                    "with high confidence — a quick way to see the grader's "
+                    "overconfident-wrong penalty fire.\n\n"
+                    "Or jump to **📋 Quick test cases** below for one-click "
+                    "presets that demonstrate both happy-path scoring and "
+                    "two deliberately-penalised failure modes."
                 )
                 with gr.Row():
                     seed_in = gr.Number(value=7, precision=0, label="Seed")
                 with gr.Row():
+                    btn_random = gr.Button("▶ Random", variant="secondary")
+                    btn_heuristic = gr.Button("▶ Heuristic", variant="secondary")
+                    btn_oracle = gr.Button("▶ Oracle (correct)", variant="primary")
+                    btn_antioracle = gr.Button("▶ Anti-oracle (wrong)", variant="stop")
+                    btn_lazy = gr.Button("▶ Lazy anti-oracle", variant="stop")
+                    btn_spammer = gr.Button("▶ Spammer (no submit)", variant="stop")
                 with gr.Row():
                     with gr.Column(scale=3):
                             "*(truth revealed when the episode ends)*",
                             label="🎯 Hidden target profile (revealed at end of episode)",
                         )
+                        breakdown_b = gr.Markdown(
+                            "*(no reward yet)*",
+                            label="📊 Grader breakdown (per-component reward)",
+                        )
+                with gr.Accordion("📋 Raw observation JSON (latest step)", open=False):
+                    obs_json_b = gr.JSON(value={}, label="ValidationObservation")
                 def _run(scenario_label, seed, agent_name):
                     yield from _stream_baseline(
                         agent_name,
                     )
+                outputs_b = [
+                    log_md, cum_reward_b, step_b,
+                    credits_b, dossier_b, truth_b,
+                    breakdown_b, obs_json_b,
+                ]
                 btn_random.click(
                     lambda s, sd: _run(s, sd, "random"),
                     inputs=[scenario_dd, seed_in],
                     inputs=[scenario_dd, seed_in],
                     outputs=outputs_b,
                 )
+                btn_antioracle.click(
+                    lambda s, sd: _run(s, sd, "antioracle"),
+                    inputs=[scenario_dd, seed_in],
+                    outputs=outputs_b,
+                )
+                btn_lazy.click(
+                    lambda s, sd: _run(s, sd, "lazy_antioracle"),
+                    inputs=[scenario_dd, seed_in],
+                    outputs=outputs_b,
+                )
+                btn_spammer.click(
+                    lambda s, sd: _run(s, sd, "spammer"),
+                    inputs=[scenario_dd, seed_in],
+                    outputs=outputs_b,
+                )
+                # ───── Quick test cases (one-click presets) ─────
+                gr.Markdown(
+                    "---\n"
+                    "### 📋 Quick test cases — demonstrating the grader\n"
+                    "These five preset rollouts each take a few seconds. "
+                    "The first three demonstrate **correct** play scoring "
+                    "high; the last two are **deliberately penalised** so "
+                    "you can watch the grader's `decision_accuracy`, "
+                    "`evidence_coverage`, and `penalty` components fire."
+                )
+                def _tc_label(scenario_value: str) -> str:
+                    """Map a scenario_name back to its dropdown label."""
+                    for lab, val in SCENARIO_CHOICES:
+                        if val == scenario_value:
+                            return lab
+                    return scenario_value
+                for tc in TEST_CASES:
+                    with gr.Row():
+                        with gr.Column(scale=2):
+                            tc_btn = gr.Button(
+                                tc["label"],
+                                variant="primary"
+                                    if tc["agent"] in ("oracle", "heuristic")
+                                    else "stop",
+                            )
+                        with gr.Column(scale=5):
+                            gr.Markdown(
+                                f"*scenario=`{tc['scenario']}` · "
+                                f"seed=`{tc['seed']}` · "
+                                f"agent=`{tc['agent']}`*  \n"
+                                f"{tc['expectation']}"
+                            )
+                    def _make_runner(scenario_value: str, seed: int, agent_name: str):
+                        scenario_label = _tc_label(scenario_value)
+                        def _runner():
+                            yield from _stream_baseline(
+                                scenario_label, int(seed), agent_name,
+                            )
+                        return _runner
+                    tc_btn.click(
+                        _make_runner(tc["scenario"], tc["seed"], tc["agent"]),
+                        inputs=None,
+                        outputs=outputs_b,
+                    )
             # ───────── Tab 2: Build your own actions ─────────
             with gr.TabItem("🛠 Build custom action"):
                             "*(truth revealed when the episode ends)*",
                             label="🎯 Hidden target profile",
                         )
+                        breakdown_md_2 = gr.Markdown(
+                            "*(no reward yet)*",
+                            label="📊 Grader breakdown",
+                        )
+                with gr.Accordion("📋 Raw observation JSON", open=False):
+                    with gr.Row():
+                        btn_show_json = gr.Button(
+                            "📋 Show observation JSON", variant="secondary",
+                        )
+                    obs_json_2 = gr.JSON(value={}, label="ValidationObservation")
+                tab2_outputs = [
+                    env_state, obs_state, status_md, cum_reward, step_idx,
+                    credits, dossier, truth, breakdown_md_2, obs_json_2,
+                ]
                 btn_new.click(
                     _new_episode,
                     inputs=[scenario_dd2, seed_in2],
+                    outputs=tab2_outputs,
                 )
                 btn_submit.click(
                         env_state, obs_state, action_type, database,
                         include_allosteric, final_decision, confidence, reasoning,
                     ],
+                    outputs=tab2_outputs,
+                )
+                # Manual "Show JSON" refresh — re-emit the current observation
+                # as JSON without advancing the env. Lets the user inspect the
+                # full ValidationObservation pydantic structure on demand.
+                def _show_obs_json(obs) -> Dict[str, Any]:
+                    if obs is None:
+                        return {"error": "no active episode — click '🔄 New episode' first"}
+                    return _to_json_dict(obs)
+                btn_show_json.click(
+                    _show_obs_json,
+                    inputs=[obs_state],
+                    outputs=[obs_json_2],
                 )
             # ───────── Tab 3: Inspect hidden truth ─────────