TheJackBright commited on
Commit
e21fe7d
·
verified ·
1 Parent(s): 21c7db9

Deploy PolyGuard OpenEnv Space (polyguard-rl @ 52ecdc8)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .env.example +8 -0
  2. .gitattributes +1 -0
  3. Dockerfile +3 -3
  4. Dockerfile.space +2 -3
  5. README.md +40 -4
  6. app/api/routes.py +12 -0
  7. app/api/schemas.py +8 -0
  8. app/tools/__init__.py +1 -0
  9. app/tools/medication_alternatives.py +463 -0
  10. app/ui/frontend/src/App.tsx +14 -0
  11. app/ui/frontend/src/components/AlternativeMedicineSearch.tsx +215 -0
  12. app/ui/frontend/src/styles/theme.css +102 -0
  13. docker/space/README.md +28 -10
  14. docs/DEMO_RECORDING_SCRIPT.md +493 -0
  15. docs/deployment.md +1 -1
  16. docs/final_submission_audit.md +1 -1
  17. docs/idea_document_traceability.md +57 -0
  18. docs/participant_guide_traceability.md +1 -1
  19. docs/ui.md +4 -0
  20. scripts/deploy_space_api.py +25 -3
  21. scripts/install_hf_active_bundle.py +18 -7
  22. submission_bundle/README.md +13 -0
  23. submission_bundle/grpo_training_cycle/docs_results/README.md +22 -0
  24. submission_bundle/grpo_training_cycle/docs_results/acceptance_gate.json +11 -0
  25. submission_bundle/grpo_training_cycle/docs_results/avg_process_fidelity.png +0 -0
  26. submission_bundle/grpo_training_cycle/docs_results/avg_reward.png +0 -0
  27. submission_bundle/grpo_training_cycle/docs_results/baselines.json +119 -0
  28. submission_bundle/grpo_training_cycle/docs_results/benchmark_report.json +52 -0
  29. submission_bundle/grpo_training_cycle/docs_results/benchmark_report.txt +52 -0
  30. submission_bundle/grpo_training_cycle/docs_results/dose_train.json +6 -0
  31. submission_bundle/grpo_training_cycle/docs_results/dosing_grpo.json +28 -0
  32. submission_bundle/grpo_training_cycle/docs_results/frontier_ready.json +8 -0
  33. submission_bundle/grpo_training_cycle/docs_results/graph_train.json +5 -0
  34. submission_bundle/grpo_training_cycle/docs_results/grpo_ablation_report.json +149 -0
  35. submission_bundle/grpo_training_cycle/docs_results/grpo_trl_run.json +42 -0
  36. submission_bundle/grpo_training_cycle/docs_results/grpo_trl_run_auto.json +39 -0
  37. submission_bundle/grpo_training_cycle/docs_results/grpo_trl_run_fallback_check.json +39 -0
  38. submission_bundle/grpo_training_cycle/docs_results/grpo_trl_run_smoke.json +39 -0
  39. submission_bundle/grpo_training_cycle/docs_results/grpo_trl_run_strict_check.json +39 -0
  40. submission_bundle/grpo_training_cycle/docs_results/hf_space_verification.json +29 -0
  41. submission_bundle/grpo_training_cycle/docs_results/hf_training_status.json +123 -0
  42. submission_bundle/grpo_training_cycle/docs_results/improvement_report.json +19 -0
  43. submission_bundle/grpo_training_cycle/docs_results/improvement_report_benchmark.json +19 -0
  44. submission_bundle/grpo_training_cycle/docs_results/inference_benchmark.json +22 -0
  45. submission_bundle/grpo_training_cycle/docs_results/legality_rate.png +0 -0
  46. submission_bundle/grpo_training_cycle/docs_results/planner_grpo.json +28 -0
  47. submission_bundle/grpo_training_cycle/docs_results/plot_index.json +9 -0
  48. submission_bundle/grpo_training_cycle/docs_results/policy_stack_avg_reward.png +0 -0
  49. submission_bundle/grpo_training_cycle/docs_results/postsave_inference.json +43 -0
  50. submission_bundle/grpo_training_cycle/docs_results/risk_train.json +6 -0
.env.example CHANGED
@@ -20,3 +20,11 @@ POLYGUARD_FRONTIER_MODEL=Qwen/Qwen2.5-7B-Instruct
20
  POLYGUARD_ALLOW_WEB_FETCH=false
21
  POLYGUARD_REWARD_MIN=0.001
22
  POLYGUARD_REWARD_MAX=0.999
 
 
 
 
 
 
 
 
 
20
  POLYGUARD_ALLOW_WEB_FETCH=false
21
  POLYGUARD_REWARD_MIN=0.001
22
  POLYGUARD_REWARD_MAX=0.999
23
+
24
+ # --- Medication alternatives tool (FDA openFDA + optional external CDS) ---
25
+ # Optional: higher openFDA rate limits — request a key at https://open.fda.gov/apis/authentication/
26
+ # POLYGUARD_OPENFDA_API_KEY=
27
+ # Optional: POST { "drug_names": ["..."] } to your service; Bearer token if required (Tally/Vellum/custom).
28
+ # Never commit real tokens; set in Space secrets or local .env only.
29
+ # POLYGUARD_MED_TOOL_URL=
30
+ # POLYGUARD_MED_TOOL_TOKEN=
.gitattributes CHANGED
@@ -41,3 +41,4 @@ docs/results/submission_evidence/qwen_0_5b_1_5b/reward_component_bars.png filter
41
  docs/results/submission_evidence/qwen_0_5b_1_5b_3b/reward_component_bars.png filter=lfs diff=lfs merge=lfs -text
42
  docs/results/submission_evidence_qwen_0_5b_1_5b/charts/generated/reward_component_bars.png filter=lfs diff=lfs merge=lfs -text
43
  docs/results/submission_evidence_qwen_0_5b_1_5b_3b/charts/generated/reward_component_bars.png filter=lfs diff=lfs merge=lfs -text
 
 
41
  docs/results/submission_evidence/qwen_0_5b_1_5b_3b/reward_component_bars.png filter=lfs diff=lfs merge=lfs -text
42
  docs/results/submission_evidence_qwen_0_5b_1_5b/charts/generated/reward_component_bars.png filter=lfs diff=lfs merge=lfs -text
43
  docs/results/submission_evidence_qwen_0_5b_1_5b_3b/charts/generated/reward_component_bars.png filter=lfs diff=lfs merge=lfs -text
44
+ submission_bundle/qwen_completed_runs/charts/generated/reward_component_bars.png filter=lfs diff=lfs merge=lfs -text
Dockerfile CHANGED
@@ -1,6 +1,6 @@
1
- # Hugging Face Space: single-port edge (nginx) + OpenEnv (8100) + API (8200) + static UI.
2
- # Build from repository root: docker build -f Dockerfile.space -t polyguard-space .
3
- # Cheap tier: use Space "CPU basic"; first boot downloads ~1.1GB model bundle.
4
 
5
  FROM node:20-bookworm-slim AS frontend
6
  WORKDIR /build
 
1
+ # Hugging Face Space: nginx on PORT (7860) + OpenEnv (8100) + API (8200) + Vite-built UI.
2
+ # Build: docker build -t polyguard-space .
3
+ # HF Spaces use this file by default when "Dockerfile path" is unset keep this as the demo image.
4
 
5
  FROM node:20-bookworm-slim AS frontend
6
  WORKDIR /build
Dockerfile.space CHANGED
@@ -1,6 +1,5 @@
1
- # Hugging Face Space: single-port edge (nginx) + OpenEnv (8100) + API (8200) + static UI.
2
- # Build from repository root: docker build -f Dockerfile.space -t polyguard-space .
3
- # Cheap tier: use Space "CPU basic"; first boot downloads ~1.1GB model bundle.
4
 
5
  FROM node:20-bookworm-slim AS frontend
6
  WORKDIR /build
 
1
+ # Same image as ./Dockerfile use this path in HF Space settings if "Dockerfile path"
2
+ # must be explicit (e.g. Dockerfile.space). Keep in sync with Dockerfile.
 
3
 
4
  FROM node:20-bookworm-slim AS frontend
5
  WORKDIR /build
README.md CHANGED
@@ -1,12 +1,48 @@
1
  ---
2
  title: PolyGuard OpenEnv
3
- emoji: 🛡️
4
  colorFrom: blue
5
- colorTo: purple
6
  sdk: docker
7
  app_port: 7860
8
  pinned: false
9
- license: mit
10
  ---
11
 
12
- Full-stack **PolyGuard** workbench: OpenEnv (WebSocket), FastAPI, and React UI behind nginx on `PORT`. Uses **CPU basic**; first cold start downloads the public [usable model bundle](https://huggingface.co/TheJackBright/polyguard-openenv-training-full-artifacts/tree/main/usable_model_bundles/local-qwen-0-5b-active-smoke) (~1.1 GB). See `docker/space/README.md` for details.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: PolyGuard OpenEnv
 
3
  colorFrom: blue
4
+ colorTo: green
5
  sdk: docker
6
  app_port: 7860
7
  pinned: false
 
8
  ---
9
 
10
+ # PolyGuard (OpenEnv implementation package)
11
+
12
+ Run all CLI commands from this directory (`cd polyguard-rl`). The repository root [`README.md`](../README.md) carries the same submission narrative with paths adjusted for viewers landing on the GitHub repo home page.
13
+
14
+ ## Submission Links
15
+
16
+ - GitHub Repo URL: [https://github.com/Vishwa-docs/Meta_Pytorch_OpenEnv_Scaler_VK](https://github.com/Vishwa-docs/Meta_Pytorch_OpenEnv_Scaler_VK)
17
+ - HF Space URL: [https://huggingface.co/spaces/TheJackBright/polyguard-openenv](https://huggingface.co/spaces/TheJackBright/polyguard-openenv)
18
+ - Colab Notebook URL: [https://colab.research.google.com/github/Vishwa-docs/Meta_Pytorch_OpenEnv_Scaler_VK/blob/master/polyguard-rl/PolyGuard_SFT_GRPO_One_Run_Runner.ipynb](https://colab.research.google.com/github/Vishwa-docs/Meta_Pytorch_OpenEnv_Scaler_VK/blob/master/polyguard-rl/PolyGuard_SFT_GRPO_One_Run_Runner.ipynb) (see also `notebooks/09_training_loop.ipynb` for a modular training walkthrough)
19
+ - YouTube Video URL: not used for this submission; see Hugging Face Blog URL below.
20
+ - Hugging Face Blog URL: [https://huggingface.co/blog/TheJackBright/polyguard-openenv](https://huggingface.co/blog/TheJackBright/polyguard-openenv) *(publish `docs/hf_blog_draft.md` or replace with a live story URL)*
21
+
22
+ ## Problem Statement
23
+
24
+ Polypharmacy decisions are long-horizon, partially observable, and safety-critical. PolyGuard is a research environment where an LLM agent selects constrained clinical actions, receives verifier-backed reward, and improves via SFT + GRPO—not generic open-ended chat fine-tuning.
25
+
26
+ ## Environment
27
+
28
+ `PolyGuardEnv` exposes OpenEnv-style HTTP/WebSocket endpoints (`/reset`, `/step`, `/state`, `/metadata`, `/schema`, `/mcp`, `/health`, `/ws`). Sub-environments include DDI, bandit mining, regimen risk, precision dosing, longitudinal deprescribing, web-search missing data, alternative suggestion, and new-drug decomposition. See `openenv.yaml`, `app/env/env_core.py`, `app/env/fastapi_app.py`, and `docs/environment_design.md`.
29
+
30
+ ## Agent Capabilities
31
+
32
+ Medication reconciliation, evidence retrieval, graph safety, dosing guardrails, candidate generation, supervisor routing, planner/critic stack, explanations, and contextual bandit ranking for ablations (`app/agents/`, `docs/agents.md`).
33
+
34
+ ## Tasks
35
+
36
+ DDI risk reduction, safe adds/substitutions, regimen optimization, taper/deprescribing sequences, precision dosing, missing-data recovery, and new-drug decomposition (`data/scenarios/`, `app/env/catalog.py`).
37
+
38
+ ## Reward Model / Evaluation Logic
39
+
40
+ Thirteen verifier-backed reward components roll up into four primary channels (`safety_legality`, `clinical_improvement`, `dosing_quality`, `process_integrity`), clamped to `[0.001, 0.999]`, with anti-cheat and timeout logic (`app/env/reward_router.py`, `app/env/anti_cheat.py`, `docs/reward_design.md`).
41
+
42
+ ## Training And Post-Training Strategy
43
+
44
+ Build corpora (`scripts/bootstrap_data.py`, `scripts/build_training_corpus.py`), SFT with TRL (`scripts/train_sft_trl.py`), GRPO with environment reward (`scripts/train_grpo_trl.py`), merge adapters (`scripts/merge_adapters_safe.py`), validate inference (`scripts/test_inference_postsave.py`), evaluate and plot (`scripts/evaluate_*.py`, `docs/results/`). Optional HF GPU training: `scripts/deploy_training_space.py`. Full commands: repository root [`README.md`](../README.md) or `docs/training.md`.
45
+
46
+ ## Documentation index
47
+
48
+ - [Architecture](docs/architecture.md) · [Environment](docs/environment_design.md) · [Rewards](docs/reward_design.md) · [Training](docs/training.md) · [Evaluation](docs/evaluation.md) · [Deployment](docs/deployment.md) · [Datasets](docs/datasets.md) · [Participant guide traceability](docs/participant_guide_traceability.md) · [Idea doc vs implementation](docs/idea_document_traceability.md) · [**Space UI demo script**](docs/DEMO_RECORDING_SCRIPT.md)
app/api/routes.py CHANGED
@@ -5,9 +5,11 @@ from __future__ import annotations
5
  from fastapi import APIRouter, Depends, HTTPException
6
 
7
  from app.api.dependencies import get_service
 
8
  from app.api.schemas import (
9
  BatchInferRequest,
10
  EvidenceQueryRequest,
 
11
  OrchestrateRequest,
12
  ResetRequest,
13
  StepCandidateRequest,
@@ -137,3 +139,13 @@ def cases_search(q: str, service: APIService = Depends(get_service)) -> list[dic
137
  @router.post("/evidence/query")
138
  def evidence_query(payload: EvidenceQueryRequest, service: APIService = Depends(get_service)) -> list[dict]:
139
  return service.evidence_query(query=payload.query, top_k=payload.top_k)
 
 
 
 
 
 
 
 
 
 
 
5
  from fastapi import APIRouter, Depends, HTTPException
6
 
7
  from app.api.dependencies import get_service
8
+ from app.tools.medication_alternatives import build_alternatives_response
9
  from app.api.schemas import (
10
  BatchInferRequest,
11
  EvidenceQueryRequest,
12
+ MedicationAlternativesRequest,
13
  OrchestrateRequest,
14
  ResetRequest,
15
  StepCandidateRequest,
 
139
  @router.post("/evidence/query")
140
  def evidence_query(payload: EvidenceQueryRequest, service: APIService = Depends(get_service)) -> list[dict]:
141
  return service.evidence_query(query=payload.query, top_k=payload.top_k)
142
+
143
+
144
+ @router.post("/tools/medication_alternatives")
145
+ def medication_alternatives(payload: MedicationAlternativesRequest) -> dict:
146
+ """OpenFDA class neighbors + optional external POST (env: POLYGUARD_MED_TOOL_URL / TOKEN)."""
147
+ return build_alternatives_response(
148
+ query_drug=payload.query_drug,
149
+ regimen_drugs=payload.regimen_drugs,
150
+ max_suggestions=payload.max_suggestions,
151
+ )
app/api/schemas.py CHANGED
@@ -55,3 +55,11 @@ class BatchInferRequest(StrictSchema):
55
  class EvidenceQueryRequest(StrictSchema):
56
  query: str
57
  top_k: int = 5
 
 
 
 
 
 
 
 
 
55
  class EvidenceQueryRequest(StrictSchema):
56
  query: str
57
  top_k: int = 5
58
+
59
+
60
+ class MedicationAlternativesRequest(StrictSchema):
61
+ """FDA / external tool: suggest other labeled drugs in a similar pharmacologic class."""
62
+
63
+ query_drug: Optional[str] = None
64
+ regimen_drugs: list[str] = Field(default_factory=list)
65
+ max_suggestions: int = Field(default=10, ge=1, le=25)
app/tools/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Optional product tools (FDA search, external CDS hooks)."""
app/tools/medication_alternatives.py ADDED
@@ -0,0 +1,463 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """OpenFDA-backed medication class search + optional external HTTP tool.
2
+
3
+ Secrets (OpenFDA key, Tally/Vellum/custom bearer tokens) must come from env only.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import logging
9
+ import os
10
+ import re
11
+ from typing import Any
12
+ from urllib.parse import quote
13
+
14
+ import requests
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ OPENFDA_LABEL = "https://api.fda.gov/drug/label.json"
19
+ _DEFAULT_DISCLAIMER = (
20
+ "Research aid only — not medical advice. FDA labels may be incomplete; verify in approved prescribing information."
21
+ )
22
+
23
+
24
+ def _openfda_key_suffix() -> str:
25
+ key = os.getenv("POLYGUARD_OPENFDA_API_KEY", "").strip()
26
+ if not key:
27
+ return ""
28
+ return f"&api_key={quote(key, safe='')}"
29
+
30
+
31
+ def _fda_get(search: str, limit: int) -> dict[str, Any] | None:
32
+ """GET openFDA label.json; returns parsed JSON or None on failure."""
33
+ q = quote(search, safe="")
34
+ url = f"{OPENFDA_LABEL}?search={q}&limit={int(limit)}{_openfda_key_suffix()}"
35
+ try:
36
+ resp = requests.get(url, timeout=14)
37
+ if resp.status_code != 200:
38
+ logger.warning("openfda_http_%s: %s", resp.status_code, resp.text[:200])
39
+ return None
40
+ return resp.json()
41
+ except Exception as exc: # noqa: BLE001
42
+ logger.warning("openfda_request_failed: %s", exc)
43
+ return None
44
+
45
+
46
+ def _first_openfda(payload: dict[str, Any] | None) -> dict[str, Any]:
47
+ if not payload or "results" not in payload:
48
+ return {}
49
+ results = payload.get("results")
50
+ if not isinstance(results, list) or not results:
51
+ return {}
52
+ first = results[0]
53
+ return first if isinstance(first, dict) else {}
54
+
55
+
56
+ def _openfda_block(label: dict[str, Any]) -> dict[str, Any]:
57
+ block = label.get("openfda")
58
+ return block if isinstance(block, dict) else {}
59
+
60
+
61
+ def _listify(value: Any) -> list[str]:
62
+ if value is None:
63
+ return []
64
+ if isinstance(value, str):
65
+ return [value]
66
+ if isinstance(value, list):
67
+ return [str(x).strip() for x in value if str(x).strip()]
68
+ return [str(value).strip()]
69
+
70
+
71
+ def _snippet(text: Any, max_len: int = 380) -> str | None:
72
+ if not text:
73
+ return None
74
+ if isinstance(text, list):
75
+ text = " ".join(str(x) for x in text[:6])
76
+ s = re.sub(r"\s+", " ", str(text)).strip()
77
+ if len(s) <= max_len:
78
+ return s
79
+ return s[: max_len - 1] + "…"
80
+
81
+
82
+ def _label_link(set_id: str | None) -> str | None:
83
+ if not set_id:
84
+ return None
85
+ return f"https://dailymed.nlm.nih.gov/dailymed/drugInfo.cfm?setid={set_id}"
86
+
87
+
88
+ # Keywords from free text / simulator tokens → openFDA pharm_class_epc strings (exact or prefix).
89
+ _KEYWORD_EPCS: tuple[tuple[str, tuple[str, ...]], ...] = (
90
+ ("benzodiazepine", ("Benzodiazepine", "Benzodiazepine Sedative")),
91
+ ("benzo", ("Benzodiazepine",)),
92
+ ("nsaid", ("Nonsteroidal Anti-inflammatory Drug",)),
93
+ ("opioid", ("Opioid Agonist", "Full Opioid Agonists")),
94
+ ("statin", ("HMG-CoA Reductase Inhibitor",)),
95
+ ("beta blocker", ("beta-Adrenergic Blocker",)),
96
+ ("betablocker", ("beta-Adrenergic Blocker",)),
97
+ ("ace inhibitor", ("Angiotensin-converting Enzyme Inhibitor",)),
98
+ ("arb", ("Angiotensin II Receptor Blocker",)),
99
+ ("ppi", ("Proton Pump Inhibitor",)),
100
+ ("ssri", ("Selective Serotonin Reuptake Inhibitor",)),
101
+ # Anticoagulant / antiplatelet (simulator warfarin_like → warfarin)
102
+ ("warfarin", ("Vitamin K Antagonist",)),
103
+ ("heparin", ("Thrombin Inhibitor", "Factor Xa Inhibitor")),
104
+ )
105
+
106
+
107
+ def _normalize_simulator_query(q: str) -> str:
108
+ """Strip simulator suffixes and underscores so benzodiazepine_like → benzodiazepine."""
109
+ raw = q.strip().lower()[:120]
110
+ if not raw:
111
+ return ""
112
+ for suf in ("_like", "_analog", "_analogue", "_class", "_group", "_category"):
113
+ if raw.endswith(suf):
114
+ raw = raw[: -len(suf)].strip("_").strip()
115
+ return raw.replace("_", " ").strip()
116
+
117
+
118
+ def _class_search_variants(focus: str) -> list[str]:
119
+ """Ordered strings to try as openFDA pharm_class_epc (exact quoted) or wildcard body."""
120
+ raw = _normalize_simulator_query(focus)
121
+ if not raw:
122
+ return []
123
+ out: list[str] = []
124
+ seen: set[str] = set()
125
+
126
+ def add(s: str) -> None:
127
+ t = s.strip()
128
+ if len(t) < 3:
129
+ return
130
+ k = t.lower()
131
+ if k in seen:
132
+ return
133
+ seen.add(k)
134
+ out.append(t)
135
+
136
+ compact = raw.replace(" ", "")
137
+ # Prefer canonical FDA class strings before raw lowercase (better labels + display).
138
+ for kw, epcs in _KEYWORD_EPCS:
139
+ if kw in compact or kw in raw:
140
+ for e in epcs:
141
+ add(e)
142
+ add(raw)
143
+ first = raw.split()[0]
144
+ if first != raw:
145
+ add(first)
146
+ if raw and " " not in raw and raw.isalpha():
147
+ add(raw[0].upper() + raw[1:])
148
+ return out
149
+
150
+
151
+ def _resolve_focus_drug(query_drug: str | None, regimen_drugs: list[str]) -> str:
152
+ """Prefer explicit query_drug from client; do not silently use regimen[0] when multiple rows exist."""
153
+ q = (query_drug or "").strip()
154
+ if q:
155
+ return q
156
+ if len(regimen_drugs) == 1:
157
+ t = str(regimen_drugs[0]).strip()
158
+ return t
159
+ # Multiple regimen drugs but no focus: caller should send query_drug (frontend bug otherwise).
160
+ return ""
161
+
162
+
163
+ def _escape_fda_term(term: str) -> str:
164
+ """Remove characters that break openFDA quoted search."""
165
+ return re.sub(r'["\\]', " ", term).strip()[:100]
166
+
167
+
168
+ def _search_label_for_name(name: str) -> dict[str, Any]:
169
+ """Search brand, generic, or active substance on SPL labels."""
170
+ n = _escape_fda_term(name.strip()[:80])
171
+ if not n:
172
+ return {}
173
+ data_g = _fda_get(f'openfda.generic_name:"{n}"', limit=3)
174
+ if data_g and data_g.get("results"):
175
+ return _first_openfda(data_g)
176
+ data_b = _fda_get(f'openfda.brand_name:"{n}"', limit=3)
177
+ if data_b and data_b.get("results"):
178
+ return _first_openfda(data_b)
179
+ # Active ingredient / substance (helps real drug stems)
180
+ data_s = _fda_get(f'openfda.substance_name:"{n}"', limit=3)
181
+ if data_s and data_s.get("results"):
182
+ return _first_openfda(data_s)
183
+ data_a = _fda_get(f'openfda.active_ingredient:"{n}"', limit=3)
184
+ return _first_openfda(data_a) if data_a else {}
185
+
186
+
187
+ def _suggestions_by_class_probe(
188
+ field: str,
189
+ class_value: str,
190
+ exclude: set[str],
191
+ max_suggestions: int,
192
+ ) -> list[dict[str, Any]]:
193
+ rows = _suggestions_for_class(field, class_value, exclude, max_suggestions)
194
+ if rows:
195
+ return rows
196
+ # Wildcard: openFDA supports *suffix / prefix* on some fields
197
+ body = _escape_fda_term(class_value).lower()
198
+ if len(body) >= 4:
199
+ wild = _fda_get(f"openfda.{field}:*{body}*", limit=min(40, max(10, max_suggestions * 4)))
200
+ if wild and wild.get("results"):
201
+ # Reuse list builder by synthesizing a narrowed class is awkward; parse manually
202
+ out: list[dict[str, Any]] = []
203
+ seen: set[str] = set()
204
+ for row in wild.get("results", []):
205
+ if not isinstance(row, dict):
206
+ continue
207
+ of = _openfda_block(row)
208
+ brands = _listify(of.get("brand_name"))
209
+ generics = _listify(of.get("generic_name"))
210
+ display = (brands[0] if brands else None) or (generics[0] if generics else None)
211
+ if not display:
212
+ continue
213
+ key = display.lower()
214
+ if key in seen or key in exclude:
215
+ continue
216
+ seen.add(key)
217
+ ar = row.get("adverse_reactions")
218
+ ar_text = ar[0] if isinstance(ar, list) and ar else ar
219
+ set_id = None
220
+ if isinstance(of.get("spl_set_id"), list) and of["spl_set_id"]:
221
+ set_id = str(of["spl_set_id"][0])
222
+ elif of.get("spl_set_id"):
223
+ set_id = str(of["spl_set_id"])
224
+ out.append(
225
+ {
226
+ "display_name": display,
227
+ "generic_names": generics[:4],
228
+ "brand_names": brands[:4],
229
+ "routes": _listify(of.get("route"))[:4],
230
+ "adverse_reactions_snippet": _snippet(ar_text),
231
+ "label_link": _label_link(set_id),
232
+ "source_detail": f"openfda.{field}.wildcard",
233
+ },
234
+ )
235
+ if len(out) >= max_suggestions:
236
+ break
237
+ return out
238
+ return []
239
+
240
+
241
+ def _pick_pharm_class(openfda_block: dict[str, Any]) -> tuple[str | None, str | None]:
242
+ for key in ("pharm_class_epc", "pharm_class_cs", "pharm_class_moa"):
243
+ for item in _listify(openfda_block.get(key)):
244
+ if len(item) > 3:
245
+ return key, item
246
+ return None, None
247
+
248
+
249
+ def _suggestions_for_class(
250
+ field: str,
251
+ pharm_class: str,
252
+ exclude: set[str],
253
+ max_suggestions: int,
254
+ ) -> list[dict[str, Any]]:
255
+ """List other drugs sharing FDA pharmacologic class on label."""
256
+ pc = pharm_class.strip()[:120]
257
+ if not pc or not field:
258
+ return []
259
+ search = f'openfda.{field}:"{pc}"'
260
+ data = _fda_get(search, limit=min(50, max(10, max_suggestions * 4)))
261
+ if not data or not data.get("results"):
262
+ return []
263
+
264
+ out: list[dict[str, Any]] = []
265
+ seen: set[str] = set()
266
+ for row in data.get("results", []):
267
+ if not isinstance(row, dict):
268
+ continue
269
+ of = _openfda_block(row)
270
+ brands = _listify(of.get("brand_name"))
271
+ generics = _listify(of.get("generic_name"))
272
+ display = (brands[0] if brands else None) or (generics[0] if generics else None)
273
+ if not display:
274
+ continue
275
+ key = display.lower()
276
+ if key in seen:
277
+ continue
278
+ if key in exclude:
279
+ continue
280
+ seen.add(key)
281
+ ar = row.get("adverse_reactions")
282
+ if isinstance(ar, list) and ar:
283
+ ar_text = ar[0]
284
+ else:
285
+ ar_text = ar
286
+ set_id = None
287
+ if isinstance(of.get("spl_set_id"), list) and of["spl_set_id"]:
288
+ set_id = str(of["spl_set_id"][0])
289
+ elif of.get("spl_set_id"):
290
+ set_id = str(of["spl_set_id"])
291
+ out.append(
292
+ {
293
+ "display_name": display,
294
+ "generic_names": generics[:4],
295
+ "brand_names": brands[:4],
296
+ "routes": _listify(of.get("route"))[:4],
297
+ "adverse_reactions_snippet": _snippet(ar_text),
298
+ "label_link": _label_link(set_id),
299
+ "source_detail": f"openfda.{field}",
300
+ }
301
+ )
302
+ if len(out) >= max_suggestions:
303
+ break
304
+ return out
305
+
306
+
307
+ def _external_suggestions(drug_names: list[str]) -> list[dict[str, Any]] | None:
308
+ url = os.getenv("POLYGUARD_MED_TOOL_URL", "").strip()
309
+ if not url:
310
+ return None
311
+ headers: dict[str, str] = {"Content-Type": "application/json"}
312
+ token = os.getenv("POLYGUARD_MED_TOOL_TOKEN", "").strip()
313
+ if token:
314
+ headers["Authorization"] = f"Bearer {token}"
315
+ try:
316
+ resp = requests.post(
317
+ url,
318
+ json={"drug_names": drug_names},
319
+ headers=headers,
320
+ timeout=18,
321
+ )
322
+ if resp.status_code >= 400:
323
+ logger.warning("med_tool_http_%s", resp.status_code)
324
+ return []
325
+ payload = resp.json()
326
+ except Exception as exc: # noqa: BLE001
327
+ logger.warning("med_tool_request_failed: %s", exc)
328
+ return []
329
+ if not isinstance(payload, dict):
330
+ return []
331
+ raw = payload.get("suggestions")
332
+ if not isinstance(raw, list):
333
+ return []
334
+ cleaned: list[dict[str, Any]] = []
335
+ for item in raw:
336
+ if isinstance(item, dict) and item.get("display_name"):
337
+ row = dict(item)
338
+ row["source_detail"] = str(row.get("source_detail") or "external_tool")
339
+ cleaned.append(row)
340
+ elif isinstance(item, str) and item.strip():
341
+ cleaned.append(
342
+ {
343
+ "display_name": item.strip(),
344
+ "generic_names": [],
345
+ "brand_names": [],
346
+ "routes": [],
347
+ "adverse_reactions_snippet": None,
348
+ "label_link": None,
349
+ "source_detail": "external_tool",
350
+ }
351
+ )
352
+ return cleaned
353
+
354
+
355
+ def build_alternatives_response(
356
+ query_drug: str | None,
357
+ regimen_drugs: list[str],
358
+ max_suggestions: int,
359
+ ) -> dict[str, Any]:
360
+ errors: list[str] = []
361
+ regimen_clean = [str(x).strip() for x in regimen_drugs if str(x).strip()][:40]
362
+ focus = _resolve_focus_drug(query_drug, regimen_clean)
363
+ exclude = {x.lower() for x in regimen_clean}
364
+ if focus:
365
+ exclude.add(focus.lower())
366
+
367
+ external_rows: list[dict[str, Any]] = []
368
+ ext = _external_suggestions([focus] if focus else regimen_clean[:5])
369
+ if ext is not None:
370
+ external_rows = ext
371
+
372
+ if not focus and not regimen_clean:
373
+ return {
374
+ "focus_drug": "",
375
+ "therapeutic_class": None,
376
+ "suggestions": external_rows,
377
+ "source": "external" if external_rows else "none",
378
+ "disclaimer": _DEFAULT_DISCLAIMER,
379
+ "errors": ["Enter a drug name or load drugs from the current episode."],
380
+ }
381
+
382
+ if not focus and regimen_clean:
383
+ return {
384
+ "focus_drug": "",
385
+ "therapeutic_class": None,
386
+ "therapeutic_class_field": None,
387
+ "suggestions": external_rows,
388
+ "source": "external" if external_rows else "none",
389
+ "disclaimer": _DEFAULT_DISCLAIMER,
390
+ "errors": [
391
+ "Several medications are on this regimen; pick a focus row in the UI (or pass query_drug). "
392
+ "The server does not guess the first medication anymore.",
393
+ ],
394
+ }
395
+
396
+ # SPL name/substance search: normalize simulator tokens first (benzodiazepine_like → benzodiazepine).
397
+ lookup = _normalize_simulator_query(focus) or focus.strip()
398
+ label = _search_label_for_name(lookup)
399
+ ofb = _openfda_block(label)
400
+ pharm_field, pharm = _pick_pharm_class(ofb)
401
+
402
+ openfda_rows: list[dict[str, Any]] = []
403
+ if pharm and pharm_field:
404
+ openfda_rows = _suggestions_for_class(pharm_field, pharm, exclude, max_suggestions)
405
+ if not openfda_rows:
406
+ # Simulator tokens (e.g. benzodiazepine_like) or class keywords: try FDA class directly.
407
+ for cand in _class_search_variants(focus):
408
+ rows = _suggestions_by_class_probe("pharm_class_epc", cand, exclude, max_suggestions)
409
+ if rows:
410
+ pharm_field, pharm = "pharm_class_epc", cand
411
+ openfda_rows = rows
412
+ break
413
+ if not openfda_rows:
414
+ for cand in _class_search_variants(focus):
415
+ rows = _suggestions_by_class_probe("pharm_class_cs", cand, exclude, max_suggestions)
416
+ if rows:
417
+ pharm_field, pharm = "pharm_class_cs", cand
418
+ openfda_rows = rows
419
+ break
420
+
421
+ if not openfda_rows:
422
+ if not (pharm and pharm_field):
423
+ errors.append(
424
+ "Could not match this text to an FDA SPL (generic/brand/substance) or pharmacologic class. "
425
+ "Try a generic name (e.g. diazepam), a class keyword (e.g. benzodiazepine), or load from episode.",
426
+ )
427
+ elif not external_rows:
428
+ errors.append(
429
+ "No labeled products returned for this query (try another spelling or a broader class keyword).",
430
+ )
431
+
432
+ merged: list[dict[str, Any]] = []
433
+ seen_keys: set[str] = set()
434
+ for row in external_rows + openfda_rows:
435
+ display = str(row.get("display_name", "")).strip()
436
+ if not display:
437
+ continue
438
+ generics = [str(g).lower() for g in (row.get("generic_names") or []) if g]
439
+ dedupe_key = generics[0] if generics else display.lower()
440
+ if dedupe_key in seen_keys:
441
+ continue
442
+ seen_keys.add(dedupe_key)
443
+ merged.append(row)
444
+ if len(merged) >= max_suggestions:
445
+ break
446
+
447
+ source = "openfda"
448
+ if external_rows and openfda_rows:
449
+ source = "mixed"
450
+ elif external_rows and not openfda_rows:
451
+ source = "external"
452
+ elif not external_rows and not openfda_rows:
453
+ source = "none"
454
+
455
+ return {
456
+ "focus_drug": focus,
457
+ "therapeutic_class": pharm,
458
+ "therapeutic_class_field": pharm_field,
459
+ "suggestions": merged,
460
+ "source": source,
461
+ "disclaimer": _DEFAULT_DISCLAIMER,
462
+ "errors": errors,
463
+ }
app/ui/frontend/src/App.tsx CHANGED
@@ -20,6 +20,7 @@ import type {
20
  StepResponse,
21
  TaskPreset,
22
  } from "./lib/types";
 
23
  import MetaverseBackdrop from "./components/MetaverseBackdrop";
24
 
25
  type WorkbenchMode = "agent" | "env";
@@ -887,6 +888,18 @@ export default function App() {
887
  const activeInfo = mode === "agent" ? agentInfo : envInfo;
888
  const activeTerminationReason = shortValue(activeInfo?.termination_reason);
889
  const terminationReason = activeTerminationReason !== "-" ? activeTerminationReason : null;
 
 
 
 
 
 
 
 
 
 
 
 
890
  const heroStats: Array<[string, string]> = [
891
  ["Runtime", mode === "agent" ? "Agent Workbench" : "Env Explorer"],
892
  ["Scenario", taskLabel(taskId, catalog.task_presets)],
@@ -1164,6 +1177,7 @@ export default function App() {
1164
  <DetailPanel title="Explanation" data={mode === "agent" ? explanation : null} />
1165
  <DetailPanel title="Evidence" data={mode === "agent" ? (isRecord(evidence) || Array.isArray(evidence) ? evidence : null) : null} />
1166
  <EventLog events={events} error={error} />
 
1167
  </main>
1168
  <QTips
1169
  open={tipsOpen}
 
20
  StepResponse,
21
  TaskPreset,
22
  } from "./lib/types";
23
+ import AlternativeMedicineSearch from "./components/AlternativeMedicineSearch";
24
  import MetaverseBackdrop from "./components/MetaverseBackdrop";
25
 
26
  type WorkbenchMode = "agent" | "env";
 
888
  const activeInfo = mode === "agent" ? agentInfo : envInfo;
889
  const activeTerminationReason = shortValue(activeInfo?.termination_reason);
890
  const terminationReason = activeTerminationReason !== "-" ? activeTerminationReason : null;
891
+ const regimenForAltTool = useMemo(() => {
892
+ const meds = activeObservation?.medication_table ?? [];
893
+ const names: string[] = [];
894
+ for (const row of meds) {
895
+ const v = row.drug ?? row.drug_id ?? row.name;
896
+ if (typeof v === "string" && v.trim()) {
897
+ names.push(v.trim());
898
+ }
899
+ }
900
+ return names;
901
+ }, [activeObservation]);
902
+
903
  const heroStats: Array<[string, string]> = [
904
  ["Runtime", mode === "agent" ? "Agent Workbench" : "Env Explorer"],
905
  ["Scenario", taskLabel(taskId, catalog.task_presets)],
 
1177
  <DetailPanel title="Explanation" data={mode === "agent" ? explanation : null} />
1178
  <DetailPanel title="Evidence" data={mode === "agent" ? (isRecord(evidence) || Array.isArray(evidence) ? evidence : null) : null} />
1179
  <EventLog events={events} error={error} />
1180
+ <AlternativeMedicineSearch regimenDrugNames={regimenForAltTool} />
1181
  </main>
1182
  <QTips
1183
  open={tipsOpen}
app/ui/frontend/src/components/AlternativeMedicineSearch.tsx ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useCallback, useEffect, useState } from "react";
2
+ import { API_BASE } from "../lib/constants";
3
+
4
+ type Suggestion = {
5
+ display_name: string;
6
+ generic_names?: string[];
7
+ brand_names?: string[];
8
+ routes?: string[];
9
+ adverse_reactions_snippet?: string | null;
10
+ label_link?: string | null;
11
+ source_detail?: string;
12
+ };
13
+
14
+ type AlternativesResponse = {
15
+ focus_drug: string;
16
+ therapeutic_class: string | null;
17
+ therapeutic_class_field?: string | null;
18
+ suggestions: Suggestion[];
19
+ source: string;
20
+ disclaimer: string;
21
+ errors: string[];
22
+ };
23
+
24
+ type Props = {
25
+ regimenDrugNames: string[];
26
+ };
27
+
28
+ export default function AlternativeMedicineSearch({ regimenDrugNames }: Props) {
29
+ const [query, setQuery] = useState("");
30
+ /** Which row in the current episode regimen drives the FDA "focus" drug (not always the first). */
31
+ const [regimenFocusIndex, setRegimenFocusIndex] = useState(0);
32
+ /** False after user types in the box; true when select/load sets query so regimen row wins on submit. */
33
+ const [focusFromRegimenSelect, setFocusFromRegimenSelect] = useState(true);
34
+ const [loading, setLoading] = useState(false);
35
+ const [error, setError] = useState<string | null>(null);
36
+ const [result, setResult] = useState<AlternativesResponse | null>(null);
37
+
38
+ useEffect(() => {
39
+ if (regimenDrugNames.length === 0) {
40
+ setRegimenFocusIndex(0);
41
+ return;
42
+ }
43
+ setRegimenFocusIndex((prev) => (prev >= regimenDrugNames.length ? 0 : prev));
44
+ }, [regimenDrugNames]);
45
+
46
+ const runSearch = useCallback(
47
+ async (queryDrug: string | undefined, regimen: string[]) => {
48
+ setLoading(true);
49
+ setError(null);
50
+ try {
51
+ const res = await fetch(`${API_BASE}/tools/medication_alternatives`, {
52
+ method: "POST",
53
+ headers: { "Content-Type": "application/json" },
54
+ body: JSON.stringify({
55
+ query_drug: queryDrug?.trim() || null,
56
+ regimen_drugs: regimen,
57
+ max_suggestions: 7,
58
+ }),
59
+ });
60
+ if (!res.ok) {
61
+ const t = await res.text();
62
+ throw new Error(t.slice(0, 200) || `HTTP ${res.status}`);
63
+ }
64
+ setResult((await res.json()) as AlternativesResponse);
65
+ } catch (e) {
66
+ setResult(null);
67
+ setError(e instanceof Error ? e.message : "Request failed");
68
+ } finally {
69
+ setLoading(false);
70
+ }
71
+ },
72
+ [],
73
+ );
74
+
75
+ const safeRegimenIndex =
76
+ regimenDrugNames.length > 0
77
+ ? Math.min(Math.max(regimenFocusIndex, 0), regimenDrugNames.length - 1)
78
+ : 0;
79
+
80
+ /** Never send null focus when a regimen exists — avoids API defaulting to regimen[0] (always benzo if first). */
81
+ const resolvedFocusDrug = (): string | undefined => {
82
+ const typed = query.trim();
83
+ const fromList = regimenDrugNames[safeRegimenIndex]?.trim() ?? "";
84
+ if (focusFromRegimenSelect && regimenDrugNames.length > 0) {
85
+ return fromList || typed || undefined;
86
+ }
87
+ return typed || fromList || undefined;
88
+ };
89
+
90
+ const onSubmit = () => {
91
+ void runSearch(resolvedFocusDrug(), regimenDrugNames);
92
+ };
93
+
94
+ const onLoadRegimen = () => {
95
+ const names = regimenDrugNames.length ? regimenDrugNames : [];
96
+ if (!names.length) {
97
+ setError("Reset an episode first so the regimen list is available.");
98
+ return;
99
+ }
100
+ const idx = Math.min(Math.max(regimenFocusIndex, 0), names.length - 1);
101
+ const focus = names[idx] ?? "";
102
+ setRegimenFocusIndex(idx);
103
+ setQuery(focus);
104
+ setFocusFromRegimenSelect(true);
105
+ void runSearch(focus, names);
106
+ };
107
+
108
+ const onRegimenSelectChange = (index: number) => {
109
+ setRegimenFocusIndex(index);
110
+ const name = regimenDrugNames[index]?.trim() ?? "";
111
+ setQuery(name);
112
+ setFocusFromRegimenSelect(true);
113
+ };
114
+
115
+ return (
116
+ <section
117
+ className="panel-surface alt-med-tool panel-wide"
118
+ aria-label="FDA alternatives: openFDA class neighbors"
119
+ >
120
+ <div className="panel-heading">
121
+ <h2>FDA alternatives</h2>
122
+ <span>Tool</span>
123
+ </div>
124
+ {regimenDrugNames.length > 0 ? (
125
+ <label className="field alt-med-tool-regimen-select">
126
+ <span>Focus drug from current regimen</span>
127
+ <select
128
+ aria-label="Select regimen drug for alternatives search"
129
+ value={safeRegimenIndex}
130
+ onChange={(e) => onRegimenSelectChange(Number(e.target.value))}
131
+ >
132
+ {regimenDrugNames.map((name, i) => (
133
+ <option key={`${name}-${i}`} value={i}>
134
+ {name}
135
+ </option>
136
+ ))}
137
+ </select>
138
+ </label>
139
+ ) : null}
140
+ <div className="alt-med-tool-row">
141
+ <label className="field alt-med-tool-field">
142
+ <span>Drug name (focus)</span>
143
+ <input
144
+ value={query}
145
+ onChange={(e) => {
146
+ setQuery(e.target.value);
147
+ setFocusFromRegimenSelect(false);
148
+ }}
149
+ placeholder="e.g. diazepam, ibuprofen, benzodiazepine (or simulator token like benzodiazepine_like)"
150
+ autoComplete="off"
151
+ />
152
+ </label>
153
+ <div className="alt-med-tool-actions">
154
+ <button type="button" className="secondary" onClick={onLoadRegimen} disabled={loading}>
155
+ Load from episode
156
+ </button>
157
+ <button type="button" onClick={onSubmit} disabled={loading}>
158
+ {loading ? "Searching…" : "Suggest alternatives"}
159
+ </button>
160
+ </div>
161
+ </div>
162
+ <p className="muted small-print alt-med-tool-hint">
163
+ Pick a regimen row, then search. Up to 7 results — scroll the list below.
164
+ </p>
165
+ {error && <div className="error-banner">{error}</div>}
166
+ {result && (
167
+ <div className="alt-med-tool-results">
168
+ {result.errors?.length ? (
169
+ <ul className="alt-med-tool-errors">
170
+ {result.errors.map((msg) => (
171
+ <li key={msg}>{msg}</li>
172
+ ))}
173
+ </ul>
174
+ ) : null}
175
+ <p className="muted">
176
+ <strong>Focus:</strong> {result.focus_drug || "—"} · <strong>Class:</strong>{" "}
177
+ {result.therapeutic_class ?? "—"}{" "}
178
+ {result.therapeutic_class_field ? <span>({result.therapeutic_class_field})</span> : null} ·{" "}
179
+ <strong>Source:</strong> {result.source}
180
+ </p>
181
+ <div className="alt-med-suggestions-scroll" role="region" aria-label="FDA alternative suggestions, scrollable">
182
+ <ul className="alt-med-suggestion-list">
183
+ {result.suggestions?.length ? (
184
+ result.suggestions.map((s, idx) => (
185
+ <li key={`${s.display_name}-${idx}`} className="alt-med-suggestion">
186
+ <div>
187
+ <strong>{s.display_name}</strong>
188
+ <span className="muted"> · {s.source_detail ?? "openfda"}</span>
189
+ </div>
190
+ {s.routes?.length ? (
191
+ <div className="muted small-print">Route: {s.routes.join(", ")}</div>
192
+ ) : null}
193
+ {s.generic_names?.length ? (
194
+ <div className="muted small-print">Generic: {s.generic_names.join(", ")}</div>
195
+ ) : null}
196
+ {s.adverse_reactions_snippet ? (
197
+ <div className="alt-med-ar">ADR label excerpt: {s.adverse_reactions_snippet}</div>
198
+ ) : null}
199
+ {s.label_link ? (
200
+ <a className="alt-med-link" href={s.label_link} target="_blank" rel="noreferrer">
201
+ DailyMed / label
202
+ </a>
203
+ ) : null}
204
+ </li>
205
+ ))
206
+ ) : (
207
+ <li className="muted">No suggestions yet — try another spelling or load from episode.</li>
208
+ )}
209
+ </ul>
210
+ </div>
211
+ </div>
212
+ )}
213
+ </section>
214
+ );
215
+ }
app/ui/frontend/src/styles/theme.css CHANGED
@@ -1138,6 +1138,108 @@ td {
1138
  }
1139
  }
1140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1141
  ::-webkit-scrollbar {
1142
  width: 7px;
1143
  height: 7px;
 
1138
  }
1139
  }
1140
 
1141
+ .small-print {
1142
+ font-size: 0.78rem;
1143
+ line-height: 1.35;
1144
+ }
1145
+
1146
+ .alt-med-tool {
1147
+ margin-top: 10px;
1148
+ border: 1px dashed rgba(155, 124, 255, 0.35);
1149
+ background: rgba(8, 11, 27, 0.55);
1150
+ }
1151
+
1152
+ .alt-med-tool .panel-heading h2 {
1153
+ font-size: 1.05rem;
1154
+ }
1155
+
1156
+ .alt-med-tool-regimen-select {
1157
+ margin: 0 0 10px;
1158
+ max-width: min(520px, 100%);
1159
+ }
1160
+
1161
+ .alt-med-tool-regimen-select select {
1162
+ width: 100%;
1163
+ }
1164
+
1165
+ .alt-med-tool-hint {
1166
+ margin: 8px 0 0;
1167
+ max-width: 960px;
1168
+ }
1169
+
1170
+ .alt-med-tool-row {
1171
+ display: flex;
1172
+ flex-wrap: wrap;
1173
+ gap: 12px;
1174
+ align-items: flex-end;
1175
+ }
1176
+
1177
+ .alt-med-tool-field {
1178
+ flex: 1 1 220px;
1179
+ margin: 0;
1180
+ }
1181
+
1182
+ .alt-med-tool-actions {
1183
+ display: flex;
1184
+ flex-wrap: wrap;
1185
+ gap: 8px;
1186
+ }
1187
+
1188
+ .alt-med-tool-results {
1189
+ margin-top: 12px;
1190
+ }
1191
+
1192
+ .alt-med-tool-errors {
1193
+ color: var(--warning);
1194
+ font-size: 0.85rem;
1195
+ }
1196
+
1197
+ .alt-med-suggestions-scroll {
1198
+ margin-top: 8px;
1199
+ max-height: 17.5rem;
1200
+ overflow-y: auto;
1201
+ overflow-x: hidden;
1202
+ padding-right: 4px;
1203
+ border-radius: 12px;
1204
+ border: 1px solid var(--line-soft);
1205
+ background: rgba(5, 8, 20, 0.35);
1206
+ }
1207
+
1208
+ .alt-med-suggestion-list {
1209
+ list-style: none;
1210
+ margin: 0;
1211
+ padding: 8px;
1212
+ display: flex;
1213
+ flex-direction: column;
1214
+ gap: 6px;
1215
+ }
1216
+
1217
+ .alt-med-suggestion {
1218
+ padding: 8px 10px;
1219
+ border-radius: 10px;
1220
+ border: 1px solid var(--line-soft);
1221
+ background: rgba(13, 16, 35, 0.45);
1222
+ flex-shrink: 0;
1223
+ }
1224
+
1225
+ .alt-med-ar {
1226
+ margin-top: 4px;
1227
+ font-size: 0.76rem;
1228
+ color: var(--muted);
1229
+ line-height: 1.35;
1230
+ display: -webkit-box;
1231
+ -webkit-box-orient: vertical;
1232
+ -webkit-line-clamp: 2;
1233
+ overflow: hidden;
1234
+ }
1235
+
1236
+ .alt-med-link {
1237
+ display: inline-block;
1238
+ margin-top: 6px;
1239
+ font-size: 0.82rem;
1240
+ color: var(--accent-2);
1241
+ }
1242
+
1243
  ::-webkit-scrollbar {
1244
  width: 7px;
1245
  height: 7px;
docker/space/README.md CHANGED
@@ -12,28 +12,46 @@ Never commit or paste Hugging Face tokens into chat or the repo. If a token was
12
 
13
  ```bash
14
  cd polyguard-rl
15
- docker build -f Dockerfile.space -t polyguard-space .
16
  ```
17
 
18
- 3. Push the Space repo (HF expects `Dockerfile` at root). Either:
19
 
20
- - **Option A:** In the Space repo on Hub, set **Build Dockerfile path** to `Dockerfile.space` if the UI allows, **or** copy/rename: `cp Dockerfile.space Dockerfile` in the branch you push.
21
 
22
- - **Option B:** Make this `polyguard-rl` folder the Space git root and add a symlink or duplicate `Dockerfile` pointing to the same content as `Dockerfile.space`.
23
 
24
- 4. Commit and push to the Space repository. HF builds the image on their builders (you do not need to `docker push` to Docker Hub for standard Spaces).
 
 
 
 
 
 
 
 
25
 
26
  ## Runtime
27
 
28
  - **Port:** Space sets `PORT` (default `7860`). Nginx listens on `PORT` and routes `/api/*` → API, `/ws` → OpenEnv WebSocket, `/` → built React app.
29
- - **First boot:** If `checkpoints/active/grpo_adapter` is missing, `entrypoint.sh` runs `scripts/install_hf_active_bundle.py` (downloads the public bundle; slow on first start).
30
  - **CORS:** Set via `POLYGUARD_ALLOW_HF_SPACE_CORS=true` (default in the Space Dockerfile).
31
 
32
- ## Optional secrets
 
 
 
 
 
 
 
 
 
 
33
 
34
- | Name | Use |
35
- |-----------|-----|
36
- | `HF_TOKEN` | Private model or artifact repo; `huggingface_hub` picks it up automatically when set in the Space environment. |
37
 
38
  ## Local smoke (same as Space)
39
 
 
12
 
13
  ```bash
14
  cd polyguard-rl
15
+ docker build -t polyguard-space .
16
  ```
17
 
18
+ 3. Push the Space repo. The root **`Dockerfile`** is the full demo (Vite UI + nginx + API + OpenEnv). Hugging Face uses it automatically when **Dockerfile path** is empty. If your Space was created earlier with a different Dockerfile, trigger **Factory reboot** after pushing so the new image builds.
19
 
20
+ 4. Commit and push to the Space repository. HF builds the image on their builders (you do not need to `docker push` to Docker Hub for standard Spaces).
21
 
22
+ ## FDA panel / latest UI missing on the live Space
23
 
24
+ Pushing code to GitHub alone does **not** refresh `huggingface.co/spaces/...` unless that Space is connected to the same repo **and** rebuilds from the branch that has your UI (for example `fda` vs `main`). This repo’s usual demo path is **upload via Hub API**:
25
+
26
+ ```bash
27
+ cd polyguard-rl
28
+ export HF_TOKEN="hf_..." # write token; never commit it
29
+ uv run python scripts/deploy_space_api.py --repo-id TheJackBright/polyguard-openenv
30
+ ```
31
+
32
+ Wait for **Build** in the Space logs to finish, then use **Factory reboot** or a hard browser refresh if the page still looks old. **Dockerfile path** should be empty (default `Dockerfile`) or `Dockerfile` / `Dockerfile.space`. If the Space uses the **full monorepo** as its Git root, set Dockerfile path to the repo-root `Dockerfile` or to `polyguard-rl/Dockerfile`.
33
 
34
  ## Runtime
35
 
36
  - **Port:** Space sets `PORT` (default `7860`). Nginx listens on `PORT` and routes `/api/*` → API, `/ws` → OpenEnv WebSocket, `/` → built React app.
37
+ - **First boot:** If `checkpoints/active/grpo_adapter` is missing, `entrypoint.sh` runs `scripts/install_hf_active_bundle.py`. That pulls `TheJackBright/polyguard-openenv-training-full-artifacts` (slow, ~1.1 GB).
38
  - **CORS:** Set via `POLYGUARD_ALLOW_HF_SPACE_CORS=true` (default in the Space Dockerfile).
39
 
40
+ ## If logs show `401` / `RepositoryNotFoundError` on startup
41
+
42
+ The artifact **model repo is private, gated, or needs a license click** while anonymous downloads are blocked. The UI can still “work” using the **heuristic ranker** and public base models, but **your trained bundle is not installed**.
43
+
44
+ **Fix (pick one):**
45
+
46
+ 1. **Space secret (recommended):** Space → **Settings** → **Secrets** → add **`HF_TOKEN`** = a [read token](https://huggingface.co/settings/tokens) that can access `polyguard-openenv-training-full-artifacts`. Restart the Space.
47
+ 2. **Hub settings:** Make that model repo **public**, or ensure **gated** access allows the token you use in (1).
48
+ 3. **Ignore:** Leave as-is if ranker-only behavior is enough for the demo.
49
+
50
+ ## Secrets
51
 
52
+ | Name | Use |
53
+ |------------|-----|
54
+ | `HF_TOKEN` | **Required** if the artifact repo is not anonymously readable; `huggingface_hub` reads it automatically. |
55
 
56
  ## Local smoke (same as Space)
57
 
docs/DEMO_RECORDING_SCRIPT.md ADDED
@@ -0,0 +1,493 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PolyGuard Space UI — demo recording script (shot-by-shot)
2
+
3
+ Use this document while screen-recording the Hugging Face Space (or local Docker). Target length: **8–14 minutes** for a full pass, or **3–5 minutes** for a highlights reel.
4
+
5
+ ---
6
+
7
+ ## Before you hit record
8
+
9
+ 1. **Open the Space** in a clean browser profile or incognito (fewer extensions → fewer glitches).
10
+ 2. **Set resolution**: 1920×1080 or 1440×900; browser zoom **100%**.
11
+ 3. **Fullscreen** the Space iframe or use HF “Open in new tab” so the URL bar shows the Space domain.
12
+ 4. **Wait for cold start**: first load may download the model bundle (several minutes). The **Event Log** and **Model Truth** panel will tell you if the policy failed to load (heuristic fallback is still usable for env steps).
13
+ 5. **Optional**: hide mouse cursor in OBS if you prefer; otherwise move slowly and pause **2 seconds** on each panel after major clicks.
14
+
15
+ **Primary Space (product):** `https://huggingface.co/spaces/TheJackBright/polyguard-openenv`
16
+ Runtime: nginx fronts the **product API** (default `8200`) and **OpenEnv service** (`8100`); see `docker/space/entrypoint.sh`.
17
+
18
+ ---
19
+
20
+ ## Where the model lives (Qwen and artifacts)
21
+
22
+ This matters for what you say on camera.
23
+
24
+ | Location | What it is |
25
+ | --- | --- |
26
+ | **On the Space container** | Working directory `/app` (see `entrypoint.sh`: `cd /app`). |
27
+ | **Downloaded bundle** | If `checkpoints/active/grpo_adapter/adapter_config.json` is missing at boot, `scripts/install_hf_active_bundle.py` pulls the **HF usable model bundle** into `checkpoints/active/`. |
28
+ | **Typical layout after install** | `checkpoints/active/active_model_manifest.json` — which artifact is active (often **GRPO adapter** on top of base). |
29
+ | **Weights** | `checkpoints/active/grpo_adapter/` (LoRA/PEFT), optionally `checkpoints/active/merged/` (full merged weights), `checkpoints/active/sft_adapter/`. |
30
+ | **Base model name** | Usually **`Qwen/Qwen2.5-0.5B-Instruct`** as the Transformers base for adapters (set via env e.g. `POLYGUARD_HF_MODEL`). |
31
+
32
+ **What the UI proves:** the **Model Truth** panel calls **`GET /policy/model_status`** (product API). It shows `model_id` / `base_model`, `run_id`, `preferred_artifact` / `loaded_source`, and availability flags. Say on camera: *“This is live from the API, not hard-coded in the frontend.”*
33
+
34
+ ---
35
+
36
+ ## UI map (what appears on screen)
37
+
38
+ | Region | Purpose |
39
+ | --- | --- |
40
+ | **Hero** (“PolyGuard neural safety cockpit”) | Marketing copy + quick stats. |
41
+ | **Top bar** | **Agent Workbench** vs **Env Explorer**, **Task** dropdown, **Reset Episode**, **Q Tips**. |
42
+ | **Status chips** | “Live” / model line; in Env mode one chip reads **ws env** (WebSocket to OpenEnv). |
43
+ | **Model Truth** | Qwen / artifact / run / availability. |
44
+ | **Advanced strip** | Only if Task = **Advanced** — pick raw `difficulty` + `sub_environment`. |
45
+ | **Episode Overview** | Mode, task, difficulty, environment, step budget, last reward, patient id, **Patient Summary**, **Risk Delta**. |
46
+ | **Candidate Actions** | Legal moves: `candidate_id`, action type, target/replacement, estimated safety delta (or **Blocked**). |
47
+ | **Action Console** | Confidence, rationale, **Submit** vs **Run Agent** (Agent mode only for Run Agent). |
48
+ | **Reward Channels** | Bars for total + primary + component scores (see below). |
49
+ | **Current Medications** | Cards from observation. |
50
+ | **Action History / Warnings** | Step trace and env warnings. |
51
+ | **Decision / Explanation / Evidence** | **Agent mode only** (filled after API steps that return those fields). |
52
+ | **Event Log** | Human-readable trace of resets, steps, rewards, errors. |
53
+
54
+ ---
55
+
56
+ ## Feature encyclopedia — every panel, branch, and agent
57
+
58
+ Use this section as a **script appendix** or **judge handout**. It mirrors the React workbench in `app/ui/frontend/src/App.tsx`, the API in `app/api/`, and the orchestrator in `app/agents/orchestrator.py`.
59
+
60
+ ### A. How the Space is wired (under the hood)
61
+
62
+ | Piece | Role |
63
+ | --- | --- |
64
+ | **Browser → nginx** | HF Space exposes one origin; nginx routes paths. |
65
+ | **Product API** | Vite uses `API_BASE` (default **`/api`**). FastAPI serves catalog, reset, step_candidate, orchestrate, model_status, reward_breakdown, etc. |
66
+ | **OpenEnv HTTP/WS** | `ENV_BASE` defaults to **same origin** on Spaces (not localhost). Web UI opens **`ws(s)://<origin>/ws`** for Env Explorer. |
67
+ | **Two Python processes** | `entrypoint.sh` starts **uvicorn** for `app.env.fastapi_app` (env, port **8100**) and **uvicorn** for `app.api` (product API, port **8200**). Agent mode reset/step still use the **API’s** in-process `PolyGuardEnv`; Env mode uses the **separate** env service over WebSocket. |
68
+ | **Important** | Agent and Env UIs maintain **separate React state** (`agentObservation` vs `envObservation`). Toggling mode **clears the Event Log** and clears the inactive branch’s episode state so you always know which backend path you are exercising. |
69
+
70
+ ### B. Hero (“PolyGuard neural safety cockpit”)
71
+
72
+ | Stat | Source | What to say on camera |
73
+ | --- | --- | --- |
74
+ | **Runtime** | `mode === "agent"` → “Agent Workbench”; else “Env Explorer”. | “This is which transport I am using right now.” |
75
+ | **Scenario** | Human label for current `taskId` from catalog presets or Advanced. | “Which curriculum preset is bound to difficulty + sub-environment.” |
76
+ | **Candidates** | `candidate_action_set.length` from the **active** observation. | “How many legal moves the env is offering after the last reset/step.” |
77
+ | **Reward** | Last scalar reward for the active branch (`null` → shown as `-`). | “Verifier scalar after the last step in this mode only.” |
78
+
79
+ ### C. Top bar — every control
80
+
81
+ | Control | Behavior |
82
+ | --- | --- |
83
+ | **Agent Workbench** | Sets `mode` to `agent`. Clears env state, event log, error; clears agent panels if switching from env (see `handleModeChange`). |
84
+ | **Env Explorer** | Sets `mode` to `env`. Clears agent-specific observation/reward/decision/evidence. |
85
+ | **Task** `<select>` | Options: each **task preset** from `GET /env/catalog` (`task_presets`), plus **Advanced**. Changing a preset updates internal `difficulty` + `sub_environment` to match the preset. |
86
+ | **Reset Episode** | **Agent:** `POST /env/reset` with body from preset (`{ task_id }`) or `{ difficulty, sub_environment }`. Refreshes **Model Truth** first. Clears reward breakdown, decision, explanation, evidence, sets default candidate. **Env:** WebSocket `reset` with `{ difficulty, sub_environment }` only (no `task_id` in WS path—preset is flattened to those two fields). **Always** clears `events` at start of reset handler, then appends one “Reset … in agent/env” line. |
87
+ | **Q Tips** | Opens modal walkthrough; highlights DOM nodes with `[data-guide="…"]`. **Skip** stores `polyguard.qtips.v2.seen` in localStorage so first visit auto-opens tips. |
88
+ | **Status chips** | First chip: **Live** if observation loaded and not done, else **Complete** / **Ready**. Second chip: in Agent mode, derived from **`modelSignal()`** (Qwen verified or not); in Env mode shows **`ws env`**. |
89
+
90
+ ### D. Model Truth panel — field by field
91
+
92
+ Data from **`GET /policy/model_status`** (`PolicyProviderRouter` / `active_model_status`).
93
+
94
+ | Field in UI | Typical meaning |
95
+ | --- | --- |
96
+ | **Heading label** | “Qwen 0.5B active” only when Space config matches a strict check (enabled + active + availability + model id regex for **Qwen2.5-0.5B-Instruct**); else “Qwen not verified” or Ollama-specific text if Ollama wins locally. |
97
+ | **Detail paragraph** | Human sentence: model name, artifact, `run_id`, optional **load_error**. |
98
+ | **Model** | `model_id` or `base_model` — HF id of the loaded or configured base. |
99
+ | **Run** | `run_id` from manifest / sweep activation (which training bundle). |
100
+ | **Artifact** | `loaded_source` or `preferred_artifact` — e.g. **`grpo_adapter`**, **`merged`**, **`sft_adapter`**. |
101
+ | **Availability** | Key/value pairs from `availability` dict (which load stages succeeded). |
102
+
103
+ **Ollama branch (local dev):** If `status.ollama.enabled && available`, the UI labels **Ollama Qwen active** and mentions `POLYGUARD_PROVIDER_PREFERENCE` order. Spaces Dockerfile sets **`POLYGUARD_ENABLE_OLLAMA=false`** by default.
104
+
105
+ ### E. Advanced strip (Task = Advanced)
106
+
107
+ Only rendered when `taskId === "advanced"`. Two selects:
108
+
109
+ 1. **Difficulty:** `easy` \| `medium` \| `hard` — passed to reset as `difficulty`.
110
+ 2. **Environment:** every string in `catalog.sub_environments` (DDI, BANDIT_MINING, REGIMEN_RISK, PRECISION_DOSING, LONGITUDINAL_DEPRESCRIBING, WEB_SEARCH_MISSING_DATA, ALTERNATIVE_SUGGESTION, NEW_DRUG_DECOMPOSITION).
111
+
112
+ **What each sub-environment stresses (one line each):**
113
+
114
+ | Sub-environment | What the episode emphasizes |
115
+ | --- | --- |
116
+ | **DDI** | Drug–drug interaction exposure and pair risk. |
117
+ | **BANDIT_MINING** | Policy / bandit exploration style scenario (see preset “Bandit Mining”). |
118
+ | **REGIMEN_RISK** | Overall regimen burden and safety tradeoffs. |
119
+ | **PRECISION_DOSING** | Dose buckets, organ-sensitive flags in observation. |
120
+ | **LONGITUDINAL_DEPRESCRIBING** | Multi-step taper / stop sequences over time. |
121
+ | **WEB_SEARCH_MISSING_DATA** | Rewards process fidelity for evidence-fetch actions. |
122
+ | **ALTERNATIVE_SUGGESTION** | Substitution / alternative action types rewarded more. |
123
+ | **NEW_DRUG_DECOMPOSITION** | Hard track: decompose novel drug string into components. |
124
+
125
+ ### F. Episode Overview — every KPI and subsection
126
+
127
+ **KPI grid (always eight rows):**
128
+
129
+ | KPI | Source |
130
+ | --- | --- |
131
+ | **Mode** | Literal “Agent Workbench” or “Env Explorer”. |
132
+ | **Task** | Preset label or “Advanced”. |
133
+ | **Difficulty** | `observation.deterministic_contract.difficulty` or `-`. |
134
+ | **Environment** | `deterministic_contract.sub_environment` or `observation.sub_environment`. |
135
+ | **Step Budget** | `observation.step_budget_remaining`. |
136
+ | **Last Reward** | Active branch’s last reward (after reset, Agent clears to `-` until first step). |
137
+ | **Patient** | `patient_summary.patient_id` or `patient_summary.id`. |
138
+ | **Status** | Complete if `done`, else Live if observation exists, else Ready. |
139
+
140
+ **Patient Summary `<dl>`:** First **8** keys of `observation.patient_summary` (keys humanized: underscores → spaces, title case). Typical keys include demographics, allergies, high-level clinical flags—whatever the backend puts on `PolyGuardObservation`.
141
+
142
+ **Risk Delta `<dl>`:** First **8** entries of `observation.burden_score_summary` — burden-related scalars the env uses for reward deltas.
143
+
144
+ ### G. Candidate Actions list — each column
145
+
146
+ Each row is one **`CandidateAction`** from `candidate_action_set`.
147
+
148
+ | Column / concept | Meaning |
149
+ | --- | --- |
150
+ | **`candidate_id`** | Stable id (e.g. `cand_…`) — must match when submitting. |
151
+ | **Action label** | Humanized `action_type` (STOP_DRUG, SUBSTITUTE_WITHIN_CLASS, …). |
152
+ | **Third column** | `target_drug` **or** `replacement_drug` **or** `mode` — whichever is most informative. |
153
+ | **Right column** | `estimated_safety_delta` formatted to 3 decimals, or **Blocked** if `legality_precheck === false`. |
154
+ | **Disabled rows** | You cannot select illegal candidates; click does nothing. |
155
+ | **Default selection** | **Agent:** first candidate in list. **Env:** first **legal** candidate that is not `KEEP_REGIMEN` and not `REQUEST_*`, else first legal non–KEEP_REGIMEN, else first in list (`defaultCandidateForMode`). |
156
+
157
+ **Hidden fields you can mention if showing JSON elsewhere:** `dose_bucket`, `taper_days`, `monitoring_plan`, `evidence_query`, `new_drug_name`, `candidate_components`, `uncertainty_score`, `rationale_tags`, `required_monitoring`, `burden_delta`, `disease_stability_estimate`.
158
+
159
+ ### H. Action Console — every input and button
160
+
161
+ | UI element | Effect |
162
+ | --- | --- |
163
+ | **Type / Mode / Target / Replacement / Dose / Uncertainty** | Read-only snapshot of the **currently selected** candidate. |
164
+ | **Confidence** | Number input **0.001–0.999** step 0.001; sent as `confidence` on **Submit Candidate** (Agent) or embedded in WS payload (Env). |
165
+ | **Rationale** | Free text → `rationale_brief` / rationale on the action. |
166
+ | **Submit Candidate** (Agent) | Calls `POST /env/step_candidate` with `{ candidate_id, confidence, rationale_brief }`. API finds matching legal action and calls `env.step`. |
167
+ | **Submit Env Step** (Env) | Same confidence/rationale + full action payload built by `buildActionPayload` → WS `step`. |
168
+ | **Run Agent** | **Only when** `mode === "agent"` **and** observation exists **and** not `done`. Calls `POST /agents/orchestrate` with empty JSON body. **Disabled** in Env mode. |
169
+ | **Done notice** | If `done`, shows which mode completed and `termination_reason` from `info` if present. Primary button becomes **Reset Episode** (shortcut). |
170
+
171
+ ### I. Reward Channels — every bar (exact keys)
172
+
173
+ The UI renders **exactly these keys** in order (`REWARD_KEYS` in `App.tsx` — **14** rows):
174
+
175
+ | # | Key | Role |
176
+ | --- | --- | --- |
177
+ | 1 | `total_reward` | Weighted aggregate of component scores (`aggregate_rewards` in `reward_scaling.py`). |
178
+ | 2 | `primary_safety_legality` | Roll-up: legality, candidate alignment, anti-cheat, uncertainty calibration (`reward_router.compute_primary_reward_channels`). |
179
+ | 3 | `primary_clinical_improvement` | Roll-up: safety delta, burden improvement, disease stability. |
180
+ | 4 | `primary_dosing_quality` | Roll-up: dosing quality + abstention quality. |
181
+ | 5 | `primary_process_integrity` | Roll-up: format compliance, efficiency, process fidelity, explanation grounding. |
182
+ | 6 | `legality_score` | Action legal per safety verifier. |
183
+ | 7 | `safety_delta_score` | Movement on severe DDI / risk proxy vs pre-step state. |
184
+ | 8 | `burden_improvement_score` | Medication burden before vs after. |
185
+ | 9 | `disease_stability_score` | Stability heuristic vs disruptive action types. |
186
+ | 10 | `dosing_quality_score` | Dose-mode and bucket appropriateness. |
187
+ | 11 | `process_fidelity_score` | Follows intended workflow for sub-environment (e.g. fetch evidence when required). |
188
+ | 12 | `explanation_grounding_score` | Rationale present / grounded. |
189
+ | 13 | `anti_cheat_score` | Collapses when anti-cheat triggers. |
190
+ | 14 | `uncertainty_calibration_score` | Confidence vs uncertainty alignment. |
191
+
192
+ **Note:** `total_reward` is row 1; rows 2–5 are **primary** channels; rows 6–14 are **exposed component** scores. Other components (`format_compliance_score`, `efficiency_score`, `candidate_alignment_score`, `abstention_quality_score`) still exist **in the backend** `RewardBreakdown` and feed primaries + total, but this UI **does not** give them their own bar rows.
193
+
194
+ Bars show **`-`** when the value is missing (no step yet or breakdown not returned). Bar width = value × 100% with value clamped to `[0.001, 0.999]`.
195
+
196
+ **Agent vs breakdown source:** After a step, UI prefers `info.reward_breakdown`; it may also call **`GET /env/reward_breakdown`**. **Env:** uses `info.reward_breakdown` from the WebSocket step packet; if empty, the UI clears the reward panel.
197
+
198
+ ### J. Current Medications cards
199
+
200
+ Built from `observation.medication_table[]`. Each card:
201
+
202
+ - **Title:** `drug` / `drug_id` / `name`.
203
+ - **High-risk ribbon:** if `high_risk` or `is_high_risk_elderly` or Beers / warning flags.
204
+ - **Body:** `indication` or `class_name` or `atc_class`.
205
+ - **Meta row:** dose bucket or mg dose; taper vs `monitoring` or `route`.
206
+
207
+ ### K. Action History vs Warnings
208
+
209
+ | Panel | Source |
210
+ | --- | --- |
211
+ | **Action History** | `observation.action_history` — each item shows step index and `action_type` / `candidate_id` / reward snippet. |
212
+ | **Warnings** | `observation.warning_summary` — list of human-readable env warnings (DDIs, constraints, etc.). |
213
+
214
+ ### L. Decision / Explanation / Evidence (Agent only)
215
+
216
+ Rendered as JSON `<pre>` blocks:
217
+
218
+ | Title | When populated | Content origin |
219
+ | --- | --- | --- |
220
+ | **Decision** | Agent mode only. | **`final_action`** on the packet. For **`step_candidate`**, the API returns the standard **step** payload — **typically no `final_action` field**, so this panel may stay **empty after manual submit**. For **`orchestrate`**, **`final_action`** is the **`PolyGuardAction`** after critic (what actually hit `env.step`). |
221
+ | **Explanation** | Agent mode only. | **`explanation`** — output of **`ExplainerAgent`** after the step (`orchestrate` returns it). Usually **empty** after raw `step_candidate` unless API adds it. |
222
+ | **Evidence** | Agent mode only. | **`evidence`** key on packet. **`orchestrate`** returns **`evidence_out`** from **`EvidenceAgent.run(state)`** (retrieval / web-fallback bundle). **`step_candidate`** does not attach orchestrator evidence — panel often **empty** on manual clicks. |
223
+
224
+ **Demo takeaway:** Tell viewers: *“To populate Decision / Explanation / Evidence in the UI, use **Run Agent** (orchestrate). Manual **Submit Candidate** updates the env and rewards but does not replay the full multi-agent JSON into those three panels.”*
225
+
226
+ ### M. Event Log vs Q Tips
227
+
228
+ | Feature | Behavior |
229
+ | --- | --- |
230
+ | **Event Log** | Prepends timestamped strings: resets, each step’s reward line, errors. **Capped** at 24 lines. Cleared when you click **Reset Episode** (handler starts with `setEvents([])` then appends) — *not* the same as mode switch clearing. |
231
+ | **Q Tips** | 10-step overlay; does not mutate env. |
232
+
233
+ ### N. Orchestrator — every agent in order (`Run Agent`)
234
+
235
+ When **`POST /agents/orchestrate`** runs, `Orchestrator.run_step` executes:
236
+
237
+ | Step | Agent class | What it does (operator language) |
238
+ | --- | --- | --- |
239
+ | 1 | **`MedRecAgent`** | Summarizes current medication list / reconciliation view for downstream modules. Output key: `medrec`. |
240
+ | 2 | **`EvidenceAgent`** | Retrieves **local evidence** (and optional web fallback) for missing or thin context. Shown in UI **`evidence`** when orchestrating. |
241
+ | 3 | **`GraphSafetyAgent`** | Graph-style **DDI / duplicate therapy** style signals. Output: `graph`. |
242
+ | 4 | **`DosingAgent`** | Flags **dose-sensitive** windows and dosing opportunities. Feeds **`dosing_active`** into supervisor. |
243
+ | 5 | **`CandidateAgent`** | Wraps env **candidate builder** — produces the legal `CandidateAction` list. |
244
+ | 6 | **`SupervisorAgent`** | Chooses planner **mode**: regimen vs dose vs **REVIEW** (conservative routing). |
245
+ | 7 | **Contextual bandit** | **`ContextualBanditPolicy`** (LinUCB or Thompson sampling via `POLYGUARD_BANDIT_ALGO`) proposes **top-k** (`POLYGUARD_BANDIT_TOP_K`) candidates for the planner to consider. |
246
+ | 8 | **`PlannerAgent`** | Calls **`PolicyProviderRouter.select_candidate`** — this is where **Transformers + Qwen + PEFT** (or Ollama, or **safety ranker fallback**) picks a **`candidate_id`** and rationale. |
247
+ | 9 | **`CriticAgent`** | Safety veto / repair. May replace proposed action with a safer **`final_action`**. |
248
+ | 10 | **Replan / debate** (optional) | If `coordination_mode` is `replan_on_veto` or `lightweight_debate` and critic rejects, planner may rerun on **review** candidates; `debate_rounds` increments. |
249
+ | 11 | **`PolyGuardEnv.step`** | Commits **`final_action`**, returns `observation`, `reward`, `done`, `info`. |
250
+ | 12 | **Bandit `update`** | If the chosen candidate was in the bandit pool, **updates bandit statistics with the reward** (learning signal for next orchestrate). |
251
+ | 13 | **`ExplainerAgent`** | Builds **`explanation`** object for audit / UI. |
252
+
253
+ **Environment variables (mention for power users):**
254
+
255
+ | Variable | Effect |
256
+ | --- | --- |
257
+ | **`POLYGUARD_POLICY_STACK`** | `llm+bandit` (default): planner sees **bandit-shortlisted** candidates. `llm-only`: all supervisor-filtered candidates. `bandit-only`: **no LLM** — first bandit pick with fixed rationale. |
258
+ | **`POLYGUARD_BANDIT_*`** | Algorithm, alpha, epsilon, seed, top-k. |
259
+ | **`POLYGUARD_PROVIDER_PREFERENCE`** | e.g. `transformers` vs `ollama` order. |
260
+ | **`POLYGUARD_ENABLE_ACTIVE_MODEL`** | Must be true on Space for bundle path; **`POLYGUARD_HF_MODEL`** sets base id for adapters. |
261
+
262
+ ### O. Qwen and fallbacks (planner path)
263
+
264
+ `PolicyProviderRouter` (`app/models/policy/provider_runtime.py`):
265
+
266
+ 1. Builds a **JSON instruction** listing candidates and asks for `candidate_id=…; rationale=…`.
267
+ 2. Tries providers in **`POLYGUARD_PROVIDER_PREFERENCE`** (default **Transformers** on Space).
268
+ 3. Parses model text for a legal `candidate_id`; on failure uses **`safety_ranker`** deterministic ordering.
269
+
270
+ **So:** Even without Qwen load, **Run Agent** still completes using **ranker / bandit** — mention that if Model Truth is red.
271
+
272
+ ### P. Full observation contract (API / types)
273
+
274
+ The TypeScript type `EnvObservation` (`lib/types.ts`) lists fields the backend **may** send. The main workbench **highlights** patient summary, medication table, candidates, burden summary, action history, warnings, step budget, and sub-environment. **Not all fields get their own panel** — if you open browser DevTools → Network → `reset` / `step` response, you can narrate extras:
275
+
276
+ | Field | Typical use |
277
+ | --- | --- |
278
+ | `comorbidity_summary` | Comorbidity list for the patient. |
279
+ | `organ_function_summary` | eGFR / hepatic flags for dosing scenarios. |
280
+ | `labs_vitals_summary` | Labs relevant to risk scoring. |
281
+ | `graph_safety_summary` | Aggregated graph / DDI context. |
282
+ | `precision_dosing_flags` | Tags when sub-environment is dosing-heavy. |
283
+ | `unresolved_conflicts` | Specialist conflict strings. |
284
+ | `abstention_indicators` | When the env suggests review / abstain. |
285
+ | `deterministic_contract` | Difficulty + sub-environment + scenario id contract for reproducibility. |
286
+
287
+ ### Q. Q Tips — copy for each slide (matches `GUIDE_STEPS`)
288
+
289
+ | # | Title | Body (read aloud or paraphrase) |
290
+ | --- | --- | --- |
291
+ | 1 | Start here | PolyGuard is an interactive OpenEnv workbench; top bar picks runtime, scenario, reset. |
292
+ | 2 | Choose the runtime | Agent Workbench = REST API + reward breakdown + Qwen path; Env Explorer = WebSocket to OpenEnv. |
293
+ | 3 | Pick a scenario | Presets load real patient/regimen state from backend. |
294
+ | 4 | Check the model truth | `/policy/model_status`; Qwen only “verified” when API says adapters live. |
295
+ | 5 | Read the episode state | Task, patient, step budget, reward, risk delta from latest env response. |
296
+ | 6 | Review legal actions | Candidate rows = legal moves; inspect safety delta and mode. |
297
+ | 7 | Submit or ask the agent | Submit Candidate vs Run Agent; check model panel before claiming LLM. |
298
+ | 8 | Inspect reward channels | Real scorer output per channel; empty = no step yet. |
299
+ | 9 | Track regimen changes | Medication cards + history + warnings = not canned. |
300
+ | 10 | Follow the run | Event log shows resets, steps, rewards, errors plainly. |
301
+
302
+ ---
303
+
304
+ ## Agent Workbench vs Env Explorer (say this exactly on camera)
305
+
306
+ | | **Agent Workbench** | **Env Explorer** |
307
+ | --- | --- | --- |
308
+ | **Reset** | `POST /env/reset` with task preset (e.g. `{ "task_id": "easy_screening" }`) via product API. | WebSocket `reset` message to OpenEnv **`/ws`** with `{ difficulty, sub_environment }`. |
309
+ | **Submit** | `POST /env/step_candidate` — product API resolves `candidate_id` + your confidence + rationale into a full action and steps the **same** in-process `PolyGuardEnv`. | WebSocket `step` — payload built from selected candidate; talks **directly** to OpenEnv service. |
310
+ | **Run Agent** | **`POST /agents/orchestrate`** — runs the full **orchestrator** (med rec, evidence, graph, dosing, candidates, supervisor, bandit, **planner/LLM**, critic, env step, explainer). | Button **disabled** — there is no orchestrator path over raw WS-only mode in this UI. |
311
+ | **Decision / Explanation / Evidence panels** | **Populated** after orchestrate or after steps that echo `final_action` / `explanation` / `evidence` (orchestrate returns rich `evidence` from `EvidenceAgent` pipeline). | **Always empty** in the UI by design — those panels are `null` in Env mode (`App.tsx` only passes agent-mode state to DetailPanels). |
312
+ | **Reward breakdown** | From step `info.reward_breakdown` or fallback `GET /env/reward_breakdown`. | From WS step packet `info.reward_breakdown` when present. |
313
+ | **Switching mode** | Clears the **Event Log** and resets the other mode’s transient state — mention that so viewers don’t think it’s a bug. | Same. |
314
+
315
+ **One-liner for judges:** *“Agent Workbench is the full product API plus optional LLM-orchestrated policy; Env Explorer is the raw OpenEnv WebSocket contract for the same underlying environment.”*
316
+
317
+ ---
318
+
319
+ ## Reward channels — what they mean and how they’re computed (talk track)
320
+
321
+ Rewards are **verifier-backed**, **bounded** to roughly **`[0.001, 0.999]`** (3 decimal places in UI).
322
+
323
+ ### Four primary channels (high level)
324
+
325
+ These are **averages of component groups** (`app/env/reward_router.py` — `compute_primary_reward_channels`):
326
+
327
+ 1. **`primary_safety_legality`** — legality, candidate id alignment, anti-cheat, uncertainty calibration.
328
+ 2. **`primary_clinical_improvement`** — safety delta vs severe pairs, burden improvement, disease stability.
329
+ 3. **`primary_dosing_quality`** — dosing quality + abstention (e.g. appropriate review requests under uncertainty).
330
+ 4. **`primary_process_integrity`** ��� format compliance, efficiency (step budget), process fidelity, explanation grounding.
331
+
332
+ ### Components (examples — `compute_reward_breakdown`)
333
+
334
+ The environment builds scores such as:
335
+
336
+ - **`legality_score`**: high if the action is legal per safety report.
337
+ - **`safety_delta_score` / `burden_improvement_score`**: from **before/after** burden and severe DDI pair counts (`_delta_to_reward`).
338
+ - **`anti_cheat_score`**: collapses if anti-cheat flags the trajectory.
339
+ - **`uncertainty_calibration_score`**: penalizes overconfidence vs modeled uncertainty.
340
+ - **Sub-environment tweaks**: e.g. `WEB_SEARCH_MISSING_DATA` boosts process fidelity when using `FETCH_EXTERNAL_EVIDENCE`; `NEW_DRUG_DECOMPOSITION` rewards decomposition actions with components.
341
+
342
+ Then components are **scaled/clamped**, **primary channels** recomputed, and **`total_reward`** = weighted aggregate (`aggregate_rewards`).
343
+
344
+ **Demo line:** *“Bars update only after a real step — empty fields mean we haven’t stepped yet, not fake filler.”*
345
+
346
+ ---
347
+
348
+ ## Built-in **Q Tips** (on-screen tour)
349
+
350
+ Click **Q Tips** in the top bar. The app cycles **10 slides** (`App.tsx` → `GUIDE_STEPS`):
351
+
352
+ 1. Start here — top bar, scenarios, reset.
353
+ 2. Choose the runtime — Agent vs Env.
354
+ 3. Pick a scenario — presets load real patient/regimen state.
355
+ 4. Check the model truth — `/policy/model_status`.
356
+ 5. Read episode state — overview + patient summary.
357
+ 6. Review legal actions — candidates.
358
+ 7. Submit or ask the agent — Submit vs Run Agent.
359
+ 8. Inspect reward channels.
360
+ 9. Medications + history/warnings.
361
+ 10. Event log — errors and connectivity.
362
+
363
+ **Recording tip:** Record **Q Tips** once in full voiceover (“I’ll use the in-app tour…”) then dismiss and do the live walkthrough below.
364
+
365
+ ---
366
+
367
+ ## Shot-by-shot recording script
368
+
369
+ ### Scene 0 — Intro (30–45 s)
370
+
371
+ **Action:** Scroll slightly so hero + top bar are visible.
372
+ **Say:** *“This is PolyGuard on Hugging Face Spaces: an OpenEnv workbench for polypharmacy safety. The backend runs a real `PolyGuardEnv` with verifiable rewards; the UI can drive it through the product API or raw OpenEnv WebSockets.”*
373
+
374
+ ---
375
+
376
+ ### Scene 1 — Model Truth (45–60 s)
377
+
378
+ **Action:** Stay on **Agent Workbench**. Click nothing yet; point at **Model Truth**.
379
+ **Say:** *“Model Truth is live from `/policy/model_status`. Here we see the base model—typically Qwen 2.5 0.5B Instruct—which artifact is loaded—often the GRPO adapter—and the run id. On Spaces, weights are under `/app/checkpoints/active` after the bundle installer runs.”*
380
+
381
+ **If panel shows unavailable:** *“Cold start or CPU load can delay the bundle; the environment still works for manual candidate submission; Run Agent may fall back to non-LLM routing depending on config.”*
382
+
383
+ ---
384
+
385
+ ### Scene 2 — Easy task, manual submit (Agent) (90–120 s)
386
+
387
+ **Action:** Task → **Easy Screening** (DDI, easy). **Reset Episode.**
388
+ **Say:** *“Easy Screening fixes difficulty easy and sub-environment DDI—drug–drug interaction screening.”*
389
+
390
+ **Action:** Pan **Episode Overview** — read **Patient Summary** and **Risk Delta** aloud briefly.
391
+ **Say:** *“This patient block and risk delta come straight from the observation object.”*
392
+
393
+ **Action:** **Candidate Actions** — click 2–3 rows; show **Blocked** vs legal. Select a **legal** row.
394
+ **Say:** *“Candidates are legal moves from the env; illegal rows are disabled .”*
395
+
396
+ **Action:** **Action Console** — tweak **Confidence** and **Rationale** slightly. Click **Submit Candidate**.
397
+ **Say:** *“Submit Candidate hits `/env/step_candidate` with my chosen legal action, confidence, and rationale.”*
398
+
399
+ **Action:** After response, pause on **Reward Channels** and **Last Reward** in overview.
400
+ **Say:** *“These bars are the verifier breakdown; total reward is the scalar GRPO-style signal we train on.”*
401
+
402
+ **Action:** **Action History** — show one new line. **Event Log** — show the new reward line.
403
+ **Say:** *“History and event log give an audit trail—not a canned animation.”*
404
+
405
+ ---
406
+
407
+ ### Scene 3 — Run Agent (orchestrator + LLM path) (90–120 s)
408
+
409
+ **Prerequisite:** Prefer recording when Model Truth shows **enabled** and **active** with Qwen artifacts.
410
+
411
+ **Action:** **Reset Episode** again (same or different task). Click **Run Agent**. Wait for completion.
412
+ **Say:** *“Run Agent calls `/agents/orchestrate`. That runs med reconciliation, evidence retrieval, graph safety, dosing hints, candidate generation, supervisor mode, a contextual bandit shortlist, then the planner—here that’s where the loaded Qwen policy can choose among candidates—the critic veto, environment step, and explainer.”*
413
+
414
+ **Action:** Scroll to **Decision**, **Explanation**, **Evidence** JSON panels.
415
+ **Say:** *“These three panels are only populated in Agent Workbench mode. Env Explorer deliberately hides them because the raw WebSocket client doesn’t run the full orchestrator response bundle.”*
416
+
417
+ **Action:** Point at **Evidence** — mention structured retriever output vs empty object if task didn’t fetch.
418
+ **Say:** *“Evidence is whatever the evidence agent produced for this state—grounding for clinician trust.”*
419
+
420
+ ---
421
+
422
+ ### Scene 4 — Env Explorer contrast (60–90 s)
423
+
424
+ **Action:** Click **Env Explorer**. **Reset Episode** (same task: Easy Screening).
425
+ **Say:** *“Now the UI resets over WebSocket `reset` to the OpenEnv service on port 8100—same scenarios, different transport.”*
426
+
427
+ **Action:** Select a candidate, **Submit Env Step**.
428
+ **Say:** *“Submit Env Step sends a WebSocket `step` with the action payload—no `/agents/orchestrate`.”*
429
+
430
+ **Action:** Scroll to **Decision / Explanation / Evidence** — show they stay **empty** or “No data.”
431
+ **Say:** *“This is intentional: I’m proving the low-level env API, not the full agent stack.”*
432
+
433
+ **Action:** **Event Log** — note new lines tagged from env step.
434
+
435
+ ---
436
+
437
+ ### Scene 5 — Task variety (2–3 minutes, optional montage)
438
+
439
+ For each preset, do **Reset** + **one** legal **Submit** (Agent mode is enough):
440
+
441
+ | Task | Difficulty | Sub-environment | What to say |
442
+ | --- | --- | --- | --- |
443
+ | **Easy Screening** | easy | DDI | “Fast DDI-focused episode.” |
444
+ | **Budgeted Screening** | medium | REGIMEN_RISK | “More steps, regimen-risk tradeoffs.” |
445
+ | **Complex Tradeoff** | hard | REGIMEN_RISK | “Harder patient draw, tighter budgets.” |
446
+ | **Bandit Mining** | hard | BANDIT_MINING | “Bandit-style policy mining scenario.” |
447
+
448
+ **Action:** Switch Task to **Advanced**. Set e.g. **hard** + **PRECISION_DOSING**. Reset.
449
+ **Say:** *“Advanced exposes every sub-environment enum the backend supports—precision dosing, deprescribing, web-search missing data, alternatives, new-drug decomposition.”*
450
+
451
+ ---
452
+
453
+ ### Scene 6 — Medications + warnings (45 s)
454
+
455
+ **Action:** After any step with regimen change, show **Current Medications** cards (high-risk styling).
456
+ **Say:** *“Cards mirror `medication_table` from the observation; warnings list is explicit env output.”*
457
+
458
+ ---
459
+
460
+ ### Scene 7 — Closing (30 s)
461
+
462
+ **Say:** *“That’s the full loop: HF Space hosts OpenEnv + API, Qwen adapters live under checkpoints/active, Agent Workbench demonstrates orchestrated LLM decisions with evidence and explanations, and Env Explorer proves the same environment over raw WebSockets for OpenEnv compatibility.”*
463
+
464
+ ---
465
+
466
+ ## OBS / QuickTime checklist
467
+
468
+ - [ ] Capture **system audio** if you add voiceover in post; or record mic in OBS.
469
+ - [ ] **1920×1080**, 30 fps (or 60 if you want smooth cursor).
470
+ - [ ] **2 s pause** after each button click before scrolling away.
471
+ - [ ] If Space sleeps, **mouse jiggle** or refresh before recording.
472
+ - [ ] Export **MP4 H.264** for YouTube / HF dataset card.
473
+
474
+ ---
475
+
476
+ ## Quick troubleshooting on camera (if something breaks)
477
+
478
+ | Symptom | What to say / do |
479
+ | --- | --- |
480
+ | WebSocket errors in Event Log | “Env service reconnect—refresh page; WS URL is derived from the Space origin.” |
481
+ | Run Agent fails | “Check Model Truth—model may still be downloading or Ollama disabled on Space.” |
482
+ | Reward bars all dash | “No step yet—reset and submit once.” |
483
+ | Candidates empty | “Reset episode—env didn’t initialize.” |
484
+
485
+ ---
486
+
487
+ ## Related docs
488
+
489
+ - [UI overview](ui.md)
490
+ - [Deployment](deployment.md)
491
+ - [Environment design](environment_design.md)
492
+ - [Reward design](reward_design.md)
493
+ - [Architecture](architecture.md)
docs/deployment.md CHANGED
@@ -30,7 +30,7 @@ uv run python -c "from huggingface_hub import HfApi; print(HfApi().space_info('$
30
  openenv validate --url "https://thejackbright-polyguard-openenv.hf.space"
31
  ```
32
 
33
- `scripts/deploy_space_api.py` is the preferred deployment path for this repo because it uploads a valid Docker Space README frontmatter bundle through `huggingface_hub.HfApi`. `scripts/deploy_space.sh` remains available, but the current OpenEnv CLI path may fail with invalid generated `colorFrom`/`colorTo` metadata.
34
 
35
  Useful `scripts/deploy_space.sh` flags:
36
 
 
30
  openenv validate --url "https://thejackbright-polyguard-openenv.hf.space"
31
  ```
32
 
33
+ `scripts/deploy_space_api.py` is the preferred deployment path for this repo because it uploads a valid Docker Space README frontmatter bundle through `huggingface_hub.HfApi`. `scripts/deploy_space.sh` remains available, but the current OpenEnv CLI path may fail with invalid generated `colorFrom`/`colorTo` metadata. Pushing to GitHub alone does not change the Hub Space unless that Space is configured to rebuild from that repo and branch; run the deploy script (with `HF_TOKEN`) after UI or API changes so the Docker image rebuilds. See `docker/space/README.md` for Dockerfile path, monorepo layout, and cache/reboot notes.
34
 
35
  Useful `scripts/deploy_space.sh` flags:
36
 
docs/final_submission_audit.md CHANGED
@@ -23,7 +23,7 @@ The only known judge-facing blocker is external storytelling: the README blog UR
23
  | Product/demo | Implemented | FastAPI product API, React/Vite workbench, policy lab, training monitor, replay, dosing, and safety views. |
24
  | Results and plots | Implemented | Tracked `docs/results/*.json` and PNG plots, including SFT baseline sweep evidence and top-level environment-backed GRPO evidence. |
25
  | HF Space deployment | Implemented | Public Space is running on CPU basic, Space metadata is available, and tracked `docs/results/hf_space_verification.json` reports OpenEnv validation passed. |
26
- | Colab notebook | Implemented | README links `notebooks/09_training_loop.ipynb` through Colab. |
27
  | Story artifact | Pending external publication | `docs/hf_blog_draft.md` exists, but the README blog URL returns 404 until published. |
28
  | Full public per-model GRPO sweep | Not claimed | Current public/tracked evidence is a 3-model SFT-baseline sweep plus a top-level GRPO run. Private training artifact repos require auth and must be mirrored before being used as public evidence. |
29
 
 
23
  | Product/demo | Implemented | FastAPI product API, React/Vite workbench, policy lab, training monitor, replay, dosing, and safety views. |
24
  | Results and plots | Implemented | Tracked `docs/results/*.json` and PNG plots, including SFT baseline sweep evidence and top-level environment-backed GRPO evidence. |
25
  | HF Space deployment | Implemented | Public Space is running on CPU basic, Space metadata is available, and tracked `docs/results/hf_space_verification.json` reports OpenEnv validation passed. |
26
+ | Colab notebook | Implemented | README Colab URL targets `PolyGuard_SFT_GRPO_One_Run_Runner.ipynb`; `notebooks/09_training_loop.ipynb` is the modular alternative. |
27
  | Story artifact | Pending external publication | `docs/hf_blog_draft.md` exists, but the README blog URL returns 404 until published. |
28
  | Full public per-model GRPO sweep | Not claimed | Current public/tracked evidence is a 3-model SFT-baseline sweep plus a top-level GRPO run. Private training artifact repos require auth and must be mirrored before being used as public evidence. |
29
 
docs/idea_document_traceability.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Idea document and participant guide — implementation map
2
+
3
+ This ties your polypharmacy / OpenEnv design notes and typical hackathon submission requirements to files in this repository.
4
+
5
+ ## Submission narrative (required bullets)
6
+
7
+ | Requirement | Status | Where |
8
+ | --- | --- | --- |
9
+ | Problem statement | Documented + implemented | Root [`README.md`](../../README.md), `polyguard-rl/README.md`, `docs/safety.md` |
10
+ | Environment (agent operates here) | Implemented | `PolyGuardEnv`, `app/env/env_core.py`, `app/env/fastapi_app.py`, `openenv.yaml`, `server/app.py` |
11
+ | Agent capabilities | Implemented | `app/agents/`, `docs/agents.md` |
12
+ | Tasks | Implemented | Scenario JSONL under `data/scenarios/`, presets in `app/env/catalog.py` |
13
+ | Reward / evaluation logic | Implemented | `app/env/reward_router.py`, `app/env/verifier.py`, `configs/rewards.yaml`, `docs/reward_design.md`, `docs/evaluation.md` |
14
+ | Post-training / self-improvement | Implemented | `scripts/train_sft_trl.py`, `scripts/train_grpo_trl.py`, `app/training/grpo_trl.py`, `docs/training.md` |
15
+
16
+ ## Your “Plan” sections vs codebase
17
+
18
+ | Plan item | Status | Notes |
19
+ | --- | --- | --- |
20
+ | OpenEnv `reset` / `step` / `state`, timeouts, safety | Done | `env_core.py`, `fastapi_app.py`, max steps per sub-env, `anti_cheat.py` |
21
+ | Local + remote execution | Done | Local FastAPI + `docker-compose.yml`, HF Space via `scripts/deploy_space_api.py`, `Dockerfile.space`, `docker/space/` |
22
+ | Specific envs: DDI, bandit mining, regimen risk | Done | `SubEnvironment` enum, transitions in `app/env/transition.py` |
23
+ | Precision dosing, deprescribing, web search, alternatives, new drug (hard) | Done | Matching enum values + scenario tracks; “new drug” is `NEW_DRUG_DECOMPOSITION` |
24
+ | Multiple reward functions + anti-hacking | Done | 13 components → 4 channels; anti-cheat and tests in `tests/` |
25
+ | TRL + Unsloth, metrics, generations | Done | TRL scripts + reports; Unsloth optional (`--use-unsloth`); `app/training/metrics.py` |
26
+ | Post-training + inference | Done | merge + `test_inference_postsave.py`, active manifest / API path |
27
+ | Product / Space demo, UI | Done | FastAPI `app/api/`, React `app/ui/frontend/`, Space deployment scripts |
28
+ | Benchmarks + plots + sample generations | Done | `scripts/evaluate_*.py`, `docs/results/`, `scripts/generate_submission_evidence.py` |
29
+ | Deploy: OpenEnv, container, HF Space | Done | See `docs/deployment.md` |
30
+ | Easy / medium / hard | Done | `scenarios_easy.jsonl`, `scenarios_medium.jsonl`, `scenarios_hard.jsonl` |
31
+
32
+ ## Themes (world modeling, multi-agent, self-improvement)
33
+
34
+ | Theme | Status | Notes |
35
+ | --- | --- | --- |
36
+ | World modeling / professional tasks | Primary fit | Stateful regimen, verifiers, tool-like actions |
37
+ | Multi-agent | Partial | Supervisor/orchestrator and policy stack (`app/agents/orchestrator.py`, `supervisor_agent.py`); not a separate multi-player env |
38
+ | Self-improving systems | Via GRPO | Environment-backed RLVR-style training, not online self-play |
39
+
40
+ ## “What to submit” checklist
41
+
42
+ | Deliverable | Status |
43
+ | --- | --- |
44
+ | GitHub repo + URLs in README | Root + `polyguard-rl/README.md` |
45
+ | HF Space URL | In README |
46
+ | Points from doc | `docs/participant_guide_traceability.md`, this file |
47
+ | Colab | `PolyGuard_SFT_GRPO_One_Run_Runner.ipynb`, `notebooks/09_training_loop.ipynb` |
48
+ | Video or blog | README links blog; **publish** draft in `docs/hf_blog_draft.md` or swap URL |
49
+
50
+ ## Future ideas from your notes (not claimed as done)
51
+
52
+ - Medicine images / barcodes: listed under Future Work in README.
53
+ - Web search agents: sub-env `WEB_SEARCH_MISSING_DATA` exists; “full web agent product” is beyond current scope.
54
+
55
+ ## Fresh clone reminder
56
+
57
+ Generated data and many `outputs/` reports are produced by scripts (see `scripts/bootstrap_data.py`, `scripts/acceptance_gate.py` `REQUIRED_ARTIFACTS`). Run the bootstrap/build pipeline before expecting strict `POLYGUARD_ENFORCE_SUBMISSION_LINKS=true` acceptance to pass on an empty workspace.
docs/participant_guide_traceability.md CHANGED
@@ -18,7 +18,7 @@ This audit maps the hackathon guide, FAQ, and judging criteria to concrete PolyG
18
  | Export adapters safely and test inference | `scripts/merge_adapters_safe.py` and `scripts/test_inference_postsave.py` |
19
  | Show results with plots and reports | `docs/results/*.json`, tracked reward/process/legal/success/sweep plot PNGs, a 3-model SFT-baseline sweep, and a top-level environment-backed GRPO run |
20
  | Host the environment on Hugging Face Spaces | `scripts/deploy_space_api.py`, `scripts/deploy_space.sh`, Docker runtime, `docs/results/hf_space_verification.json`, and live Space health/metadata checks |
21
- | Include a Colab training notebook | `notebooks/09_training_loop.ipynb` |
22
  | Link story material from README | README links the selected Hugging Face blog/story URL; publish it before final hand-in if the external URL is still 404 |
23
 
24
  ## Current Evidence Status
 
18
  | Export adapters safely and test inference | `scripts/merge_adapters_safe.py` and `scripts/test_inference_postsave.py` |
19
  | Show results with plots and reports | `docs/results/*.json`, tracked reward/process/legal/success/sweep plot PNGs, a 3-model SFT-baseline sweep, and a top-level environment-backed GRPO run |
20
  | Host the environment on Hugging Face Spaces | `scripts/deploy_space_api.py`, `scripts/deploy_space.sh`, Docker runtime, `docs/results/hf_space_verification.json`, and live Space health/metadata checks |
21
+ | Include a Colab training notebook | [`PolyGuard_SFT_GRPO_One_Run_Runner.ipynb`](../PolyGuard_SFT_GRPO_One_Run_Runner.ipynb) (README Colab link) and [`notebooks/09_training_loop.ipynb`](../notebooks/09_training_loop.ipynb) (modular walkthrough) |
22
  | Link story material from README | README links the selected Hugging Face blog/story URL; publish it before final hand-in if the external URL is still 404 |
23
 
24
  ## Current Evidence Status
docs/ui.md CHANGED
@@ -1,3 +1,7 @@
1
  # UI
2
 
3
  React/Vite workbench provides operator-oriented pages for patient workbench, replay, policy lab, dosing, safety, and training monitoring.
 
 
 
 
 
1
  # UI
2
 
3
  React/Vite workbench provides operator-oriented pages for patient workbench, replay, policy lab, dosing, safety, and training monitoring.
4
+
5
+ For a **panel-by-panel and agent-by-agent** reference (Model Truth, Agent vs Env, Qwen, orchestrator stack, every reward bar, Q Tips copy), see [DEMO_RECORDING_SCRIPT.md](DEMO_RECORDING_SCRIPT.md) § *Feature encyclopedia*.
6
+
7
+ The **FDA alternatives** panel (last block in the Home workbench, full width) calls `POST /tools/medication_alternatives` (up to 7 rows, scrollable list). Optional env: `POLYGUARD_OPENFDA_API_KEY`, `POLYGUARD_MED_TOOL_URL` + `POLYGUARD_MED_TOOL_TOKEN` (server-side only).
scripts/deploy_space_api.py CHANGED
@@ -8,8 +8,9 @@ still shipping the same OpenEnv/FastAPI runtime.
8
  from __future__ import annotations
9
 
10
  import argparse
11
- from pathlib import Path
12
  import shutil
 
 
13
 
14
  from huggingface_hub import HfApi
15
 
@@ -17,6 +18,18 @@ from huggingface_hub import HfApi
17
  ROOT = Path(__file__).resolve().parents[1]
18
 
19
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def parse_args() -> argparse.Namespace:
21
  parser = argparse.ArgumentParser(description="Deploy PolyGuard OpenEnv Space with valid HF metadata.")
22
  parser.add_argument("--repo-id", default="TheJackBright/polyguard-openenv")
@@ -64,7 +77,8 @@ def build_bundle(bundle_dir: Path) -> None:
64
  "colorFrom: blue",
65
  "colorTo: green",
66
  "sdk: docker",
67
- "app_port: 8100",
 
68
  "pinned: false",
69
  "---",
70
  "",
@@ -83,6 +97,9 @@ def main() -> None:
83
  print(f"bundle_dir={bundle_dir}")
84
  return
85
 
 
 
 
86
  api = HfApi()
87
  api.create_repo(
88
  repo_id=args.repo_id,
@@ -95,7 +112,7 @@ def main() -> None:
95
  repo_id=args.repo_id,
96
  repo_type="space",
97
  folder_path=str(bundle_dir),
98
- commit_message="Deploy PolyGuard OpenEnv Space",
99
  ignore_patterns=[
100
  ".git/*",
101
  ".venv/*",
@@ -109,6 +126,11 @@ def main() -> None:
109
  print(f"space_url=https://huggingface.co/spaces/{args.repo_id}")
110
  print(f"runtime_url=https://{args.repo_id.replace('/', '-').lower()}.hf.space")
111
  print(f"bundle_dir={bundle_dir}")
 
 
 
 
 
112
 
113
 
114
  if __name__ == "__main__":
 
8
  from __future__ import annotations
9
 
10
  import argparse
 
11
  import shutil
12
+ import subprocess
13
+ from pathlib import Path
14
 
15
  from huggingface_hub import HfApi
16
 
 
18
  ROOT = Path(__file__).resolve().parents[1]
19
 
20
 
21
+ def _git_revision() -> str:
22
+ try:
23
+ return subprocess.check_output(
24
+ ["git", "rev-parse", "--short", "HEAD"],
25
+ cwd=str(ROOT),
26
+ stderr=subprocess.DEVNULL,
27
+ text=True,
28
+ ).strip()
29
+ except (subprocess.CalledProcessError, FileNotFoundError, OSError):
30
+ return "unknown"
31
+
32
+
33
  def parse_args() -> argparse.Namespace:
34
  parser = argparse.ArgumentParser(description="Deploy PolyGuard OpenEnv Space with valid HF metadata.")
35
  parser.add_argument("--repo-id", default="TheJackBright/polyguard-openenv")
 
77
  "colorFrom: blue",
78
  "colorTo: green",
79
  "sdk: docker",
80
+ # Must match nginx / EXPOSE in Dockerfile (Space UI + /api proxy); 8100/8200 are loopback-only.
81
+ "app_port: 7860",
82
  "pinned: false",
83
  "---",
84
  "",
 
97
  print(f"bundle_dir={bundle_dir}")
98
  return
99
 
100
+ rev = _git_revision()
101
+ commit_message = f"Deploy PolyGuard OpenEnv Space (polyguard-rl @ {rev})"
102
+
103
  api = HfApi()
104
  api.create_repo(
105
  repo_id=args.repo_id,
 
112
  repo_id=args.repo_id,
113
  repo_type="space",
114
  folder_path=str(bundle_dir),
115
+ commit_message=commit_message,
116
  ignore_patterns=[
117
  ".git/*",
118
  ".venv/*",
 
126
  print(f"space_url=https://huggingface.co/spaces/{args.repo_id}")
127
  print(f"runtime_url=https://{args.repo_id.replace('/', '-').lower()}.hf.space")
128
  print(f"bundle_dir={bundle_dir}")
129
+ print(f"deployed_src_revision={rev}")
130
+ print(
131
+ "If the live UI still looks old: open the Space → Settings → Factory reboot, "
132
+ "or hard-refresh the browser (Vite hashes usually bust cache after rebuild)."
133
+ )
134
 
135
 
136
  if __name__ == "__main__":
scripts/install_hf_active_bundle.py CHANGED
@@ -77,13 +77,24 @@ def main() -> None:
77
  allow = f"{args.bundle_path.strip('/')}/**"
78
 
79
  print(f"Downloading snapshot of {args.repo_id}@{args.revision} (pattern {allow}) …", flush=True)
80
- snapshot_download(
81
- repo_id=args.repo_id,
82
- repo_type="model",
83
- revision=args.revision,
84
- local_dir=str(snap_root),
85
- allow_patterns=[allow],
86
- )
 
 
 
 
 
 
 
 
 
 
 
87
 
88
  bundle_root = snap_root / args.bundle_path
89
  ckpt_src = bundle_root / "checkpoints"
 
77
  allow = f"{args.bundle_path.strip('/')}/**"
78
 
79
  print(f"Downloading snapshot of {args.repo_id}@{args.revision} (pattern {allow}) …", flush=True)
80
+ try:
81
+ snapshot_download(
82
+ repo_id=args.repo_id,
83
+ repo_type="model",
84
+ revision=args.revision,
85
+ local_dir=str(snap_root),
86
+ allow_patterns=[allow],
87
+ )
88
+ except Exception as exc:
89
+ err = f"{type(exc).__name__}: {exc}"
90
+ hint = (
91
+ "\n[install_hf_active_bundle] Hub returned an error (401/404 often means the artifact repo is private or gated).\n"
92
+ " • Hugging Face Space: Space Settings → Secrets → add HF_TOKEN (read access to that model repo).\n"
93
+ " • Or change the repo to public / accept the license on the model card while logged in.\n"
94
+ " • Without a successful download, POLYGUARD falls back to heuristics / ranker (no local GRPO weights).\n"
95
+ )
96
+ print(f"{hint} • Raw error: {err}\n", flush=True)
97
+ raise SystemExit(1) from exc
98
 
99
  bundle_root = snap_root / args.bundle_path
100
  ckpt_src = bundle_root / "checkpoints"
submission_bundle/README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PolyGuard Submission Bundle
2
+
3
+ This folder contains push-friendly training evidence from the Hugging Face runs.
4
+
5
+ Included:
6
+ - `grpo_training_cycle/`: GRPO run plots, reports, and docs result JSONs.
7
+ - `sft_baseline/`: SFT baseline sweep plots, reports, and docs result JSONs.
8
+ - `local_results/`: Current local result mirrors from `outputs/plots`, `outputs/reports`, and `docs/results`.
9
+ - `notebooks/`: Project notebooks, including `09_training_loop.ipynb`.
10
+
11
+ Not included:
12
+ - Model weights, adapters, checkpoints, optimizer states, or tokenizer/model binary payloads.
13
+ - These were intentionally removed after download cancellation so the repository can be pushed without large model artifacts.
submission_bundle/grpo_training_cycle/docs_results/README.md ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Result Artifacts
2
+
3
+ These tracked files mirror the latest local smoke/evaluation artifacts so the README can show stable evidence even though `outputs/` and `checkpoints/` are intentionally git-ignored.
4
+
5
+ Current status:
6
+
7
+ - OpenEnv structure/runtime validation passes locally.
8
+ - Test suite passes locally.
9
+ - Frontend production build passes locally.
10
+ - SFT and GRPO artifacts in this folder are non-fallback TRL Transformers evidence from a tiny local compliance run.
11
+ - `postsave_inference.json` loads the merged artifact rather than the fallback policy.
12
+ - `improvement_report.json` shows positive average-reward improvement against the no-change baseline.
13
+ - `hf_space_verification.json` records a live Hugging Face Space validation pass.
14
+
15
+ For a stronger final pitch, replace these artifacts after a larger Colab/HF GPU run:
16
+
17
+ - `sft_trl_run.json`
18
+ - `grpo_trl_run.json`
19
+ - `postsave_inference.json`
20
+ - `improvement_report.json`
21
+ - all plot PNGs
22
+ - `hf_space_verification.json`
submission_bundle/grpo_training_cycle/docs_results/acceptance_gate.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "missing_files": [],
3
+ "missing_artifacts": [],
4
+ "missing_readme_markers": [],
5
+ "missing_readme_links": [],
6
+ "strict_submission_links": true,
7
+ "missing_submission_env": [],
8
+ "strict_submission_failures": [],
9
+ "submission_ready": true,
10
+ "status": "ok"
11
+ }
submission_bundle/grpo_training_cycle/docs_results/avg_process_fidelity.png ADDED
submission_bundle/grpo_training_cycle/docs_results/avg_reward.png ADDED
submission_bundle/grpo_training_cycle/docs_results/baselines.json ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "no_change": {
3
+ "mode": "REGIMEN_OPT",
4
+ "action_type": "KEEP_REGIMEN",
5
+ "target_drug": null,
6
+ "replacement_drug": null,
7
+ "dose_bucket": "NA",
8
+ "taper_days": null,
9
+ "monitoring_plan": null,
10
+ "evidence_query": null,
11
+ "new_drug_name": null,
12
+ "candidate_components": [],
13
+ "candidate_id": "cand_01",
14
+ "confidence": 0.8,
15
+ "rationale_brief": "Baseline no-change policy."
16
+ },
17
+ "rules_only": {
18
+ "mode": "REGIMEN_OPT",
19
+ "action_type": "SUBSTITUTE_WITHIN_CLASS",
20
+ "target_drug": "opioid_like",
21
+ "replacement_drug": "non_opioid_analgesic",
22
+ "dose_bucket": "NA",
23
+ "taper_days": null,
24
+ "monitoring_plan": null,
25
+ "evidence_query": null,
26
+ "new_drug_name": null,
27
+ "candidate_components": [],
28
+ "candidate_id": "cand_04",
29
+ "confidence": 0.75,
30
+ "rationale_brief": "Rules-only selected top legal candidate."
31
+ },
32
+ "greedy": {
33
+ "mode": "REGIMEN_OPT",
34
+ "action_type": "SUBSTITUTE_WITHIN_CLASS",
35
+ "target_drug": "opioid_like",
36
+ "replacement_drug": "non_opioid_analgesic",
37
+ "dose_bucket": "NA",
38
+ "taper_days": null,
39
+ "monitoring_plan": null,
40
+ "evidence_query": null,
41
+ "new_drug_name": null,
42
+ "candidate_components": [],
43
+ "candidate_id": "cand_04",
44
+ "confidence": 0.72,
45
+ "rationale_brief": "Greedy safety/burden improvement baseline."
46
+ },
47
+ "contextual_bandit": {
48
+ "mode": "REGIMEN_OPT",
49
+ "action_type": "SUBSTITUTE_WITHIN_CLASS",
50
+ "target_drug": "opioid_like",
51
+ "replacement_drug": "non_opioid_analgesic",
52
+ "dose_bucket": "NA",
53
+ "taper_days": null,
54
+ "monitoring_plan": null,
55
+ "evidence_query": null,
56
+ "new_drug_name": null,
57
+ "candidate_components": [],
58
+ "candidate_id": "cand_04",
59
+ "confidence": 0.68,
60
+ "rationale_brief": "Contextual bandit selected candidate."
61
+ },
62
+ "contextual_bandit_topk": [
63
+ {
64
+ "candidate_id": "cand_09",
65
+ "score": 1.1532307878304324,
66
+ "exploration_bonus": 1.1532307878304324,
67
+ "algorithm": "linucb"
68
+ },
69
+ {
70
+ "candidate_id": "cand_10",
71
+ "score": 1.1489735636645433,
72
+ "exploration_bonus": 1.1489735636645433,
73
+ "algorithm": "linucb"
74
+ },
75
+ {
76
+ "candidate_id": "cand_08",
77
+ "score": 1.1447401451857973,
78
+ "exploration_bonus": 1.1447401451857973,
79
+ "algorithm": "linucb"
80
+ }
81
+ ],
82
+ "beam_search": {
83
+ "mode": "REGIMEN_OPT",
84
+ "action_type": "SUBSTITUTE_WITHIN_CLASS",
85
+ "target_drug": "opioid_like",
86
+ "replacement_drug": "non_opioid_analgesic",
87
+ "dose_bucket": "NA",
88
+ "taper_days": null,
89
+ "monitoring_plan": null,
90
+ "evidence_query": null,
91
+ "new_drug_name": null,
92
+ "candidate_components": [],
93
+ "candidate_id": "cand_04",
94
+ "confidence": 0.74,
95
+ "rationale_brief": "Beam-search(3) top candidate."
96
+ },
97
+ "baseline_policy": "no_change_candidate",
98
+ "episodes": 8,
99
+ "avg_reward": 0.747,
100
+ "legality_rate": 1.0,
101
+ "success_rate": 0.0,
102
+ "policy_stack_ablations": {
103
+ "bandit-only": {
104
+ "avg_reward": 0.7616666666666667,
105
+ "legality_rate": 1.0,
106
+ "steps": 3.0
107
+ },
108
+ "llm-only": {
109
+ "avg_reward": 0.7753333333333333,
110
+ "legality_rate": 1.0,
111
+ "steps": 3.0
112
+ },
113
+ "llm+bandit": {
114
+ "avg_reward": 0.7753333333333333,
115
+ "legality_rate": 1.0,
116
+ "steps": 3.0
117
+ }
118
+ }
119
+ }
submission_bundle/grpo_training_cycle/docs_results/benchmark_report.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "offline_policy_eval": {
3
+ "avg_reward": 0.772833,
4
+ "legal_rate": 1.0,
5
+ "success_rate": 0.0
6
+ },
7
+ "safety_eval": {
8
+ "severe_violation_rate": 0.0,
9
+ "illegal_step_rate": 0.0
10
+ },
11
+ "dosing_eval": {
12
+ "target_attainment": 0.75,
13
+ "toxicity_avoidance": 1.0
14
+ },
15
+ "robustness_eval": {
16
+ "missing_labs_safety_rate": 0.666667,
17
+ "noisy_dose_info_safety_rate": 1.0,
18
+ "conflicting_meds_safety_rate": 1.0,
19
+ "alias_noise_safety_rate": 1.0,
20
+ "hidden_duplicate_detection_rate": 1.0,
21
+ "wrong_candidate_id_resilience": 1.0,
22
+ "stale_evidence_safety_rate": 1.0,
23
+ "delayed_ade_manifestation_safety_rate": 1.0
24
+ },
25
+ "calibration_eval": {
26
+ "ece_proxy": 0.08625
27
+ },
28
+ "abstention_eval": {
29
+ "appropriate_abstention_rate": 0.0
30
+ },
31
+ "process_eval": {
32
+ "process_fidelity": 0.92,
33
+ "avg_invalid_actions": 0.333333
34
+ },
35
+ "subgroup_eval": {
36
+ "renal_compromise": {
37
+ "avg_reward": 0.774,
38
+ "legal_rate": 1.0
39
+ },
40
+ "hepatic_compromise": {
41
+ "avg_reward": 0.779333,
42
+ "legal_rate": 1.0
43
+ },
44
+ "frail": {
45
+ "avg_reward": 0.781667,
46
+ "legal_rate": 1.0
47
+ }
48
+ },
49
+ "explainability_eval": {
50
+ "grounding_rate": 0.8
51
+ }
52
+ }
submission_bundle/grpo_training_cycle/docs_results/benchmark_report.txt ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "offline_policy_eval": {
3
+ "avg_reward": 0.772833,
4
+ "legal_rate": 1.0,
5
+ "success_rate": 0.0
6
+ },
7
+ "safety_eval": {
8
+ "severe_violation_rate": 0.0,
9
+ "illegal_step_rate": 0.0
10
+ },
11
+ "dosing_eval": {
12
+ "target_attainment": 0.75,
13
+ "toxicity_avoidance": 1.0
14
+ },
15
+ "robustness_eval": {
16
+ "missing_labs_safety_rate": 0.666667,
17
+ "noisy_dose_info_safety_rate": 1.0,
18
+ "conflicting_meds_safety_rate": 1.0,
19
+ "alias_noise_safety_rate": 1.0,
20
+ "hidden_duplicate_detection_rate": 1.0,
21
+ "wrong_candidate_id_resilience": 1.0,
22
+ "stale_evidence_safety_rate": 1.0,
23
+ "delayed_ade_manifestation_safety_rate": 1.0
24
+ },
25
+ "calibration_eval": {
26
+ "ece_proxy": 0.08625
27
+ },
28
+ "abstention_eval": {
29
+ "appropriate_abstention_rate": 0.0
30
+ },
31
+ "process_eval": {
32
+ "process_fidelity": 0.92,
33
+ "avg_invalid_actions": 0.333333
34
+ },
35
+ "subgroup_eval": {
36
+ "renal_compromise": {
37
+ "avg_reward": 0.774,
38
+ "legal_rate": 1.0
39
+ },
40
+ "hepatic_compromise": {
41
+ "avg_reward": 0.779333,
42
+ "legal_rate": 1.0
43
+ },
44
+ "frail": {
45
+ "avg_reward": 0.781667,
46
+ "legal_rate": 1.0
47
+ }
48
+ },
49
+ "explainability_eval": {
50
+ "grounding_rate": 0.8
51
+ }
52
+ }
submission_bundle/grpo_training_cycle/docs_results/dose_train.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "dataset_size": 120.0,
3
+ "status": "trained",
4
+ "train_mae": 0.0025,
5
+ "model_path": "outputs/models/dose_model.pkl"
6
+ }
submission_bundle/grpo_training_cycle/docs_results/dosing_grpo.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "avg_reward": 0.7785555555555557,
3
+ "legality_rate": 1.0,
4
+ "severe_violation_rate": 0.0,
5
+ "abstention_rate": 0.0,
6
+ "avg_episode_length": 2.0,
7
+ "success_rate": 0.0,
8
+ "avg_burden_delta": 0.0,
9
+ "avg_safety_delta": 0.5,
10
+ "avg_dosing_quality": 0.75,
11
+ "avg_process_fidelity": 0.9200000000000002,
12
+ "exploit_detection_count": 3.0,
13
+ "reward_columns": {
14
+ "format_compliance_score": 0.999,
15
+ "candidate_alignment_score": 0.999,
16
+ "legality_score": 0.999,
17
+ "safety_delta_score": 0.5,
18
+ "burden_improvement_score": 0.5,
19
+ "disease_stability_score": 0.9000000000000001,
20
+ "dosing_quality_score": 0.75,
21
+ "abstention_quality_score": 0.56,
22
+ "efficiency_score": 0.77,
23
+ "process_fidelity_score": 0.9200000000000002,
24
+ "explanation_grounding_score": 0.7999999999999999,
25
+ "anti_cheat_score": 0.6663333333333333,
26
+ "uncertainty_calibration_score": 0.87
27
+ }
28
+ }
submission_bundle/grpo_training_cycle/docs_results/frontier_ready.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "frontier_models": [
3
+ "qwen2.5:7b-instruct",
4
+ "qwen2.5:14b-instruct"
5
+ ],
6
+ "deployment_mode": "hf_or_vllm_ready",
7
+ "notes": "Baseline complete; ready for larger model sweep."
8
+ }
submission_bundle/grpo_training_cycle/docs_results/graph_train.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 180,
3
+ "status": "trained",
4
+ "model_path": "/Users/daver/Desktop/Meta_Pytorch_OpenEnv_Scaler/polyguard-rl/outputs/models/graph_model.pkl"
5
+ }
submission_bundle/grpo_training_cycle/docs_results/grpo_ablation_report.json ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "status": "ok",
3
+ "ablations": {
4
+ "bandit_only": {
5
+ "avg_reward": 0.779625,
6
+ "legality_rate": 1.0,
7
+ "severe_violation_rate": 0.0,
8
+ "abstention_rate": 0.0,
9
+ "avg_episode_length": 2.8125,
10
+ "success_rate": 0.0,
11
+ "avg_burden_delta": 0.0,
12
+ "avg_safety_delta": 0.483125,
13
+ "avg_dosing_quality": 0.75,
14
+ "avg_process_fidelity": 0.9056250000000008,
15
+ "exploit_detection_count": 2.0,
16
+ "timeout_rate": 0.0,
17
+ "failure_visible_rate": 0.0625,
18
+ "avg_invalid_actions": 0.0625,
19
+ "reward_columns": {
20
+ "format_compliance_score": 0.9989999999999996,
21
+ "candidate_alignment_score": 0.9989999999999996,
22
+ "legality_score": 0.9989999999999996,
23
+ "safety_delta_score": 0.483125,
24
+ "burden_improvement_score": 0.5,
25
+ "disease_stability_score": 0.8999999999999995,
26
+ "dosing_quality_score": 0.75,
27
+ "abstention_quality_score": 0.5600000000000002,
28
+ "efficiency_score": 0.5855625,
29
+ "process_fidelity_score": 0.9056250000000008,
30
+ "explanation_grounding_score": 0.8000000000000004,
31
+ "anti_cheat_score": 0.9366249999999997,
32
+ "uncertainty_calibration_score": 0.8531250000000004
33
+ },
34
+ "primary_reward_channels": {
35
+ "safety_legality": 0.9469062499999998,
36
+ "clinical_improvement": 0.6273749999999997,
37
+ "dosing_quality": 0.6550000000000001,
38
+ "process_integrity": 0.8225937500000001
39
+ },
40
+ "policy_stack": "bandit-only",
41
+ "failure_mining": {
42
+ "total_rows": 32,
43
+ "failure_rows": 2,
44
+ "top_failure_reasons": [
45
+ {
46
+ "reason": "repeated_action_loop",
47
+ "count": 2
48
+ }
49
+ ]
50
+ }
51
+ },
52
+ "llm_only": {
53
+ "avg_reward": 0.7723913043478261,
54
+ "legality_rate": 1.0,
55
+ "severe_violation_rate": 0.0,
56
+ "abstention_rate": 0.0,
57
+ "avg_episode_length": 1.9565217391304348,
58
+ "success_rate": 0.0,
59
+ "avg_burden_delta": 0.0,
60
+ "avg_safety_delta": 0.4882608695652174,
61
+ "avg_dosing_quality": 0.75,
62
+ "avg_process_fidelity": 0.9000000000000005,
63
+ "exploit_detection_count": 7.0,
64
+ "timeout_rate": 0.0,
65
+ "failure_visible_rate": 0.30434782608695654,
66
+ "avg_invalid_actions": 0.30434782608695654,
67
+ "reward_columns": {
68
+ "format_compliance_score": 0.9989999999999999,
69
+ "candidate_alignment_score": 0.9989999999999999,
70
+ "legality_score": 0.9989999999999999,
71
+ "safety_delta_score": 0.4882608695652174,
72
+ "burden_improvement_score": 0.5,
73
+ "disease_stability_score": 0.8999999999999998,
74
+ "dosing_quality_score": 0.75,
75
+ "abstention_quality_score": 0.5600000000000004,
76
+ "efficiency_score": 0.7027826086956522,
77
+ "process_fidelity_score": 0.9000000000000005,
78
+ "explanation_grounding_score": 0.8000000000000003,
79
+ "anti_cheat_score": 0.6952608695652175,
80
+ "uncertainty_calibration_score": 0.8482608695652176
81
+ },
82
+ "primary_reward_channels": {
83
+ "safety_legality": 0.8853478260869562,
84
+ "clinical_improvement": 0.6290869565217388,
85
+ "dosing_quality": 0.6549999999999998,
86
+ "process_integrity": 0.8504782608695656
87
+ },
88
+ "policy_stack": "llm-only",
89
+ "failure_mining": {
90
+ "total_rows": 23,
91
+ "failure_rows": 7,
92
+ "top_failure_reasons": [
93
+ {
94
+ "reason": "repeated_action_loop",
95
+ "count": 7
96
+ }
97
+ ]
98
+ }
99
+ },
100
+ "llm_bandit": {
101
+ "avg_reward": 0.7647391304347826,
102
+ "legality_rate": 1.0,
103
+ "severe_violation_rate": 0.0,
104
+ "abstention_rate": 0.0,
105
+ "avg_episode_length": 1.9565217391304348,
106
+ "success_rate": 0.0,
107
+ "avg_burden_delta": 0.0,
108
+ "avg_safety_delta": 0.48982608695652174,
109
+ "avg_dosing_quality": 0.717391304347826,
110
+ "avg_process_fidelity": 0.9000000000000005,
111
+ "exploit_detection_count": 7.0,
112
+ "timeout_rate": 0.0,
113
+ "failure_visible_rate": 0.30434782608695654,
114
+ "avg_invalid_actions": 0.30434782608695654,
115
+ "reward_columns": {
116
+ "format_compliance_score": 0.9989999999999999,
117
+ "candidate_alignment_score": 0.9989999999999999,
118
+ "legality_score": 0.9989999999999999,
119
+ "safety_delta_score": 0.48982608695652174,
120
+ "burden_improvement_score": 0.5043478260869565,
121
+ "disease_stability_score": 0.8582608695652173,
122
+ "dosing_quality_score": 0.717391304347826,
123
+ "abstention_quality_score": 0.5600000000000004,
124
+ "efficiency_score": 0.7027826086956522,
125
+ "process_fidelity_score": 0.9000000000000005,
126
+ "explanation_grounding_score": 0.8000000000000003,
127
+ "anti_cheat_score": 0.6952608695652175,
128
+ "uncertainty_calibration_score": 0.8126086956521739
129
+ },
130
+ "primary_reward_channels": {
131
+ "safety_legality": 0.8765217391304347,
132
+ "clinical_improvement": 0.6171739130434781,
133
+ "dosing_quality": 0.6386956521739129,
134
+ "process_integrity": 0.8504782608695656
135
+ },
136
+ "policy_stack": "llm+bandit",
137
+ "failure_mining": {
138
+ "total_rows": 23,
139
+ "failure_rows": 7,
140
+ "top_failure_reasons": [
141
+ {
142
+ "reason": "repeated_action_loop",
143
+ "count": 7
144
+ }
145
+ ]
146
+ }
147
+ }
148
+ }
149
+ }
submission_bundle/grpo_training_cycle/docs_results/grpo_trl_run.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "status": "ok",
3
+ "backend": "trl_transformers",
4
+ "model_id": "Qwen/Qwen2.5-0.5B-Instruct",
5
+ "records": 2000,
6
+ "prompts_path": "/app/data/processed/training_corpus_grpo_prompts.jsonl",
7
+ "reward_summary": {
8
+ "count": 4000,
9
+ "avg_reward": 0.782178,
10
+ "avg_reward_components": {
11
+ "format_compliance_score": 0.999,
12
+ "candidate_alignment_score": 0.999,
13
+ "legality_score": 0.985277,
14
+ "safety_delta_score": 0.496104,
15
+ "burden_improvement_score": 0.494346,
16
+ "disease_stability_score": 0.8912,
17
+ "dosing_quality_score": 0.511938,
18
+ "abstention_quality_score": 0.56,
19
+ "efficiency_score": 0.84942,
20
+ "process_fidelity_score": 0.905268,
21
+ "explanation_grounding_score": 0.800248,
22
+ "anti_cheat_score": 0.48004,
23
+ "uncertainty_calibration_score": 0.730195
24
+ },
25
+ "avg_primary_reward_channels": {
26
+ "safety_legality": 0.798661,
27
+ "clinical_improvement": 0.62689,
28
+ "dosing_quality": 0.535969,
29
+ "process_integrity": 0.888448
30
+ }
31
+ },
32
+ "reward_log": "/app/checkpoints/grpo_reward_components.jsonl",
33
+ "train_metrics": {
34
+ "train_runtime": 6960.8084,
35
+ "train_samples_per_second": 0.287,
36
+ "train_steps_per_second": 0.287,
37
+ "total_flos": 0.0,
38
+ "train_loss": 2.3633859725151752e-06
39
+ },
40
+ "artifact_path": "/app/checkpoints/grpo_adapter",
41
+ "unsloth_available": false
42
+ }
submission_bundle/grpo_training_cycle/docs_results/grpo_trl_run_auto.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "status": "fallback",
3
+ "backend": "env_reward_fallback",
4
+ "model_id": "Qwen/Qwen2.5-1.5B-Instruct",
5
+ "records": 2,
6
+ "prompts_path": "/Users/daver/Desktop/Meta_Pytorch_OpenEnv_Scaler/polyguard-rl/data/processed/training_corpus_grpo_prompts.jsonl",
7
+ "reward_summary": {
8
+ "count": 2,
9
+ "avg_reward": 0.798,
10
+ "avg_reward_components": {
11
+ "format_compliance_score": 0.999,
12
+ "candidate_alignment_score": 0.999,
13
+ "legality_score": 0.999,
14
+ "safety_delta_score": 0.671,
15
+ "burden_improvement_score": 0.525,
16
+ "disease_stability_score": 0.74,
17
+ "dosing_quality_score": 0.5,
18
+ "abstention_quality_score": 0.56,
19
+ "efficiency_score": 0.857,
20
+ "process_fidelity_score": 0.92,
21
+ "explanation_grounding_score": 0.8,
22
+ "anti_cheat_score": 0.5,
23
+ "uncertainty_calibration_score": 0.74
24
+ },
25
+ "avg_primary_reward_channels": {
26
+ "safety_legality": 0.8095,
27
+ "clinical_improvement": 0.645,
28
+ "dosing_quality": 0.53,
29
+ "process_integrity": 0.894
30
+ }
31
+ },
32
+ "reward_log": "/Users/daver/Desktop/Meta_Pytorch_OpenEnv_Scaler/polyguard-rl/checkpoints/grpo_reward_components.jsonl",
33
+ "train_metrics": {
34
+ "steps_executed": 2.0
35
+ },
36
+ "artifact_path": "",
37
+ "unsloth_available": false,
38
+ "trl_runtime_error": "We couldn't connect to 'https://huggingface.co' to load the files, and couldn't find them in the cached files.\nCheck your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'."
39
+ }
submission_bundle/grpo_training_cycle/docs_results/grpo_trl_run_fallback_check.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "status": "fallback",
3
+ "backend": "env_reward_fallback",
4
+ "model_id": "Qwen/Qwen2.5-1.5B-Instruct",
5
+ "records": 1,
6
+ "prompts_path": "/Users/daver/Desktop/Meta_Pytorch_OpenEnv_Scaler/polyguard-rl/data/processed/training_corpus_grpo_prompts.jsonl",
7
+ "reward_summary": {
8
+ "count": 1,
9
+ "avg_reward": 0.764,
10
+ "avg_reward_components": {
11
+ "format_compliance_score": 0.999,
12
+ "candidate_alignment_score": 0.999,
13
+ "legality_score": 0.999,
14
+ "safety_delta_score": 0.5,
15
+ "burden_improvement_score": 0.5,
16
+ "disease_stability_score": 0.9,
17
+ "dosing_quality_score": 0.5,
18
+ "abstention_quality_score": 0.56,
19
+ "efficiency_score": 0.857,
20
+ "process_fidelity_score": 0.92,
21
+ "explanation_grounding_score": 0.8,
22
+ "anti_cheat_score": 0.001,
23
+ "uncertainty_calibration_score": 0.7
24
+ },
25
+ "avg_primary_reward_channels": {
26
+ "safety_legality": 0.675,
27
+ "clinical_improvement": 0.633,
28
+ "dosing_quality": 0.53,
29
+ "process_integrity": 0.894
30
+ }
31
+ },
32
+ "reward_log": "/Users/daver/Desktop/Meta_Pytorch_OpenEnv_Scaler/polyguard-rl/checkpoints/grpo_reward_components.jsonl",
33
+ "train_metrics": {
34
+ "steps_executed": 1.0
35
+ },
36
+ "artifact_path": "",
37
+ "unsloth_available": false,
38
+ "trl_runtime_error": "forced_fallback"
39
+ }
submission_bundle/grpo_training_cycle/docs_results/grpo_trl_run_smoke.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "status": "fallback",
3
+ "backend": "env_reward_fallback",
4
+ "model_id": "Qwen/Qwen2.5-1.5B-Instruct",
5
+ "records": 1,
6
+ "prompts_path": "/Users/daver/Desktop/Meta_Pytorch_OpenEnv_Scaler/polyguard-rl/data/processed/training_corpus_grpo_prompts.jsonl",
7
+ "reward_summary": {
8
+ "count": 1,
9
+ "avg_reward": 0.764,
10
+ "avg_reward_components": {
11
+ "format_compliance_score": 0.999,
12
+ "candidate_alignment_score": 0.999,
13
+ "legality_score": 0.999,
14
+ "safety_delta_score": 0.5,
15
+ "burden_improvement_score": 0.5,
16
+ "disease_stability_score": 0.9,
17
+ "dosing_quality_score": 0.5,
18
+ "abstention_quality_score": 0.56,
19
+ "efficiency_score": 0.857,
20
+ "process_fidelity_score": 0.92,
21
+ "explanation_grounding_score": 0.8,
22
+ "anti_cheat_score": 0.001,
23
+ "uncertainty_calibration_score": 0.7
24
+ },
25
+ "avg_primary_reward_channels": {
26
+ "safety_legality": 0.675,
27
+ "clinical_improvement": 0.633,
28
+ "dosing_quality": 0.53,
29
+ "process_integrity": 0.894
30
+ }
31
+ },
32
+ "reward_log": "/Users/daver/Desktop/Meta_Pytorch_OpenEnv_Scaler/polyguard-rl/checkpoints/grpo_reward_components.jsonl",
33
+ "train_metrics": {
34
+ "steps_executed": 1.0
35
+ },
36
+ "artifact_path": "",
37
+ "unsloth_available": false,
38
+ "trl_runtime_error": "forced_fallback"
39
+ }
submission_bundle/grpo_training_cycle/docs_results/grpo_trl_run_strict_check.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "status": "fallback",
3
+ "backend": "env_reward_fallback",
4
+ "model_id": "Qwen/Qwen2.5-1.5B-Instruct",
5
+ "records": 1,
6
+ "prompts_path": "/Users/daver/Desktop/Meta_Pytorch_OpenEnv_Scaler/polyguard-rl/data/processed/training_corpus_grpo_prompts.jsonl",
7
+ "reward_summary": {
8
+ "count": 1,
9
+ "avg_reward": 0.764,
10
+ "avg_reward_components": {
11
+ "format_compliance_score": 0.999,
12
+ "candidate_alignment_score": 0.999,
13
+ "legality_score": 0.999,
14
+ "safety_delta_score": 0.5,
15
+ "burden_improvement_score": 0.5,
16
+ "disease_stability_score": 0.9,
17
+ "dosing_quality_score": 0.5,
18
+ "abstention_quality_score": 0.56,
19
+ "efficiency_score": 0.857,
20
+ "process_fidelity_score": 0.92,
21
+ "explanation_grounding_score": 0.8,
22
+ "anti_cheat_score": 0.001,
23
+ "uncertainty_calibration_score": 0.7
24
+ },
25
+ "avg_primary_reward_channels": {
26
+ "safety_legality": 0.675,
27
+ "clinical_improvement": 0.633,
28
+ "dosing_quality": 0.53,
29
+ "process_integrity": 0.894
30
+ }
31
+ },
32
+ "reward_log": "/Users/daver/Desktop/Meta_Pytorch_OpenEnv_Scaler/polyguard-rl/checkpoints/grpo_reward_components.jsonl",
33
+ "train_metrics": {
34
+ "steps_executed": 1.0
35
+ },
36
+ "artifact_path": "",
37
+ "unsloth_available": false,
38
+ "trl_runtime_error": "forced_fallback"
39
+ }
submission_bundle/grpo_training_cycle/docs_results/hf_space_verification.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "passed": true,
3
+ "status": "running",
4
+ "checked_on": "2026-04-26",
5
+ "repo_id": "TheJackBright/polyguard-openenv",
6
+ "space_url": "https://huggingface.co/spaces/TheJackBright/polyguard-openenv",
7
+ "runtime_url": "https://thejackbright-polyguard-openenv.hf.space",
8
+ "space_sha": "877add7878fbdf2011ed3d5d378cdca5fe7bac4b",
9
+ "space_private": false,
10
+ "runtime": {
11
+ "stage": "RUNNING",
12
+ "hardware": {
13
+ "current": "cpu-basic",
14
+ "requested": "cpu-basic"
15
+ },
16
+ "replicas": {
17
+ "current": 1,
18
+ "requested": 1
19
+ },
20
+ "domain": "thejackbright-polyguard-openenv.hf.space"
21
+ },
22
+ "openenv_validation": {
23
+ "command": "uv run openenv validate --url https://thejackbright-polyguard-openenv.hf.space",
24
+ "passed": true,
25
+ "passed_count": 6,
26
+ "total_count": 6,
27
+ "failed_criteria": []
28
+ }
29
+ }
submission_bundle/grpo_training_cycle/docs_results/hf_training_status.json ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "status": "running",
3
+ "started_at": 1777161126.3536248,
4
+ "finished_at": null,
5
+ "commands": [
6
+ {
7
+ "args": [
8
+ "python",
9
+ "scripts/bootstrap_data.py"
10
+ ],
11
+ "returncode": 0,
12
+ "elapsed_seconds": 0.821
13
+ },
14
+ {
15
+ "args": [
16
+ "python",
17
+ "scripts/build_training_corpus.py",
18
+ "--profile",
19
+ "massive",
20
+ "--with-local",
21
+ "--with-synthetic",
22
+ "--with-hf"
23
+ ],
24
+ "returncode": 0,
25
+ "elapsed_seconds": 4.367
26
+ },
27
+ {
28
+ "args": [
29
+ "python",
30
+ "scripts/train_sft_trl.py",
31
+ "--model-id",
32
+ "Qwen/Qwen2.5-0.5B-Instruct",
33
+ "--dataset-path",
34
+ "data/processed/training_corpus_sft.json",
35
+ "--epochs",
36
+ "1",
37
+ "--max-steps",
38
+ "20",
39
+ "--batch-size",
40
+ "2",
41
+ "--max-seq-len",
42
+ "512",
43
+ "--use-unsloth"
44
+ ],
45
+ "returncode": 0,
46
+ "elapsed_seconds": 24.564
47
+ },
48
+ {
49
+ "args": [
50
+ "reuse_artifact",
51
+ "grpo_adapter",
52
+ "/app/checkpoints/grpo_adapter"
53
+ ],
54
+ "returncode": 0,
55
+ "elapsed_seconds": 0.0
56
+ },
57
+ {
58
+ "args": [
59
+ "python",
60
+ "scripts/merge_adapters_safe.py",
61
+ "--adapter-dir",
62
+ "checkpoints/sft_adapter",
63
+ "--output-dir",
64
+ "checkpoints/merged"
65
+ ],
66
+ "returncode": 0,
67
+ "elapsed_seconds": 9.014
68
+ },
69
+ {
70
+ "args": [
71
+ "python",
72
+ "scripts/test_inference_postsave.py",
73
+ "--samples",
74
+ "3",
75
+ "--base-model",
76
+ "Qwen/Qwen2.5-0.5B-Instruct"
77
+ ],
78
+ "returncode": 0,
79
+ "elapsed_seconds": 14.811
80
+ },
81
+ {
82
+ "args": [
83
+ "python",
84
+ "scripts/evaluate_policy_ablations.py",
85
+ "--episodes",
86
+ "8"
87
+ ],
88
+ "returncode": 0,
89
+ "elapsed_seconds": 4.458
90
+ },
91
+ {
92
+ "args": [
93
+ "python",
94
+ "scripts/evaluate_baselines.py"
95
+ ],
96
+ "returncode": 0,
97
+ "elapsed_seconds": 4.603
98
+ },
99
+ {
100
+ "args": [
101
+ "python",
102
+ "scripts/evaluate_all.py"
103
+ ],
104
+ "returncode": 0,
105
+ "elapsed_seconds": 4.271
106
+ },
107
+ {
108
+ "args": [
109
+ "python",
110
+ "scripts/evaluate_compare_runs.py",
111
+ "--baseline",
112
+ "outputs/reports/baselines.json",
113
+ "--candidate",
114
+ "outputs/reports/benchmark_report.json",
115
+ "--output",
116
+ "outputs/reports/improvement_report.json"
117
+ ],
118
+ "returncode": 0,
119
+ "elapsed_seconds": 0.037
120
+ }
121
+ ],
122
+ "artifact_repo_id": "TheJackBright/polyguard-openenv-training-artifacts"
123
+ }
submission_bundle/grpo_training_cycle/docs_results/improvement_report.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "status": "ok",
3
+ "baseline": "outputs/reports/baselines.json",
4
+ "candidate": "outputs/reports/benchmark_report.json",
5
+ "deltas": {
6
+ "avg_reward": 0.025833,
7
+ "legality_rate": 0.0,
8
+ "success_rate": 0.0,
9
+ "avg_process_fidelity": 0.92,
10
+ "timeout_rate": 0.0,
11
+ "failure_visible_rate": 0.0
12
+ },
13
+ "gate": {
14
+ "avg_reward_up": true,
15
+ "legality_up": true,
16
+ "success_up": true
17
+ },
18
+ "improved": true
19
+ }
submission_bundle/grpo_training_cycle/docs_results/improvement_report_benchmark.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "status": "ok",
3
+ "baseline": "outputs/reports/baselines.json",
4
+ "candidate": "outputs/reports/benchmark_report.json",
5
+ "deltas": {
6
+ "avg_reward": -0.0025,
7
+ "legality_rate": 0.0,
8
+ "success_rate": 0.0,
9
+ "avg_process_fidelity": 0.92,
10
+ "timeout_rate": 0.0,
11
+ "failure_visible_rate": 0.0
12
+ },
13
+ "gate": {
14
+ "avg_reward_up": false,
15
+ "legality_up": true,
16
+ "success_up": true
17
+ },
18
+ "improved": false
19
+ }
submission_bundle/grpo_training_cycle/docs_results/inference_benchmark.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "status": "ok",
3
+ "runs": [
4
+ {
5
+ "run": 0,
6
+ "provider": "transformers",
7
+ "candidate_id": "cand_04",
8
+ "latency_ms": 2313.731,
9
+ "rationale": "Transformers fallback selected cand_04 via local ranker."
10
+ },
11
+ {
12
+ "run": 1,
13
+ "provider": "transformers",
14
+ "candidate_id": "cand_02",
15
+ "latency_ms": 0.012,
16
+ "rationale": "Transformers fallback selected cand_02 via local ranker."
17
+ }
18
+ ],
19
+ "avg_latency_ms": 1156.872,
20
+ "provider_requested": "transformers",
21
+ "model": "Qwen/Qwen2.5-0.5B-Instruct"
22
+ }
submission_bundle/grpo_training_cycle/docs_results/legality_rate.png ADDED
submission_bundle/grpo_training_cycle/docs_results/planner_grpo.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "avg_reward": 0.77625,
3
+ "legality_rate": 1.0,
4
+ "severe_violation_rate": 0.0,
5
+ "abstention_rate": 0.0,
6
+ "avg_episode_length": 2.0,
7
+ "success_rate": 0.0,
8
+ "avg_burden_delta": 0.0,
9
+ "avg_safety_delta": 0.5,
10
+ "avg_dosing_quality": 0.75,
11
+ "avg_process_fidelity": 0.92,
12
+ "exploit_detection_count": 4.0,
13
+ "reward_columns": {
14
+ "format_compliance_score": 0.9990000000000001,
15
+ "candidate_alignment_score": 0.9990000000000001,
16
+ "legality_score": 0.9990000000000001,
17
+ "safety_delta_score": 0.5,
18
+ "burden_improvement_score": 0.5,
19
+ "disease_stability_score": 0.9000000000000002,
20
+ "dosing_quality_score": 0.75,
21
+ "abstention_quality_score": 0.5600000000000002,
22
+ "efficiency_score": 0.73,
23
+ "process_fidelity_score": 0.92,
24
+ "explanation_grounding_score": 0.7999999999999999,
25
+ "anti_cheat_score": 0.6663333333333333,
26
+ "uncertainty_calibration_score": 0.8699999999999998
27
+ }
28
+ }
submission_bundle/grpo_training_cycle/docs_results/plot_index.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "plots": [
3
+ "/app/outputs/plots/avg_reward.png",
4
+ "/app/outputs/plots/legality_rate.png",
5
+ "/app/outputs/plots/success_rate.png",
6
+ "/app/outputs/plots/avg_process_fidelity.png",
7
+ "/app/outputs/plots/policy_stack_avg_reward.png"
8
+ ]
9
+ }
submission_bundle/grpo_training_cycle/docs_results/policy_stack_avg_reward.png ADDED
submission_bundle/grpo_training_cycle/docs_results/postsave_inference.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "status": "ok",
3
+ "model_source": "merged",
4
+ "model_load_error": "",
5
+ "samples": 3,
6
+ "valid_rate": 1.0,
7
+ "avg_env_reward": 0.712333,
8
+ "results": [
9
+ {
10
+ "idx": 0,
11
+ "prompt": "{\"instruction\": \"Choose one candidate_id and justify briefly.\", \"patient_id\": \"patient_8000\", \"candidate_ids\": [\"cand_01\", \"cand_02\", \"cand_03\", \"cand_04\", \"cand_05\", \"cand_08\", \"cand_09\", \"cand_10\"], \"format\": \"candidate_id=<cand_xx>; rationale=<text>\"}",
12
+ "completion": "}\n\nTo choose a candidate ID, I will follow these steps:\n\n1. **Review the list of candidates**: The provided list includes six candidate IDs: `cand_01`, `cand_02`, `cand_03`, `cand_04`, `cand_05`, `cand_08`, `cand_09`, and",
13
+ "candidate_id": "cand_01",
14
+ "selected_candidate": "cand_01",
15
+ "env_reward": 0.717,
16
+ "done": true,
17
+ "valid": true,
18
+ "termination_reason": "exploit_detection"
19
+ },
20
+ {
21
+ "idx": 1,
22
+ "prompt": "{\"instruction\": \"Choose one candidate_id and justify briefly.\", \"patient_id\": \"patient_8001\", \"candidate_ids\": [\"cand_01\", \"cand_02\", \"cand_03\", \"cand_05\", \"cand_08\", \"cand_09\", \"cand_10\"], \"format\": \"candidate_id=<cand_xx>; rationale=<text>\"}",
23
+ "completion": "}\n\nSure, I can help you choose a candidate ID based on the information provided. However, I need to know which of the given candidates is your patient ID so that I can provide a suitable rationale for choosing that specific candidate.\n\nPlease let me know which candidate ID you are referring to, and I will proceed with selecting the appropriate candidate ID and providing the rationale accordingly. If you don't have a",
24
+ "candidate_id": null,
25
+ "selected_candidate": "cand_01",
26
+ "env_reward": 0.703,
27
+ "done": true,
28
+ "valid": true,
29
+ "termination_reason": "exploit_detection"
30
+ },
31
+ {
32
+ "idx": 2,
33
+ "prompt": "{\"instruction\": \"Choose one candidate_id and justify briefly.\", \"patient_id\": \"patient_8002\", \"candidate_ids\": [\"cand_07\", \"cand_01\", \"cand_02\", \"cand_03\", \"cand_04\", \"cand_08\", \"cand_09\", \"cand_10\"], \"format\": \"candidate_id=<cand_xx>; rationale=<text>\"}",
34
+ "completion": "}\n\nTo choose a candidate ID for the patient with the identifier `patient_8002`, I will follow these steps:\n\n1. **Identify the Patient Identifier**: The patient's identifier is `patient_8002`.\n\n2. **List of Candidate IDs**: The list of candidate IDs provided is:\n - cand_07\n - cand_01\n -",
35
+ "candidate_id": "cand_07",
36
+ "selected_candidate": "cand_01",
37
+ "env_reward": 0.717,
38
+ "done": true,
39
+ "valid": true,
40
+ "termination_reason": "exploit_detection"
41
+ }
42
+ ]
43
+ }
submission_bundle/grpo_training_cycle/docs_results/risk_train.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "dataset_size": 180.0,
3
+ "status": "trained",
4
+ "train_mae": 0.0033,
5
+ "model_path": "outputs/models/tabular_risk.pkl"
6
+ }