Deploy PolyGuard workbench from master
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +15 -0
- .gitignore +2 -0
- Dockerfile +3 -3
- Dockerfile.space +3 -2
- README.md +26 -6
- README_HF_SPACE.md +0 -12
- app/ui/frontend/dist/assets/index-DV0STDGE.css +0 -1
- app/ui/frontend/dist/assets/index-DgY-oaWG.js +0 -0
- app/ui/frontend/dist/index.html +0 -13
- checkpoints/README.md +0 -23
- docker/space/README.md +5 -13
- docs/DEMO_RECORDING_SCRIPT.md +2 -2
- app/ui/frontend/dist/blackhole.webm → docs/UI Images/1.jpeg +2 -2
- docs/UI Images/2.jpeg +3 -0
- docs/UI Images/3.jpeg +3 -0
- docs/UI Images/4.jpeg +3 -0
- docs/UI Images/5.jpeg +3 -0
- docs/assets/diagrams/data_training_pipeline.png +3 -0
- docs/assets/diagrams/deployment_topology.png +3 -0
- docs/assets/diagrams/episode_state_machine.png +0 -0
- docs/assets/diagrams/evidence_generation_flow.png +0 -0
- docs/assets/diagrams/frontend_runtime_surface.png +3 -0
- docs/assets/diagrams/multi_agent_orchestration.png +3 -0
- docs/assets/diagrams/reward_decomposition.png +3 -0
- docs/assets/diagrams/runtime_step_flow.png +0 -0
- docs/assets/diagrams/system_architecture.png +3 -0
- docs/deployment.md +3 -3
- docs/evaluation.md +3 -1
- docs/final_submission_audit.md +0 -42
- docs/hf_blog_draft.md +0 -17
- docs/mathematics.md +1045 -0
- docs/participant_guide_traceability.md +1 -1
- docs/results/README.md +27 -13
- docs/results/anti_cheat_failure_rates.png +0 -0
- docs/results/final_submission_evidence/README.md +77 -0
- docs/results/final_submission_evidence/charts/all/anti_cheat_failure_rates.png +0 -0
- docs/results/final_submission_evidence/charts/all/avg_reward.png +0 -0
- docs/results/final_submission_evidence/charts/all/basic_llm_vs_full_pipeline_latency.png +0 -0
- docs/results/final_submission_evidence/charts/all/basic_llm_vs_full_pipeline_legality.png +0 -0
- docs/results/final_submission_evidence/charts/all/basic_llm_vs_full_pipeline_reward.png +0 -0
- docs/results/final_submission_evidence/charts/all/basic_llm_vs_full_pipeline_reward_delta_by_seed.png +0 -0
- docs/results/final_submission_evidence/charts/all/grpo_reward_curves.png +0 -0
- docs/results/final_submission_evidence/charts/all/inference_latency_validity.png +0 -0
- docs/results/final_submission_evidence/charts/all/inference_validity_reward.png +0 -0
- docs/results/final_submission_evidence/charts/all/legality_rate.png +0 -0
- docs/results/final_submission_evidence/charts/all/policy_ablation_avg_reward.png +0 -0
- docs/results/final_submission_evidence/charts/all/policy_ablation_exploit_detection.png +0 -0
- docs/results/final_submission_evidence/charts/all/policy_ablation_legality.png +0 -0
- docs/results/final_submission_evidence/charts/all/policy_stack_avg_reward.png +0 -0
- docs/results/final_submission_evidence/charts/all/primary_reward_channel_bars.png +0 -0
.gitattributes
CHANGED
|
@@ -42,3 +42,18 @@ docs/results/submission_evidence/qwen_0_5b_1_5b_3b/reward_component_bars.png fil
|
|
| 42 |
docs/results/submission_evidence_qwen_0_5b_1_5b/charts/generated/reward_component_bars.png filter=lfs diff=lfs merge=lfs -text
|
| 43 |
docs/results/submission_evidence_qwen_0_5b_1_5b_3b/charts/generated/reward_component_bars.png filter=lfs diff=lfs merge=lfs -text
|
| 44 |
submission_bundle/qwen_completed_runs/charts/generated/reward_component_bars.png filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
docs/results/submission_evidence_qwen_0_5b_1_5b/charts/generated/reward_component_bars.png filter=lfs diff=lfs merge=lfs -text
|
| 43 |
docs/results/submission_evidence_qwen_0_5b_1_5b_3b/charts/generated/reward_component_bars.png filter=lfs diff=lfs merge=lfs -text
|
| 44 |
submission_bundle/qwen_completed_runs/charts/generated/reward_component_bars.png filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
docs/UI[[:space:]]Images/1.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
docs/UI[[:space:]]Images/2.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
docs/UI[[:space:]]Images/3.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
docs/UI[[:space:]]Images/4.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
docs/UI[[:space:]]Images/5.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
docs/assets/diagrams/data_training_pipeline.png filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
docs/assets/diagrams/deployment_topology.png filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
docs/assets/diagrams/frontend_runtime_surface.png filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
docs/assets/diagrams/multi_agent_orchestration.png filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
docs/assets/diagrams/reward_decomposition.png filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
docs/assets/diagrams/system_architecture.png filter=lfs diff=lfs merge=lfs -text
|
| 56 |
+
docs/results/final_submission_evidence/charts/curated/reward_and_safety/reward_component_bars.png filter=lfs diff=lfs merge=lfs -text
|
| 57 |
+
docs/results/final_submission_evidence/charts/curated/training/qwen_3b_grpo_reward_curve.png filter=lfs diff=lfs merge=lfs -text
|
| 58 |
+
docs/results/final_submission_evidence/charts/frontpage/04_reward_components.png filter=lfs diff=lfs merge=lfs -text
|
| 59 |
+
docs/results/final_submission_evidence/charts/frontpage/09_qwen_3b_grpo_reward_curve.png filter=lfs diff=lfs merge=lfs -text
|
.gitignore
CHANGED
|
@@ -27,3 +27,5 @@ data/retrieval_index/*
|
|
| 27 |
!data/**/.gitkeep
|
| 28 |
app/ui/frontend/.vite/
|
| 29 |
/demo.md
|
|
|
|
|
|
|
|
|
| 27 |
!data/**/.gitkeep
|
| 28 |
app/ui/frontend/.vite/
|
| 29 |
/demo.md
|
| 30 |
+
docs/hf_blog_draft.md
|
| 31 |
+
docs/submission_gap_review.md
|
Dockerfile
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
-
# Hugging Face Space:
|
| 2 |
-
# Build: docker build -t polyguard-space .
|
| 3 |
-
#
|
| 4 |
|
| 5 |
FROM node:20-bookworm-slim AS frontend
|
| 6 |
WORKDIR /build
|
|
|
|
| 1 |
+
# Hugging Face Space: single-port edge (nginx) + OpenEnv (8100) + API (8200) + static UI.
|
| 2 |
+
# Build from repository root: docker build -f Dockerfile.space -t polyguard-space .
|
| 3 |
+
# Cheap tier: use Space "CPU basic"; first boot downloads ~1.1GB model bundle.
|
| 4 |
|
| 5 |
FROM node:20-bookworm-slim AS frontend
|
| 6 |
WORKDIR /build
|
Dockerfile.space
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
-
#
|
| 2 |
-
#
|
|
|
|
| 3 |
|
| 4 |
FROM node:20-bookworm-slim AS frontend
|
| 5 |
WORKDIR /build
|
|
|
|
| 1 |
+
# Hugging Face Space: single-port edge (nginx) + OpenEnv (8100) + API (8200) + static UI.
|
| 2 |
+
# Build from repository root: docker build -f Dockerfile.space -t polyguard-space .
|
| 3 |
+
# Cheap tier: use Space "CPU basic"; first boot downloads ~1.1GB model bundle.
|
| 4 |
|
| 5 |
FROM node:20-bookworm-slim AS frontend
|
| 6 |
WORKDIR /build
|
README.md
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
---
|
| 2 |
-
title: PolyGuard OpenEnv
|
| 3 |
colorFrom: blue
|
| 4 |
colorTo: green
|
| 5 |
sdk: docker
|
|
@@ -14,10 +14,20 @@ Run all CLI commands from this directory (`cd polyguard-rl`). The repository roo
|
|
| 14 |
## Submission Links
|
| 15 |
|
| 16 |
- GitHub Repo URL: [https://github.com/Vishwa-docs/Meta_Pytorch_OpenEnv_Scaler_VK](https://github.com/Vishwa-docs/Meta_Pytorch_OpenEnv_Scaler_VK)
|
| 17 |
-
- HF Space URL: [https://huggingface.co/spaces/TheJackBright/polyguard-openenv](https://huggingface.co/spaces/TheJackBright/polyguard-openenv)
|
| 18 |
- Colab Notebook URL: [https://colab.research.google.com/github/Vishwa-docs/Meta_Pytorch_OpenEnv_Scaler_VK/blob/master/polyguard-rl/PolyGuard_SFT_GRPO_One_Run_Runner.ipynb](https://colab.research.google.com/github/Vishwa-docs/Meta_Pytorch_OpenEnv_Scaler_VK/blob/master/polyguard-rl/PolyGuard_SFT_GRPO_One_Run_Runner.ipynb) (see also `notebooks/09_training_loop.ipynb` for a modular training walkthrough)
|
| 19 |
-
- YouTube Video URL: not used for this submission;
|
| 20 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
## Problem Statement
|
| 23 |
|
|
@@ -41,8 +51,18 @@ Thirteen verifier-backed reward components roll up into four primary channels (`
|
|
| 41 |
|
| 42 |
## Training And Post-Training Strategy
|
| 43 |
|
| 44 |
-
Build corpora (`scripts/bootstrap_data.py`, `scripts/build_training_corpus.py`), SFT with TRL (`scripts/train_sft_trl.py`), GRPO with environment reward (`scripts/train_grpo_trl.py`), merge adapters (`scripts/merge_adapters_safe.py`), validate inference (`scripts/test_inference_postsave.py`), evaluate and plot (`scripts/evaluate_*.py`, `docs/results/`). Optional HF GPU training
|
| 45 |
|
| 46 |
## Documentation index
|
| 47 |
|
| 48 |
-
- [Architecture](docs/architecture.md)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: PolyGuard OpenEnv Workbench
|
| 3 |
colorFrom: blue
|
| 4 |
colorTo: green
|
| 5 |
sdk: docker
|
|
|
|
| 14 |
## Submission Links
|
| 15 |
|
| 16 |
- GitHub Repo URL: [https://github.com/Vishwa-docs/Meta_Pytorch_OpenEnv_Scaler_VK](https://github.com/Vishwa-docs/Meta_Pytorch_OpenEnv_Scaler_VK)
|
| 17 |
+
- HF Space URL: [https://huggingface.co/spaces/TheJackBright/polyguard-openenv-workbench](https://huggingface.co/spaces/TheJackBright/polyguard-openenv-workbench)
|
| 18 |
- Colab Notebook URL: [https://colab.research.google.com/github/Vishwa-docs/Meta_Pytorch_OpenEnv_Scaler_VK/blob/master/polyguard-rl/PolyGuard_SFT_GRPO_One_Run_Runner.ipynb](https://colab.research.google.com/github/Vishwa-docs/Meta_Pytorch_OpenEnv_Scaler_VK/blob/master/polyguard-rl/PolyGuard_SFT_GRPO_One_Run_Runner.ipynb) (see also `notebooks/09_training_loop.ipynb` for a modular training walkthrough)
|
| 19 |
+
- YouTube Video URL: not used for this submission; the repository root README is the story artifact.
|
| 20 |
+
- Story artifact: the repository root [`README.md`](../README.md) is the final blog-style narrative and evidence map.
|
| 21 |
+
|
| 22 |
+
## Shared Environment, Logs, And Scripts
|
| 23 |
+
|
| 24 |
+
The required environment files, training logs, and training scripts are shared
|
| 25 |
+
in the repo and indexed in [Submission Artifact Index](docs/submission_artifacts.md).
|
| 26 |
+
|
| 27 |
+
- Environment/runtime: `openenv.yaml`, `pyproject.toml`, `uv.lock`, `requirements*.txt`, `Dockerfile*`, `app/env/`, `server/app.py`, and `app/hf_space/Dockerfile`.
|
| 28 |
+
- Training scripts/notebooks: `PolyGuard_SFT_GRPO_One_Run_Runner.ipynb`, `notebooks/09_training_loop.ipynb`, `scripts/train_sft_trl.py`, `scripts/train_grpo_trl.py`, `scripts/deploy_training_space.py`, `app/hf_space/training_runner.py`, and `app/training/`.
|
| 29 |
+
- Training logs/results: `docs/results/final_submission_evidence/reports/`, `docs/results/sweeps/`, `docs/results/submission_evidence_qwen_0_5b_1_5b_3b/reports/`, and `docs/results/qwen_completed_runs/reports/`.
|
| 30 |
+
- Final downloadable artifact Space: [https://huggingface.co/spaces/adithya9903/polyguard-openenv-final-artifacts](https://huggingface.co/spaces/adithya9903/polyguard-openenv-final-artifacts).
|
| 31 |
|
| 32 |
## Problem Statement
|
| 33 |
|
|
|
|
| 51 |
|
| 52 |
## Training And Post-Training Strategy
|
| 53 |
|
| 54 |
+
Build corpora (`scripts/bootstrap_data.py`, `scripts/build_training_corpus.py`), SFT with TRL (`scripts/train_sft_trl.py`), GRPO with environment reward (`scripts/train_grpo_trl.py`), merge adapters (`scripts/merge_adapters_safe.py`), validate inference (`scripts/test_inference_postsave.py`), evaluate and plot (`scripts/evaluate_*.py`, `docs/results/`). Optional HF GPU training uses `scripts/deploy_training_space.py`; public review should start with the repository root [`README.md`](../README.md), then `docs/training.md` for implementation notes.
|
| 55 |
|
| 56 |
## Documentation index
|
| 57 |
|
| 58 |
+
- [Architecture](docs/architecture.md)
|
| 59 |
+
- [Environment](docs/environment_design.md)
|
| 60 |
+
- [Rewards](docs/reward_design.md)
|
| 61 |
+
- [Training](docs/training.md)
|
| 62 |
+
- [Evaluation](docs/evaluation.md)
|
| 63 |
+
- [Deployment](docs/deployment.md)
|
| 64 |
+
- [Datasets](docs/datasets.md)
|
| 65 |
+
- [Participant guide traceability](docs/participant_guide_traceability.md)
|
| 66 |
+
- [Idea doc vs implementation](docs/idea_document_traceability.md)
|
| 67 |
+
- [Submission artifact index](docs/submission_artifacts.md)
|
| 68 |
+
- [**Space UI demo script**](docs/DEMO_RECORDING_SCRIPT.md)
|
README_HF_SPACE.md
DELETED
|
@@ -1,12 +0,0 @@
|
|
| 1 |
-
---
|
| 2 |
-
title: PolyGuard OpenEnv
|
| 3 |
-
emoji: 🛡️
|
| 4 |
-
colorFrom: blue
|
| 5 |
-
colorTo: purple
|
| 6 |
-
sdk: docker
|
| 7 |
-
app_port: 7860
|
| 8 |
-
pinned: false
|
| 9 |
-
license: mit
|
| 10 |
-
---
|
| 11 |
-
|
| 12 |
-
Full-stack **PolyGuard** workbench: OpenEnv (WebSocket), FastAPI, and React UI behind nginx on `PORT`. Uses **CPU basic**; first cold start downloads the public [usable model bundle](https://huggingface.co/TheJackBright/polyguard-openenv-training-full-artifacts/tree/main/usable_model_bundles/local-qwen-0-5b-active-smoke) (~1.1 GB). See `docker/space/README.md` for details.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/ui/frontend/dist/assets/index-DV0STDGE.css
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
@import"https://fonts.googleapis.com/css2?family=IBM+Plex+Sans:wght@400;500;600;700&family=JetBrains+Mono:wght@500;700&family=Space+Grotesk:wght@500;600;700&display=swap";:root{--bg: #03030b;--surface: rgba(13, 16, 35, .62);--surface-2: rgba(19, 24, 51, .58);--surface-3: rgba(35, 26, 72, .68);--ink: #f6f7ff;--muted: #a6a9c8;--line: rgba(197, 187, 255, .22);--line-soft: rgba(189, 178, 255, .14);--accent: #9b7cff;--accent-2: #28e8ff;--accent-3: #ff4fd8;--warning: #d29922;--critical: #f85149;--glass: rgba(8, 11, 25, .58);--shadow: 0 24px 80px rgba(0, 0, 0, .42), inset 0 1px 0 rgba(255, 255, 255, .08);--glow: 0 0 34px rgba(155, 124, 255, .22), 0 0 64px rgba(40, 232, 255, .08);color-scheme:dark}*{box-sizing:border-box}html,body,#root{margin:0;min-height:100%;background:var(--bg);color:var(--ink);font-family:IBM Plex Sans,system-ui,-apple-system,BlinkMacSystemFont,Segoe UI,sans-serif}body{min-width:320px;overflow-x:hidden;background:radial-gradient(circle at 50% -10%,rgba(106,68,255,.28),transparent 34rem),radial-gradient(circle at 85% 12%,rgba(255,79,216,.12),transparent 30rem),#02020a}button,select,input{min-height:40px;border:1px solid var(--line);border-radius:14px;background:#080b1bc7;color:var(--ink);font:inherit}button{width:auto;padding:9px 14px;background:linear-gradient(180deg,rgba(255,255,255,.22),transparent),linear-gradient(135deg,var(--accent),var(--accent-2));border-color:transparent;color:#030414;font-weight:700;cursor:pointer;box-shadow:0 10px 30px #5b5cff52,inset 0 0 18px #ffffff2e;transition:background .14s ease,border-color .14s ease,box-shadow .14s ease,transform .12s ease}button:hover:not(:disabled){background:linear-gradient(180deg,rgba(255,255,255,.28),transparent),linear-gradient(135deg,#b49bff,#5ef5ff);box-shadow:0 14px 44px #28e8ff42,inset 0 0 22px #ffffff38;transform:translateY(-1px)}button.secondary,.mode-toggle button{background:#9b7cff1f;border-color:#9b7cff4d;color:var(--accent);box-shadow:inset 0 0 16px #bf97ff1f}button.secondary:hover:not(:disabled),.mode-toggle button:hover:not(:disabled){background:#9b7cff33}button:disabled{cursor:not-allowed;opacity:.48;transform:none}select,input{width:100%;padding:8px 11px;-webkit-backdrop-filter:blur(12px);backdrop-filter:blur(12px)}select{color-scheme:dark}select:focus,input:focus,button:focus{outline:2px solid rgba(40,232,255,.38);outline-offset:2px}pre{margin:0;max-height:260px;overflow:auto;font-family:JetBrains Mono,ui-monospace,SFMono-Regular,Menlo,Monaco,monospace;font-size:.76rem;line-height:1.55;white-space:pre-wrap;word-break:break-word}table{width:100%;border-collapse:collapse}th,td{padding:8px 10px;border-bottom:1px solid var(--line-soft);text-align:left;font-size:.84rem}.workbench-shell{position:relative;min-height:100vh;isolation:isolate;overflow:hidden;padding:20px;background:linear-gradient(180deg,#090b2338,#03030be0 44rem),var(--bg)}.workbench-container{position:relative;z-index:2;width:min(1440px,100%);margin:0 auto}.metaverse-backdrop{position:fixed;top:0;right:0;bottom:0;left:0;z-index:0;overflow:hidden;pointer-events:none}.blackhole-video{position:absolute;top:-32vh;left:50%;width:min(1300px,148vw);min-width:760px;height:74vh;opacity:.78;mix-blend-mode:screen;object-fit:cover;transform:translate(-50%) rotate(180deg);filter:saturate(1.18) contrast(1.08)}.stars-canvas{position:absolute;top:0;right:0;bottom:0;left:0;z-index:1;opacity:.86}.stars-canvas canvas{display:block}.nebula-orb{position:absolute;border-radius:999px;filter:blur(18px);mix-blend-mode:screen}.orb-one{right:-8rem;top:14rem;width:28rem;height:28rem;background:radial-gradient(circle,rgba(255,79,216,.24),transparent 68%)}.orb-two{left:-10rem;bottom:0;width:34rem;height:34rem;background:radial-gradient(circle,rgba(40,232,255,.18),transparent 70%)}.nebula-grid{position:absolute;top:0;right:0;bottom:0;left:0;background-image:linear-gradient(rgba(255,255,255,.035) 1px,transparent 1px),linear-gradient(90deg,rgba(255,255,255,.035) 1px,transparent 1px);background-size:72px 72px;-webkit-mask-image:linear-gradient(to bottom,transparent,black 18%,transparent 86%);mask-image:linear-gradient(to bottom,transparent,black 18%,transparent 86%);opacity:.36;transform:perspective(900px) rotateX(60deg) translateY(12rem);transform-origin:center bottom}.cosmic-vignette{position:absolute;top:0;right:0;bottom:0;left:0;z-index:2;background:radial-gradient(circle at 50% 0%,transparent 0,rgba(3,3,11,.1) 26rem,rgba(3,3,11,.86) 62rem),linear-gradient(180deg,#03030b0a,#03030be6 76%)}.metaverse-hero{position:relative;display:grid;grid-template-columns:minmax(0,1.3fr) minmax(300px,.72fr);align-items:end;gap:22px;margin:18px 0 14px;overflow:hidden;padding:28px}.metaverse-hero:before{content:"";position:absolute;top:-1px;right:-1px;bottom:-1px;left:-1px;z-index:-1;background:radial-gradient(circle at 16% 10%,rgba(155,124,255,.26),transparent 28rem),radial-gradient(circle at 80% 0%,rgba(40,232,255,.18),transparent 24rem)}.hero-copy{min-width:0}.welcome-box{display:inline-flex;align-items:center;width:max-content;max-width:100%;gap:9px;isolation:isolate;overflow:hidden;margin-bottom:18px;border:1px solid rgba(185,157,255,.45);border-radius:999px;padding:8px 12px;background:#712fff1a;box-shadow:inset 0 -7px 11px #a48fff1f,0 0 28px #9b7cff24;-webkit-backdrop-filter:blur(10px);backdrop-filter:blur(10px)}.spark-glyph,.welcome-text{color:var(--accent);font-size:.78rem;font-weight:900;letter-spacing:.12em;text-transform:uppercase}.welcome-text{background:linear-gradient(0deg,#ffffff6b,#ffffff6b),linear-gradient(90deg,#e59cff,#ba9cff 48%,#8ff6ff);-webkit-background-clip:text;background-clip:text;-webkit-text-fill-color:transparent}.metaverse-hero h2{max-width:900px;margin:0;color:var(--ink);font-family:Space Grotesk,IBM Plex Sans,system-ui,sans-serif;font-size:clamp(2.4rem,6vw,5.7rem);line-height:.92;letter-spacing:-.07em}.metaverse-hero h2 span{display:inline;background:linear-gradient(90deg,#b49bff,#5ef5ff 52%,#ff7ce7);-webkit-background-clip:text;background-clip:text;-webkit-text-fill-color:transparent}.metaverse-hero p{max-width:760px;margin:18px 0 0;color:#c5c8df;font-size:1rem;line-height:1.7}.hero-stat-grid{display:grid;grid-template-columns:repeat(2,minmax(0,1fr));gap:10px}.hero-stat-grid div{min-width:0;border:1px solid var(--line-soft);border-radius:18px;background:#090d1f8f;padding:14px;box-shadow:inset 0 1px #ffffff14;-webkit-backdrop-filter:blur(16px);backdrop-filter:blur(16px)}.hero-stat-grid span{display:block;color:var(--muted);font-size:.7rem;font-weight:900;letter-spacing:.08em;text-transform:uppercase}.hero-stat-grid strong{display:block;margin-top:7px;overflow:hidden;color:var(--ink);font-family:Space Grotesk,IBM Plex Sans,sans-serif;font-size:1.05rem;text-overflow:ellipsis;white-space:nowrap}.panel-surface,.panel{border:1px solid var(--line);border-radius:24px;background:var(--surface);box-shadow:var(--shadow);backdrop-filter:blur(22px) saturate(1.25);-webkit-backdrop-filter:blur(22px) saturate(1.25)}.topbar{display:grid;grid-template-columns:minmax(220px,1fr) auto auto minmax(320px,.9fr);align-items:center;gap:14px;padding:16px}.title-wrap{min-width:0}.title-wrap h1,.page h1{margin:0;color:var(--ink);font-family:Space Grotesk,IBM Plex Sans,sans-serif;font-size:1.5rem;line-height:1.1;font-weight:800;letter-spacing:-.04em}.title-wrap p,.muted{margin:4px 0 0;color:var(--muted);font-size:.88rem}.mode-toggle{display:grid;grid-template-columns:repeat(2,minmax(126px,1fr));gap:6px;padding:4px;border:1px solid var(--line);border-radius:18px;background:#050814b3;box-shadow:inset 0 0 24px #9b7cff14}.mode-toggle button{min-height:34px;padding:6px 10px;border-radius:14px;box-shadow:none}.mode-toggle button.active{background:linear-gradient(135deg,var(--accent),var(--accent-2));color:#030414;box-shadow:0 10px 28px #28e8ff2e}.topbar-status,.topbar-actions,.button-row{display:flex;align-items:center;justify-content:flex-end;flex-wrap:wrap;gap:8px}.topbar-actions{display:grid;grid-template-columns:minmax(170px,1fr) auto}.qtip-trigger{min-height:32px;padding:6px 11px}.status-chip,.panel-heading span,.med-card-header span{display:inline-flex;align-items:center;min-height:28px;border:1px solid var(--line);border-radius:999px;padding:4px 10px;background:#0c1023b8;color:var(--muted);font-size:.72rem;font-weight:800;letter-spacing:.04em;text-transform:uppercase;white-space:nowrap}.status-chip.live{border-color:#28e8ff70;background:#28e8ff1f;color:#78f6ff;box-shadow:0 0 18px #28e8ff24}.status-chip.idle{border-color:#9aa6b247}.advanced-strip{display:grid;grid-template-columns:minmax(160px,.4fr) minmax(260px,1fr);gap:12px;margin-top:12px;padding:14px}.model-truth{margin-top:12px;padding:14px}.model-truth.verified{border-color:#28e8ff80}.model-truth.unverified{border-color:#ffd35c70}.model-truth p{margin:0 0 12px;color:var(--muted);font-size:.88rem;line-height:1.5}.model-truth-grid{display:grid;grid-template-columns:repeat(4,minmax(0,1fr));gap:10px}.model-truth-grid div{min-width:0;border:1px solid var(--line-soft);border-radius:18px;background:var(--surface-2);padding:10px}.model-truth-grid span{color:var(--muted);font-size:.7rem;font-weight:800;letter-spacing:.05em;text-transform:uppercase}.model-truth-grid strong{display:block;margin-top:5px;color:var(--ink);font-size:.86rem;line-height:1.35;overflow-wrap:anywhere}.field{display:flex;min-width:0;flex-direction:column;gap:6px}.field span,.kpi-grid span,.action-detail-grid span,.compact-defs dt{color:var(--muted);font-size:.72rem;font-weight:800;letter-spacing:.05em;text-transform:uppercase}.workbench-layout{display:grid;grid-template-columns:minmax(320px,1.05fr) minmax(320px,.95fr);gap:16px;margin-top:16px;align-items:start}.panel-wide{grid-column:1 / -1}.panel-scroll{min-height:348px;padding:16px}.panel-heading{display:flex;align-items:center;justify-content:space-between;gap:10px;margin-bottom:12px}.inline-heading{margin-bottom:10px}.panel-heading h2,.panel h3,.history-grid h2{margin:0;color:#d8d6ff;font-family:Space Grotesk,IBM Plex Sans,sans-serif;font-size:.82rem;font-weight:800;letter-spacing:.08em;text-transform:uppercase}.panel-surface:not(.topbar,.advanced-strip,.metaverse-hero){padding:16px}.kpi-grid,.action-detail-grid{display:grid;grid-template-columns:repeat(4,minmax(120px,1fr));gap:10px}.kpi-grid div,.action-detail-grid div{min-width:0;min-height:72px;border:1px solid var(--line-soft);border-radius:18px;background:var(--surface-2);padding:12px;box-shadow:inset 0 1px #ffffff0f}.kpi-grid strong,.action-detail-grid strong,.compact-defs dd{display:block;margin-top:6px;color:var(--ink);font-family:Space Grotesk,IBM Plex Sans,sans-serif;font-size:.96rem;line-height:1.25;overflow-wrap:anywhere}.overview-lower{display:grid;grid-template-columns:1fr 1fr;gap:16px;margin-top:16px}.overview-lower h3{margin:0 0 8px;color:var(--muted);font-size:.78rem;letter-spacing:.05em;text-transform:uppercase}.compact-defs{display:grid;grid-template-columns:repeat(2,minmax(0,1fr));gap:8px;margin:0}.compact-defs div{min-width:0;border:1px solid var(--line-soft);border-radius:16px;background:#080c1d9e;padding:10px}.compact-defs dd{margin-left:0;font-size:.86rem}.candidate-list,.history-list,.reward-bars,.event-log{display:flex;flex-direction:column;gap:8px;max-height:292px;overflow:auto;padding-right:2px}.candidate-row{display:grid;grid-template-columns:minmax(150px,1fr) minmax(90px,.65fr) 64px;width:100%;min-height:58px;align-items:center;gap:8px;border-color:var(--line-soft);background:var(--surface-2);color:var(--ink);text-align:left;box-shadow:none}.candidate-row:hover:not(:disabled){border-color:#28e8ff52;background:var(--surface-3);box-shadow:inset 0 0 24px #28e8ff14}.candidate-row.selected{border-color:#28e8ffb8;background:linear-gradient(90deg,#28e8ff29,#9b7cff14),#0b1023b8;box-shadow:inset 3px 0 0 var(--accent-2),0 0 26px #28e8ff1a}.candidate-row.illegal{border-color:#ffd35c38;background:#221b317a;color:#f6f7ff94}.candidate-row.illegal strong{color:#f7d878}.candidate-row span{min-width:0;overflow:hidden;text-overflow:ellipsis;white-space:nowrap}.candidate-row strong{display:block;color:#90f8ff;font-size:.82rem}.action-console{min-height:348px}.action-detail-grid{grid-template-columns:repeat(2,minmax(0,1fr));margin-bottom:12px}.action-console .field{margin-bottom:10px}.console-notice{margin:0 0 12px;border:1px solid rgba(255,211,92,.34);border-radius:16px;background:#ffd35c1a;color:#f7d878;padding:10px 12px;font-size:.84rem;line-height:1.45}.console-notice strong{color:#fff4b8}.button-row{justify-content:flex-start}.reward-row{display:grid;grid-template-columns:minmax(150px,.9fr) minmax(110px,1fr) 56px;align-items:center;gap:8px;font-size:.8rem}.reward-row span{min-width:0;overflow:hidden;color:var(--muted);text-overflow:ellipsis;white-space:nowrap}.reward-row strong{color:var(--ink);font-family:JetBrains Mono,ui-monospace,monospace;font-size:.76rem;text-align:right}.reward-track{height:7px;overflow:hidden;border-radius:999px;background:#040712db}.reward-fill{height:100%;border-radius:inherit;background:linear-gradient(90deg,var(--accent-3),var(--accent),var(--accent-2));box-shadow:0 0 16px #28e8ff5c;transition:width .22s ease}.med-grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(210px,1fr));gap:10px}.med-card{min-width:0;border:1px solid var(--line-soft);border-radius:18px;background:var(--surface-2);padding:12px;box-shadow:inset 0 1px #ffffff0f}.med-card.high-risk{border-color:#ff4fd86b;box-shadow:0 0 22px #ff4fd814,inset 0 1px #ffffff0f}.med-card-header{display:flex;align-items:center;justify-content:space-between;gap:8px}.med-card-header strong{min-width:0;overflow:hidden;color:var(--ink);text-overflow:ellipsis;white-space:nowrap}.med-card-header span{border-color:#ff4fd86b;background:#ff4fd81f;color:#ff9dea;font-size:.64rem}.med-card p,.med-meta{margin:6px 0 0;color:var(--muted);font-size:.84rem}.med-meta{display:flex;flex-wrap:wrap;gap:8px}.med-meta span{color:#8ff6ff}.history-grid{display:grid;grid-template-columns:1fr 1fr;gap:16px}.history-item,.event-log div{border:1px solid var(--line-soft);border-radius:16px;background:var(--surface-2);padding:10px 12px;color:var(--ink);font-size:.84rem;overflow-wrap:anywhere}.history-item strong{display:block;margin-bottom:4px}.history-item span{color:var(--muted)}.history-item.warning{border-color:#d2992252;color:#f0c36a}.detail-panel{min-height:220px}.event-panel{margin-bottom:22px}.event-log{max-height:210px;font-family:JetBrains Mono,ui-monospace,monospace}.error-banner{margin-bottom:10px;border:1px solid rgba(248,81,73,.36);border-radius:16px;background:#f851491f;color:#ff8b85;padding:10px 12px;font-weight:800}.qtip-overlay{position:fixed;top:0;right:0;bottom:0;left:0;z-index:1000;pointer-events:none}.qtip-dim{position:absolute;top:0;right:0;bottom:0;left:0;background:#03030bb8;-webkit-backdrop-filter:blur(4px);backdrop-filter:blur(4px);pointer-events:auto}.qtip-ring{position:fixed;z-index:1001;border:2px solid var(--accent-2);border-radius:20px;box-shadow:0 0 0 4px #28e8ff29,0 0 38px #28e8ff4d;pointer-events:none;transition:top .18s ease,left .18s ease,width .18s ease,height .18s ease}.qtip-card{position:fixed;top:var(--tip-top, 18px);left:var(--tip-left, 18px);z-index:1002;width:min(374px,calc(100vw - 28px));padding:18px;pointer-events:auto;animation:qtipIn .16s ease-out}.qtip-header{display:flex;align-items:center;justify-content:space-between;gap:12px;margin-bottom:10px}.qtip-header span,.qtip-header strong{color:var(--accent);font-size:.72rem;font-weight:900;letter-spacing:.08em;text-transform:uppercase}.qtip-card h2{margin:0 0 8px;color:var(--ink);font-size:1.05rem;letter-spacing:0}.qtip-card p{margin:0;color:var(--muted);font-size:.9rem;line-height:1.55}.qtip-actions{display:flex;justify-content:flex-end;gap:8px;margin-top:16px}@keyframes qtipIn{0%{opacity:0;transform:translateY(6px)}to{opacity:1;transform:translateY(0)}}.page{padding:20px}.grid,.grid-mini{display:grid;grid-template-columns:repeat(2,minmax(240px,1fr));gap:12px}.list{margin:0;padding-left:18px}.kpi{margin:0;font-size:1.6rem;font-weight:800}.hero-line{width:280px;max-width:100%;height:4px;margin:14px 0;border-radius:999px;background:linear-gradient(90deg,var(--accent),var(--accent-2))}.actions{display:flex;flex-wrap:wrap;gap:8px}@media (max-width: 1180px){.metaverse-hero{grid-template-columns:1fr}.topbar{grid-template-columns:1fr;align-items:stretch}.topbar-status,.topbar-actions{justify-content:flex-start}.workbench-layout,.overview-lower,.history-grid{grid-template-columns:1fr}.panel-wide{grid-column:auto}}@media (max-width: 760px){.workbench-shell{padding:10px}.blackhole-video{top:-20vh;min-width:620px;height:54vh}.metaverse-hero{margin-top:8px;padding:18px}.metaverse-hero h2{font-size:clamp(2rem,13vw,3.4rem);letter-spacing:-.055em}.hero-stat-grid{grid-template-columns:1fr}.topbar,.panel-surface:not(.topbar,.advanced-strip,.metaverse-hero),.advanced-strip{padding:12px}.mode-toggle,.topbar-actions,.advanced-strip,.model-truth-grid,.kpi-grid,.action-detail-grid,.compact-defs,.grid,.grid-mini{grid-template-columns:1fr}.topbar-actions button,.button-row button,.qtip-actions button{width:100%}.qtip-card{inset:auto 10px 14px 10px;width:auto}.qtip-actions{flex-direction:column}.qtip-ring{display:none}.candidate-row,.reward-row{grid-template-columns:1fr}.candidate-row span,.reward-row span{white-space:normal}.reward-row strong{text-align:left}.panel-scroll,.action-console,.detail-panel{min-height:auto}.candidate-list,.history-list,.reward-bars,.event-log{max-height:none}}::-webkit-scrollbar{width:7px;height:7px}::-webkit-scrollbar-track{background:transparent}::-webkit-scrollbar-thumb{border-radius:999px;background:#9aa6b257}
|
|
|
|
|
|
app/ui/frontend/dist/assets/index-DgY-oaWG.js
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
app/ui/frontend/dist/index.html
DELETED
|
@@ -1,13 +0,0 @@
|
|
| 1 |
-
<!doctype html>
|
| 2 |
-
<html lang="en">
|
| 3 |
-
<head>
|
| 4 |
-
<meta charset="UTF-8" />
|
| 5 |
-
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
| 6 |
-
<title>POLYGUARD-RL Workbench</title>
|
| 7 |
-
<script type="module" crossorigin src="/assets/index-DgY-oaWG.js"></script>
|
| 8 |
-
<link rel="stylesheet" crossorigin href="/assets/index-DV0STDGE.css">
|
| 9 |
-
</head>
|
| 10 |
-
<body>
|
| 11 |
-
<div id="root"></div>
|
| 12 |
-
</body>
|
| 13 |
-
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
checkpoints/README.md
DELETED
|
@@ -1,23 +0,0 @@
|
|
| 1 |
-
# Local checkpoints (not in Git)
|
| 2 |
-
|
| 3 |
-
Trained weights live here so clones stay small. After cloning, install the published bundle:
|
| 4 |
-
|
| 5 |
-
```bash
|
| 6 |
-
cd polyguard-rl
|
| 7 |
-
python scripts/install_hf_active_bundle.py
|
| 8 |
-
```
|
| 9 |
-
|
| 10 |
-
That creates **`active/`** with:
|
| 11 |
-
|
| 12 |
-
| Path | Contents |
|
| 13 |
-
|------|----------|
|
| 14 |
-
| `active/active_model_manifest.json` | Which artifact to load (GRPO vs merged vs SFT) |
|
| 15 |
-
| `active/grpo_adapter/` | PEFT GRPO adapter (+ tokenizer files) |
|
| 16 |
-
| `active/merged/` | Full merged Qwen 0.5B weights (~1 GB) |
|
| 17 |
-
| `active/sft_adapter/` | SFT LoRA fallback |
|
| 18 |
-
|
| 19 |
-
A Hub cache copy may also appear under `.hf_bundles/` (safe to delete after a successful install).
|
| 20 |
-
|
| 21 |
-
Enable in `.env`: `POLYGUARD_ENABLE_ACTIVE_MODEL=true` and `POLYGUARD_HF_MODEL=Qwen/Qwen2.5-0.5B-Instruct` (base for the adapter path).
|
| 22 |
-
|
| 23 |
-
**If this folder looks empty in the editor:** run the install command above; then confirm with `ls active/`.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docker/space/README.md
CHANGED
|
@@ -12,24 +12,16 @@ Never commit or paste Hugging Face tokens into chat or the repo. If a token was
|
|
| 12 |
|
| 13 |
```bash
|
| 14 |
cd polyguard-rl
|
| 15 |
-
docker build -t polyguard-space .
|
| 16 |
```
|
| 17 |
|
| 18 |
-
3. Push the Space repo
|
| 19 |
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
## FDA panel / latest UI missing on the live Space
|
| 23 |
-
|
| 24 |
-
Pushing code to GitHub alone does **not** refresh `huggingface.co/spaces/...` unless that Space is connected to the same repo **and** rebuilds from the branch that has your UI (for example `fda` vs `main`). This repo’s usual demo path is **upload via Hub API**:
|
| 25 |
|
| 26 |
-
```
|
| 27 |
-
cd polyguard-rl
|
| 28 |
-
export HF_TOKEN="hf_..." # write token; never commit it
|
| 29 |
-
uv run python scripts/deploy_space_api.py --repo-id TheJackBright/polyguard-openenv
|
| 30 |
-
```
|
| 31 |
|
| 32 |
-
|
| 33 |
|
| 34 |
## Runtime
|
| 35 |
|
|
|
|
| 12 |
|
| 13 |
```bash
|
| 14 |
cd polyguard-rl
|
| 15 |
+
docker build -f Dockerfile.space -t polyguard-space .
|
| 16 |
```
|
| 17 |
|
| 18 |
+
3. Push the Space repo (HF expects `Dockerfile` at root). Either:
|
| 19 |
|
| 20 |
+
- **Option A:** In the Space repo on Hub, set **Build → Dockerfile path** to `Dockerfile.space` if the UI allows, **or** copy/rename: `cp Dockerfile.space Dockerfile` in the branch you push.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
+
- **Option B:** Make this `polyguard-rl` folder the Space git root and add a symlink or duplicate `Dockerfile` pointing to the same content as `Dockerfile.space`.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
+
4. Commit and push to the Space repository. HF builds the image on their builders (you do not need to `docker push` to Docker Hub for standard Spaces).
|
| 25 |
|
| 26 |
## Runtime
|
| 27 |
|
docs/DEMO_RECORDING_SCRIPT.md
CHANGED
|
@@ -12,7 +12,7 @@ Use this document while screen-recording the Hugging Face Space (or local Docker
|
|
| 12 |
4. **Wait for cold start**: first load may download the model bundle (several minutes). The **Event Log** and **Model Truth** panel will tell you if the policy failed to load (heuristic fallback is still usable for env steps).
|
| 13 |
5. **Optional**: hide mouse cursor in OBS if you prefer; otherwise move slowly and pause **2 seconds** on each panel after major clicks.
|
| 14 |
|
| 15 |
-
**Primary Space (product):** `https://huggingface.co/spaces/TheJackBright/polyguard-openenv`
|
| 16 |
Runtime: nginx fronts the **product API** (default `8200`) and **OpenEnv service** (`8100`); see `docker/space/entrypoint.sh`.
|
| 17 |
|
| 18 |
---
|
|
@@ -391,7 +391,7 @@ Click **Q Tips** in the top bar. The app cycles **10 slides** (`App.tsx` → `GU
|
|
| 391 |
**Say:** *“This patient block and risk delta come straight from the observation object.”*
|
| 392 |
|
| 393 |
**Action:** **Candidate Actions** — click 2–3 rows; show **Blocked** vs legal. Select a **legal** row.
|
| 394 |
-
**Say:** *“Candidates are legal moves from the env; illegal rows are disabled
|
| 395 |
|
| 396 |
**Action:** **Action Console** — tweak **Confidence** and **Rationale** slightly. Click **Submit Candidate**.
|
| 397 |
**Say:** *“Submit Candidate hits `/env/step_candidate` with my chosen legal action, confidence, and rationale.”*
|
|
|
|
| 12 |
4. **Wait for cold start**: first load may download the model bundle (several minutes). The **Event Log** and **Model Truth** panel will tell you if the policy failed to load (heuristic fallback is still usable for env steps).
|
| 13 |
5. **Optional**: hide mouse cursor in OBS if you prefer; otherwise move slowly and pause **2 seconds** on each panel after major clicks.
|
| 14 |
|
| 15 |
+
**Primary Space (product):** `https://huggingface.co/spaces/TheJackBright/polyguard-openenv-workbench`
|
| 16 |
Runtime: nginx fronts the **product API** (default `8200`) and **OpenEnv service** (`8100`); see `docker/space/entrypoint.sh`.
|
| 17 |
|
| 18 |
---
|
|
|
|
| 391 |
**Say:** *“This patient block and risk delta come straight from the observation object.”*
|
| 392 |
|
| 393 |
**Action:** **Candidate Actions** — click 2–3 rows; show **Blocked** vs legal. Select a **legal** row.
|
| 394 |
+
**Say:** *“Candidates are legal moves from the env; illegal rows are disabled.”*
|
| 395 |
|
| 396 |
**Action:** **Action Console** — tweak **Confidence** and **Rationale** slightly. Click **Submit Candidate**.
|
| 397 |
**Say:** *“Submit Candidate hits `/env/step_candidate` with my chosen legal action, confidence, and rationale.”*
|
app/ui/frontend/dist/blackhole.webm → docs/UI Images/1.jpeg
RENAMED
|
File without changes
|
docs/UI Images/2.jpeg
ADDED
|
Git LFS Details
|
docs/UI Images/3.jpeg
ADDED
|
Git LFS Details
|
docs/UI Images/4.jpeg
ADDED
|
Git LFS Details
|
docs/UI Images/5.jpeg
ADDED
|
Git LFS Details
|
docs/assets/diagrams/data_training_pipeline.png
ADDED
|
Git LFS Details
|
docs/assets/diagrams/deployment_topology.png
ADDED
|
Git LFS Details
|
docs/assets/diagrams/episode_state_machine.png
ADDED
|
docs/assets/diagrams/evidence_generation_flow.png
ADDED
|
docs/assets/diagrams/frontend_runtime_surface.png
ADDED
|
Git LFS Details
|
docs/assets/diagrams/multi_agent_orchestration.png
ADDED
|
Git LFS Details
|
docs/assets/diagrams/reward_decomposition.png
ADDED
|
Git LFS Details
|
docs/assets/diagrams/runtime_step_flow.png
ADDED
|
docs/assets/diagrams/system_architecture.png
ADDED
|
Git LFS Details
|
docs/deployment.md
CHANGED
|
@@ -24,13 +24,13 @@ The global `hf` command on this workstation currently fails because its installe
|
|
| 24 |
## Hugging Face Space Deployment
|
| 25 |
|
| 26 |
```bash
|
| 27 |
-
export HF_SPACE_REPO_ID="TheJackBright/polyguard-openenv"
|
| 28 |
uv run python scripts/deploy_space_api.py --repo-id "$HF_SPACE_REPO_ID"
|
| 29 |
uv run python -c "from huggingface_hub import HfApi; print(HfApi().space_info('$HF_SPACE_REPO_ID').id)"
|
| 30 |
-
openenv validate --url "https://thejackbright-polyguard-openenv.hf.space"
|
| 31 |
```
|
| 32 |
|
| 33 |
-
`scripts/deploy_space_api.py` is the preferred deployment path for this repo because it uploads a valid Docker Space README frontmatter bundle through `huggingface_hub.HfApi`. `scripts/deploy_space.sh` remains available, but the current OpenEnv CLI path may fail with invalid generated `colorFrom`/`colorTo` metadata.
|
| 34 |
|
| 35 |
Useful `scripts/deploy_space.sh` flags:
|
| 36 |
|
|
|
|
| 24 |
## Hugging Face Space Deployment
|
| 25 |
|
| 26 |
```bash
|
| 27 |
+
export HF_SPACE_REPO_ID="TheJackBright/polyguard-openenv-workbench"
|
| 28 |
uv run python scripts/deploy_space_api.py --repo-id "$HF_SPACE_REPO_ID"
|
| 29 |
uv run python -c "from huggingface_hub import HfApi; print(HfApi().space_info('$HF_SPACE_REPO_ID').id)"
|
| 30 |
+
openenv validate --url "https://thejackbright-polyguard-openenv-workbench.hf.space"
|
| 31 |
```
|
| 32 |
|
| 33 |
+
`scripts/deploy_space_api.py` is the preferred deployment path for this repo because it uploads a valid Docker Space README frontmatter bundle through `huggingface_hub.HfApi`. `scripts/deploy_space.sh` remains available, but the current OpenEnv CLI path may fail with invalid generated `colorFrom`/`colorTo` metadata.
|
| 34 |
|
| 35 |
Useful `scripts/deploy_space.sh` flags:
|
| 36 |
|
docs/evaluation.md
CHANGED
|
@@ -40,4 +40,6 @@ Final comparison must show positive or non-regressing behavior on:
|
|
| 40 |
- timeout rate
|
| 41 |
- failure visibility
|
| 42 |
|
| 43 |
-
|
|
|
|
|
|
|
|
|
| 40 |
- timeout rate
|
| 41 |
- failure visibility
|
| 42 |
|
| 43 |
+
Older smoke artifacts are retained for auditability, but final claims should use
|
| 44 |
+
the curated bundle under `docs/results/final_submission_evidence/`. The root
|
| 45 |
+
repository README is the canonical narrative and evidence map.
|
docs/final_submission_audit.md
DELETED
|
@@ -1,42 +0,0 @@
|
|
| 1 |
-
# Final Submission Audit
|
| 2 |
-
|
| 3 |
-
Audit date: April 26, 2026.
|
| 4 |
-
|
| 5 |
-
## Status Summary
|
| 6 |
-
|
| 7 |
-
PolyGuard implements the participant-guide stack from dataset acquisition through OpenEnv environment, rewards, SFT, GRPO, inference, UI/API product, evaluation, and Hugging Face Space deployment. The public environment Space is live at `https://huggingface.co/spaces/TheJackBright/polyguard-openenv` and the runtime health endpoint returned `{"status":"healthy"}` during this audit.
|
| 8 |
-
|
| 9 |
-
The only known judge-facing blocker is external storytelling: the README blog URL `https://huggingface.co/blog/TheJackBright/polyguard-openenv` currently returns 404 until `docs/hf_blog_draft.md` is published there or the README is updated with a real YouTube/slide/blog URL.
|
| 10 |
-
|
| 11 |
-
## Requirement Matrix
|
| 12 |
-
|
| 13 |
-
| Requirement area | Status | Evidence |
|
| 14 |
-
| --- | --- | --- |
|
| 15 |
-
| Problem statement and theme fit | Implemented | README describes safe long-horizon polypharmacy action selection under World Modeling / Professional Tasks. |
|
| 16 |
-
| OpenEnv environment | Implemented | `openenv.yaml`, `PolyGuardEnv`, FastAPI `/reset`, `/step`, `/state`, `/metadata`, `/schema`, `/mcp`, and `/ws`; `uv run openenv validate .` passes. |
|
| 17 |
-
| Dataset acquisition and preprocessing | Implemented | `scripts/bootstrap_data.py`, `scripts/ingest_open_drug_sources.py`, `scripts/build_training_corpus.py`, `data/processed/*`, `data/scenarios/*`, and `docs/dataset_report.md`. |
|
| 18 |
-
| Easy/medium/hard curriculum | Implemented | Scenario JSON/JSONL sets plus task presets exposed through `/env/catalog`. |
|
| 19 |
-
| Rewards and anti-hacking | Implemented | 13 reward components, 4 primary channels, bounded reward scaling, timeout handling, `app/env/anti_cheat.py`, and reward/anti-cheat tests. |
|
| 20 |
-
| Training loop | Implemented | `scripts/train_sft_trl.py`, `scripts/train_grpo_trl.py`, `app/training/grpo_trl.py`, and `app/hf_space/training_runner.py`. |
|
| 21 |
-
| TRL / Unsloth stack | Implemented with fallback reality documented | TRL path is active and reports `trl_transformers`; Unsloth is wired as optional but was unavailable in current reports. |
|
| 22 |
-
| Post-training export and inference | Implemented | `scripts/merge_adapters_safe.py`, `scripts/test_inference_postsave.py`, active model manifest, and API/UI model status path. |
|
| 23 |
-
| Product/demo | Implemented | FastAPI product API, React/Vite workbench, policy lab, training monitor, replay, dosing, and safety views. |
|
| 24 |
-
| Results and plots | Implemented | Tracked `docs/results/*.json` and PNG plots, including SFT baseline sweep evidence and top-level environment-backed GRPO evidence. |
|
| 25 |
-
| HF Space deployment | Implemented | Public Space is running on CPU basic, Space metadata is available, and tracked `docs/results/hf_space_verification.json` reports OpenEnv validation passed. |
|
| 26 |
-
| Colab notebook | Implemented | README Colab URL targets `PolyGuard_SFT_GRPO_One_Run_Runner.ipynb`; `notebooks/09_training_loop.ipynb` is the modular alternative. |
|
| 27 |
-
| Story artifact | Pending external publication | `docs/hf_blog_draft.md` exists, but the README blog URL returns 404 until published. |
|
| 28 |
-
| Full public per-model GRPO sweep | Not claimed | Current public/tracked evidence is a 3-model SFT-baseline sweep plus a top-level GRPO run. Private training artifact repos require auth and must be mirrored before being used as public evidence. |
|
| 29 |
-
|
| 30 |
-
## Fresh Verification
|
| 31 |
-
|
| 32 |
-
- `uv run pytest`: 49 tests passed.
|
| 33 |
-
- `uv run openenv validate .`: local OpenEnv validation passed.
|
| 34 |
-
- `POLYGUARD_ENFORCE_SUBMISSION_LINKS=true uv run python scripts/acceptance_gate.py`: strict gate passed.
|
| 35 |
-
- `curl -s https://thejackbright-polyguard-openenv.hf.space/health`: returned `{"status":"healthy"}`.
|
| 36 |
-
- `curl -s https://thejackbright-polyguard-openenv.hf.space/metadata`: returned PolyGuard OpenEnv metadata with reward range `[0.001, 0.999]`.
|
| 37 |
-
|
| 38 |
-
## Submission Notes
|
| 39 |
-
|
| 40 |
-
- Publish the Hugging Face blog draft or replace the story URL before final hand-in.
|
| 41 |
-
- Run `uv run python scripts/validate_submission_links.py` after publication to catch broken README URLs.
|
| 42 |
-
- Do not add private HF artifact repos as judge-facing links unless they are made public or their outputs are mirrored into the repository/Space documentation.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docs/hf_blog_draft.md
DELETED
|
@@ -1,17 +0,0 @@
|
|
| 1 |
-
# PolyGuard OpenEnv Blog Draft
|
| 2 |
-
|
| 3 |
-
PolyGuard turns polypharmacy safety into an OpenEnv-compatible reinforcement-learning environment. The agent sees a partially observable patient/regimen state, chooses constrained medication actions, and receives verifier-backed feedback over legality, safety, dosing quality, process fidelity, explanation grounding, uncertainty calibration, and anti-cheat checks.
|
| 4 |
-
|
| 5 |
-
The environment targets the World Modeling / Professional Tasks theme. Medication optimization is not a one-shot answer task: safe action selection depends on state, evidence, comorbidities, labs, drug-drug interactions, uncertainty, and rollback behavior when an action is unsafe.
|
| 6 |
-
|
| 7 |
-
The demo includes:
|
| 8 |
-
|
| 9 |
-
- Easy, medium, and hard task presets over DDI screening, regimen risk, bandit mining, precision dosing, deprescribing, missing-data search, alternatives, and new-drug decomposition.
|
| 10 |
-
- A React workbench for reset/step interaction, clickable candidates, task/environment selection, reward bars, action history, and event traces.
|
| 11 |
-
- A TRL SFT warm start and GRPO loop using environment-backed rewards.
|
| 12 |
-
- Post-save inference checks from exported artifacts.
|
| 13 |
-
- Baseline comparison and plots committed under `docs/results/`.
|
| 14 |
-
|
| 15 |
-
The current local compliance run uses a tiny model so the full pipeline can be verified quickly. For the final pitch, rerun the same notebook on GPU with the Qwen model and Unsloth enabled, then replace the result artifacts with the stronger run.
|
| 16 |
-
|
| 17 |
-
Key result to show: the current benchmark report improves average reward over the no-change baseline while preserving legality. The reward design is intentionally decomposed into multiple independent checks to reduce reward hacking and make failures visible.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docs/mathematics.md
ADDED
|
@@ -0,0 +1,1045 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Mathematics Behind PolyGuard Agents
|
| 2 |
+
|
| 3 |
+
This note is the expert-facing mathematical map of PolyGuard: what the
|
| 4 |
+
agents optimize, how actions are constrained, how reward is computed, and why
|
| 5 |
+
the training stack uses SFT plus environment-verified GRPO instead of an
|
| 6 |
+
unconstrained chat policy. It expands the shorter `docs/math.md`.
|
| 7 |
+
|
| 8 |
+
Source-of-truth implementation files:
|
| 9 |
+
|
| 10 |
+
- `app/env/env_core.py`: reset, observation, step, traces, OpenEnv state.
|
| 11 |
+
- `app/models/policy/candidate_builder.py`: constrained candidate set.
|
| 12 |
+
- `app/env/verifier.py`: hard legality and safety verifier.
|
| 13 |
+
- `app/env/transition.py`: state transition dynamics.
|
| 14 |
+
- `app/env/reward_router.py`: reward decomposition and aggregation.
|
| 15 |
+
- `app/env/reward_scaling.py`: strict reward normalization.
|
| 16 |
+
- `app/env/anti_cheat.py`: reward-hacking guards.
|
| 17 |
+
- `app/agents/orchestrator.py`: multi-agent policy stack.
|
| 18 |
+
- `app/models/baselines/contextual_bandit_policy.py`: LinUCB/Thompson co-policy.
|
| 19 |
+
- `app/training/sft_trl.py`: supervised warm start.
|
| 20 |
+
- `app/training/grpo_trl.py`: TRL GRPO with environment reward verification.
|
| 21 |
+
|
| 22 |
+
## 1. Problem Formulation
|
| 23 |
+
|
| 24 |
+
PolyGuard is best read as a finite-horizon constrained POMDP:
|
| 25 |
+
|
| 26 |
+
```text
|
| 27 |
+
M = (S, A, O, T, R, H, C)
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
where:
|
| 31 |
+
|
| 32 |
+
- `S` is the latent patient/regimen state.
|
| 33 |
+
- `A` is the set of medication actions expressible by `PolyGuardAction`.
|
| 34 |
+
- `O` is the observation emitted to the agent.
|
| 35 |
+
- `T(s' | s, a)` is the simulator transition.
|
| 36 |
+
- `R(s, a, s')` is the verifier-backed reward.
|
| 37 |
+
- `H` is the episode horizon, derived from sub-environment difficulty.
|
| 38 |
+
- `C(s, a)` is the hard clinical/safety constraint predicate.
|
| 39 |
+
|
| 40 |
+
The policy objective is:
|
| 41 |
+
|
| 42 |
+
```text
|
| 43 |
+
maximize_pi E_pi [ sum_{t=0}^{H-1} R(s_t, a_t, s_{t+1}) ]
|
| 44 |
+
subject to C(s_t, a_t) = 1 whenever possible
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
There is no explicit discount factor in the runtime. Time preference enters
|
| 48 |
+
through the finite horizon and the efficiency reward:
|
| 49 |
+
|
| 50 |
+
```text
|
| 51 |
+
efficiency_t = q(1 - step_count_t / (max_steps + 1))
|
| 52 |
+
```
|
| 53 |
+
|
| 54 |
+
where `q` is PolyGuard's reward clamp and quantizer:
|
| 55 |
+
|
| 56 |
+
```text
|
| 57 |
+
q(x) = round(clip(x, 0.001, 0.999), 3)
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
Why this framing: medication optimization is partially observable, long
|
| 61 |
+
horizon, and safety constrained. A free-form language model objective would
|
| 62 |
+
allow plausible but illegal actions. PolyGuard instead learns inside a small
|
| 63 |
+
legal action set with explicit reward columns, so failures remain auditable.
|
| 64 |
+
|
| 65 |
+
## 2. State, Observation, And Partial Observability
|
| 66 |
+
|
| 67 |
+
The latent state `s_t` is represented by `PolyGuardState`:
|
| 68 |
+
|
| 69 |
+
```text
|
| 70 |
+
s_t = (
|
| 71 |
+
patient profile,
|
| 72 |
+
active decision mode,
|
| 73 |
+
step count,
|
| 74 |
+
max steps,
|
| 75 |
+
risk summary,
|
| 76 |
+
burden score,
|
| 77 |
+
precision dosing flags,
|
| 78 |
+
unresolved conflicts,
|
| 79 |
+
action history,
|
| 80 |
+
cumulative reward,
|
| 81 |
+
done flag
|
| 82 |
+
)
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
At reset, the initial risk summary is:
|
| 86 |
+
|
| 87 |
+
```text
|
| 88 |
+
polypharmacy_count = number_of_medications
|
| 89 |
+
burden_score = min(1, number_of_medications / 12)
|
| 90 |
+
severe_pair_count = number_of_contraindicated_pairs
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
The agent does not receive all latent simulator internals. The observation
|
| 94 |
+
`o_t = O(s_t)` exposes a controlled view:
|
| 95 |
+
|
| 96 |
+
```text
|
| 97 |
+
o_t = (
|
| 98 |
+
patient summary,
|
| 99 |
+
medication table,
|
| 100 |
+
comorbidities,
|
| 101 |
+
organ function and labs/vitals,
|
| 102 |
+
graph safety summary,
|
| 103 |
+
burden summary,
|
| 104 |
+
precision dosing flags,
|
| 105 |
+
unresolved conflicts,
|
| 106 |
+
candidate action set,
|
| 107 |
+
step budget,
|
| 108 |
+
action history,
|
| 109 |
+
warnings,
|
| 110 |
+
abstention indicators
|
| 111 |
+
)
|
| 112 |
+
```
|
| 113 |
+
|
| 114 |
+
Uncertainty is a simple observable proxy:
|
| 115 |
+
|
| 116 |
+
```text
|
| 117 |
+
missing = I[egfr missing] + I[ast missing] + I[alt missing]
|
| 118 |
+
base_uncertainty = missing / 3
|
| 119 |
+
conflict_penalty = min(0.3, 0.1 * number_of_unresolved_conflicts)
|
| 120 |
+
u_t = clip(base_uncertainty + conflict_penalty, 0, 1)
|
| 121 |
+
```
|
| 122 |
+
|
| 123 |
+
The environment recommends abstention/review when:
|
| 124 |
+
|
| 125 |
+
```text
|
| 126 |
+
u_t > 0.65
|
| 127 |
+
```
|
| 128 |
+
|
| 129 |
+
The supervisor uses a stricter routing threshold:
|
| 130 |
+
|
| 131 |
+
```text
|
| 132 |
+
mode_t = REVIEW if u_t > 0.72
|
| 133 |
+
mode_t = DOSE_OPT if sub_environment = PRECISION_DOSING or dosing is active
|
| 134 |
+
mode_t = REGIMEN_OPT otherwise
|
| 135 |
+
```
|
| 136 |
+
|
| 137 |
+
Why this choice: the observation keeps the agent honest. Missing labs and
|
| 138 |
+
conflicts are not hidden from reward, but they are presented as uncertainty
|
| 139 |
+
signals that should change policy behavior rather than invite overconfident
|
| 140 |
+
recommendations.
|
| 141 |
+
|
| 142 |
+
## 3. Constrained Action Model
|
| 143 |
+
|
| 144 |
+
The runtime action is a strict `PolyGuardAction`:
|
| 145 |
+
|
| 146 |
+
```text
|
| 147 |
+
a_t = (
|
| 148 |
+
mode,
|
| 149 |
+
action_type,
|
| 150 |
+
target_drug,
|
| 151 |
+
replacement_drug,
|
| 152 |
+
dose_bucket,
|
| 153 |
+
taper_days,
|
| 154 |
+
monitoring_plan,
|
| 155 |
+
evidence_query,
|
| 156 |
+
new_drug_name,
|
| 157 |
+
candidate_components,
|
| 158 |
+
candidate_id,
|
| 159 |
+
confidence,
|
| 160 |
+
rationale_brief
|
| 161 |
+
)
|
| 162 |
+
```
|
| 163 |
+
|
| 164 |
+
The environment first builds a candidate set:
|
| 165 |
+
|
| 166 |
+
```text
|
| 167 |
+
C_t = B(s_t)
|
| 168 |
+
```
|
| 169 |
+
|
| 170 |
+
where `B` is `build_candidates`. Candidate generation is rule-seeded and
|
| 171 |
+
bounded:
|
| 172 |
+
|
| 173 |
+
```text
|
| 174 |
+
3 <= |C_t| <= 10
|
| 175 |
+
```
|
| 176 |
+
|
| 177 |
+
Each candidate carries proxy features:
|
| 178 |
+
|
| 179 |
+
```text
|
| 180 |
+
c = (
|
| 181 |
+
candidate_id,
|
| 182 |
+
mode,
|
| 183 |
+
action_type,
|
| 184 |
+
estimated_safety_delta,
|
| 185 |
+
burden_delta,
|
| 186 |
+
disease_stability_estimate,
|
| 187 |
+
uncertainty_score,
|
| 188 |
+
legality_precheck,
|
| 189 |
+
rationale_tags
|
| 190 |
+
)
|
| 191 |
+
```
|
| 192 |
+
|
| 193 |
+
The legal candidate set is:
|
| 194 |
+
|
| 195 |
+
```text
|
| 196 |
+
L_t = { c in C_t : verifier(s_t, c).legal = true }
|
| 197 |
+
```
|
| 198 |
+
|
| 199 |
+
Policy selection is candidate selection, not arbitrary action synthesis:
|
| 200 |
+
|
| 201 |
+
```text
|
| 202 |
+
a_t = to_action(c_t), c_t in C_t
|
| 203 |
+
```
|
| 204 |
+
|
| 205 |
+
The action type space is intentionally small:
|
| 206 |
+
|
| 207 |
+
```text
|
| 208 |
+
KEEP_REGIMEN
|
| 209 |
+
STOP_DRUG
|
| 210 |
+
SUBSTITUTE_WITHIN_CLASS
|
| 211 |
+
RECOMMEND_ALTERNATIVE
|
| 212 |
+
REDUCE_DOSE_BUCKET
|
| 213 |
+
INCREASE_DOSE_BUCKET
|
| 214 |
+
TAPER_INITIATE
|
| 215 |
+
TAPER_CONTINUE
|
| 216 |
+
DOSE_HOLD
|
| 217 |
+
ORDER_MONITORING_AND_WAIT
|
| 218 |
+
FETCH_EXTERNAL_EVIDENCE
|
| 219 |
+
DECOMPOSE_NEW_DRUG
|
| 220 |
+
REQUEST_SPECIALIST_REVIEW
|
| 221 |
+
REQUEST_PHARMACIST_REVIEW
|
| 222 |
+
```
|
| 223 |
+
|
| 224 |
+
Why this choice: most safety failures in clinical LLM tasks come from an
|
| 225 |
+
unbounded output space. PolyGuard makes the LLM solve ranking and explanation
|
| 226 |
+
inside a constrained action manifold, then lets the verifier and transition
|
| 227 |
+
system enforce semantics.
|
| 228 |
+
|
| 229 |
+
## 4. Hard Legality Constraints
|
| 230 |
+
|
| 231 |
+
The verifier computes:
|
| 232 |
+
|
| 233 |
+
```text
|
| 234 |
+
V(s_t, a_t) = (legal, violations, severity, fallback)
|
| 235 |
+
```
|
| 236 |
+
|
| 237 |
+
Examples of hard constraints:
|
| 238 |
+
|
| 239 |
+
- The target drug must exist in the current regimen when required.
|
| 240 |
+
- Substitutions and alternatives must be drawn from allowed substitution rules.
|
| 241 |
+
- Evidence-fetch URLs must be allowlisted.
|
| 242 |
+
- New-drug decomposition must include a new drug and components.
|
| 243 |
+
- Abrupt stopping is illegal when taper rules require tapering.
|
| 244 |
+
- Renal/hepatic unsafe dose escalation is illegal.
|
| 245 |
+
- Duplicate therapy and contraindicated substitutions are illegal.
|
| 246 |
+
- Monitoring/hold actions require a monitoring plan.
|
| 247 |
+
- Destabilizing deprescribing patterns are illegal.
|
| 248 |
+
|
| 249 |
+
The environment step uses a two-gate transition:
|
| 250 |
+
|
| 251 |
+
```text
|
| 252 |
+
if V(s_t, a_t).legal and not anti_cheat(s_t, a_t):
|
| 253 |
+
s_{t+1} = T(s_t, a_t)
|
| 254 |
+
else:
|
| 255 |
+
s_{t+1} = rollback_state_with_failed_action_record(s_t, a_t)
|
| 256 |
+
```
|
| 257 |
+
|
| 258 |
+
Even blocked actions advance the step count and become visible in
|
| 259 |
+
`action_history`, `failure_reasons`, `invalid_action_count`, and trace logs.
|
| 260 |
+
|
| 261 |
+
Why this choice: legality is a constraint, not a soft preference. The reward
|
| 262 |
+
still exposes illegal behavior numerically, but illegal behavior is prevented
|
| 263 |
+
from mutating patient state.
|
| 264 |
+
|
| 265 |
+
## 5. Transition Dynamics
|
| 266 |
+
|
| 267 |
+
The transition function mutates the regimen and derived risk state. Important
|
| 268 |
+
deterministic transitions include:
|
| 269 |
+
|
| 270 |
+
```text
|
| 271 |
+
STOP_DRUG:
|
| 272 |
+
medications' = medications without target_drug
|
| 273 |
+
|
| 274 |
+
SUBSTITUTE_WITHIN_CLASS or RECOMMEND_ALTERNATIVE:
|
| 275 |
+
target_drug' = replacement_drug
|
| 276 |
+
|
| 277 |
+
REDUCE_DOSE_BUCKET / INCREASE_DOSE_BUCKET:
|
| 278 |
+
dose_bucket moves one level over [LOW, MEDIUM, HIGH]
|
| 279 |
+
|
| 280 |
+
DOSE_HOLD:
|
| 281 |
+
dose_bucket' = HOLD
|
| 282 |
+
|
| 283 |
+
ORDER_MONITORING_AND_WAIT:
|
| 284 |
+
optional hold + unresolved review conflicts cleared
|
| 285 |
+
|
| 286 |
+
REQUEST_*_REVIEW:
|
| 287 |
+
active_mode' = REVIEW
|
| 288 |
+
unresolved_conflicts append review marker
|
| 289 |
+
|
| 290 |
+
FETCH_EXTERNAL_EVIDENCE:
|
| 291 |
+
external mention/component counts update
|
| 292 |
+
missing-data conflicts can be cleared
|
| 293 |
+
|
| 294 |
+
DECOMPOSE_NEW_DRUG:
|
| 295 |
+
component count and unknown-risk flags update
|
| 296 |
+
```
|
| 297 |
+
|
| 298 |
+
After any applied transition, burden is recomputed with dose weights:
|
| 299 |
+
|
| 300 |
+
```text
|
| 301 |
+
w(LOW) = 0.70
|
| 302 |
+
w(MEDIUM) = 1.00
|
| 303 |
+
w(HIGH) = 1.25
|
| 304 |
+
w(HOLD) = 0.45
|
| 305 |
+
w(NA) = 1.00
|
| 306 |
+
|
| 307 |
+
burden_{t+1} = clip( sum_{m in medications_{t+1}} w(dose_bucket_m) / 12, 0, 1 )
|
| 308 |
+
```
|
| 309 |
+
|
| 310 |
+
The severe-pair count is recomputed from known contraindicated pairs:
|
| 311 |
+
|
| 312 |
+
```text
|
| 313 |
+
severe_pair_count_{t+1} =
|
| 314 |
+
|{(i, j): i < j and contraindicated(drug_i, drug_j)}|
|
| 315 |
+
```
|
| 316 |
+
|
| 317 |
+
Why this choice: transitions are intentionally deterministic and inspectable.
|
| 318 |
+
That makes reward debugging and training reproducibility easier than a hidden
|
| 319 |
+
black-box clinical simulator.
|
| 320 |
+
|
| 321 |
+
## 6. Multi-Agent Factorization
|
| 322 |
+
|
| 323 |
+
PolyGuard's "agents" are a policy factorization, not independent RL learners
|
| 324 |
+
with separate private rewards. Each module emits features, candidates, gates,
|
| 325 |
+
or explanations consumed by the next stage:
|
| 326 |
+
|
| 327 |
+
```text
|
| 328 |
+
MedRec -> Evidence -> GraphSafety -> Dosing -> Candidate
|
| 329 |
+
-> Supervisor -> Planner -> Critic -> Env -> Explainer
|
| 330 |
+
```
|
| 331 |
+
|
| 332 |
+
The orchestrated policy can be written:
|
| 333 |
+
|
| 334 |
+
```text
|
| 335 |
+
pi(a | o) =
|
| 336 |
+
pi_critic(
|
| 337 |
+
pi_planner(
|
| 338 |
+
top_k_bandit(
|
| 339 |
+
pi_supervisor(
|
| 340 |
+
features_medrec,evidence,graph,dosing,candidates
|
| 341 |
+
)
|
| 342 |
+
)
|
| 343 |
+
)
|
| 344 |
+
)
|
| 345 |
+
```
|
| 346 |
+
|
| 347 |
+
More concretely:
|
| 348 |
+
|
| 349 |
+
```text
|
| 350 |
+
z_medrec = f_medrec(s_t)
|
| 351 |
+
z_evid = f_evidence(s_t)
|
| 352 |
+
z_graph = f_graph(s_t)
|
| 353 |
+
z_dose = f_dosing(s_t)
|
| 354 |
+
C_t = f_candidate(s_t)
|
| 355 |
+
m_t = f_supervisor(s_t, z_dose)
|
| 356 |
+
K_t = f_bandit(C_t, m_t)
|
| 357 |
+
a_hat_t = f_planner(K_t, m_t, provider_prompt)
|
| 358 |
+
a_t = f_critic(s_t, a_hat_t)
|
| 359 |
+
```
|
| 360 |
+
|
| 361 |
+
Coordination modes change the graph behavior:
|
| 362 |
+
|
| 363 |
+
- `sequential_pipeline`: one pass through the stack.
|
| 364 |
+
- `supervisor_routed`: filters candidates by macro mode.
|
| 365 |
+
- `replan_on_veto`: replans into review mode when the critic rejects.
|
| 366 |
+
- `lightweight_debate`: allows a small debate/replan signal around vetoes.
|
| 367 |
+
|
| 368 |
+
Why this choice: the decomposition creates audit points. Experts can inspect
|
| 369 |
+
whether a failure came from candidate construction, uncertainty routing,
|
| 370 |
+
planner choice, critic behavior, transition logic, or reward shaping.
|
| 371 |
+
|
| 372 |
+
## 7. Graph Safety Mathematics
|
| 373 |
+
|
| 374 |
+
The graph safety module summarizes regimen risk. In the no-artifact fallback,
|
| 375 |
+
the encoder maps a regimen to a 24-dimensional vector:
|
| 376 |
+
|
| 377 |
+
```text
|
| 378 |
+
g = encode_regimen(drugs) in R^24
|
| 379 |
+
```
|
| 380 |
+
|
| 381 |
+
The vector includes hashed drug identity features, drug-class counts,
|
| 382 |
+
side-effect tag load, medication count, contraindicated-pair count, and flags
|
| 383 |
+
for sedative, anticoagulant, and glucose-lowering classes.
|
| 384 |
+
|
| 385 |
+
Pairwise DDI severity is:
|
| 386 |
+
|
| 387 |
+
```text
|
| 388 |
+
score_pair(a, b) =
|
| 389 |
+
0.95 if contraindicated(a, b)
|
| 390 |
+
0.15 otherwise
|
| 391 |
+
```
|
| 392 |
+
|
| 393 |
+
Fallback severe-alert probability is:
|
| 394 |
+
|
| 395 |
+
```text
|
| 396 |
+
p_severe = min(0.99, 0.10 + 0.30 * number_of_risky_pairs)
|
| 397 |
+
```
|
| 398 |
+
|
| 399 |
+
Side-effect probabilities normalize ontology tag counts:
|
| 400 |
+
|
| 401 |
+
```text
|
| 402 |
+
p(tag) = count(tag across regimen) / sum_tag count(tag)
|
| 403 |
+
```
|
| 404 |
+
|
| 405 |
+
If a trained graph artifact exists, learned heads may override the fallback
|
| 406 |
+
severe-alert and side-effect estimates.
|
| 407 |
+
|
| 408 |
+
Why this choice: the graph model supplies dense safety features while the
|
| 409 |
+
verifier still enforces hard contraindication rules. Learned risk can help
|
| 410 |
+
ranking, but it is not trusted as the only safety barrier.
|
| 411 |
+
|
| 412 |
+
## 8. Dosing Mathematics
|
| 413 |
+
|
| 414 |
+
Dose-sensitive drugs are currently selected from sensitive classes:
|
| 415 |
+
|
| 416 |
+
```text
|
| 417 |
+
{anticoagulant, sedative, glucose_lowering}
|
| 418 |
+
```
|
| 419 |
+
|
| 420 |
+
Dose features include interaction load and organ stress:
|
| 421 |
+
|
| 422 |
+
```text
|
| 423 |
+
interaction_load = min(1, number_of_medications / 12)
|
| 424 |
+
|
| 425 |
+
organ_stress = min(
|
| 426 |
+
1,
|
| 427 |
+
max(0, (35 - egfr) / 35)
|
| 428 |
+
+ max(0, (ast - 80) / 80)
|
| 429 |
+
+ max(0, (alt - 80) / 80)
|
| 430 |
+
)
|
| 431 |
+
```
|
| 432 |
+
|
| 433 |
+
The surrogate PK/PD state is:
|
| 434 |
+
|
| 435 |
+
```text
|
| 436 |
+
x = (
|
| 437 |
+
effect_level,
|
| 438 |
+
toxicity_level,
|
| 439 |
+
underdose_risk,
|
| 440 |
+
organ_stress,
|
| 441 |
+
interaction_load
|
| 442 |
+
)
|
| 443 |
+
```
|
| 444 |
+
|
| 445 |
+
Initial proxies:
|
| 446 |
+
|
| 447 |
+
```text
|
| 448 |
+
effect_0 = min(1, 0.35 + 0.45 * adherence)
|
| 449 |
+
toxicity_0 = min(1, 0.08 + 0.40 * organ_stress)
|
| 450 |
+
underdose_0 = max(0, 1 - effect_0)
|
| 451 |
+
```
|
| 452 |
+
|
| 453 |
+
For a dose change `d`:
|
| 454 |
+
|
| 455 |
+
```text
|
| 456 |
+
effective_delta = d * (1 - min(0.6, 0.4 * organ_stress))
|
| 457 |
+
|
| 458 |
+
effect' =
|
| 459 |
+
clip(effect + 0.28 * effective_delta - 0.05 * interaction_load, 0, 1)
|
| 460 |
+
|
| 461 |
+
toxicity_gain =
|
| 462 |
+
max(0, d) * (0.35 + 0.25 * organ_stress + 0.20 * interaction_load)
|
| 463 |
+
|
| 464 |
+
toxicity' =
|
| 465 |
+
clip(0.85 * toxicity + toxicity_gain, 0, 1)
|
| 466 |
+
|
| 467 |
+
underdose' =
|
| 468 |
+
clip(1 - effect' + 0.15 * max(0, -d), 0, 1)
|
| 469 |
+
```
|
| 470 |
+
|
| 471 |
+
Dosing quality proxies:
|
| 472 |
+
|
| 473 |
+
```text
|
| 474 |
+
target_attainment = clip(1 - |effect_level - 0.62|, 0, 1)
|
| 475 |
+
toxicity_proxy = min(1, toxicity + 0.20 * organ_stress + 0.12 * interaction_load)
|
| 476 |
+
underdose_proxy = min(1, underdose_risk + max(0, 0.30 - effect_level))
|
| 477 |
+
measurement_need = max(toxicity_proxy, underdose_proxy)
|
| 478 |
+
```
|
| 479 |
+
|
| 480 |
+
The runtime reward currently uses a coarse dose-mode reward:
|
| 481 |
+
|
| 482 |
+
```text
|
| 483 |
+
dosing_quality_score = 0.75 if action.mode = DOSE_OPT else 0.50
|
| 484 |
+
```
|
| 485 |
+
|
| 486 |
+
The detailed PK/PD analysis is still useful because it influences the agent
|
| 487 |
+
stack and evaluation, even when the scalar reward channel remains deliberately
|
| 488 |
+
simple.
|
| 489 |
+
|
| 490 |
+
Why this choice: dose optimization needs its own state features, but dense
|
| 491 |
+
dosing reward must not overpower legality and safety in early RL training.
|
| 492 |
+
|
| 493 |
+
## 9. Contextual Bandit Co-Policy
|
| 494 |
+
|
| 495 |
+
The bandit proposes a top-k shortlist before the planner finalizes an action.
|
| 496 |
+
Each candidate becomes an 8-dimensional feature vector:
|
| 497 |
+
|
| 498 |
+
```text
|
| 499 |
+
x(c) = [
|
| 500 |
+
1,
|
| 501 |
+
I[legality_precheck],
|
| 502 |
+
estimated_safety_delta,
|
| 503 |
+
burden_delta,
|
| 504 |
+
disease_stability_estimate,
|
| 505 |
+
1 - uncertainty_score,
|
| 506 |
+
I[mode = DOSE_OPT],
|
| 507 |
+
I[mode = REVIEW]
|
| 508 |
+
]
|
| 509 |
+
```
|
| 510 |
+
|
| 511 |
+
An arm is keyed by macro mode and action type:
|
| 512 |
+
|
| 513 |
+
```text
|
| 514 |
+
arm(c) = mode(c) || ":" || action_type(c)
|
| 515 |
+
```
|
| 516 |
+
|
| 517 |
+
### LinUCB
|
| 518 |
+
|
| 519 |
+
For each arm `a`, PolyGuard maintains:
|
| 520 |
+
|
| 521 |
+
```text
|
| 522 |
+
A_a = I + sum x x^T
|
| 523 |
+
b_a = sum r x
|
| 524 |
+
theta_a = A_a^{-1} b_a
|
| 525 |
+
```
|
| 526 |
+
|
| 527 |
+
The score is:
|
| 528 |
+
|
| 529 |
+
```text
|
| 530 |
+
score_a(x) =
|
| 531 |
+
theta_a^T x + alpha * sqrt(x^T A_a^{-1} x)
|
| 532 |
+
```
|
| 533 |
+
|
| 534 |
+
where the default `alpha` is read from `POLYGUARD_BANDIT_ALPHA`, defaulting to
|
| 535 |
+
`0.55`.
|
| 536 |
+
|
| 537 |
+
### Thompson Sampling Variant
|
| 538 |
+
|
| 539 |
+
The alternate score is:
|
| 540 |
+
|
| 541 |
+
```text
|
| 542 |
+
score_a(x) = theta_a^T x + Normal(0, alpha)
|
| 543 |
+
```
|
| 544 |
+
|
| 545 |
+
The absolute sampled noise is logged as the exploration bonus.
|
| 546 |
+
|
| 547 |
+
### Explicit Exploration
|
| 548 |
+
|
| 549 |
+
With probability `epsilon`, default `0.1`, the policy swaps the top candidate
|
| 550 |
+
with another candidate in the sorted list:
|
| 551 |
+
|
| 552 |
+
```text
|
| 553 |
+
if Uniform(0, 1) < epsilon:
|
| 554 |
+
swap(scored[0], scored[random_non_top_index])
|
| 555 |
+
```
|
| 556 |
+
|
| 557 |
+
After the environment step:
|
| 558 |
+
|
| 559 |
+
```text
|
| 560 |
+
A_a <- A_a + x x^T
|
| 561 |
+
b_a <- b_a + r x
|
| 562 |
+
```
|
| 563 |
+
|
| 564 |
+
Why this choice: the bandit gives a sample-efficient, inspectable exploration
|
| 565 |
+
layer. It can improve candidate ordering without allowing the LLM to leave the
|
| 566 |
+
safe candidate space.
|
| 567 |
+
|
| 568 |
+
## 10. Planner Policy
|
| 569 |
+
|
| 570 |
+
The planner receives candidates, a supervisor mode, and optional provider
|
| 571 |
+
context. It filters candidates by mode when possible:
|
| 572 |
+
|
| 573 |
+
```text
|
| 574 |
+
C_t^m = { c in C_t : mode(c) = m_t }
|
| 575 |
+
```
|
| 576 |
+
|
| 577 |
+
Then the provider selects a candidate id:
|
| 578 |
+
|
| 579 |
+
```text
|
| 580 |
+
y_t ~ pi_theta(. | prompt(C_t^m, o_t))
|
| 581 |
+
candidate_id = parse(y_t)
|
| 582 |
+
a_hat_t = to_action(candidate_id)
|
| 583 |
+
```
|
| 584 |
+
|
| 585 |
+
If an active Transformers/adapter artifact is available, the model generates a
|
| 586 |
+
completion and the runtime extracts a provided `cand_NN`. If no active artifact
|
| 587 |
+
is available or loading fails, the deterministic safety ranker chooses:
|
| 588 |
+
|
| 589 |
+
```text
|
| 590 |
+
argmax_c (legality_precheck(c), estimated_safety_delta(c), -uncertainty_score(c))
|
| 591 |
+
```
|
| 592 |
+
|
| 593 |
+
The planner confidence is:
|
| 594 |
+
|
| 595 |
+
```text
|
| 596 |
+
confidence = max(0.45, 1 - uncertainty_score(candidate))
|
| 597 |
+
```
|
| 598 |
+
|
| 599 |
+
Why this choice: the learned policy is used where language models are useful:
|
| 600 |
+
contextual judgment over a compact set plus rationale generation. Ranking
|
| 601 |
+
fallbacks keep the product path deterministic and testable when model artifacts
|
| 602 |
+
are unavailable.
|
| 603 |
+
|
| 604 |
+
## 11. Critic And Safety Veto
|
| 605 |
+
|
| 606 |
+
The critic re-runs the verifier:
|
| 607 |
+
|
| 608 |
+
```text
|
| 609 |
+
report = V(s_t, a_hat_t)
|
| 610 |
+
```
|
| 611 |
+
|
| 612 |
+
If the report is legal:
|
| 613 |
+
|
| 614 |
+
```text
|
| 615 |
+
a_t = a_hat_t
|
| 616 |
+
```
|
| 617 |
+
|
| 618 |
+
Otherwise, the critic returns a review-style fallback action. The environment
|
| 619 |
+
still subjects that final action to the same legality and anti-cheat gates, so
|
| 620 |
+
critic output is not privileged over the environment.
|
| 621 |
+
|
| 622 |
+
Why this choice: the planner is allowed to be probabilistic, but state mutation
|
| 623 |
+
is not. The critic provides an additional audit point before the environment
|
| 624 |
+
transition.
|
| 625 |
+
|
| 626 |
+
## 12. Anti-Cheat And Reward-Hacking Guards
|
| 627 |
+
|
| 628 |
+
The anti-cheat detector computes an exploit predicate:
|
| 629 |
+
|
| 630 |
+
```text
|
| 631 |
+
E(s_t, a_t) in {0, 1}
|
| 632 |
+
```
|
| 633 |
+
|
| 634 |
+
It fires on:
|
| 635 |
+
|
| 636 |
+
- repeated candidate loops over the last `MAX_REPEATED_ACTIONS = 3` actions;
|
| 637 |
+
- excessive keep-regimen behavior after at least 3 actions;
|
| 638 |
+
- excessive review behavior after at least 3 actions;
|
| 639 |
+
- malformed candidate ids;
|
| 640 |
+
- candidate ids outside the legal candidate set;
|
| 641 |
+
- repeated no-op retries after failed actions;
|
| 642 |
+
- parser exploit patterns in rationale text;
|
| 643 |
+
- repeated no-op behavior on a hidden high-risk DDI holdout pair.
|
| 644 |
+
|
| 645 |
+
The configured ratio thresholds are:
|
| 646 |
+
|
| 647 |
+
```text
|
| 648 |
+
MAX_KEEP_REGIMEN_RATIO = 0.6
|
| 649 |
+
MAX_REVIEW_RATIO = 0.5
|
| 650 |
+
```
|
| 651 |
+
|
| 652 |
+
Reward impact:
|
| 653 |
+
|
| 654 |
+
```text
|
| 655 |
+
anti_cheat_score = 0.001 if E(s_t, a_t) else 0.999
|
| 656 |
+
```
|
| 657 |
+
|
| 658 |
+
Termination impact:
|
| 659 |
+
|
| 660 |
+
```text
|
| 661 |
+
done = true, reason = "exploit_detection" if E(s_t, a_t)
|
| 662 |
+
```
|
| 663 |
+
|
| 664 |
+
Why this choice: RL policies exploit reward functions. PolyGuard makes common
|
| 665 |
+
shortcuts explicit, penalized, and visible in traces instead of treating them
|
| 666 |
+
as silent bad luck.
|
| 667 |
+
|
| 668 |
+
## 13. Reward Components
|
| 669 |
+
|
| 670 |
+
PolyGuard computes 13 reward columns. Every component is clamped by `q`.
|
| 671 |
+
|
| 672 |
+
Let:
|
| 673 |
+
|
| 674 |
+
```text
|
| 675 |
+
u_t = overall uncertainty
|
| 676 |
+
legal = V(s_t, a_t).legal
|
| 677 |
+
exploit = E(s_t, a_t)
|
| 678 |
+
pre_burden, post_burden = burden before/after step
|
| 679 |
+
pre_pairs, post_pairs = severe-pair count before/after step
|
| 680 |
+
```
|
| 681 |
+
|
| 682 |
+
Risk-like deltas become rewards through:
|
| 683 |
+
|
| 684 |
+
```text
|
| 685 |
+
delta_reward(pre, post) = q(0.5 + 0.6 * (pre - post))
|
| 686 |
+
```
|
| 687 |
+
|
| 688 |
+
So:
|
| 689 |
+
|
| 690 |
+
```text
|
| 691 |
+
burden_reward = delta_reward(pre_burden, post_burden)
|
| 692 |
+
pair_reward = delta_reward(pre_pairs, post_pairs)
|
| 693 |
+
|
| 694 |
+
safety_delta_score =
|
| 695 |
+
q(0.65 * pair_reward + 0.35 * burden_reward) if legal
|
| 696 |
+
0.001 otherwise
|
| 697 |
+
```
|
| 698 |
+
|
| 699 |
+
The current component formulas are:
|
| 700 |
+
|
| 701 |
+
| Component | Formula |
|
| 702 |
+
| --- | --- |
|
| 703 |
+
| `format_compliance_score` | `0.999` after schema validation |
|
| 704 |
+
| `candidate_alignment_score` | `0.999` if `candidate_id` starts with `cand_`, else `0.001` |
|
| 705 |
+
| `legality_score` | `0.999` if legal, else `0.001` |
|
| 706 |
+
| `safety_delta_score` | weighted pair/burden improvement if legal, else `0.001` |
|
| 707 |
+
| `burden_improvement_score` | `burden_reward` if legal, else `0.001` |
|
| 708 |
+
| `disease_stability_score` | `0.90` except `STOP_DRUG` or `INCREASE_DOSE_BUCKET`, which use `0.58` |
|
| 709 |
+
| `dosing_quality_score` | `0.75` if action mode is `DOSE_OPT`, else `0.50` |
|
| 710 |
+
| `abstention_quality_score` | `0.82` for review action with `u_t > 0.6`, else `0.56` |
|
| 711 |
+
| `efficiency_score` | `q(1 - step_count / (max_steps + 1))` |
|
| 712 |
+
| `process_fidelity_score` | `0.92` if legal, else `0.08` |
|
| 713 |
+
| `explanation_grounding_score` | `0.80` if rationale exists, else `0.20` |
|
| 714 |
+
| `anti_cheat_score` | `0.001` if exploit detected, else `0.999` |
|
| 715 |
+
| `uncertainty_calibration_score` | `q(1 - |confidence - (1 - u_t)|)` |
|
| 716 |
+
|
| 717 |
+
Sub-environment modifiers:
|
| 718 |
+
|
| 719 |
+
```text
|
| 720 |
+
WEB_SEARCH_MISSING_DATA:
|
| 721 |
+
FETCH_EXTERNAL_EVIDENCE:
|
| 722 |
+
process_fidelity_score >= 0.90
|
| 723 |
+
explanation_grounding_score >= 0.85
|
| 724 |
+
otherwise:
|
| 725 |
+
process_fidelity_score *= 0.75
|
| 726 |
+
|
| 727 |
+
ALTERNATIVE_SUGGESTION:
|
| 728 |
+
RECOMMEND_ALTERNATIVE or SUBSTITUTE_WITHIN_CLASS:
|
| 729 |
+
safety_delta_score >= 0.88
|
| 730 |
+
burden_improvement_score >= 0.76
|
| 731 |
+
otherwise:
|
| 732 |
+
safety_delta_score *= 0.82
|
| 733 |
+
|
| 734 |
+
NEW_DRUG_DECOMPOSITION:
|
| 735 |
+
DECOMPOSE_NEW_DRUG with components:
|
| 736 |
+
explanation_grounding_score >= 0.90
|
| 737 |
+
process_fidelity_score >= 0.88
|
| 738 |
+
uncertainty_calibration_score >= 0.82
|
| 739 |
+
otherwise:
|
| 740 |
+
explanation_grounding_score *= 0.70
|
| 741 |
+
```
|
| 742 |
+
|
| 743 |
+
Why this choice: dense reward reduces sparse-credit problems, but the columns
|
| 744 |
+
are semantically separated so experts can detect when total reward improves
|
| 745 |
+
for the wrong reason.
|
| 746 |
+
|
| 747 |
+
## 14. Primary Reward Channels
|
| 748 |
+
|
| 749 |
+
The 13 columns roll up into four primary channels:
|
| 750 |
+
|
| 751 |
+
```text
|
| 752 |
+
safety_legality =
|
| 753 |
+
avg(
|
| 754 |
+
legality_score,
|
| 755 |
+
candidate_alignment_score,
|
| 756 |
+
anti_cheat_score,
|
| 757 |
+
uncertainty_calibration_score
|
| 758 |
+
)
|
| 759 |
+
|
| 760 |
+
clinical_improvement =
|
| 761 |
+
avg(
|
| 762 |
+
safety_delta_score,
|
| 763 |
+
burden_improvement_score,
|
| 764 |
+
disease_stability_score
|
| 765 |
+
)
|
| 766 |
+
|
| 767 |
+
dosing_quality =
|
| 768 |
+
avg(
|
| 769 |
+
dosing_quality_score,
|
| 770 |
+
abstention_quality_score
|
| 771 |
+
)
|
| 772 |
+
|
| 773 |
+
process_integrity =
|
| 774 |
+
avg(
|
| 775 |
+
format_compliance_score,
|
| 776 |
+
efficiency_score,
|
| 777 |
+
process_fidelity_score,
|
| 778 |
+
explanation_grounding_score
|
| 779 |
+
)
|
| 780 |
+
```
|
| 781 |
+
|
| 782 |
+
Each average is clamped through `q`. These channels are emitted in
|
| 783 |
+
`info.primary_reward_channels`, GRPO logs, reports, plots, and ablation
|
| 784 |
+
summaries.
|
| 785 |
+
|
| 786 |
+
Why this choice: primary channels make the reward legible to judges and domain
|
| 787 |
+
experts without hiding the lower-level reward columns needed for debugging.
|
| 788 |
+
|
| 789 |
+
## 15. Total Reward
|
| 790 |
+
|
| 791 |
+
The scalar environment reward is a weighted average:
|
| 792 |
+
|
| 793 |
+
```text
|
| 794 |
+
R_env(s_t, a_t, s_{t+1}) =
|
| 795 |
+
q( sum_i w_i c_i / sum_i w_i )
|
| 796 |
+
```
|
| 797 |
+
|
| 798 |
+
Current weights sum to 1:
|
| 799 |
+
|
| 800 |
+
| Component | Weight |
|
| 801 |
+
| --- | ---: |
|
| 802 |
+
| `format_compliance_score` | `0.08` |
|
| 803 |
+
| `candidate_alignment_score` | `0.08` |
|
| 804 |
+
| `legality_score` | `0.12` |
|
| 805 |
+
| `safety_delta_score` | `0.15` |
|
| 806 |
+
| `burden_improvement_score` | `0.08` |
|
| 807 |
+
| `disease_stability_score` | `0.10` |
|
| 808 |
+
| `dosing_quality_score` | `0.08` |
|
| 809 |
+
| `abstention_quality_score` | `0.06` |
|
| 810 |
+
| `efficiency_score` | `0.06` |
|
| 811 |
+
| `process_fidelity_score` | `0.06` |
|
| 812 |
+
| `explanation_grounding_score` | `0.03` |
|
| 813 |
+
| `anti_cheat_score` | `0.06` |
|
| 814 |
+
| `uncertainty_calibration_score` | `0.04` |
|
| 815 |
+
|
| 816 |
+
Safety-related terms have the largest combined mass:
|
| 817 |
+
|
| 818 |
+
```text
|
| 819 |
+
legality + safety_delta + burden + disease_stability + anti_cheat
|
| 820 |
+
= 0.12 + 0.15 + 0.08 + 0.10 + 0.06
|
| 821 |
+
= 0.51
|
| 822 |
+
```
|
| 823 |
+
|
| 824 |
+
That does not include candidate alignment or calibration, which also affect
|
| 825 |
+
safety behavior.
|
| 826 |
+
|
| 827 |
+
Why this choice: the scalar reward is needed by RL algorithms, but the weights
|
| 828 |
+
make safety and clinical improvement dominate style, speed, and explanation.
|
| 829 |
+
|
| 830 |
+
## 16. Episode Termination
|
| 831 |
+
|
| 832 |
+
Termination is deterministic:
|
| 833 |
+
|
| 834 |
+
```text
|
| 835 |
+
done = true if:
|
| 836 |
+
exploit_detected
|
| 837 |
+
or step_count >= max_steps
|
| 838 |
+
or at least 3 recent invalid actions
|
| 839 |
+
or severe_pair_count >= 2 after enough steps
|
| 840 |
+
or burden_score > 0.92 after step 2
|
| 841 |
+
or burden_score < 0.25 and no unresolved conflicts
|
| 842 |
+
or wall-clock/step timeout
|
| 843 |
+
```
|
| 844 |
+
|
| 845 |
+
The main success-like terminal condition is:
|
| 846 |
+
|
| 847 |
+
```text
|
| 848 |
+
safe_resolution:
|
| 849 |
+
burden_score < 0.25 and unresolved_conflicts = empty
|
| 850 |
+
```
|
| 851 |
+
|
| 852 |
+
Why this choice: the environment needs both positive endings and explicit
|
| 853 |
+
failure endings. Otherwise an RL policy could learn to loop, delay, or avoid
|
| 854 |
+
difficult decisions.
|
| 855 |
+
|
| 856 |
+
## 17. SFT Warm Start
|
| 857 |
+
|
| 858 |
+
SFT trains the model to emit the target candidate id for curated examples. A
|
| 859 |
+
record is serialized as:
|
| 860 |
+
|
| 861 |
+
```text
|
| 862 |
+
{
|
| 863 |
+
instruction: "Select the safest legal medication action candidate_id.",
|
| 864 |
+
medications: ...,
|
| 865 |
+
candidates: ...,
|
| 866 |
+
answer: target_candidate_id
|
| 867 |
+
}
|
| 868 |
+
```
|
| 869 |
+
|
| 870 |
+
The mathematical objective is standard token-level negative log likelihood:
|
| 871 |
+
|
| 872 |
+
```text
|
| 873 |
+
L_SFT(theta) =
|
| 874 |
+
- sum_{(x, y*) in D} log pi_theta(y* | x)
|
| 875 |
+
```
|
| 876 |
+
|
| 877 |
+
where `y*` includes the target candidate id.
|
| 878 |
+
|
| 879 |
+
Why this choice: SFT gives the policy the output format and obvious clinical
|
| 880 |
+
priors before RL. Without SFT, GRPO would spend too much budget learning to
|
| 881 |
+
name a valid candidate id.
|
| 882 |
+
|
| 883 |
+
## 18. GRPO With Environment-Backed Reward
|
| 884 |
+
|
| 885 |
+
GRPO prompts are built from patient/candidate records. For each prompt, the
|
| 886 |
+
model emits one or more completions containing a candidate id:
|
| 887 |
+
|
| 888 |
+
```text
|
| 889 |
+
y_i ~ pi_theta(. | x), i = 1..G
|
| 890 |
+
```
|
| 891 |
+
|
| 892 |
+
The environment verifier parses each completion, resets a deterministic
|
| 893 |
+
PolyGuard environment using the recorded seed/difficulty/sub-environment, maps
|
| 894 |
+
the candidate id to an action, takes one environment step, and returns a reward.
|
| 895 |
+
|
| 896 |
+
The training reward used by the GRPO reward function is:
|
| 897 |
+
|
| 898 |
+
```text
|
| 899 |
+
legal_bonus = 0.95 if action is legal else 0.05
|
| 900 |
+
|
| 901 |
+
R_GRPO =
|
| 902 |
+
q(0.80 * R_env + 0.20 * legal_bonus)
|
| 903 |
+
```
|
| 904 |
+
|
| 905 |
+
The reward function logs:
|
| 906 |
+
|
| 907 |
+
```text
|
| 908 |
+
generated_candidate_id
|
| 909 |
+
selected_candidate_id
|
| 910 |
+
legal
|
| 911 |
+
reward
|
| 912 |
+
reward_breakdown
|
| 913 |
+
primary_reward_channels
|
| 914 |
+
termination_reason
|
| 915 |
+
```
|
| 916 |
+
|
| 917 |
+
Conceptually, group-relative policy optimization forms a within-prompt
|
| 918 |
+
advantage:
|
| 919 |
+
|
| 920 |
+
```text
|
| 921 |
+
A_i = (R_i - mean_j R_j) / (std_j R_j + epsilon)
|
| 922 |
+
```
|
| 923 |
+
|
| 924 |
+
and updates the policy with a clipped policy-ratio objective:
|
| 925 |
+
|
| 926 |
+
```text
|
| 927 |
+
rho_i(theta) = pi_theta(y_i | x) / pi_old(y_i | x)
|
| 928 |
+
|
| 929 |
+
J_GRPO(theta) =
|
| 930 |
+
E[ (1/G) * sum_i min(
|
| 931 |
+
rho_i(theta) * A_i,
|
| 932 |
+
clip(rho_i(theta), 1 - eps, 1 + eps) * A_i
|
| 933 |
+
)
|
| 934 |
+
- beta * KL(pi_theta || pi_ref)
|
| 935 |
+
]
|
| 936 |
+
```
|
| 937 |
+
|
| 938 |
+
The exact optimizer mechanics are owned by TRL's `GRPOTrainer`; PolyGuard's
|
| 939 |
+
critical contribution is the reward function that executes verifier-backed
|
| 940 |
+
environment transitions instead of scoring completions with a text-only judge.
|
| 941 |
+
|
| 942 |
+
Why this choice: GRPO avoids training a separate value model, works naturally
|
| 943 |
+
with multiple completions per prompt, and lets the environment supply rewards
|
| 944 |
+
that are grounded in legality, transition effects, and anti-cheat checks.
|
| 945 |
+
|
| 946 |
+
## 19. Evaluation Metrics
|
| 947 |
+
|
| 948 |
+
Rollout metrics are sample means over environment steps or episodes:
|
| 949 |
+
|
| 950 |
+
```text
|
| 951 |
+
avg_reward = mean_t R_t
|
| 952 |
+
legality_rate = mean_t I[action_t legal]
|
| 953 |
+
success_rate = mean_episode I[termination_reason = safe_resolution]
|
| 954 |
+
abstention_rate = mean_t I[action_type starts with REQUEST_]
|
| 955 |
+
timeout_rate = timeout_count / number_of_rewards
|
| 956 |
+
```
|
| 957 |
+
|
| 958 |
+
Reward components and primary channels are averaged column-wise:
|
| 959 |
+
|
| 960 |
+
```text
|
| 961 |
+
avg_component_k = mean_t c_{t,k}
|
| 962 |
+
avg_channel_j = mean_t channel_{t,j}
|
| 963 |
+
```
|
| 964 |
+
|
| 965 |
+
Policy-stack ablations compare:
|
| 966 |
+
|
| 967 |
+
```text
|
| 968 |
+
bandit-only
|
| 969 |
+
llm-only
|
| 970 |
+
llm+bandit
|
| 971 |
+
```
|
| 972 |
+
|
| 973 |
+
Baselines include:
|
| 974 |
+
|
| 975 |
+
```text
|
| 976 |
+
no-change:
|
| 977 |
+
always KEEP_REGIMEN
|
| 978 |
+
|
| 979 |
+
rules-only:
|
| 980 |
+
argmax_c (legality_precheck, estimated_safety_delta)
|
| 981 |
+
|
| 982 |
+
greedy:
|
| 983 |
+
argmax_c (estimated_safety_delta, burden_delta)
|
| 984 |
+
```
|
| 985 |
+
|
| 986 |
+
Why this choice: average reward alone is not trustworthy. PolyGuard also
|
| 987 |
+
reports legality, success, process fidelity, anti-cheat counts, invalid
|
| 988 |
+
actions, timeouts, and failure visibility.
|
| 989 |
+
|
| 990 |
+
## 20. What Experts Should Watch
|
| 991 |
+
|
| 992 |
+
High-quality behavior should show:
|
| 993 |
+
|
| 994 |
+
- High legality without collapsing into review-only actions.
|
| 995 |
+
- Lower severe-pair and burden metrics over transitions.
|
| 996 |
+
- Good uncertainty calibration: confidence near `1 - uncertainty`.
|
| 997 |
+
- High process fidelity in special sub-environments.
|
| 998 |
+
- Low exploit detection and low invalid-action counts.
|
| 999 |
+
- GRPO reward improvements that are visible in primary channels, not just in
|
| 1000 |
+
one easy component.
|
| 1001 |
+
|
| 1002 |
+
Potential failure signatures:
|
| 1003 |
+
|
| 1004 |
+
- Reward rises while `safety_legality` falls.
|
| 1005 |
+
- `abstention_quality_score` rises with review abuse.
|
| 1006 |
+
- Candidate alignment is high but `candidate_not_in_legal_set` appears in
|
| 1007 |
+
anti-cheat logs.
|
| 1008 |
+
- Dosing mode is selected often without better target/toxicity metrics.
|
| 1009 |
+
- The policy exploits deterministic first-candidate fallbacks instead of
|
| 1010 |
+
actually emitting candidate ids.
|
| 1011 |
+
|
| 1012 |
+
The intended expert reading is therefore not "the scalar reward went up".
|
| 1013 |
+
The intended reading is:
|
| 1014 |
+
|
| 1015 |
+
```text
|
| 1016 |
+
policy improved iff
|
| 1017 |
+
scalar reward improves
|
| 1018 |
+
and safety_legality does not regress
|
| 1019 |
+
and clinical_improvement improves or stays justified
|
| 1020 |
+
and process_integrity remains high
|
| 1021 |
+
and anti-cheat/failure logs remain acceptable
|
| 1022 |
+
```
|
| 1023 |
+
|
| 1024 |
+
## 21. Design Summary
|
| 1025 |
+
|
| 1026 |
+
PolyGuard chooses:
|
| 1027 |
+
|
| 1028 |
+
- A constrained POMDP/CMDP framing because free-form medication actions are
|
| 1029 |
+
unsafe and hard to evaluate.
|
| 1030 |
+
- A hierarchical multi-agent policy because clinical medication decisions have
|
| 1031 |
+
separable routing, candidate generation, critique, and explanation stages.
|
| 1032 |
+
- A contextual bandit shortlist because it is transparent, online-updateable,
|
| 1033 |
+
and sample efficient.
|
| 1034 |
+
- SFT first because candidate-id format and clinical priors should not be
|
| 1035 |
+
discovered from sparse RL reward.
|
| 1036 |
+
- GRPO next because group-relative rewards fit verifier-backed completion
|
| 1037 |
+
scoring without a separate critic/value model.
|
| 1038 |
+
- Decomposed reward because safety-critical RL must be debuggable by reward
|
| 1039 |
+
channel, not only by total return.
|
| 1040 |
+
- Hard verifier gates because some actions should be impossible to apply even
|
| 1041 |
+
when a learned policy assigns them high probability.
|
| 1042 |
+
|
| 1043 |
+
This is a research environment and simulator. The mathematics describes how
|
| 1044 |
+
PolyGuard trains and evaluates agents inside this controlled OpenEnv setting;
|
| 1045 |
+
it is not a clinical decision rule for patient care.
|
docs/participant_guide_traceability.md
CHANGED
|
@@ -27,7 +27,7 @@ This audit maps the hackathon guide, FAQ, and judging criteria to concrete PolyG
|
|
| 27 |
- Current tracked reports include a non-fallback SFT run, a top-level non-fallback GRPO run, post-save inference, improvement reports, anti-hacking reports, and a 3-model SFT-baseline sweep.
|
| 28 |
- The optional private remote artifact pull checks reward bounds, reward precision, missing charts, GRPO adapter paths, and the anti-hacking/overfit report. Do not describe private artifacts as public judge-facing links unless mirrored.
|
| 29 |
- The strict submission gate passes as of April 26, 2026, but it validates link presence/shape, not live HTTP status.
|
| 30 |
-
- The live public Space target is `TheJackBright/polyguard-openenv`; `/health`
|
| 31 |
|
| 32 |
## Remaining Human-Owned External Step
|
| 33 |
|
|
|
|
| 27 |
- Current tracked reports include a non-fallback SFT run, a top-level non-fallback GRPO run, post-save inference, improvement reports, anti-hacking reports, and a 3-model SFT-baseline sweep.
|
| 28 |
- The optional private remote artifact pull checks reward bounds, reward precision, missing charts, GRPO adapter paths, and the anti-hacking/overfit report. Do not describe private artifacts as public judge-facing links unless mirrored.
|
| 29 |
- The strict submission gate passes as of April 26, 2026, but it validates link presence/shape, not live HTTP status.
|
| 30 |
+
- The live public Space target is `TheJackBright/polyguard-openenv-workbench`; `/health` is validated through the deployed workbench runtime.
|
| 31 |
|
| 32 |
## Remaining Human-Owned External Step
|
| 33 |
|
docs/results/README.md
CHANGED
|
@@ -1,24 +1,38 @@
|
|
| 1 |
# Result Artifacts
|
| 2 |
|
| 3 |
-
These tracked files mirror
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
Current status:
|
| 6 |
|
| 7 |
- OpenEnv structure/runtime validation passes locally.
|
| 8 |
- Test suite passes locally.
|
| 9 |
- Frontend production build passes locally.
|
| 10 |
-
-
|
| 11 |
-
- `
|
| 12 |
-
- `
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
- `hf_space_verification.json` records a live Hugging Face Space validation pass.
|
| 14 |
-
- `active_model_manifest.json` records the currently activated local product model. As of April 26, 2026 this points at the local Qwen 0.5B smoke artifact while the full remote Qwen sweep continues.
|
| 15 |
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
-
|
| 19 |
-
- `grpo_trl_run.json`
|
| 20 |
-
- `postsave_inference.json`
|
| 21 |
-
- `improvement_report.json`
|
| 22 |
-
- all plot PNGs
|
| 23 |
-
- `hf_space_verification.json`
|
| 24 |
-
- `active_model_manifest.json`
|
|
|
|
| 1 |
# Result Artifacts
|
| 2 |
|
| 3 |
+
These tracked files mirror local smoke/evaluation artifacts and the final curated submission evidence even though `outputs/` and `checkpoints/` are intentionally git-ignored.
|
| 4 |
+
|
| 5 |
+
The shared environment files, training scripts/notebooks, and training logs are
|
| 6 |
+
indexed in `../submission_artifacts.md`.
|
| 7 |
|
| 8 |
Current status:
|
| 9 |
|
| 10 |
- OpenEnv structure/runtime validation passes locally.
|
| 11 |
- Test suite passes locally.
|
| 12 |
- Frontend production build passes locally.
|
| 13 |
+
- `final_submission_evidence/` is the current evidence bundle with curated charts, action traces, final reports, and the public HF artifact Space manifest.
|
| 14 |
+
- `final_submission_evidence/charts/curated/` is the visually reviewed, non-redundant viewing layer used by the root README.
|
| 15 |
+
- `final_submission_evidence/charts/all/` keeps the full chart pool.
|
| 16 |
+
- `final_submission_evidence/charts/stale_superseded/` documents older 0.5B/1.5B-only charts and smoke-run mirrors that are retained for auditability.
|
| 17 |
+
- Final artifact Space: https://huggingface.co/spaces/adithya9903/polyguard-openenv-final-artifacts
|
| 18 |
+
- Qwen 3B SFT/GRPO adapter files and checkpoint tree are available through the final artifact Space; Qwen 0.5B and 1.5B currently have reports/history/post-save SFT evidence but no adapter directories in the checked mirrors.
|
| 19 |
+
- `postsave_inference.json` loads the merged artifact rather than the fallback policy for the older smoke path.
|
| 20 |
+
- `improvement_report.json` shows positive average-reward improvement against the no-change baseline for the older smoke path.
|
| 21 |
- `hf_space_verification.json` records a live Hugging Face Space validation pass.
|
|
|
|
| 22 |
|
| 23 |
+
Best current evidence:
|
| 24 |
+
|
| 25 |
+
- `final_submission_evidence/charts/curated/training/sft_loss_curves_all_models.png`
|
| 26 |
+
- `final_submission_evidence/charts/curated/training/qwen_3b_grpo_reward_curve.png`
|
| 27 |
+
- `final_submission_evidence/charts/curated/training/qwen_3b_grpo_loss_curve.png`
|
| 28 |
+
- `final_submission_evidence/charts/curated/model_comparison/sft_vs_grpo_reward_by_model.png`
|
| 29 |
+
- `final_submission_evidence/charts/curated/model_comparison/qwen_model_grpo_reward.png`
|
| 30 |
+
- `final_submission_evidence/charts/curated/product_over_basic_llm/basic_llm_vs_full_pipeline_reward.png`
|
| 31 |
+
- `final_submission_evidence/charts/curated/product_over_basic_llm/reward_delta_by_seed.png`
|
| 32 |
+
- `final_submission_evidence/charts/curated/reward_and_safety/reward_component_bars.png`
|
| 33 |
+
- `final_submission_evidence/charts/curated/inference/inference_validity_reward.png`
|
| 34 |
+
- `final_submission_evidence/reports/basic_llm_vs_polyguard_report.json`
|
| 35 |
+
- `final_submission_evidence/reports/action_traces.jsonl`
|
| 36 |
+
- `final_submission_evidence/manifest.json`
|
| 37 |
|
| 38 |
+
Older smoke artifacts remain here for auditability and regression checks. The root compatibility charts such as `avg_reward.png` and `policy_stack_avg_reward.png` are intentionally left in place because local gates still check them.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docs/results/anti_cheat_failure_rates.png
CHANGED
|
|
docs/results/final_submission_evidence/README.md
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# PolyGuard Final Submission Evidence
|
| 2 |
+
|
| 3 |
+
This folder is the current curated evidence set for the final submission. It
|
| 4 |
+
replaces the earlier Qwen 0.5B/1.5B-only view with a single location for the
|
| 5 |
+
best charts, reports, action traces, and model-artifact availability.
|
| 6 |
+
|
| 7 |
+
## Hugging Face Artifact Space
|
| 8 |
+
|
| 9 |
+
- Space: [adithya9903/polyguard-openenv-final-artifacts](https://huggingface.co/spaces/adithya9903/polyguard-openenv-final-artifacts)
|
| 10 |
+
- The root repository README is the primary public narrative. This folder is the
|
| 11 |
+
supporting local mirror for charts, reports, traces, and artifact availability.
|
| 12 |
+
|
| 13 |
+
## Shared Environment, Logs, And Scripts
|
| 14 |
+
|
| 15 |
+
The full index for shared environment files, training scripts, notebooks, and
|
| 16 |
+
training logs is [Submission Artifact Index](../../submission_artifacts.md).
|
| 17 |
+
|
| 18 |
+
- Environment/runtime: `openenv.yaml`, `pyproject.toml`, `uv.lock`, `requirements*.txt`, `Dockerfile*`, `app/env/`, `server/app.py`, and `app/hf_space/Dockerfile`.
|
| 19 |
+
- Training scripts/notebooks: `PolyGuard_SFT_GRPO_One_Run_Runner.ipynb`, `notebooks/09_training_loop.ipynb`, `scripts/train_sft_trl.py`, `scripts/train_grpo_trl.py`, `scripts/deploy_training_space.py`, and `app/hf_space/training_runner.py`.
|
| 20 |
+
- Training logs/results: this folder's `reports/`, `docs/results/sweeps/`, and `docs/results/submission_evidence_qwen_0_5b_1_5b_3b/reports/`.
|
| 21 |
+
|
| 22 |
+
## Artifact Availability
|
| 23 |
+
|
| 24 |
+
| Model | SFT adapter | GRPO adapter | Checkpoints | Reports | Status |
|
| 25 |
+
| --- | --- | --- | --- | --- | --- |
|
| 26 |
+
| Qwen 0.5B | missing | missing | missing | yes | reports_only_or_partial |
|
| 27 |
+
| Qwen 1.5B | missing | missing | missing | yes | reports_only_or_partial |
|
| 28 |
+
| Qwen 3B | yes | yes | yes | yes | complete |
|
| 29 |
+
|
| 30 |
+
Qwen 0.5B and 1.5B currently have SFT histories/reports and post-save SFT
|
| 31 |
+
evidence in this repository, but no downloadable SFT/GRPO adapter directories
|
| 32 |
+
were present in the local checkout or authenticated artifact repos at packaging
|
| 33 |
+
time. Qwen 3B has both SFT and GRPO adapters, checkpoint metadata/intermediate
|
| 34 |
+
checkpoints, GRPO history, post-save GRPO inference, and policy ablation
|
| 35 |
+
evidence.
|
| 36 |
+
|
| 37 |
+
## Chart Organization
|
| 38 |
+
|
| 39 |
+
- `charts/curated/` is the visually reviewed, non-redundant submission view.
|
| 40 |
+
- `charts/all/` is the full chart pool, including individual run curves and diagnostics.
|
| 41 |
+
- `charts/frontpage/` is kept as the earlier compact compatibility set.
|
| 42 |
+
- `charts/stale_superseded/` documents older 0.5B/1.5B-only charts and smoke-run mirrors.
|
| 43 |
+
|
| 44 |
+
Recommended README charts:
|
| 45 |
+
|
| 46 |
+
- `charts/curated/training/sft_loss_curves_all_models.png`
|
| 47 |
+
- `charts/curated/training/qwen_3b_grpo_reward_curve.png`
|
| 48 |
+
- `charts/curated/training/qwen_3b_grpo_loss_curve.png`
|
| 49 |
+
- `charts/curated/model_comparison/sft_vs_grpo_reward_by_model.png`
|
| 50 |
+
- `charts/curated/model_comparison/qwen_model_grpo_reward.png`
|
| 51 |
+
- `charts/curated/product_over_basic_llm/basic_llm_vs_full_pipeline_reward.png`
|
| 52 |
+
- `charts/curated/product_over_basic_llm/reward_delta_by_seed.png`
|
| 53 |
+
- `charts/curated/reward_and_safety/reward_component_bars.png`
|
| 54 |
+
- `charts/curated/inference/inference_validity_reward.png`
|
| 55 |
+
|
| 56 |
+
## Improvement Evidence
|
| 57 |
+
|
| 58 |
+
- Basic LLM proxy vs full PolyGuard pipeline reward delta:
|
| 59 |
+
`0.043` average reward.
|
| 60 |
+
- Full pipeline legality rate: `1.0`.
|
| 61 |
+
- Basic LLM failure/exploit rate: `0.25`.
|
| 62 |
+
- Full pipeline failure/exploit rate: `0.0`.
|
| 63 |
+
|
| 64 |
+
Reward values in the tracked API/reports remain numeric and clamped to
|
| 65 |
+
`[0.001, 0.999]` at three decimal precision.
|
| 66 |
+
|
| 67 |
+
## Visual Review Notes
|
| 68 |
+
|
| 69 |
+
The README uses the clearest training, comparison, and product-lift charts. A
|
| 70 |
+
few diagnostics are intentionally kept out of the top-level README: the
|
| 71 |
+
train-vs-holdout gap plot is effectively zero-gap/blank, the anti-cheat chart
|
| 72 |
+
is audit-oriented, and policy-ablation reward is supplemental because the
|
| 73 |
+
product-over-basic-LLM charts communicate the improvement more directly.
|
| 74 |
+
|
| 75 |
+
See `charts/curated/README.md` for the full curated index and
|
| 76 |
+
`charts/stale_superseded/README.md` for superseded 0.5B/1.5B-only charts and
|
| 77 |
+
smoke mirrors.
|
docs/results/final_submission_evidence/charts/all/anti_cheat_failure_rates.png
ADDED
|
docs/results/final_submission_evidence/charts/all/avg_reward.png
ADDED
|
docs/results/final_submission_evidence/charts/all/basic_llm_vs_full_pipeline_latency.png
ADDED
|
docs/results/final_submission_evidence/charts/all/basic_llm_vs_full_pipeline_legality.png
ADDED
|
docs/results/final_submission_evidence/charts/all/basic_llm_vs_full_pipeline_reward.png
ADDED
|
docs/results/final_submission_evidence/charts/all/basic_llm_vs_full_pipeline_reward_delta_by_seed.png
ADDED
|
docs/results/final_submission_evidence/charts/all/grpo_reward_curves.png
ADDED
|
docs/results/final_submission_evidence/charts/all/inference_latency_validity.png
ADDED
|
docs/results/final_submission_evidence/charts/all/inference_validity_reward.png
ADDED
|
docs/results/final_submission_evidence/charts/all/legality_rate.png
ADDED
|
docs/results/final_submission_evidence/charts/all/policy_ablation_avg_reward.png
ADDED
|
docs/results/final_submission_evidence/charts/all/policy_ablation_exploit_detection.png
ADDED
|
docs/results/final_submission_evidence/charts/all/policy_ablation_legality.png
ADDED
|
docs/results/final_submission_evidence/charts/all/policy_stack_avg_reward.png
ADDED
|
docs/results/final_submission_evidence/charts/all/primary_reward_channel_bars.png
ADDED
|