Cyber_analyst-round1 / assets /env_rl_training_flow_diagram.mmd
Humanlearning's picture
diagrams updated
5809a6c
%%{init: {"theme": "base", "themeCSS": "svg { background: #ffffff; }", "themeVariables": {"background": "#ffffff", "mainBkg": "#ffffff", "edgeLabelBackground": "#ffffff", "fontFamily": "Arial, Helvetica, sans-serif", "primaryTextColor": "#111827", "lineColor": "#0f172a", "clusterBkg": "#ffffff", "clusterBorder": "#cbd5e1"}, "flowchart": {"htmlLabels": false, "curve": "basis", "nodeSpacing": 58, "rankSpacing": 70, "padding": 24}}}%%
flowchart TD
classDef setup fill:#eff6ff,stroke:#2563eb,stroke-width:2px,color:#111827;
classDef episode fill:#ecfdf5,stroke:#059669,stroke-width:2px,color:#111827;
classDef train fill:#f5f3ff,stroke:#7c3aed,stroke-width:2px,color:#111827;
classDef adapt fill:#fff7ed,stroke:#ea580c,stroke-width:2px,color:#111827;
classDef artifact fill:#f1f5f9,stroke:#64748b,stroke-width:2px,color:#111827;
Start["Start run\nbase model + config"] --> Cache["Prepare cache\ntrain / validation / hidden_eval"]
Cache --> Require["Modal cache mode\nrequire"]
Require --> Baseline["Baseline eval\nscripted or model rollouts"]
Baseline --> TrainLoop["GRPO training loop"]
subgraph Episode["One OpenEnv episode"]
direction TB
Reset["reset(seed)\nload cached app + policy"] --> Observe["Observation\nphase, hints, tools"]
Observe --> Prompt["Build prompt\nJSON action only"]
Prompt --> Generate["Model generates\none action"]
Generate --> Step["step(action)\nphase gate + tool"]
Step --> Done{"done?"}
Done -- no --> Observe
Done -- yes --> Verify["Terminal verifier\nsecurity + regression + anti-cheat"]
Verify --> Rewards["Reward components\ndiscovery, security, regression, safety"]
end
TrainLoop --> Reset
Rewards --> Update["GRPO update\nLoRA checkpoint"]
Update --> Metrics["Trackio logging\nrewards, pass rates, latency"]
Metrics --> Decision{"next step?"}
Decision -- continue --> TrainLoop
Decision -- rebalance --> Curriculum["Curriculum controller\nsampling weights"]
Curriculum --> TrainLoop
Decision -- weak spot --> Refresh["Async cache refresh\nnew validated bundles"]
Refresh --> Cache
Decision -- final --> Heldout["Held-out eval\nunseen seeds and layouts"]
Heldout --> Compare["Before/after summary\nsuccess + reward lift"]
Compare --> Artifacts["Saved artifacts\noutputs/evals + outputs/rollouts"]
class Start,Cache,Require,Baseline setup;
class Reset,Observe,Prompt,Generate,Step,Done,Verify,Rewards episode;
class TrainLoop,Update,Metrics,Heldout,Compare train;
class Decision,Curriculum,Refresh adapt;
class Artifacts artifact;
linkStyle default stroke:#0f172a,stroke-width:2px;