Spaces:
Sleeping
Sleeping
Commit ·
be8eade
1
Parent(s): 448eddd
feat: enhance scenario authoring and caching mechanisms, update action submission terminology, and improve reward configuration for CyberSecurity_OWASP environment
Browse files- .agents/skills/cybersecurity-owasp-trainer/SKILL.md +26 -9
- .agents/skills/cybersecurity-owasp-trainer/references/trl-unsloth-openenv-notes.md +1 -1
- 00_PROJECT_BRIEF.md +1 -2
- 01_ARCHITECTURE.md +126 -68
- AGENTS.md +5 -5
- README.md +54 -11
- assets/architecture_diagram.mmd +55 -48
- assets/architecture_diagram.svg +172 -81
- assets/env_rl_training_flow_diagram.mmd +36 -18
- assets/env_rl_training_flow_diagram.svg +146 -88
- config.py +180 -0
- configs/scenario_authoring.small.json +34 -0
- evals.py +6 -4
- models.py +15 -1
- pyproject.toml +5 -0
- reward_config.py +119 -0
- rewards.py +391 -40
- scripts/generate_scenario_cache.py +56 -0
- scripts/generate_scenarios.sh +1 -1
- scripts/modal_ephemeral_train.py +110 -11
- scripts/modal_train_grpo.py +325 -63
- server/CyberSecurity_OWASP_environment.py +96 -10
- server/__init__.py +2 -0
- server/app_sandbox.py +59 -3
- server/curriculum.py +13 -5
- server/episode_logger.py +7 -0
- server/scenario_cache.py +525 -0
- server/scenario_factory.py +1 -1
- server/verifier.py +3 -2
- tests/helpers.py +19 -7
- tests/test_closed_loop_runtime.py +2 -2
- tests/test_invalid_actions.py +4 -0
- tests/test_modal_scenario_cache_static.py +39 -0
- tests/test_reward_config.py +48 -0
- tests/test_rewards.py +63 -1
- tests/test_scenario_authoring_config.py +72 -0
- tests/test_scenario_cache.py +148 -0
- tests/test_trackio_utils.py +1 -1
- training/configs/grpo_small.yaml +142 -1
- training/reward_funcs.py +20 -0
- training/rollout.py +15 -2
- training/trackio_utils.py +84 -7
- training/train_grpo.py +12 -3
- uv.lock +2 -0
- validators.py +52 -2
.agents/skills/cybersecurity-owasp-trainer/SKILL.md
CHANGED
|
@@ -29,6 +29,8 @@ Do not start real training until all checks below are true:
|
|
| 29 |
- A local server or Docker server can run, and at least one manual episode completes.
|
| 30 |
- Scripted random, bad, and oracle policies run without crashing; oracle gets high reward on easy seeds.
|
| 31 |
- At least 10 validation rollouts complete and sampled rollout artifacts look behaviorally plausible.
|
|
|
|
|
|
|
| 32 |
- Trackio run config is set and can log a smoke metric locally or to the canonical Space.
|
| 33 |
|
| 34 |
If any gate fails, fix the environment, verifier, reward engine, or rollout parser before touching trainer scale.
|
|
@@ -46,23 +48,33 @@ Prefer the existing repo modules:
|
|
| 46 |
Default environment values:
|
| 47 |
|
| 48 |
```powershell
|
| 49 |
-
$env:MODEL_NAME = "
|
| 50 |
$env:TRACKIO_SPACE_ID = "Humanlearning/CyberSecurity_OWASP-trackio"
|
| 51 |
$env:TRACKIO_PROJECT = "CyberSecurity_OWASP"
|
| 52 |
$env:DIFFICULTY = "0"
|
|
|
|
|
|
|
| 53 |
```
|
| 54 |
|
| 55 |
Use level-0 debug runs before scaling, and verify them through Modal smoke/ephemeral runs.
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
## Training Workflow
|
| 58 |
|
| 59 |
-
1. Validate the environment first: run the targeted tests that cover models, reset/step/state, rewards, anti-cheat, seed reproducibility, invalid actions, and
|
| 60 |
-
2.
|
| 61 |
-
3. Run
|
| 62 |
-
4.
|
| 63 |
-
5.
|
| 64 |
-
6.
|
| 65 |
-
7.
|
|
|
|
| 66 |
|
| 67 |
## Reward And Monitoring
|
| 68 |
|
|
@@ -71,17 +83,22 @@ Track at least these behavior columns:
|
|
| 71 |
- Reward components: total, discovery, security, regression, public routes, patch quality, visible tests, safety, anti-cheat.
|
| 72 |
- Rates: success, exploit-block, regression preservation, public-route preservation, anti-cheat pass, invalid action, timeout, safety violation, reward-hacking suspected.
|
| 73 |
- Efficiency: episode length mean/p95, rollouts per second, tokens per second, loss, learning rate, KL, grad norm.
|
| 74 |
-
- Environment timing: reset, step, verifier, reward, scenario compile, error rate, difficulty, seed.
|
| 75 |
|
| 76 |
Stop or roll back if reward rises while sampled traces show deny-all patches, hardcoded users/resources/tenants, fixture/test tampering, repeated invalid actions, public routes being locked, or visible-test-only optimization.
|
| 77 |
|
|
|
|
|
|
|
| 78 |
## TRL, OpenEnv, And Unsloth Guidance
|
| 79 |
|
| 80 |
- Use TRL GRPO for verifier-driven rewards. Keep multiple independent reward functions for logging and diagnosis.
|
| 81 |
- Keep the existing custom rollout path unless deliberately migrating to TRL's `environment_factory`. If migrating, preserve typed actions, observations, reward component logging, anti-cheat flags, and rollout artifacts.
|
| 82 |
- Use Modal as the default training path; local-only vLLM/GRPO execution is intentionally avoided in this repository.
|
| 83 |
- For OpenEnv server training concurrency, ensure the server supports enough concurrent sessions for the generation batch.
|
|
|
|
|
|
|
| 84 |
- Use Unsloth with LoRA or QLoRA for memory efficiency when the training machine supports it. Start from an instruct-capable checkpoint and verify the model has non-zero success probability before RL.
|
|
|
|
| 85 |
- Pin and smoke-test TRL, Unsloth, vLLM, CUDA, and torch versions before longer runs.
|
| 86 |
- Save LoRA adapters or use Unsloth-supported merged save paths. Do not naively upcast a 4-bit model and merge adapters manually.
|
| 87 |
|
|
|
|
| 29 |
- A local server or Docker server can run, and at least one manual episode completes.
|
| 30 |
- Scripted random, bad, and oracle policies run without crashing; oracle gets high reward on easy seeds.
|
| 31 |
- At least 10 validation rollouts complete and sampled rollout artifacts look behaviorally plausible.
|
| 32 |
+
- The validated scenario cache exists, is mounted, and meets the configured split/difficulty minimums.
|
| 33 |
+
- Modal smoke and GRPO runs use `CYBERSECURITY_OWASP_SCENARIO_CACHE_MODE=require`; runtime `reset()` must not compile scenarios or call an LLM during training/eval.
|
| 34 |
- Trackio run config is set and can log a smoke metric locally or to the canonical Space.
|
| 35 |
|
| 36 |
If any gate fails, fix the environment, verifier, reward engine, or rollout parser before touching trainer scale.
|
|
|
|
| 48 |
Default environment values:
|
| 49 |
|
| 50 |
```powershell
|
| 51 |
+
$env:MODEL_NAME = "unsloth/gemma-4-E2B-it"
|
| 52 |
$env:TRACKIO_SPACE_ID = "Humanlearning/CyberSecurity_OWASP-trackio"
|
| 53 |
$env:TRACKIO_PROJECT = "CyberSecurity_OWASP"
|
| 54 |
$env:DIFFICULTY = "0"
|
| 55 |
+
$env:CYBERSECURITY_OWASP_SCENARIO_CACHE_DIR = "scenario_cache"
|
| 56 |
+
$env:CYBERSECURITY_OWASP_SCENARIO_CACHE_MODE = "fallback"
|
| 57 |
```
|
| 58 |
|
| 59 |
Use level-0 debug runs before scaling, and verify them through Modal smoke/ephemeral runs.
|
| 60 |
|
| 61 |
+
Modal uses two persistent cache volumes:
|
| 62 |
+
|
| 63 |
+
- `CyberSecurity_OWASP-model-cache`: Hugging Face, torch, Unsloth, Triton, and model artifacts.
|
| 64 |
+
- `CyberSecurity_OWASP-scenario-cache`: validated executable scenario bundles for `reset()`.
|
| 65 |
+
|
| 66 |
+
Scenario/curriculum authoring is config-driven through `configs/scenario_authoring.small.json`. The default offline author model is `deepseek-ai/DeepSeek-V4-Pro`; this is not the RL training policy model. The RL training model is pinned to `unsloth/gemma-4-E2B-it`, matching the Unsloth Gemma 4 E2B RL notebook.
|
| 67 |
+
|
| 68 |
## Training Workflow
|
| 69 |
|
| 70 |
+
1. Validate the environment first: run the targeted tests that cover models, reset/step/state, rewards, anti-cheat, seed reproducibility, invalid actions, rollouts, config, and scenario cache.
|
| 71 |
+
2. Prepare the scenario cache once per generator/verifier version: `scripts/modal_train_grpo.py --mode prepare-cache` or `scripts/modal_ephemeral_train.py --mode prepare-cache`.
|
| 72 |
+
3. Run the CPU-only Modal scenario-cache preflight before any GPU training. If cache hit rate or coverage is below config, stop and refill the cache instead of allocating a GPU.
|
| 73 |
+
4. Run a frozen-model or dummy-policy rollout on Modal and inspect the action trace, observations, terminal reason, cache metadata, and reward breakdown.
|
| 74 |
+
5. Confirm Trackio receives component metrics and the run name follows `CyberSecurity_OWASP-<model>-<algo>-level<difficulty>-<YYYYMMDD-HHMM>-<git_sha>`.
|
| 75 |
+
6. Start a very small GRPO run only after the above passes. Start via `scripts/modal_train_grpo.py --mode train`.
|
| 76 |
+
7. Evaluate baseline, trained, and held-out splits with `training/eval_before_after.py` and save summaries under `outputs/evals/`.
|
| 77 |
+
8. Save sampled rollouts under `outputs/rollouts/` for baseline, mid-training, trained, and held-out evidence.
|
| 78 |
|
| 79 |
## Reward And Monitoring
|
| 80 |
|
|
|
|
| 83 |
- Reward components: total, discovery, security, regression, public routes, patch quality, visible tests, safety, anti-cheat.
|
| 84 |
- Rates: success, exploit-block, regression preservation, public-route preservation, anti-cheat pass, invalid action, timeout, safety violation, reward-hacking suspected.
|
| 85 |
- Efficiency: episode length mean/p95, rollouts per second, tokens per second, loss, learning rate, KL, grad norm.
|
| 86 |
+
- Environment timing: reset, step, verifier, reward, scenario cache hit/miss, scenario bundle load, scenario compile fallback, error rate, difficulty, seed.
|
| 87 |
|
| 88 |
Stop or roll back if reward rises while sampled traces show deny-all patches, hardcoded users/resources/tenants, fixture/test tampering, repeated invalid actions, public routes being locked, or visible-test-only optimization.
|
| 89 |
|
| 90 |
+
Stop or downgrade to local-dev only if Modal training/eval shows runtime scenario compilation, cache misses in required mode, or cache hit rate below the configured target.
|
| 91 |
+
|
| 92 |
## TRL, OpenEnv, And Unsloth Guidance
|
| 93 |
|
| 94 |
- Use TRL GRPO for verifier-driven rewards. Keep multiple independent reward functions for logging and diagnosis.
|
| 95 |
- Keep the existing custom rollout path unless deliberately migrating to TRL's `environment_factory`. If migrating, preserve typed actions, observations, reward component logging, anti-cheat flags, and rollout artifacts.
|
| 96 |
- Use Modal as the default training path; local-only vLLM/GRPO execution is intentionally avoided in this repository.
|
| 97 |
- For OpenEnv server training concurrency, ensure the server supports enough concurrent sessions for the generation batch.
|
| 98 |
+
- Keep scenario generation out of the rollout hot path. `reset()` should clone cached bundles; any LLM scenario authoring belongs to offline cache prep.
|
| 99 |
+
- GPU training launchers must call the CPU-only scenario-cache preflight before spawning the L4 function, so missing cache coverage fails before GPU allocation.
|
| 100 |
- Use Unsloth with LoRA or QLoRA for memory efficiency when the training machine supports it. Start from an instruct-capable checkpoint and verify the model has non-zero success probability before RL.
|
| 101 |
+
- Do not swap the RL model away from `unsloth/gemma-4-E2B-it` for smoke runs. Cost-control should use `--max-steps`, `--dataset-size`, `--max-completion-length`, and cache preflight, not a different model.
|
| 102 |
- Pin and smoke-test TRL, Unsloth, vLLM, CUDA, and torch versions before longer runs.
|
| 103 |
- Save LoRA adapters or use Unsloth-supported merged save paths. Do not naively upcast a 4-bit model and merge adapters manually.
|
| 104 |
|
.agents/skills/cybersecurity-owasp-trainer/references/trl-unsloth-openenv-notes.md
CHANGED
|
@@ -32,7 +32,7 @@ Recheck these pages before major dependency upgrades because TRL, OpenEnv integr
|
|
| 32 |
- Start from a capable instruct model or lightly format-tuned model. If success probability is effectively zero, RL will not bootstrap.
|
| 33 |
- Keep reward functions/verifiers simple and trustworthy first; add shaping only after sparse reward blocks learning.
|
| 34 |
- Unsloth recipes commonly use Qwen, Gemma, Llama, Phi, Mistral, and gpt-oss variants. For this repo, prefer the configured `Qwen/Qwen3-1.7B` or another small instruct/coder checkpoint for smoke runs.
|
| 35 |
-
- For Unsloth-specific GRPO recipes, use more than two generations per prompt when hardware allows.
|
| 36 |
- Pin torch, CUDA, vLLM, TRL, and Unsloth versions for any serious run, then run a short smoke test before scaling.
|
| 37 |
|
| 38 |
## Saving And Serving
|
|
|
|
| 32 |
- Start from a capable instruct model or lightly format-tuned model. If success probability is effectively zero, RL will not bootstrap.
|
| 33 |
- Keep reward functions/verifiers simple and trustworthy first; add shaping only after sparse reward blocks learning.
|
| 34 |
- Unsloth recipes commonly use Qwen, Gemma, Llama, Phi, Mistral, and gpt-oss variants. For this repo, prefer the configured `Qwen/Qwen3-1.7B` or another small instruct/coder checkpoint for smoke runs.
|
| 35 |
+
- For Unsloth-specific GRPO recipes, use more than two generations per prompt when hardware allows. In this repo, keep `num_generations=2` only for smoke/debug runs; for non-smoke training runs default to `num_generations>=6`.
|
| 36 |
- Pin torch, CUDA, vLLM, TRL, and Unsloth versions for any serious run, then run a short smoke test before scaling.
|
| 37 |
|
| 38 |
## Saving And Serving
|
00_PROJECT_BRIEF.md
CHANGED
|
@@ -56,7 +56,7 @@ This environment is useful because it targets a real gap between today’s scann
|
|
| 56 |
- **Scanners detect patterns.** This environment trains policy-aware reasoning.
|
| 57 |
- **Unit tests check known cases.** This environment includes hidden authorization invariants.
|
| 58 |
- **Static repair can overfit.** This environment forces the model to preserve valid business behavior.
|
| 59 |
-
- **One-app benchmarks are easy to memorize.** This environment
|
| 60 |
|
| 61 |
The outcome is a model that becomes better at a practical DevSecOps workflow: safely reviewing and repairing authorization logic in small-to-medium web apps.
|
| 62 |
|
|
@@ -142,4 +142,3 @@ CyberSecurity_OWASP/
|
|
| 142 |
| OpenEnv build/deploy docs | Defines the required OpenEnv structure: models, server, client, Docker, HF Spaces deployment. | 8.5/10 |
|
| 143 |
| Hackathon judging criteria | Aligns deliverables with scoring: innovation, storytelling, reward improvement, and training pipeline. | 9/10 |
|
| 144 |
| TRL/OpenEnv GRPO example | Shows a practical pattern for environment rollouts, reward functions, and Trackio logging. | 8/10 |
|
| 145 |
-
|
|
|
|
| 56 |
- **Scanners detect patterns.** This environment trains policy-aware reasoning.
|
| 57 |
- **Unit tests check known cases.** This environment includes hidden authorization invariants.
|
| 58 |
- **Static repair can overfit.** This environment forces the model to preserve valid business behavior.
|
| 59 |
+
- **One-app benchmarks are easy to memorize.** This environment prepares and caches many equivalent-but-different apps from policy graphs, templates, route shapes, schema names, and hidden test seeds, then keeps runtime `reset()` deterministic and fast.
|
| 60 |
|
| 61 |
The outcome is a model that becomes better at a practical DevSecOps workflow: safely reviewing and repairing authorization logic in small-to-medium web apps.
|
| 62 |
|
|
|
|
| 142 |
| OpenEnv build/deploy docs | Defines the required OpenEnv structure: models, server, client, Docker, HF Spaces deployment. | 8.5/10 |
|
| 143 |
| Hackathon judging criteria | Aligns deliverables with scoring: innovation, storytelling, reward improvement, and training pipeline. | 9/10 |
|
| 144 |
| TRL/OpenEnv GRPO example | Shows a practical pattern for environment rollouts, reward functions, and Trackio logging. | 8/10 |
|
|
|
01_ARCHITECTURE.md
CHANGED
|
@@ -22,32 +22,30 @@ Editable source: `assets/architecture_diagram.mmd`
|
|
| 22 |
|
| 23 |
```mermaid
|
| 24 |
flowchart TB
|
| 25 |
-
subgraph A[Scenario + Curriculum Factory]
|
| 26 |
-
A1[
|
| 27 |
-
A2[
|
| 28 |
-
A3[
|
| 29 |
-
A4[
|
| 30 |
-
A5[
|
| 31 |
-
A6[
|
| 32 |
-
|
| 33 |
-
A2 --> A3
|
| 34 |
-
A3 --> A4 --> A5 --> A6
|
| 35 |
end
|
| 36 |
|
| 37 |
-
subgraph B[CyberSecurity_OWASP OpenEnv
|
| 38 |
-
B1[reset\(seed, difficulty\)\
|
| 39 |
-
B2[
|
| 40 |
-
B3[
|
| 41 |
-
B4[
|
| 42 |
-
B5[
|
| 43 |
-
B6[
|
| 44 |
-
B7[
|
| 45 |
-
B8[
|
| 46 |
-
B1 --> B2 --> B3
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
B2 --> B8
|
| 51 |
end
|
| 52 |
|
| 53 |
subgraph C[Single LLM Agent]
|
|
@@ -57,42 +55,63 @@ flowchart TB
|
|
| 57 |
C1 --> C2 --> C3
|
| 58 |
end
|
| 59 |
|
| 60 |
-
subgraph D[Training + Evaluation]
|
| 61 |
-
D1[Parallel
|
| 62 |
D2[TRL GRPO + LoRA]
|
| 63 |
-
D3[Trackio
|
| 64 |
D4[Held-out Family Eval\nbase vs trained model]
|
| 65 |
D5[Demo Artifacts\nbefore/after traces + JSONL]
|
| 66 |
D1 --> D2 --> D3 --> D4 --> D5
|
| 67 |
end
|
| 68 |
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
D2 --> C1
|
| 74 |
-
|
|
|
|
| 75 |
```
|
| 76 |
|
| 77 |
## 3. Component responsibilities
|
| 78 |
|
| 79 |
-
### 3.1 Scenario
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
-
|
| 84 |
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
- metadata for eval and debugging.
|
| 94 |
|
| 95 |
-
|
|
|
|
|
|
|
| 96 |
|
| 97 |
- route names;
|
| 98 |
- schema names;
|
|
@@ -105,13 +124,30 @@ The scenario compiler is the main anti-overfitting mechanism. It should vary:
|
|
| 105 |
- visible test coverage;
|
| 106 |
- hidden invariant seeds.
|
| 107 |
|
| 108 |
-
The runtime
|
| 109 |
|
| 110 |
- `CurriculumController` tracks target weakness mastery, recent reward trend, failure counts, and difficulty tier.
|
| 111 |
-
-
|
| 112 |
-
- `
|
| 113 |
- Hidden-eval episodes hold out scenario families, not only seeds, by marking evaluation-only scenario-family metadata in state rather than observations.
|
| 114 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
### 3.2 Policy Graph Generator
|
| 116 |
|
| 117 |
The policy graph is the ground truth for intended behavior.
|
|
@@ -167,7 +203,7 @@ MVP bug classes:
|
|
| 167 |
|
| 168 |
The OpenEnv server should implement the standard lifecycle:
|
| 169 |
|
| 170 |
-
- `reset()` — initialize a fresh scenario
|
| 171 |
- `step(action)` — execute one typed action and return observation, reward, and done.
|
| 172 |
- `state()` — expose episode metadata for debugging and evaluation.
|
| 173 |
|
|
@@ -189,15 +225,19 @@ The agent should interact through typed actions. Keep the interface small enough
|
|
| 189 |
```python
|
| 190 |
@dataclass
|
| 191 |
class CyberSecurityOWASPAction(Action):
|
| 192 |
-
|
| 193 |
-
"
|
| 194 |
-
"list_files",
|
| 195 |
"list_routes",
|
| 196 |
-
"
|
|
|
|
|
|
|
| 197 |
"send_local_request",
|
| 198 |
-
"
|
| 199 |
-
"
|
|
|
|
|
|
|
| 200 |
"submit_fix",
|
|
|
|
| 201 |
]
|
| 202 |
arguments: dict
|
| 203 |
```
|
|
@@ -206,12 +246,13 @@ Recommended actions:
|
|
| 206 |
|
| 207 |
| Action | Purpose | Safety boundary |
|
| 208 |
|---|---|---|
|
| 209 |
-
| `
|
| 210 |
| `list_routes` | See local app route map. | No internet target. |
|
| 211 |
| `read_file` | Inspect selected source file. | Sandbox allowlist only. |
|
| 212 |
| `send_local_request` | Validate behavior against local app. | Local generated app only. |
|
| 213 |
-
| `
|
| 214 |
-
| `
|
|
|
|
| 215 |
| `submit_fix` | End episode and trigger hidden eval. | Final hidden score only, no leaked test details. |
|
| 216 |
|
| 217 |
### 3.6 Observation schema
|
|
@@ -263,9 +304,9 @@ class CyberSecurityOWASPState(State):
|
|
| 263 |
```text
|
| 264 |
1. reset()
|
| 265 |
- curriculum selects difficulty tier and target weakness
|
| 266 |
-
-
|
| 267 |
-
-
|
| 268 |
-
- initialize
|
| 269 |
- return initial observation
|
| 270 |
|
| 271 |
2. agent loop
|
|
@@ -290,6 +331,8 @@ class CyberSecurityOWASPState(State):
|
|
| 290 |
- send metrics to Trackio during training/eval
|
| 291 |
```
|
| 292 |
|
|
|
|
|
|
|
| 293 |
## 5. Reward design
|
| 294 |
|
| 295 |
The reward should be deterministic, decomposed, and resistant to reward hacking. The maximum terminal reward remains **15.0** and high reward requires deterministic verifier success, not explanation quality.
|
|
@@ -306,10 +349,22 @@ Stable reward keys:
|
|
| 306 |
"visible_tests": 0.0,
|
| 307 |
"safety": 0.0,
|
| 308 |
"anti_cheat": 0.0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 309 |
"total": 0.0,
|
| 310 |
}
|
| 311 |
```
|
| 312 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 313 |
### Reward components
|
| 314 |
|
| 315 |
| Component | Purpose |
|
|
@@ -387,26 +442,28 @@ Editable source: `assets/env_rl_training_flow_diagram.mmd`
|
|
| 387 |
|
| 388 |
```text
|
| 389 |
1. Build CyberSecurity_OWASP OpenEnv server.
|
| 390 |
-
2.
|
| 391 |
-
3. Run baseline eval with
|
| 392 |
-
4. Train with GRPO/TRL or Unsloth using rollout episodes.
|
| 393 |
-
5. Log reward components to Trackio.
|
| 394 |
6. Run held-out eval every N training steps.
|
| 395 |
-
7. Inspect failure clusters.
|
| 396 |
-
8.
|
| 397 |
9. Produce final demo: before/after trace + reward curve + held-out eval table.
|
| 398 |
```
|
| 399 |
|
| 400 |
Recommended initial training setup (Modal-first):
|
| 401 |
|
| 402 |
```text
|
| 403 |
-
Model:
|
| 404 |
Algorithm: GRPO via TRL or Unsloth-compatible loop
|
| 405 |
Dataset prompt: repeated task instruction with randomized scenario IDs
|
| 406 |
Max steps per episode: 30
|
| 407 |
Rollouts per prompt: 2-4
|
| 408 |
Logging: Trackio
|
| 409 |
Primary eval: held-out deterministic test pass rate
|
|
|
|
|
|
|
| 410 |
|
| 411 |
Training execution is expected to run on Modal (persistent or ephemeral) rather than locally.
|
| 412 |
```
|
|
@@ -501,3 +558,4 @@ Expected endpoints:
|
|
| 501 |
| Hackathon judging criteria | Informs demo priorities: innovation, storytelling, reward improvement, and training pipeline. | 9/10 |
|
| 502 |
| TRL/OpenEnv training example | Informs rollout function, decomposed reward functions, and Trackio logging pattern. | 8/10 |
|
| 503 |
| Kube SRE Gym README | Informs the closed-loop pattern: adversarial scenario design, curriculum mastery tracking, real tool interaction, verification, and artifact-driven storytelling. | 8/10 |
|
|
|
|
|
|
| 22 |
|
| 23 |
```mermaid
|
| 24 |
flowchart TB
|
| 25 |
+
subgraph A[Async Scenario Authoring + Curriculum Factory]
|
| 26 |
+
A1[Config-guided LLM Scenario Author\nDeepSeek-V4-Pro default]
|
| 27 |
+
A2[ScenarioSpec JSON\npolicy, app family, bug target]
|
| 28 |
+
A3[Template + A01 Mutator\nFastAPI code variants]
|
| 29 |
+
A4[Deterministic Compiler\nexecutable bundle]
|
| 30 |
+
A5[Static + Dynamic Verifier\nsolvable, safe, hidden/visible tests]
|
| 31 |
+
A6[Difficulty Calibrator\nbaseline pass-rate buckets]
|
| 32 |
+
A7[Versioned Scenario Cache\nsplit, difficulty, family, hash]
|
| 33 |
+
A1 --> A2 --> A3 --> A4 --> A5 --> A6 --> A7
|
|
|
|
| 34 |
end
|
| 35 |
|
| 36 |
+
subgraph B[CyberSecurity_OWASP OpenEnv Runtime]
|
| 37 |
+
B1[reset\(seed, difficulty, family_budget\)\ncache lookup only]
|
| 38 |
+
B2[Curriculum Sampler\nvalidated cache slice]
|
| 39 |
+
B3[Episode State Store\nphase, history, cache metadata, patch diff]
|
| 40 |
+
B4[Typed Action Tools\ninspect, request, patch, visible tests]
|
| 41 |
+
B5[Ephemeral App Sandbox\ncloned cached workspace + fixtures]
|
| 42 |
+
B6[Multi-layer Verifier\nvisible, hidden, oracle, regression]
|
| 43 |
+
B7[Deterministic Reward Engine\nstable components + penalties]
|
| 44 |
+
B8[Episode Artifact Logger\nJSONL transcript + verifier + diff]
|
| 45 |
+
B1 --> B2 --> B3 --> B4
|
| 46 |
+
B4 <--> B5
|
| 47 |
+
B5 --> B6 --> B7 --> B3
|
| 48 |
+
B3 --> B8
|
|
|
|
| 49 |
end
|
| 50 |
|
| 51 |
subgraph C[Single LLM Agent]
|
|
|
|
| 55 |
C1 --> C2 --> C3
|
| 56 |
end
|
| 57 |
|
| 58 |
+
subgraph D[Training + Evaluation + Demo]
|
| 59 |
+
D1[Parallel Rollouts\nfast cached reset]
|
| 60 |
D2[TRL GRPO + LoRA]
|
| 61 |
+
D3[Trackio Curves\nreward, pass rates, cache metrics]
|
| 62 |
D4[Held-out Family Eval\nbase vs trained model]
|
| 63 |
D5[Demo Artifacts\nbefore/after traces + JSONL]
|
| 64 |
D1 --> D2 --> D3 --> D4 --> D5
|
| 65 |
end
|
| 66 |
|
| 67 |
+
subgraph E[Feedback / Adaptation Loop]
|
| 68 |
+
E1[Episode logs + failures]
|
| 69 |
+
E2[Mastery Model\nweakness and plateau tracking]
|
| 70 |
+
E3[Cache Sampling Weights\nnew generation queue]
|
| 71 |
+
E1 --> E2 --> E3
|
| 72 |
+
end
|
| 73 |
+
|
| 74 |
+
A7 --> B1
|
| 75 |
+
C3 -->|typed action| B4
|
| 76 |
+
B4 -->|observation + reward + done| C1
|
| 77 |
+
B7 --> D1
|
| 78 |
D2 --> C1
|
| 79 |
+
B8 --> E1
|
| 80 |
+
E3 --> A1
|
| 81 |
```
|
| 82 |
|
| 83 |
## 3. Component responsibilities
|
| 84 |
|
| 85 |
+
### 3.1 Async Scenario Authoring Plane
|
| 86 |
+
|
| 87 |
+
Scenario generation is offline, asynchronous, validated, and cached. Runtime `reset()` must not call an LLM and must not compile a fresh app during Modal smoke, training, or evaluation runs.
|
| 88 |
+
|
| 89 |
+
The scenario authoring plane outputs complete executable bundles:
|
| 90 |
|
| 91 |
+
- `scenario.json`;
|
| 92 |
+
- `app_source/`;
|
| 93 |
+
- `policy_graph.json`;
|
| 94 |
+
- `visible_tests.py`;
|
| 95 |
+
- `hidden_tests.py`;
|
| 96 |
+
- `oracle_tests.py`;
|
| 97 |
+
- `expected_exploit_trace.json`;
|
| 98 |
+
- `reward_config.json`;
|
| 99 |
+
- `metadata.json`.
|
| 100 |
|
| 101 |
+
The default scenario/curriculum author is configured in `configs/scenario_authoring.small.json`:
|
| 102 |
|
| 103 |
+
```yaml
|
| 104 |
+
provider: huggingface
|
| 105 |
+
model_id: deepseek-ai/DeepSeek-V4-Pro
|
| 106 |
+
thinking_mode: thinking
|
| 107 |
+
reasoning_effort: high
|
| 108 |
+
temperature: 1.0
|
| 109 |
+
top_p: 1.0
|
| 110 |
+
```
|
|
|
|
| 111 |
|
| 112 |
+
DeepSeek-V4-Pro is only used for offline scenario/curriculum authoring. It is not the RL policy model unless explicitly selected for training.
|
| 113 |
+
|
| 114 |
+
The compiler remains the main anti-overfitting mechanism. It should vary:
|
| 115 |
|
| 116 |
- route names;
|
| 117 |
- schema names;
|
|
|
|
| 124 |
- visible test coverage;
|
| 125 |
- hidden invariant seeds.
|
| 126 |
|
| 127 |
+
The runtime treats curriculum and cache sampling as first-class scenario inputs:
|
| 128 |
|
| 129 |
- `CurriculumController` tracks target weakness mastery, recent reward trend, failure counts, and difficulty tier.
|
| 130 |
+
- Offline cache prep uses the configured LLM author, deterministic compiler, verifier, and baseline-agent difficulty calibrator.
|
| 131 |
+
- `ScenarioCache` stores validated bundles by split, difficulty, family, generator version, verifier version, and scenario hash.
|
| 132 |
- Hidden-eval episodes hold out scenario families, not only seeds, by marking evaluation-only scenario-family metadata in state rather than observations.
|
| 133 |
|
| 134 |
+
Cache keys include:
|
| 135 |
+
|
| 136 |
+
```text
|
| 137 |
+
difficulty_level
|
| 138 |
+
authz_bug_type
|
| 139 |
+
app_family
|
| 140 |
+
framework
|
| 141 |
+
policy_shape
|
| 142 |
+
tenant_model
|
| 143 |
+
exploit_depth
|
| 144 |
+
patch_scope
|
| 145 |
+
regression_risk
|
| 146 |
+
generator_version
|
| 147 |
+
verifier_version
|
| 148 |
+
scenario_hash
|
| 149 |
+
```
|
| 150 |
+
|
| 151 |
### 3.2 Policy Graph Generator
|
| 152 |
|
| 153 |
The policy graph is the ground truth for intended behavior.
|
|
|
|
| 203 |
|
| 204 |
The OpenEnv server should implement the standard lifecycle:
|
| 205 |
|
| 206 |
+
- `reset()` — initialize a fresh episode from a cached scenario bundle.
|
| 207 |
- `step(action)` — execute one typed action and return observation, reward, and done.
|
| 208 |
- `state()` — expose episode metadata for debugging and evaluation.
|
| 209 |
|
|
|
|
| 225 |
```python
|
| 226 |
@dataclass
|
| 227 |
class CyberSecurityOWASPAction(Action):
|
| 228 |
+
tool_name: Literal[
|
| 229 |
+
"inspect_policy_graph",
|
|
|
|
| 230 |
"list_routes",
|
| 231 |
+
"read_openapi",
|
| 232 |
+
"read_file",
|
| 233 |
+
"search_code",
|
| 234 |
"send_local_request",
|
| 235 |
+
"compare_identities",
|
| 236 |
+
"submit_diagnosis",
|
| 237 |
+
"patch_file",
|
| 238 |
+
"run_visible_tests",
|
| 239 |
"submit_fix",
|
| 240 |
+
"noop",
|
| 241 |
]
|
| 242 |
arguments: dict
|
| 243 |
```
|
|
|
|
| 246 |
|
| 247 |
| Action | Purpose | Safety boundary |
|
| 248 |
|---|---|---|
|
| 249 |
+
| `inspect_policy_graph` | Read intended authorization rules. | Only synthetic policy. |
|
| 250 |
| `list_routes` | See local app route map. | No internet target. |
|
| 251 |
| `read_file` | Inspect selected source file. | Sandbox allowlist only. |
|
| 252 |
| `send_local_request` | Validate behavior against local app. | Local generated app only. |
|
| 253 |
+
| `submit_diagnosis` | Record bug class, route, policy rule, evidence trace IDs, and fix plan. | Does not reveal hidden tests. |
|
| 254 |
+
| `run_visible_tests` | Run visible tests. | No hidden test disclosure. |
|
| 255 |
+
| `patch_file` | Modify source through unified diff or full content. | Patch size and file allowlist limits. |
|
| 256 |
| `submit_fix` | End episode and trigger hidden eval. | Final hidden score only, no leaked test details. |
|
| 257 |
|
| 258 |
### 3.6 Observation schema
|
|
|
|
| 304 |
```text
|
| 305 |
1. reset()
|
| 306 |
- curriculum selects difficulty tier and target weakness
|
| 307 |
+
- runtime samples or directly loads a validated cached bundle
|
| 308 |
+
- clone cached `app_source/` into an isolated ephemeral workspace
|
| 309 |
+
- initialize fixture state, cache metadata, and sandbox handles
|
| 310 |
- return initial observation
|
| 311 |
|
| 312 |
2. agent loop
|
|
|
|
| 331 |
- send metrics to Trackio during training/eval
|
| 332 |
```
|
| 333 |
|
| 334 |
+
`CYBERSECURITY_OWASP_SCENARIO_CACHE_MODE=require` is mandatory for Modal smoke, training, and evaluation. In that mode a missing cache bundle is a hard failure. Local development may use `fallback`, which compiles deterministically on a miss, but that path is not allowed for meaningful training.
|
| 335 |
+
|
| 336 |
## 5. Reward design
|
| 337 |
|
| 338 |
The reward should be deterministic, decomposed, and resistant to reward hacking. The maximum terminal reward remains **15.0** and high reward requires deterministic verifier success, not explanation quality.
|
|
|
|
| 349 |
"visible_tests": 0.0,
|
| 350 |
"safety": 0.0,
|
| 351 |
"anti_cheat": 0.0,
|
| 352 |
+
"terminal_total": 0.0,
|
| 353 |
+
"progressive": 0.0,
|
| 354 |
+
"step_penalty": 0.0,
|
| 355 |
+
"speed_bonus": 0.0,
|
| 356 |
+
"token_penalty": 0.0,
|
| 357 |
+
"behavior_penalty": 0.0,
|
| 358 |
+
"train_total": 0.0,
|
| 359 |
"total": 0.0,
|
| 360 |
}
|
| 361 |
```
|
| 362 |
|
| 363 |
+
Sparse evaluation uses `terminal_total` as `total`. Dense training uses
|
| 364 |
+
`terminal_total + shaping_weight * progressive + efficiency - penalties` as `total`,
|
| 365 |
+
with all reward values and short descriptions configured in
|
| 366 |
+
`training/configs/grpo_small.yaml`.
|
| 367 |
+
|
| 368 |
### Reward components
|
| 369 |
|
| 370 |
| Component | Purpose |
|
|
|
|
| 442 |
|
| 443 |
```text
|
| 444 |
1. Build CyberSecurity_OWASP OpenEnv server.
|
| 445 |
+
2. Prepare validated scenario cache once per generator/verifier version.
|
| 446 |
+
3. Run baseline eval with cached validation/held-out bundles.
|
| 447 |
+
4. Train with GRPO/TRL or Unsloth using cached rollout episodes.
|
| 448 |
+
5. Log reward components, pass rates, reset latency, and cache hit metrics to Trackio.
|
| 449 |
6. Run held-out eval every N training steps.
|
| 450 |
+
7. Inspect failure clusters and cache sampling weights.
|
| 451 |
+
8. Refresh only 5-10% of scenarios per epoch when new weak spots are found.
|
| 452 |
9. Produce final demo: before/after trace + reward curve + held-out eval table.
|
| 453 |
```
|
| 454 |
|
| 455 |
Recommended initial training setup (Modal-first):
|
| 456 |
|
| 457 |
```text
|
| 458 |
+
Model: unsloth/gemma-4-E2B-it
|
| 459 |
Algorithm: GRPO via TRL or Unsloth-compatible loop
|
| 460 |
Dataset prompt: repeated task instruction with randomized scenario IDs
|
| 461 |
Max steps per episode: 30
|
| 462 |
Rollouts per prompt: 2-4
|
| 463 |
Logging: Trackio
|
| 464 |
Primary eval: held-out deterministic test pass rate
|
| 465 |
+
Scenario cache mode: require
|
| 466 |
+
Scenario cache volume: CyberSecurity_OWASP-scenario-cache
|
| 467 |
|
| 468 |
Training execution is expected to run on Modal (persistent or ephemeral) rather than locally.
|
| 469 |
```
|
|
|
|
| 558 |
| Hackathon judging criteria | Informs demo priorities: innovation, storytelling, reward improvement, and training pipeline. | 9/10 |
|
| 559 |
| TRL/OpenEnv training example | Informs rollout function, decomposed reward functions, and Trackio logging pattern. | 8/10 |
|
| 560 |
| Kube SRE Gym README | Informs the closed-loop pattern: adversarial scenario design, curriculum mastery tracking, real tool interaction, verification, and artifact-driven storytelling. | 8/10 |
|
| 561 |
+
| DeepSeek-V4-Pro Hugging Face model card and encoding notes | Informs the default offline scenario-author config and the note that prompt handling should not assume a Jinja chat template. | 8/10 |
|
AGENTS.md
CHANGED
|
@@ -310,7 +310,7 @@ class CyberSecurityOWASPAction(Action):
|
|
| 310 |
"search_code",
|
| 311 |
"send_local_request",
|
| 312 |
"compare_identities",
|
| 313 |
-
"
|
| 314 |
"patch_file",
|
| 315 |
"run_visible_tests",
|
| 316 |
"submit_fix",
|
|
@@ -370,7 +370,7 @@ Actions must be explicit, typed, serializable, and constrained. Invalid actions
|
|
| 370 |
|
| 371 |
| Phase | Allowed tools |
|
| 372 |
|---|---|
|
| 373 |
-
| discover | `inspect_policy_graph`, `list_routes`, `read_openapi`, `read_file`, `search_code`, `send_local_request`, `compare_identities`, `
|
| 374 |
| patch | `read_file`, `search_code`, `patch_file`, `run_visible_tests`, `send_local_request`, `submit_fix`, `noop` |
|
| 375 |
| done | no state-changing tools; return stable done observation |
|
| 376 |
|
|
@@ -397,7 +397,7 @@ Actions must be explicit, typed, serializable, and constrained. Invalid actions
|
|
| 397 |
`compare_identities`
|
| 398 |
: Runs the same local request as two generated users and summarizes behavioral differences.
|
| 399 |
|
| 400 |
-
`
|
| 401 |
: Accepts structured evidence of the suspected authorization bug. Required before patch phase unless curriculum level explicitly allows blind patching.
|
| 402 |
|
| 403 |
`patch_file`
|
|
@@ -484,7 +484,7 @@ class CyberSecurityOWASPEnvironment(Environment):
|
|
| 484 |
3. Increment step count.
|
| 485 |
4. Execute the tool.
|
| 486 |
5. Update state/history.
|
| 487 |
-
6. Run verifier if `
|
| 488 |
7. Compute reward components.
|
| 489 |
8. Check terminal conditions.
|
| 490 |
9. Return observation, reward, and done through OpenEnv step result handling.
|
|
@@ -805,7 +805,7 @@ grpo_config = GRPOConfig(
|
|
| 805 |
num_train_epochs=1,
|
| 806 |
per_device_train_batch_size=1,
|
| 807 |
gradient_accumulation_steps=32,
|
| 808 |
-
num_generations=
|
| 809 |
max_prompt_length=4096,
|
| 810 |
max_completion_length=768,
|
| 811 |
use_vllm=True,
|
|
|
|
| 310 |
"search_code",
|
| 311 |
"send_local_request",
|
| 312 |
"compare_identities",
|
| 313 |
+
"submit_diagnosis",
|
| 314 |
"patch_file",
|
| 315 |
"run_visible_tests",
|
| 316 |
"submit_fix",
|
|
|
|
| 370 |
|
| 371 |
| Phase | Allowed tools |
|
| 372 |
|---|---|
|
| 373 |
+
| discover | `inspect_policy_graph`, `list_routes`, `read_openapi`, `read_file`, `search_code`, `send_local_request`, `compare_identities`, `submit_diagnosis`, `noop` |
|
| 374 |
| patch | `read_file`, `search_code`, `patch_file`, `run_visible_tests`, `send_local_request`, `submit_fix`, `noop` |
|
| 375 |
| done | no state-changing tools; return stable done observation |
|
| 376 |
|
|
|
|
| 397 |
`compare_identities`
|
| 398 |
: Runs the same local request as two generated users and summarizes behavioral differences.
|
| 399 |
|
| 400 |
+
`submit_diagnosis`
|
| 401 |
: Accepts structured evidence of the suspected authorization bug. Required before patch phase unless curriculum level explicitly allows blind patching.
|
| 402 |
|
| 403 |
`patch_file`
|
|
|
|
| 484 |
3. Increment step count.
|
| 485 |
4. Execute the tool.
|
| 486 |
5. Update state/history.
|
| 487 |
+
6. Run verifier if `submit_diagnosis`, `run_visible_tests`, or `submit_fix`.
|
| 488 |
7. Compute reward components.
|
| 489 |
8. Check terminal conditions.
|
| 490 |
9. Return observation, reward, and done through OpenEnv step result handling.
|
|
|
|
| 805 |
num_train_epochs=1,
|
| 806 |
per_device_train_batch_size=1,
|
| 807 |
gradient_accumulation_steps=32,
|
| 808 |
+
num_generations=6,
|
| 809 |
max_prompt_length=4096,
|
| 810 |
max_completion_length=768,
|
| 811 |
use_vllm=True,
|
README.md
CHANGED
|
@@ -18,10 +18,10 @@ tags:
|
|
| 18 |
`CyberSecurity_OWASP` is an OpenEnv-compliant reinforcement-learning environment for a single LLM agent that performs a defensive authorization-repair workflow:
|
| 19 |
|
| 20 |
```text
|
| 21 |
-
inspect generated app + policy -> discover authorization bug -> submit
|
| 22 |
```
|
| 23 |
|
| 24 |
-
The current implementation includes a functional closed-loop MVP scenario: an invoices FastAPI-style app with one injected OWASP A01 BOLA/IDOR defect,
|
| 25 |
|
| 26 |
## Diagrams
|
| 27 |
|
|
@@ -36,6 +36,7 @@ Editable Mermaid sources are available in `assets/architecture_diagram.mmd` and
|
|
| 36 |
```bash
|
| 37 |
uv sync --extra dev
|
| 38 |
uv run --extra dev pytest
|
|
|
|
| 39 |
uv run server --port 8000
|
| 40 |
```
|
| 41 |
|
|
@@ -68,7 +69,7 @@ Supported tools:
|
|
| 68 |
- `search_code`
|
| 69 |
- `send_local_request`
|
| 70 |
- `compare_identities`
|
| 71 |
-
- `
|
| 72 |
- `patch_file`
|
| 73 |
- `run_visible_tests`
|
| 74 |
- `submit_fix`
|
|
@@ -76,7 +77,7 @@ Supported tools:
|
|
| 76 |
|
| 77 |
Tools are phase-gated:
|
| 78 |
|
| 79 |
-
- `discover`: inspect policy/routes/files, run safe local requests, compare identities, submit
|
| 80 |
- `patch`: read/search, patch editable app files, run visible tests, submit final fix.
|
| 81 |
- `done`: stable terminal observation only.
|
| 82 |
|
|
@@ -94,15 +95,43 @@ Terminal reward uses stable components:
|
|
| 94 |
"visible_tests": 0.0,
|
| 95 |
"safety": 0.0,
|
| 96 |
"anti_cheat": 0.0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
"total": 0.0,
|
| 98 |
}
|
| 99 |
```
|
| 100 |
|
| 101 |
The verifier rewards blocking the hidden exploit while preserving legitimate owner/admin behavior and intentionally public routes. Terminal scoring requires visible checks, hidden authorization checks, a policy-oracle matrix, regression checks, public-route preservation, and patch-quality checks. It penalizes deny-all fixes, hardcoded IDs, repeated/invalid action patterns, hidden file probes, external URL attempts, and test/fixture tampering.
|
| 102 |
|
| 103 |
-
|
|
|
|
| 104 |
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
- invoices domain policy graph;
|
| 108 |
- bounded adversarial target metadata such as same-role cross-object access, cross-tenant access, public-route overlocking traps, alternate route/service reachability, or visible-test-only edge cases;
|
|
@@ -118,8 +147,9 @@ Additional domains and bug families are scaffolded for extension.
|
|
| 118 |
The OpenEnv runtime is split into small server modules:
|
| 119 |
|
| 120 |
- `server/curriculum.py` tracks mastery, weak spots, reward trend, and difficulty tier.
|
|
|
|
| 121 |
- `server/adversarial_designer.py` chooses safe synthetic scenario targets from tracked weaknesses.
|
| 122 |
-
- `server/scenario_factory.py` compiles the generated app
|
| 123 |
- `server/app_sandbox.py` handles editable workspace reads, patches, local requests, and OpenAPI summaries.
|
| 124 |
- `server/action_tools.py` dispatches typed tools through the sandbox.
|
| 125 |
- `server/authz_oracle.py` builds the hidden allowed/denied user-resource-action matrix.
|
|
@@ -153,6 +183,16 @@ The training scaffold is intentionally minimal until the environment/verifier be
|
|
| 153 |
Use the Modal launchers in `scripts/modal_train_grpo.py` (persistent) and
|
| 154 |
`scripts/modal_ephemeral_train.py` (smoke) for real GRPO runs.
|
| 155 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
## Trackio Run Tracking
|
| 157 |
|
| 158 |
Trackio is the default tracker for official runs. Set `TRACKIO_SPACE_ID` to log to a hosted Hugging Face Trackio Space; otherwise Trackio records locally.
|
|
@@ -184,6 +224,7 @@ uv sync --extra modal
|
|
| 184 |
Run a temporary Modal app for a cheap environment/training smoke check:
|
| 185 |
|
| 186 |
```bash
|
|
|
|
| 187 |
uv run --extra modal modal run scripts/modal_ephemeral_train.py --mode smoke --episodes 4
|
| 188 |
```
|
| 189 |
|
|
@@ -218,10 +259,11 @@ uv run --extra modal modal run scripts/modal_train_grpo.py --mode config
|
|
| 218 |
Run the default smoke GRPO job:
|
| 219 |
|
| 220 |
```bash
|
|
|
|
| 221 |
uv run --extra modal modal run scripts/modal_train_grpo.py \
|
| 222 |
--max-steps 10 \
|
| 223 |
--dataset-size 16 \
|
| 224 |
-
--num-generations
|
| 225 |
--difficulty 0
|
| 226 |
```
|
| 227 |
|
|
@@ -235,7 +277,7 @@ uv run --extra modal modal run scripts/modal_train_grpo.py \
|
|
| 235 |
--repo-branch master \
|
| 236 |
--max-steps 10 \
|
| 237 |
--dataset-size 16 \
|
| 238 |
-
--num-generations
|
| 239 |
--difficulty 0
|
| 240 |
```
|
| 241 |
|
|
@@ -243,10 +285,11 @@ Defaults are derived from `HF_TOKEN`:
|
|
| 243 |
|
| 244 |
- Trackio Space: `<hf-user>/CyberSecurity_OWASP-trackio`
|
| 245 |
- Trackio project: `CyberSecurity_OWASP-grpo`
|
| 246 |
-
-
|
|
|
|
| 247 |
|
| 248 |
Override these with `--trackio-space-id`, `--trackio-project`, and
|
| 249 |
-
`--output-repo-id` when needed.
|
| 250 |
|
| 251 |
## Docker / Spaces
|
| 252 |
|
|
|
|
| 18 |
`CyberSecurity_OWASP` is an OpenEnv-compliant reinforcement-learning environment for a single LLM agent that performs a defensive authorization-repair workflow:
|
| 19 |
|
| 20 |
```text
|
| 21 |
+
inspect generated app + policy -> discover authorization bug -> submit diagnosis -> patch code -> preserve intended behavior
|
| 22 |
```
|
| 23 |
|
| 24 |
+
The current implementation includes a functional closed-loop MVP scenario: an invoices FastAPI-style app with one injected OWASP A01 BOLA/IDOR defect, config-driven curriculum settings, cache-backed scenario reset, an ephemeral app sandbox, multi-layer deterministic verifier checks, anti-cheat safeguards, JSONL episode artifacts, and decomposed reward.
|
| 25 |
|
| 26 |
## Diagrams
|
| 27 |
|
|
|
|
| 36 |
```bash
|
| 37 |
uv sync --extra dev
|
| 38 |
uv run --extra dev pytest
|
| 39 |
+
uv run python scripts/generate_scenario_cache.py --train-per-bucket 3 --validation-per-bucket 3 --heldout-per-bucket 3
|
| 40 |
uv run server --port 8000
|
| 41 |
```
|
| 42 |
|
|
|
|
| 69 |
- `search_code`
|
| 70 |
- `send_local_request`
|
| 71 |
- `compare_identities`
|
| 72 |
+
- `submit_diagnosis`
|
| 73 |
- `patch_file`
|
| 74 |
- `run_visible_tests`
|
| 75 |
- `submit_fix`
|
|
|
|
| 77 |
|
| 78 |
Tools are phase-gated:
|
| 79 |
|
| 80 |
+
- `discover`: inspect policy/routes/files, run safe local requests, compare identities, submit diagnosis.
|
| 81 |
- `patch`: read/search, patch editable app files, run visible tests, submit final fix.
|
| 82 |
- `done`: stable terminal observation only.
|
| 83 |
|
|
|
|
| 95 |
"visible_tests": 0.0,
|
| 96 |
"safety": 0.0,
|
| 97 |
"anti_cheat": 0.0,
|
| 98 |
+
"terminal_total": 0.0,
|
| 99 |
+
"progressive": 0.0,
|
| 100 |
+
"step_penalty": 0.0,
|
| 101 |
+
"speed_bonus": 0.0,
|
| 102 |
+
"token_penalty": 0.0,
|
| 103 |
+
"behavior_penalty": 0.0,
|
| 104 |
+
"train_total": 0.0,
|
| 105 |
"total": 0.0,
|
| 106 |
}
|
| 107 |
```
|
| 108 |
|
| 109 |
The verifier rewards blocking the hidden exploit while preserving legitimate owner/admin behavior and intentionally public routes. Terminal scoring requires visible checks, hidden authorization checks, a policy-oracle matrix, regression checks, public-route preservation, and patch-quality checks. It penalizes deny-all fixes, hardcoded IDs, repeated/invalid action patterns, hidden file probes, external URL attempts, and test/fixture tampering.
|
| 110 |
|
| 111 |
+
Training can enable dense rewards with `CYBERSECURITY_OWASP_REWARD_MODE=dense_train`.
|
| 112 |
+
Dense mode adds configurable progressive rewards, small efficiency penalties, and capped behavior penalties from `training/configs/grpo_small.yaml`; evaluation defaults to sparse terminal scoring.
|
| 113 |
|
| 114 |
+
## Scenario Cache And Generation
|
| 115 |
+
|
| 116 |
+
Scenario generation is an offline/cache-prep concern. `reset(seed)` asks the `CurriculumController` for a difficulty tier and target weakness, then loads a validated executable bundle from the scenario cache when `CYBERSECURITY_OWASP_SCENARIO_CACHE_MODE=require`. Local development defaults to `fallback`, which compiles deterministically on a cache miss.
|
| 117 |
+
|
| 118 |
+
The scenario/curriculum author is config-driven through `configs/scenario_authoring.small.json`. The default offline author model is `deepseek-ai/DeepSeek-V4-Pro` with Hugging Face provider settings, thinking mode enabled, `temperature=1.0`, and `top_p=1.0`. This model config is for scenario authoring, not the RL policy model.
|
| 119 |
+
|
| 120 |
+
The cache bundle contract is:
|
| 121 |
+
|
| 122 |
+
- `scenario.json`
|
| 123 |
+
- `app_source/`
|
| 124 |
+
- `policy_graph.json`
|
| 125 |
+
- `visible_tests.py`
|
| 126 |
+
- `hidden_tests.py`
|
| 127 |
+
- `oracle_tests.py`
|
| 128 |
+
- `expected_exploit_trace.json`
|
| 129 |
+
- `reward_config.json`
|
| 130 |
+
- `metadata.json`
|
| 131 |
+
|
| 132 |
+
Cache keys include difficulty, authorization bug type, app family, framework, policy shape, tenant model, exploit depth, patch scope, regression risk, generator version, verifier version, and scenario hash.
|
| 133 |
+
|
| 134 |
+
The MVP compiler currently generates:
|
| 135 |
|
| 136 |
- invoices domain policy graph;
|
| 137 |
- bounded adversarial target metadata such as same-role cross-object access, cross-tenant access, public-route overlocking traps, alternate route/service reachability, or visible-test-only edge cases;
|
|
|
|
| 147 |
The OpenEnv runtime is split into small server modules:
|
| 148 |
|
| 149 |
- `server/curriculum.py` tracks mastery, weak spots, reward trend, and difficulty tier.
|
| 150 |
+
- `server/scenario_cache.py` writes and loads validated executable scenario bundles.
|
| 151 |
- `server/adversarial_designer.py` chooses safe synthetic scenario targets from tracked weaknesses.
|
| 152 |
+
- `server/scenario_factory.py` compiles the generated app during cache prep or local fallback.
|
| 153 |
- `server/app_sandbox.py` handles editable workspace reads, patches, local requests, and OpenAPI summaries.
|
| 154 |
- `server/action_tools.py` dispatches typed tools through the sandbox.
|
| 155 |
- `server/authz_oracle.py` builds the hidden allowed/denied user-resource-action matrix.
|
|
|
|
| 183 |
Use the Modal launchers in `scripts/modal_train_grpo.py` (persistent) and
|
| 184 |
`scripts/modal_ephemeral_train.py` (smoke) for real GRPO runs.
|
| 185 |
|
| 186 |
+
Modal smoke and GRPO runs use `CYBERSECURITY_OWASP_SCENARIO_CACHE_MODE=require` and mount the persistent `CyberSecurity_OWASP-scenario-cache` volume. Prepare that cache before smoke/training:
|
| 187 |
+
|
| 188 |
+
```bash
|
| 189 |
+
uv run --extra modal modal run scripts/modal_train_grpo.py --mode prepare-cache
|
| 190 |
+
uv run --extra modal modal run scripts/modal_ephemeral_train.py --mode prepare-cache
|
| 191 |
+
```
|
| 192 |
+
|
| 193 |
+
If the cache slice is missing or below the configured per-bucket minimum, Modal training fails before rollouts rather than compiling scenarios during the run.
|
| 194 |
+
The persistent GRPO launcher runs a CPU-only scenario-cache preflight before it starts the L4 GPU function, so missing cache coverage fails before GPU allocation.
|
| 195 |
+
|
| 196 |
## Trackio Run Tracking
|
| 197 |
|
| 198 |
Trackio is the default tracker for official runs. Set `TRACKIO_SPACE_ID` to log to a hosted Hugging Face Trackio Space; otherwise Trackio records locally.
|
|
|
|
| 224 |
Run a temporary Modal app for a cheap environment/training smoke check:
|
| 225 |
|
| 226 |
```bash
|
| 227 |
+
uv run --extra modal modal run scripts/modal_ephemeral_train.py --mode prepare-cache
|
| 228 |
uv run --extra modal modal run scripts/modal_ephemeral_train.py --mode smoke --episodes 4
|
| 229 |
```
|
| 230 |
|
|
|
|
| 259 |
Run the default smoke GRPO job:
|
| 260 |
|
| 261 |
```bash
|
| 262 |
+
uv run --extra modal modal run scripts/modal_train_grpo.py --mode prepare-cache
|
| 263 |
uv run --extra modal modal run scripts/modal_train_grpo.py \
|
| 264 |
--max-steps 10 \
|
| 265 |
--dataset-size 16 \
|
| 266 |
+
--num-generations 6 \
|
| 267 |
--difficulty 0
|
| 268 |
```
|
| 269 |
|
|
|
|
| 277 |
--repo-branch master \
|
| 278 |
--max-steps 10 \
|
| 279 |
--dataset-size 16 \
|
| 280 |
+
--num-generations 6 \
|
| 281 |
--difficulty 0
|
| 282 |
```
|
| 283 |
|
|
|
|
| 285 |
|
| 286 |
- Trackio Space: `<hf-user>/CyberSecurity_OWASP-trackio`
|
| 287 |
- Trackio project: `CyberSecurity_OWASP-grpo`
|
| 288 |
+
- Training model: `unsloth/gemma-4-E2B-it`
|
| 289 |
+
- Output repo: `<hf-user>/CyberSecurity_OWASP-unsloth-gemma-4-e2b-it-grpo-lora`
|
| 290 |
|
| 291 |
Override these with `--trackio-space-id`, `--trackio-project`, and
|
| 292 |
+
`--output-repo-id` when needed. The persistent GRPO launcher intentionally rejects non-Gemma model overrides so smoke runs match the Unsloth Gemma 4 E2B RL notebook.
|
| 293 |
|
| 294 |
## Docker / Spaces
|
| 295 |
|
assets/architecture_diagram.mmd
CHANGED
|
@@ -1,56 +1,63 @@
|
|
| 1 |
-
flowchart
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
end
|
| 15 |
|
| 16 |
-
subgraph
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
Sandbox["Ephemeral app sandbox\ncode workspace + fixture DB + local API model"]
|
| 21 |
-
Verifier["Multi-layer verifier\nvisible, hidden, oracle, regression"]
|
| 22 |
-
Reward["Deterministic reward engine\ncomponents + penalties"]
|
| 23 |
-
Logger["Episode artifact logger\nJSONL transcript + verifier + diff"]
|
| 24 |
-
App["FastAPI / WebSocket server\n/ws, /reset, /step, /state, /web"]
|
| 25 |
-
Reset --> State
|
| 26 |
-
State --> Tools
|
| 27 |
-
Tools <--> Sandbox
|
| 28 |
-
Sandbox --> Verifier
|
| 29 |
-
Verifier --> Reward
|
| 30 |
-
Reward --> State
|
| 31 |
-
State --> Logger
|
| 32 |
-
Logger --> Curriculum
|
| 33 |
-
State --> App
|
| 34 |
end
|
| 35 |
|
| 36 |
-
subgraph
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
|
|
|
| 41 |
end
|
| 42 |
|
| 43 |
-
subgraph
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
Eval["Held-out family eval\nbase vs trained model"]
|
| 48 |
-
Artifacts["Demo artifacts\nbefore/after traces + JSONL"]
|
| 49 |
-
Rollout --> GRPO --> Trackio --> Eval --> Artifacts
|
| 50 |
end
|
| 51 |
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
%%{init: {"theme": "base", "themeVariables": {"fontFamily": "Arial, Helvetica, sans-serif", "primaryTextColor": "#111827", "lineColor": "#0f172a", "clusterBkg": "#ffffff", "clusterBorder": "#cbd5e1"}, "flowchart": {"htmlLabels": false, "curve": "basis", "nodeSpacing": 60, "rankSpacing": 80, "padding": 24}}}%%
|
| 2 |
+
flowchart LR
|
| 3 |
+
classDef factory fill:#eff6ff,stroke:#2563eb,stroke-width:2px,color:#111827;
|
| 4 |
+
classDef runtime fill:#ecfdf5,stroke:#059669,stroke-width:2px,color:#111827;
|
| 5 |
+
classDef agent fill:#fff7ed,stroke:#ea580c,stroke-width:2px,color:#111827;
|
| 6 |
+
classDef training fill:#f5f3ff,stroke:#7c3aed,stroke-width:2px,color:#111827;
|
| 7 |
+
classDef feedback fill:#f1f5f9,stroke:#64748b,stroke-width:2px,color:#111827;
|
| 8 |
+
|
| 9 |
+
subgraph Factory["Scenario factory\noffline authoring"]
|
| 10 |
+
direction TB
|
| 11 |
+
F1["LLM author\nconfig-driven drafts"] --> F2["ScenarioSpec\npolicy + bug target"]
|
| 12 |
+
F2 --> F3["A01 mutator\nFastAPI variants + traps"]
|
| 13 |
+
F3 --> F4["Compiler\nexecutable app bundle"]
|
| 14 |
+
F4 --> F5["Verifier\nvisible + hidden tests"]
|
| 15 |
+
F5 --> F6["Versioned cache\nsplit + difficulty + hash"]
|
| 16 |
+
end
|
| 17 |
+
|
| 18 |
+
subgraph Runtime["OpenEnv runtime\ncache-backed episodes"]
|
| 19 |
+
direction TB
|
| 20 |
+
R1["reset(seed)\nload cached bundle"] --> R2["Curriculum sampler\nvalidated slice"]
|
| 21 |
+
R2 --> R3["Episode state\nphase + history + diff"]
|
| 22 |
+
R3 --> R4["Typed tools\ninspect, request, patch"]
|
| 23 |
+
R4 --> R5["App sandbox\ncloned workspace"]
|
| 24 |
+
R5 --> R6["Verifier\nsecurity + regression"]
|
| 25 |
+
R6 --> R7["Reward engine\ncomponents + penalties"]
|
| 26 |
+
R7 --> R3
|
| 27 |
+
R3 --> R8["API + logger\n/ws, /step, artifacts"]
|
| 28 |
end
|
| 29 |
|
| 30 |
+
subgraph Agent["Single LLM agent"]
|
| 31 |
+
direction TB
|
| 32 |
+
A1["Parse observation"] --> A2["Reason over\npolicy + code"]
|
| 33 |
+
A2 --> A3["Emit one\nJSON action"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
end
|
| 35 |
|
| 36 |
+
subgraph Training["Training, eval, demo"]
|
| 37 |
+
direction TB
|
| 38 |
+
T1["Parallel rollouts\nfast cached reset"] --> T2["TRL GRPO + LoRA"]
|
| 39 |
+
T2 --> T3["Trackio metrics\nreward + pass rates"]
|
| 40 |
+
T3 --> T4["Held-out eval\nbaseline vs trained"]
|
| 41 |
+
T4 --> T5["Demo artifacts\nrollouts + summaries"]
|
| 42 |
end
|
| 43 |
|
| 44 |
+
subgraph Feedback["Feedback loop"]
|
| 45 |
+
direction LR
|
| 46 |
+
B1["Episode logs"] --> B2["Failure analysis"]
|
| 47 |
+
B2 --> B3["Sampling weights\nand new jobs"]
|
|
|
|
|
|
|
|
|
|
| 48 |
end
|
| 49 |
|
| 50 |
+
F6 == cached bundle ==> R1
|
| 51 |
+
R8 -- observation --> A1
|
| 52 |
+
A3 -- JSON action --> R4
|
| 53 |
+
R7 -- terminal reward --> T1
|
| 54 |
+
T2 -. adapter checkpoint .-> A2
|
| 55 |
+
R8 -- episode logs --> B1
|
| 56 |
+
B3 -. cache refresh .-> F1
|
| 57 |
+
|
| 58 |
+
class F1,F2,F3,F4,F5,F6 factory;
|
| 59 |
+
class R1,R2,R3,R4,R5,R6,R7,R8 runtime;
|
| 60 |
+
class A1,A2,A3 agent;
|
| 61 |
+
class T1,T2,T3,T4,T5 training;
|
| 62 |
+
class B1,B2,B3 feedback;
|
| 63 |
+
linkStyle default stroke:#0f172a,stroke-width:2px;
|
assets/architecture_diagram.svg
CHANGED
|
|
|
|
assets/env_rl_training_flow_diagram.mmd
CHANGED
|
@@ -1,26 +1,44 @@
|
|
|
|
|
| 1 |
flowchart TD
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
Baseline --> TrainLoop["GRPO training loop"]
|
| 5 |
|
| 6 |
-
subgraph Episode["One OpenEnv
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
| 14 |
end
|
| 15 |
|
| 16 |
TrainLoop --> Reset
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
Validate -- "adjust curriculum" --> Curriculum["Curriculum controller\nrebalance difficulty and traps"]
|
| 23 |
Curriculum --> TrainLoop
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
| 26 |
Compare --> Artifacts["Saved artifacts\noutputs/evals + outputs/rollouts"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
%%{init: {"theme": "base", "themeVariables": {"fontFamily": "Arial, Helvetica, sans-serif", "primaryTextColor": "#111827", "lineColor": "#0f172a", "clusterBkg": "#ffffff", "clusterBorder": "#cbd5e1"}, "flowchart": {"htmlLabels": false, "curve": "basis", "nodeSpacing": 58, "rankSpacing": 70, "padding": 24}}}%%
|
| 2 |
flowchart TD
|
| 3 |
+
classDef setup fill:#eff6ff,stroke:#2563eb,stroke-width:2px,color:#111827;
|
| 4 |
+
classDef episode fill:#ecfdf5,stroke:#059669,stroke-width:2px,color:#111827;
|
| 5 |
+
classDef train fill:#f5f3ff,stroke:#7c3aed,stroke-width:2px,color:#111827;
|
| 6 |
+
classDef adapt fill:#fff7ed,stroke:#ea580c,stroke-width:2px,color:#111827;
|
| 7 |
+
classDef artifact fill:#f1f5f9,stroke:#64748b,stroke-width:2px,color:#111827;
|
| 8 |
+
|
| 9 |
+
Start["Start run\nbase model + config"] --> Cache["Prepare cache\ntrain / validation / hidden_eval"]
|
| 10 |
+
Cache --> Require["Modal cache mode\nrequire"]
|
| 11 |
+
Require --> Baseline["Baseline eval\nscripted or model rollouts"]
|
| 12 |
Baseline --> TrainLoop["GRPO training loop"]
|
| 13 |
|
| 14 |
+
subgraph Episode["One OpenEnv episode"]
|
| 15 |
+
direction TB
|
| 16 |
+
Reset["reset(seed)\nload cached app + policy"] --> Observe["Observation\nphase, hints, tools"]
|
| 17 |
+
Observe --> Prompt["Build prompt\nJSON action only"]
|
| 18 |
+
Prompt --> Generate["Model generates\none action"]
|
| 19 |
+
Generate --> Step["step(action)\nphase gate + tool"]
|
| 20 |
+
Step --> Done{"done?"}
|
| 21 |
+
Done -- no --> Observe
|
| 22 |
+
Done -- yes --> Verify["Terminal verifier\nsecurity + regression + anti-cheat"]
|
| 23 |
+
Verify --> Rewards["Reward components\ndiscovery, security, regression, safety"]
|
| 24 |
end
|
| 25 |
|
| 26 |
TrainLoop --> Reset
|
| 27 |
+
Rewards --> Update["GRPO update\nLoRA checkpoint"]
|
| 28 |
+
Update --> Metrics["Trackio logging\nrewards, pass rates, latency"]
|
| 29 |
+
Metrics --> Decision{"next step?"}
|
| 30 |
+
Decision -- continue --> TrainLoop
|
| 31 |
+
Decision -- rebalance --> Curriculum["Curriculum controller\nsampling weights"]
|
|
|
|
| 32 |
Curriculum --> TrainLoop
|
| 33 |
+
Decision -- weak spot --> Refresh["Async cache refresh\nnew validated bundles"]
|
| 34 |
+
Refresh --> Cache
|
| 35 |
+
Decision -- final --> Heldout["Held-out eval\nunseen seeds and layouts"]
|
| 36 |
+
Heldout --> Compare["Before/after summary\nsuccess + reward lift"]
|
| 37 |
Compare --> Artifacts["Saved artifacts\noutputs/evals + outputs/rollouts"]
|
| 38 |
+
|
| 39 |
+
class Start,Cache,Require,Baseline setup;
|
| 40 |
+
class Reset,Observe,Prompt,Generate,Step,Done,Verify,Rewards episode;
|
| 41 |
+
class TrainLoop,Update,Metrics,Heldout,Compare train;
|
| 42 |
+
class Decision,Curriculum,Refresh adapt;
|
| 43 |
+
class Artifacts artifact;
|
| 44 |
+
linkStyle default stroke:#0f172a,stroke-width:2px;
|
assets/env_rl_training_flow_diagram.svg
CHANGED
|
|
|
|
config.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Configuration for scenario authoring, curriculum, and cache-backed reset."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
import os
|
| 7 |
+
from dataclasses import dataclass, field
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import Any, Literal
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
ScenarioCacheMode = Literal["fallback", "require", "disabled"]
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
DEFAULT_SCENARIO_CONFIG_PATH = (
|
| 16 |
+
Path(__file__).resolve().parent / "configs" / "scenario_authoring.small.json"
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
@dataclass(frozen=True)
|
| 21 |
+
class ScenarioAuthorConfig:
|
| 22 |
+
provider: str = "huggingface"
|
| 23 |
+
model_id: str = "deepseek-ai/DeepSeek-V4-Pro"
|
| 24 |
+
thinking_mode: str = "thinking"
|
| 25 |
+
reasoning_effort: str = "high"
|
| 26 |
+
temperature: float = 1.0
|
| 27 |
+
top_p: float = 1.0
|
| 28 |
+
max_context_tokens: int = 131072
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
@dataclass(frozen=True)
|
| 32 |
+
class CurriculumCacheConfig:
|
| 33 |
+
difficulty_bucket_count: int = 4
|
| 34 |
+
difficulty_labels: list[str] = field(default_factory=lambda: ["D0", "D1", "D2", "D3"])
|
| 35 |
+
train_scenarios_per_bucket: int = 25
|
| 36 |
+
validation_scenarios_per_bucket: int = 10
|
| 37 |
+
heldout_eval_scenarios_per_bucket: int = 10
|
| 38 |
+
target_cache_hit_rate: float = 0.95
|
| 39 |
+
target_reset_latency_ms: int = 200
|
| 40 |
+
scenario_refresh_rate_per_epoch: float = 0.05
|
| 41 |
+
difficulty_calibration_strategy: str = "baseline_agent_pass_rate"
|
| 42 |
+
pass_rate_thresholds: dict[str, tuple[float, float]] = field(
|
| 43 |
+
default_factory=lambda: {
|
| 44 |
+
"D0": (0.8, 1.0),
|
| 45 |
+
"D1": (0.6, 0.8),
|
| 46 |
+
"D2": (0.4, 0.6),
|
| 47 |
+
"D3": (0.2, 0.4),
|
| 48 |
+
}
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
def minimum_for_split(self, split: str) -> int:
|
| 52 |
+
if split == "hidden_eval":
|
| 53 |
+
return self.heldout_eval_scenarios_per_bucket
|
| 54 |
+
if split == "validation":
|
| 55 |
+
return self.validation_scenarios_per_bucket
|
| 56 |
+
return self.train_scenarios_per_bucket
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
@dataclass(frozen=True)
|
| 60 |
+
class ScenarioRuntimeConfig:
|
| 61 |
+
cache_mode: ScenarioCacheMode = "fallback"
|
| 62 |
+
cache_dir: str = "scenario_cache"
|
| 63 |
+
generator_version: str = "scenario_generator_v1"
|
| 64 |
+
verifier_version: str = "verifier_v1"
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
@dataclass(frozen=True)
|
| 68 |
+
class ScenarioAuthoringSettings:
|
| 69 |
+
scenario_author: ScenarioAuthorConfig = field(default_factory=ScenarioAuthorConfig)
|
| 70 |
+
curriculum: CurriculumCacheConfig = field(default_factory=CurriculumCacheConfig)
|
| 71 |
+
runtime: ScenarioRuntimeConfig = field(default_factory=ScenarioRuntimeConfig)
|
| 72 |
+
source_path: str = ""
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def load_scenario_authoring_config(path: str | Path | None = None) -> ScenarioAuthoringSettings:
|
| 76 |
+
"""Load and validate the small scenario-authoring config with env overrides."""
|
| 77 |
+
|
| 78 |
+
configured_path = Path(
|
| 79 |
+
path
|
| 80 |
+
or os.getenv("CYBERSECURITY_OWASP_SCENARIO_CONFIG", "")
|
| 81 |
+
or DEFAULT_SCENARIO_CONFIG_PATH
|
| 82 |
+
)
|
| 83 |
+
raw = json.loads(configured_path.read_text(encoding="utf-8"))
|
| 84 |
+
raw = _apply_env_overrides(raw)
|
| 85 |
+
settings = ScenarioAuthoringSettings(
|
| 86 |
+
scenario_author=ScenarioAuthorConfig(**raw.get("scenario_author", {})),
|
| 87 |
+
curriculum=_curriculum_from_raw(raw.get("curriculum", {})),
|
| 88 |
+
runtime=ScenarioRuntimeConfig(**raw.get("runtime", {})),
|
| 89 |
+
source_path=str(configured_path),
|
| 90 |
+
)
|
| 91 |
+
_validate_settings(settings)
|
| 92 |
+
return settings
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def _apply_env_overrides(raw: dict[str, Any]) -> dict[str, Any]:
|
| 96 |
+
data = json.loads(json.dumps(raw))
|
| 97 |
+
author = data.setdefault("scenario_author", {})
|
| 98 |
+
curriculum = data.setdefault("curriculum", {})
|
| 99 |
+
runtime = data.setdefault("runtime", {})
|
| 100 |
+
|
| 101 |
+
_set_if_present(author, "model_id", "CYBERSECURITY_OWASP_SCENARIO_AUTHOR_MODEL")
|
| 102 |
+
_set_if_present(author, "provider", "CYBERSECURITY_OWASP_SCENARIO_AUTHOR_PROVIDER")
|
| 103 |
+
_set_if_present(author, "thinking_mode", "CYBERSECURITY_OWASP_SCENARIO_THINKING_MODE")
|
| 104 |
+
_set_if_present(author, "reasoning_effort", "CYBERSECURITY_OWASP_SCENARIO_REASONING_EFFORT")
|
| 105 |
+
_set_if_present(author, "temperature", "CYBERSECURITY_OWASP_SCENARIO_TEMPERATURE", float)
|
| 106 |
+
_set_if_present(author, "top_p", "CYBERSECURITY_OWASP_SCENARIO_TOP_P", float)
|
| 107 |
+
_set_if_present(author, "max_context_tokens", "CYBERSECURITY_OWASP_SCENARIO_MAX_CONTEXT", int)
|
| 108 |
+
|
| 109 |
+
_set_if_present(curriculum, "difficulty_bucket_count", "CYBERSECURITY_OWASP_DIFFICULTY_BUCKETS", int)
|
| 110 |
+
_set_if_present(curriculum, "train_scenarios_per_bucket", "CYBERSECURITY_OWASP_TRAIN_SCENARIOS_PER_BUCKET", int)
|
| 111 |
+
_set_if_present(curriculum, "validation_scenarios_per_bucket", "CYBERSECURITY_OWASP_VALIDATION_SCENARIOS_PER_BUCKET", int)
|
| 112 |
+
_set_if_present(curriculum, "heldout_eval_scenarios_per_bucket", "CYBERSECURITY_OWASP_HELDOUT_SCENARIOS_PER_BUCKET", int)
|
| 113 |
+
_set_if_present(curriculum, "target_cache_hit_rate", "CYBERSECURITY_OWASP_TARGET_CACHE_HIT_RATE", float)
|
| 114 |
+
_set_if_present(curriculum, "target_reset_latency_ms", "CYBERSECURITY_OWASP_TARGET_RESET_LATENCY_MS", int)
|
| 115 |
+
_set_if_present(curriculum, "scenario_refresh_rate_per_epoch", "CYBERSECURITY_OWASP_SCENARIO_REFRESH_RATE", float)
|
| 116 |
+
_set_if_present(curriculum, "difficulty_calibration_strategy", "CYBERSECURITY_OWASP_DIFFICULTY_CALIBRATION")
|
| 117 |
+
|
| 118 |
+
_set_if_present(runtime, "cache_dir", "CYBERSECURITY_OWASP_SCENARIO_CACHE_DIR")
|
| 119 |
+
_set_if_present(runtime, "cache_mode", "CYBERSECURITY_OWASP_SCENARIO_CACHE_MODE")
|
| 120 |
+
_set_if_present(runtime, "generator_version", "CYBERSECURITY_OWASP_SCENARIO_GENERATOR_VERSION")
|
| 121 |
+
_set_if_present(runtime, "verifier_version", "CYBERSECURITY_OWASP_SCENARIO_VERIFIER_VERSION")
|
| 122 |
+
return data
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def _set_if_present(
|
| 126 |
+
target: dict[str, Any],
|
| 127 |
+
key: str,
|
| 128 |
+
env_name: str,
|
| 129 |
+
caster: type | None = None,
|
| 130 |
+
) -> None:
|
| 131 |
+
value = os.getenv(env_name)
|
| 132 |
+
if value is None:
|
| 133 |
+
return
|
| 134 |
+
target[key] = caster(value) if caster else value
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def _curriculum_from_raw(raw: dict[str, Any]) -> CurriculumCacheConfig:
|
| 138 |
+
values = dict(raw)
|
| 139 |
+
bucket_count = int(values.get("difficulty_bucket_count", 4))
|
| 140 |
+
labels = list(values.get("difficulty_labels") or [])
|
| 141 |
+
if len(labels) < bucket_count:
|
| 142 |
+
labels.extend(f"D{index}" for index in range(len(labels), bucket_count))
|
| 143 |
+
values["difficulty_labels"] = labels
|
| 144 |
+
thresholds = values.get("pass_rate_thresholds") or {}
|
| 145 |
+
values["pass_rate_thresholds"] = {
|
| 146 |
+
str(key): tuple(float(item) for item in value)
|
| 147 |
+
for key, value in thresholds.items()
|
| 148 |
+
}
|
| 149 |
+
return CurriculumCacheConfig(**values)
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def _validate_settings(settings: ScenarioAuthoringSettings) -> None:
|
| 153 |
+
author = settings.scenario_author
|
| 154 |
+
curriculum = settings.curriculum
|
| 155 |
+
runtime = settings.runtime
|
| 156 |
+
|
| 157 |
+
if not author.model_id:
|
| 158 |
+
raise ValueError("scenario_author.model_id is required")
|
| 159 |
+
if author.temperature <= 0.0 or author.top_p <= 0.0:
|
| 160 |
+
raise ValueError("scenario author sampling values must be positive")
|
| 161 |
+
if author.max_context_tokens < 4096:
|
| 162 |
+
raise ValueError("scenario author max_context_tokens is too small")
|
| 163 |
+
if curriculum.difficulty_bucket_count <= 0:
|
| 164 |
+
raise ValueError("difficulty_bucket_count must be positive")
|
| 165 |
+
if len(curriculum.difficulty_labels) < curriculum.difficulty_bucket_count:
|
| 166 |
+
raise ValueError("difficulty_labels must cover every configured bucket")
|
| 167 |
+
for attr in (
|
| 168 |
+
"train_scenarios_per_bucket",
|
| 169 |
+
"validation_scenarios_per_bucket",
|
| 170 |
+
"heldout_eval_scenarios_per_bucket",
|
| 171 |
+
"target_reset_latency_ms",
|
| 172 |
+
):
|
| 173 |
+
if int(getattr(curriculum, attr)) <= 0:
|
| 174 |
+
raise ValueError(f"{attr} must be positive")
|
| 175 |
+
if not 0.0 < curriculum.target_cache_hit_rate <= 1.0:
|
| 176 |
+
raise ValueError("target_cache_hit_rate must be in (0, 1]")
|
| 177 |
+
if not 0.0 <= curriculum.scenario_refresh_rate_per_epoch <= 1.0:
|
| 178 |
+
raise ValueError("scenario_refresh_rate_per_epoch must be in [0, 1]")
|
| 179 |
+
if runtime.cache_mode not in {"fallback", "require", "disabled"}:
|
| 180 |
+
raise ValueError("runtime.cache_mode must be fallback, require, or disabled")
|
configs/scenario_authoring.small.json
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"scenario_author": {
|
| 3 |
+
"provider": "huggingface",
|
| 4 |
+
"model_id": "deepseek-ai/DeepSeek-V4-Pro",
|
| 5 |
+
"thinking_mode": "thinking",
|
| 6 |
+
"reasoning_effort": "high",
|
| 7 |
+
"temperature": 1.0,
|
| 8 |
+
"top_p": 1.0,
|
| 9 |
+
"max_context_tokens": 131072
|
| 10 |
+
},
|
| 11 |
+
"curriculum": {
|
| 12 |
+
"difficulty_bucket_count": 4,
|
| 13 |
+
"difficulty_labels": ["D0", "D1", "D2", "D3"],
|
| 14 |
+
"train_scenarios_per_bucket": 25,
|
| 15 |
+
"validation_scenarios_per_bucket": 10,
|
| 16 |
+
"heldout_eval_scenarios_per_bucket": 10,
|
| 17 |
+
"target_cache_hit_rate": 0.95,
|
| 18 |
+
"target_reset_latency_ms": 200,
|
| 19 |
+
"scenario_refresh_rate_per_epoch": 0.05,
|
| 20 |
+
"difficulty_calibration_strategy": "baseline_agent_pass_rate",
|
| 21 |
+
"pass_rate_thresholds": {
|
| 22 |
+
"D0": [0.8, 1.0],
|
| 23 |
+
"D1": [0.6, 0.8],
|
| 24 |
+
"D2": [0.4, 0.6],
|
| 25 |
+
"D3": [0.2, 0.4]
|
| 26 |
+
}
|
| 27 |
+
},
|
| 28 |
+
"runtime": {
|
| 29 |
+
"cache_mode": "fallback",
|
| 30 |
+
"cache_dir": "scenario_cache",
|
| 31 |
+
"generator_version": "scenario_generator_v1",
|
| 32 |
+
"verifier_version": "verifier_v1"
|
| 33 |
+
}
|
| 34 |
+
}
|
evals.py
CHANGED
|
@@ -51,11 +51,13 @@ def oracle_policy(original_source: str) -> list[CyberSecurityOWASPAction]:
|
|
| 51 |
arguments={"method": "GET", "path": "__EXPLOIT_PATH__", "user_id": "__EXPLOIT_USER__"},
|
| 52 |
),
|
| 53 |
CyberSecurityOWASPAction(
|
| 54 |
-
tool_name="
|
| 55 |
arguments={
|
| 56 |
-
"
|
| 57 |
-
"
|
| 58 |
-
"
|
|
|
|
|
|
|
| 59 |
},
|
| 60 |
),
|
| 61 |
CyberSecurityOWASPAction(
|
|
|
|
| 51 |
arguments={"method": "GET", "path": "__EXPLOIT_PATH__", "user_id": "__EXPLOIT_USER__"},
|
| 52 |
),
|
| 53 |
CyberSecurityOWASPAction(
|
| 54 |
+
tool_name="submit_diagnosis",
|
| 55 |
arguments={
|
| 56 |
+
"bug_class": "idor_ownership_bug",
|
| 57 |
+
"route": "GET /invoices/{invoice_id}",
|
| 58 |
+
"violated_policy_rule": "Only the owner or billing_admin in the same tenant may read invoices.",
|
| 59 |
+
"evidence_trace_ids": ["req_001"],
|
| 60 |
+
"fix_plan": "Add tenant and owner/admin checks before returning invoice data.",
|
| 61 |
},
|
| 62 |
),
|
| 63 |
CyberSecurityOWASPAction(
|
models.py
CHANGED
|
@@ -21,7 +21,7 @@ class CyberSecurityOWASPAction(Action):
|
|
| 21 |
"search_code",
|
| 22 |
"send_local_request",
|
| 23 |
"compare_identities",
|
| 24 |
-
"
|
| 25 |
"patch_file",
|
| 26 |
"run_visible_tests",
|
| 27 |
"submit_fix",
|
|
@@ -62,17 +62,31 @@ class CyberSecurityOWASPState(State):
|
|
| 62 |
scenario_family: str = ""
|
| 63 |
template_id: str = "fastapi_basic"
|
| 64 |
target_weakness: str = "same_role_cross_object"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
phase: CyberSecurityOWASPPhase = "discover"
|
| 66 |
max_steps: int = 40
|
| 67 |
done: bool = False
|
| 68 |
success: bool = False
|
| 69 |
failure_reason: str | None = None
|
| 70 |
finding_submitted: bool = False
|
|
|
|
| 71 |
patch_submitted: bool = False
|
| 72 |
accumulated_reward: float = 0.0
|
| 73 |
last_reward: float = 0.0
|
| 74 |
action_history: list[dict[str, Any]] = Field(default_factory=list)
|
| 75 |
reward_history: list[dict[str, float]] = Field(default_factory=list)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
visible_facts: dict[str, Any] = Field(default_factory=dict)
|
| 77 |
hidden_facts: dict[str, Any] = Field(default_factory=dict)
|
| 78 |
curriculum_snapshot: dict[str, Any] = Field(default_factory=dict)
|
|
|
|
| 21 |
"search_code",
|
| 22 |
"send_local_request",
|
| 23 |
"compare_identities",
|
| 24 |
+
"submit_diagnosis",
|
| 25 |
"patch_file",
|
| 26 |
"run_visible_tests",
|
| 27 |
"submit_fix",
|
|
|
|
| 62 |
scenario_family: str = ""
|
| 63 |
template_id: str = "fastapi_basic"
|
| 64 |
target_weakness: str = "same_role_cross_object"
|
| 65 |
+
cache_key: dict[str, Any] = Field(default_factory=dict)
|
| 66 |
+
scenario_hash: str = ""
|
| 67 |
+
generator_version: str = ""
|
| 68 |
+
verifier_version: str = ""
|
| 69 |
+
cache_hit: bool = False
|
| 70 |
+
reset_latency_ms: float = 0.0
|
| 71 |
phase: CyberSecurityOWASPPhase = "discover"
|
| 72 |
max_steps: int = 40
|
| 73 |
done: bool = False
|
| 74 |
success: bool = False
|
| 75 |
failure_reason: str | None = None
|
| 76 |
finding_submitted: bool = False
|
| 77 |
+
diagnosis_submitted: bool = False
|
| 78 |
patch_submitted: bool = False
|
| 79 |
accumulated_reward: float = 0.0
|
| 80 |
last_reward: float = 0.0
|
| 81 |
action_history: list[dict[str, Any]] = Field(default_factory=list)
|
| 82 |
reward_history: list[dict[str, float]] = Field(default_factory=list)
|
| 83 |
+
progress_flags: dict[str, bool] = Field(default_factory=dict)
|
| 84 |
+
progress_reward_total: float = 0.0
|
| 85 |
+
diagnosis: dict[str, Any] = Field(default_factory=dict)
|
| 86 |
+
request_trace: list[dict[str, Any]] = Field(default_factory=list)
|
| 87 |
+
patch_attempt_count: int = 0
|
| 88 |
+
visible_test_count: int = 0
|
| 89 |
+
completion_tokens: int = 0
|
| 90 |
visible_facts: dict[str, Any] = Field(default_factory=dict)
|
| 91 |
hidden_facts: dict[str, Any] = Field(default_factory=dict)
|
| 92 |
curriculum_snapshot: dict[str, Any] = Field(default_factory=dict)
|
pyproject.toml
CHANGED
|
@@ -19,6 +19,7 @@ dependencies = [
|
|
| 19 |
# "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
|
| 20 |
"openenv-core[core]>=0.2.2",
|
| 21 |
"trackio>=0.22.0",
|
|
|
|
| 22 |
# Environment-specific dependencies
|
| 23 |
# Add all dependencies needed for your environment here
|
| 24 |
# Examples:
|
|
@@ -48,6 +49,10 @@ include-package-data = true
|
|
| 48 |
packages = ["CyberSecurity_OWASP", "CyberSecurity_OWASP.server", "training"]
|
| 49 |
package-dir = { "CyberSecurity_OWASP" = ".", "CyberSecurity_OWASP.server" = "server" }
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
[tool.pytest.ini_options]
|
| 52 |
testpaths = ["tests"]
|
| 53 |
norecursedirs = [
|
|
|
|
| 19 |
# "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
|
| 20 |
"openenv-core[core]>=0.2.2",
|
| 21 |
"trackio>=0.22.0",
|
| 22 |
+
"PyYAML>=6.0",
|
| 23 |
# Environment-specific dependencies
|
| 24 |
# Add all dependencies needed for your environment here
|
| 25 |
# Examples:
|
|
|
|
| 49 |
packages = ["CyberSecurity_OWASP", "CyberSecurity_OWASP.server", "training"]
|
| 50 |
package-dir = { "CyberSecurity_OWASP" = ".", "CyberSecurity_OWASP.server" = "server" }
|
| 51 |
|
| 52 |
+
[tool.setuptools.package-data]
|
| 53 |
+
CyberSecurity_OWASP = ["configs/*.json"]
|
| 54 |
+
training = ["configs/*.yaml"]
|
| 55 |
+
|
| 56 |
[tool.pytest.ini_options]
|
| 57 |
testpaths = ["tests"]
|
| 58 |
norecursedirs = [
|
reward_config.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Configurable reward shaping settings for CyberSecurity_OWASP."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Any
|
| 9 |
+
|
| 10 |
+
import yaml
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
DEFAULT_GRPO_CONFIG_PATH = (
|
| 14 |
+
Path(__file__).resolve().parent / "training" / "configs" / "grpo_small.yaml"
|
| 15 |
+
)
|
| 16 |
+
REWARD_MODES = {"dense_train", "sparse_eval"}
|
| 17 |
+
REWARD_STAGES = {"early", "middle", "late", "final"}
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
@dataclass(frozen=True)
|
| 21 |
+
class RewardSettings:
|
| 22 |
+
"""Loaded reward settings with stage-aware helpers."""
|
| 23 |
+
|
| 24 |
+
mode: str
|
| 25 |
+
training_mode: str
|
| 26 |
+
stage: str
|
| 27 |
+
raw: dict[str, Any]
|
| 28 |
+
source_path: str
|
| 29 |
+
|
| 30 |
+
@property
|
| 31 |
+
def dense_train(self) -> bool:
|
| 32 |
+
return self.mode == "dense_train"
|
| 33 |
+
|
| 34 |
+
@property
|
| 35 |
+
def shaping_weight(self) -> float:
|
| 36 |
+
override = os.getenv("CYBERSECURITY_OWASP_SHAPING_WEIGHT")
|
| 37 |
+
if override is not None:
|
| 38 |
+
return float(override)
|
| 39 |
+
return self.value("shaping_weight", 0.0)
|
| 40 |
+
|
| 41 |
+
def entry(self, name: str) -> dict[str, Any]:
|
| 42 |
+
value = self.raw.get(name, {})
|
| 43 |
+
return value if isinstance(value, dict) else {}
|
| 44 |
+
|
| 45 |
+
def value(self, name: str, default: float = 0.0) -> float:
|
| 46 |
+
entry = self.entry(name)
|
| 47 |
+
if self.stage in entry:
|
| 48 |
+
return float(entry[self.stage])
|
| 49 |
+
if "value" in entry:
|
| 50 |
+
return float(entry["value"])
|
| 51 |
+
return float(default)
|
| 52 |
+
|
| 53 |
+
def cap(self, name: str, default: float | None = None) -> float | None:
|
| 54 |
+
entry = self.entry(name)
|
| 55 |
+
if "cap" not in entry:
|
| 56 |
+
return default
|
| 57 |
+
return float(entry["cap"])
|
| 58 |
+
|
| 59 |
+
def int_value(self, name: str, key: str, default: int) -> int:
|
| 60 |
+
entry = self.entry(name)
|
| 61 |
+
return int(entry.get(key, default))
|
| 62 |
+
|
| 63 |
+
def terminate(self, name: str) -> bool:
|
| 64 |
+
return bool(self.entry(name).get("terminate", False))
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def load_reward_settings(path: str | Path | None = None) -> RewardSettings:
|
| 68 |
+
"""Load reward settings from the GRPO YAML config with env overrides."""
|
| 69 |
+
|
| 70 |
+
configured_path = Path(
|
| 71 |
+
path
|
| 72 |
+
or os.getenv("CYBERSECURITY_OWASP_REWARD_CONFIG", "")
|
| 73 |
+
or DEFAULT_GRPO_CONFIG_PATH
|
| 74 |
+
)
|
| 75 |
+
raw = yaml.safe_load(configured_path.read_text(encoding="utf-8")) or {}
|
| 76 |
+
reward = dict(raw.get("reward") or {})
|
| 77 |
+
mode = os.getenv("CYBERSECURITY_OWASP_REWARD_MODE", str(reward.get("mode", "sparse_eval")))
|
| 78 |
+
training_mode = str(reward.get("training_mode", "dense_train"))
|
| 79 |
+
stage = os.getenv("CYBERSECURITY_OWASP_REWARD_STAGE", str(reward.get("stage", "early")))
|
| 80 |
+
settings = RewardSettings(
|
| 81 |
+
mode=mode,
|
| 82 |
+
training_mode=training_mode,
|
| 83 |
+
stage=stage,
|
| 84 |
+
raw=reward,
|
| 85 |
+
source_path=str(configured_path),
|
| 86 |
+
)
|
| 87 |
+
validate_reward_settings(settings)
|
| 88 |
+
return settings
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def validate_reward_settings(settings: RewardSettings) -> None:
|
| 92 |
+
if settings.mode not in REWARD_MODES:
|
| 93 |
+
raise ValueError("reward.mode must be dense_train or sparse_eval")
|
| 94 |
+
if settings.training_mode not in REWARD_MODES:
|
| 95 |
+
raise ValueError("reward.training_mode must be dense_train or sparse_eval")
|
| 96 |
+
if settings.stage not in REWARD_STAGES:
|
| 97 |
+
raise ValueError("reward.stage must be early, middle, late, or final")
|
| 98 |
+
|
| 99 |
+
for key, value in settings.raw.items():
|
| 100 |
+
if not isinstance(value, dict):
|
| 101 |
+
continue
|
| 102 |
+
if not str(value.get("description", "")).strip():
|
| 103 |
+
raise ValueError(f"reward.{key}.description is required")
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def compute_token_penalty(
|
| 107 |
+
completion_tokens: int,
|
| 108 |
+
settings: RewardSettings | None = None,
|
| 109 |
+
) -> float:
|
| 110 |
+
"""Return the trainer-side token penalty for a completion."""
|
| 111 |
+
|
| 112 |
+
settings = settings or load_reward_settings()
|
| 113 |
+
if not settings.dense_train:
|
| 114 |
+
return 0.0
|
| 115 |
+
target = settings.int_value("token_penalty", "target_tokens", 350)
|
| 116 |
+
excess = max(0, int(completion_tokens) - target)
|
| 117 |
+
penalty = settings.value("token_penalty", 0.0) * excess
|
| 118 |
+
cap = settings.cap("token_penalty", -0.5)
|
| 119 |
+
return max(penalty, cap if cap is not None else penalty)
|
rewards.py
CHANGED
|
@@ -4,8 +4,10 @@ from __future__ import annotations
|
|
| 4 |
|
| 5 |
try:
|
| 6 |
from .models import CyberSecurityOWASPAction, CyberSecurityOWASPState
|
|
|
|
| 7 |
except ImportError: # pragma: no cover
|
| 8 |
from models import CyberSecurityOWASPAction, CyberSecurityOWASPState
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
REWARD_KEYS = (
|
|
@@ -17,6 +19,13 @@ REWARD_KEYS = (
|
|
| 17 |
"visible_tests",
|
| 18 |
"safety",
|
| 19 |
"anti_cheat",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
"total",
|
| 21 |
)
|
| 22 |
|
|
@@ -30,62 +39,404 @@ def compute_reward(
|
|
| 30 |
action: CyberSecurityOWASPAction,
|
| 31 |
verifier_result: dict,
|
| 32 |
) -> dict[str, float]:
|
|
|
|
| 33 |
reward = empty_reward()
|
| 34 |
-
if action.tool_name == "
|
| 35 |
-
|
| 36 |
-
reward["discovery"] = (
|
| 37 |
-
1.0 if finding.get("tied_to_policy") else 0.0
|
| 38 |
-
)
|
| 39 |
elif action.tool_name == "run_visible_tests":
|
| 40 |
visible = verifier_result.get("visible", {})
|
| 41 |
reward["visible_tests"] = 1.0 if visible.get("passed") else 0.0
|
| 42 |
elif action.tool_name == "submit_fix":
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
public_routes = verifier_result.get("public_routes", {})
|
| 47 |
-
quality = verifier_result.get("patch_quality", {})
|
| 48 |
-
visible = verifier_result.get("visible", {})
|
| 49 |
-
security_passed = bool(security.get("passed") and oracle.get("passed", True))
|
| 50 |
-
reward["security"] = 5.0 if security_passed else -6.0
|
| 51 |
-
reward["regression"] = 3.0 if regression.get("passed") else -5.0
|
| 52 |
-
reward["public_routes"] = 1.0 if public_routes.get("passed") else -3.0
|
| 53 |
-
reward["patch_quality"] = 2.0 if quality.get("passed") else 0.0
|
| 54 |
-
if quality.get("passed") and _followed_repair_phase_order(state):
|
| 55 |
-
reward["patch_quality"] += 0.5
|
| 56 |
-
reward["visible_tests"] = 1.0 if visible.get("passed") else -6.0
|
| 57 |
-
if quality.get("hardcoded"):
|
| 58 |
-
reward["anti_cheat"] -= 4.0
|
| 59 |
-
if quality.get("deny_all"):
|
| 60 |
-
reward["safety"] -= 5.0
|
| 61 |
-
|
| 62 |
-
flags = sorted(
|
| 63 |
-
set(verifier_result.get("anti_cheat_flags", []) or [])
|
| 64 |
-
| set(state.anti_cheat_flags or [])
|
| 65 |
-
)
|
| 66 |
-
if flags:
|
| 67 |
-
if "external_network_attempt" in flags:
|
| 68 |
-
reward["safety"] -= 8.0
|
| 69 |
-
if "blocked_file_probe" in flags:
|
| 70 |
-
reward["anti_cheat"] -= 8.0
|
| 71 |
-
if "hardcoded_hidden_identifier" in flags:
|
| 72 |
-
reward["anti_cheat"] -= 4.0
|
| 73 |
|
| 74 |
if verifier_result.get("invalid_action"):
|
| 75 |
-
reward["
|
| 76 |
if verifier_result.get("repeated_action"):
|
| 77 |
-
reward["
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
-
|
| 80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
return reward
|
| 82 |
|
| 83 |
|
| 84 |
def _followed_repair_phase_order(state: CyberSecurityOWASPState) -> bool:
|
| 85 |
tools = [item.get("tool_name") for item in state.action_history]
|
| 86 |
-
required = ["
|
| 87 |
cursor = 0
|
| 88 |
for tool in tools:
|
| 89 |
if cursor < len(required) and tool == required[cursor]:
|
| 90 |
cursor += 1
|
| 91 |
return cursor == len(required)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
try:
|
| 6 |
from .models import CyberSecurityOWASPAction, CyberSecurityOWASPState
|
| 7 |
+
from .reward_config import RewardSettings, load_reward_settings
|
| 8 |
except ImportError: # pragma: no cover
|
| 9 |
from models import CyberSecurityOWASPAction, CyberSecurityOWASPState
|
| 10 |
+
from reward_config import RewardSettings, load_reward_settings
|
| 11 |
|
| 12 |
|
| 13 |
REWARD_KEYS = (
|
|
|
|
| 19 |
"visible_tests",
|
| 20 |
"safety",
|
| 21 |
"anti_cheat",
|
| 22 |
+
"terminal_total",
|
| 23 |
+
"progressive",
|
| 24 |
+
"step_penalty",
|
| 25 |
+
"speed_bonus",
|
| 26 |
+
"token_penalty",
|
| 27 |
+
"behavior_penalty",
|
| 28 |
+
"train_total",
|
| 29 |
"total",
|
| 30 |
)
|
| 31 |
|
|
|
|
| 39 |
action: CyberSecurityOWASPAction,
|
| 40 |
verifier_result: dict,
|
| 41 |
) -> dict[str, float]:
|
| 42 |
+
settings = load_reward_settings()
|
| 43 |
reward = empty_reward()
|
| 44 |
+
if action.tool_name == "submit_diagnosis":
|
| 45 |
+
diagnosis = verifier_result.get("diagnosis", verifier_result.get("finding", {}))
|
| 46 |
+
reward["discovery"] = _diagnosis_score(diagnosis)
|
|
|
|
|
|
|
| 47 |
elif action.tool_name == "run_visible_tests":
|
| 48 |
visible = verifier_result.get("visible", {})
|
| 49 |
reward["visible_tests"] = 1.0 if visible.get("passed") else 0.0
|
| 50 |
elif action.tool_name == "submit_fix":
|
| 51 |
+
_add_terminal_submit_fix_reward(state, verifier_result, reward, settings)
|
| 52 |
+
|
| 53 |
+
_add_current_anti_cheat_penalties(verifier_result, reward, settings)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
if verifier_result.get("invalid_action"):
|
| 56 |
+
reward["behavior_penalty"] += settings.value("invalid_action", -0.2)
|
| 57 |
if verifier_result.get("repeated_action"):
|
| 58 |
+
reward["behavior_penalty"] += (
|
| 59 |
+
settings.value("repeated_invalid_action", -0.3)
|
| 60 |
+
if verifier_result.get("invalid_action")
|
| 61 |
+
else settings.value("repeated_low_value_action", -0.1)
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
reward["progressive"] = _compute_progressive_reward(
|
| 65 |
+
state,
|
| 66 |
+
action,
|
| 67 |
+
verifier_result,
|
| 68 |
+
settings,
|
| 69 |
+
)
|
| 70 |
+
reward["step_penalty"] = _compute_step_penalty(state, settings)
|
| 71 |
+
reward["speed_bonus"] = _compute_speed_bonus(state, action, verifier_result, settings)
|
| 72 |
+
reward["behavior_penalty"] += _compute_behavior_penalty(
|
| 73 |
+
state,
|
| 74 |
+
action,
|
| 75 |
+
verifier_result,
|
| 76 |
+
settings,
|
| 77 |
+
reward["progressive"],
|
| 78 |
+
)
|
| 79 |
|
| 80 |
+
terminal_total = (
|
| 81 |
+
_component_total(reward)
|
| 82 |
+
if action.tool_name == "submit_fix"
|
| 83 |
+
else reward["safety"] + reward["anti_cheat"]
|
| 84 |
+
)
|
| 85 |
+
reward["terminal_total"] = _cap_terminal(terminal_total, settings)
|
| 86 |
+
reward["train_total"] = _cap_train(
|
| 87 |
+
reward["terminal_total"]
|
| 88 |
+
+ settings.shaping_weight * reward["progressive"]
|
| 89 |
+
+ reward["step_penalty"]
|
| 90 |
+
+ reward["speed_bonus"]
|
| 91 |
+
+ reward["token_penalty"]
|
| 92 |
+
+ reward["behavior_penalty"],
|
| 93 |
+
settings,
|
| 94 |
+
state,
|
| 95 |
+
)
|
| 96 |
+
reward["total"] = reward["train_total"] if settings.dense_train else reward["terminal_total"]
|
| 97 |
return reward
|
| 98 |
|
| 99 |
|
| 100 |
def _followed_repair_phase_order(state: CyberSecurityOWASPState) -> bool:
|
| 101 |
tools = [item.get("tool_name") for item in state.action_history]
|
| 102 |
+
required = ["submit_diagnosis", "patch_file", "run_visible_tests", "submit_fix"]
|
| 103 |
cursor = 0
|
| 104 |
for tool in tools:
|
| 105 |
if cursor < len(required) and tool == required[cursor]:
|
| 106 |
cursor += 1
|
| 107 |
return cursor == len(required)
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def should_terminate_for_flags(flags: list[str], settings: RewardSettings | None = None) -> bool:
|
| 111 |
+
settings = settings or load_reward_settings()
|
| 112 |
+
mapping = {
|
| 113 |
+
"external_network_attempt": "external_url_attempt",
|
| 114 |
+
"blocked_file_probe": "hidden_file_probe",
|
| 115 |
+
"host_path_escape": "host_path_escape",
|
| 116 |
+
"test_file_edit_attempt": "test_file_edit",
|
| 117 |
+
}
|
| 118 |
+
return any(settings.terminate(mapping[flag]) for flag in flags if flag in mapping)
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def _diagnosis_score(diagnosis: dict) -> float:
|
| 122 |
+
return sum(
|
| 123 |
+
0.25
|
| 124 |
+
for key in (
|
| 125 |
+
"correct_route",
|
| 126 |
+
"correct_bug_class",
|
| 127 |
+
"correct_policy_rule",
|
| 128 |
+
"valid_local_evidence",
|
| 129 |
+
)
|
| 130 |
+
if diagnosis.get(key)
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def _add_terminal_submit_fix_reward(
|
| 135 |
+
state: CyberSecurityOWASPState,
|
| 136 |
+
verifier_result: dict,
|
| 137 |
+
reward: dict[str, float],
|
| 138 |
+
settings: RewardSettings,
|
| 139 |
+
) -> None:
|
| 140 |
+
security = verifier_result.get("security", {})
|
| 141 |
+
oracle = verifier_result.get("oracle_matrix", {})
|
| 142 |
+
regression = verifier_result.get("regression", {})
|
| 143 |
+
public_routes = verifier_result.get("public_routes", {})
|
| 144 |
+
quality = verifier_result.get("patch_quality", {})
|
| 145 |
+
visible = verifier_result.get("visible", {})
|
| 146 |
+
security_passed = bool(security.get("passed") and oracle.get("passed", True))
|
| 147 |
+
reward["security"] = 5.0 if security_passed else -6.0
|
| 148 |
+
reward["regression"] = 3.0 if regression.get("passed") else -5.0
|
| 149 |
+
reward["public_routes"] = 1.0 if public_routes.get("passed") else -3.0
|
| 150 |
+
reward["patch_quality"] = 2.0 if quality.get("passed") else 0.0
|
| 151 |
+
if quality.get("passed") and _followed_repair_phase_order(state):
|
| 152 |
+
reward["patch_quality"] += 0.5
|
| 153 |
+
reward["visible_tests"] = 1.0 if visible.get("passed") else -6.0
|
| 154 |
+
if quality.get("hardcoded"):
|
| 155 |
+
reward["anti_cheat"] += settings.value("hardcoded_identifier", -2.0)
|
| 156 |
+
if quality.get("deny_all"):
|
| 157 |
+
reward["safety"] += settings.value("deny_all_patch", -3.0)
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def _add_current_anti_cheat_penalties(
|
| 161 |
+
verifier_result: dict,
|
| 162 |
+
reward: dict[str, float],
|
| 163 |
+
settings: RewardSettings,
|
| 164 |
+
) -> None:
|
| 165 |
+
flags = set(verifier_result.get("anti_cheat_flags", []) or [])
|
| 166 |
+
if "external_network_attempt" in flags:
|
| 167 |
+
reward["safety"] += settings.value("external_url_attempt", -5.0)
|
| 168 |
+
if "blocked_file_probe" in flags:
|
| 169 |
+
reward["anti_cheat"] += settings.value("hidden_file_probe", -5.0)
|
| 170 |
+
if "host_path_escape" in flags:
|
| 171 |
+
reward["anti_cheat"] += settings.value("host_path_escape", -4.0)
|
| 172 |
+
if "test_file_edit_attempt" in flags:
|
| 173 |
+
reward["anti_cheat"] += settings.value("test_file_edit", -5.0)
|
| 174 |
+
if "hardcoded_hidden_identifier" in flags:
|
| 175 |
+
reward["anti_cheat"] += settings.value("hardcoded_identifier", -2.0)
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def _compute_progressive_reward(
|
| 179 |
+
state: CyberSecurityOWASPState,
|
| 180 |
+
action: CyberSecurityOWASPAction,
|
| 181 |
+
verifier_result: dict,
|
| 182 |
+
settings: RewardSettings,
|
| 183 |
+
) -> float:
|
| 184 |
+
if not settings.dense_train:
|
| 185 |
+
return 0.0
|
| 186 |
+
delta = 0.0
|
| 187 |
+
if action.tool_name == "inspect_policy_graph":
|
| 188 |
+
delta += _award_progress_once(state, "policy_seen", "policy_inspected", settings)
|
| 189 |
+
if action.tool_name in {"list_routes", "read_openapi"}:
|
| 190 |
+
delta += _award_progress_once(state, "route_map_seen", "route_map_inspected", settings)
|
| 191 |
+
if action.tool_name in {"read_file", "search_code"} and _is_relevant_code_action(action):
|
| 192 |
+
delta += _award_progress_once(
|
| 193 |
+
state,
|
| 194 |
+
"relevant_file_seen",
|
| 195 |
+
"relevant_file_inspected",
|
| 196 |
+
settings,
|
| 197 |
+
)
|
| 198 |
+
if action.tool_name in {"send_local_request", "compare_identities"} and any(
|
| 199 |
+
trace.get("unauthorized_success") for trace in state.request_trace
|
| 200 |
+
):
|
| 201 |
+
delta += _award_progress_once(
|
| 202 |
+
state,
|
| 203 |
+
"local_evidence_found",
|
| 204 |
+
"local_evidence_found",
|
| 205 |
+
settings,
|
| 206 |
+
)
|
| 207 |
+
if action.tool_name == "submit_diagnosis":
|
| 208 |
+
diagnosis = verifier_result.get("diagnosis", verifier_result.get("finding", {}))
|
| 209 |
+
if all(
|
| 210 |
+
diagnosis.get(key)
|
| 211 |
+
for key in (
|
| 212 |
+
"correct_route",
|
| 213 |
+
"correct_bug_class",
|
| 214 |
+
"correct_policy_rule",
|
| 215 |
+
"valid_local_evidence",
|
| 216 |
+
)
|
| 217 |
+
):
|
| 218 |
+
delta += _award_progress_once(
|
| 219 |
+
state,
|
| 220 |
+
"diagnosis_correct",
|
| 221 |
+
"diagnosis_correct",
|
| 222 |
+
settings,
|
| 223 |
+
)
|
| 224 |
+
if action.tool_name == "patch_file" and not verifier_result.get("invalid_action"):
|
| 225 |
+
delta += _award_progress_once(state, "patch_applies", "patch_applies", settings)
|
| 226 |
+
if action.tool_name == "run_visible_tests":
|
| 227 |
+
visible = verifier_result.get("visible", {})
|
| 228 |
+
checks = visible.get("checks", {}) if isinstance(visible, dict) else {}
|
| 229 |
+
if visible.get("passed"):
|
| 230 |
+
delta += _award_progress_once(
|
| 231 |
+
state,
|
| 232 |
+
"app_boots",
|
| 233 |
+
"app_boots_after_patch",
|
| 234 |
+
settings,
|
| 235 |
+
)
|
| 236 |
+
delta += _award_progress_once(
|
| 237 |
+
state,
|
| 238 |
+
"visible_tests_improved",
|
| 239 |
+
"visible_tests_improved",
|
| 240 |
+
settings,
|
| 241 |
+
)
|
| 242 |
+
if checks.get("health_public"):
|
| 243 |
+
delta += _award_progress_once(
|
| 244 |
+
state,
|
| 245 |
+
"public_routes_visible_pass",
|
| 246 |
+
"public_routes_visible_pass",
|
| 247 |
+
settings,
|
| 248 |
+
)
|
| 249 |
+
return delta
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
def _award_progress_once(
|
| 253 |
+
state: CyberSecurityOWASPState,
|
| 254 |
+
flag_name: str,
|
| 255 |
+
config_name: str,
|
| 256 |
+
settings: RewardSettings,
|
| 257 |
+
) -> float:
|
| 258 |
+
if state.progress_flags.get(flag_name):
|
| 259 |
+
return 0.0
|
| 260 |
+
cap = settings.value("progressive_cap", 5.0)
|
| 261 |
+
remaining = max(0.0, cap - float(state.progress_reward_total or 0.0))
|
| 262 |
+
if remaining <= 0.0:
|
| 263 |
+
return 0.0
|
| 264 |
+
state.progress_flags[flag_name] = True
|
| 265 |
+
value = min(settings.value(config_name, 0.0), remaining)
|
| 266 |
+
state.progress_reward_total += value
|
| 267 |
+
return value
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
def _is_relevant_code_action(action: CyberSecurityOWASPAction) -> bool:
|
| 271 |
+
args = action.arguments or {}
|
| 272 |
+
text = f"{args.get('path', '')} {args.get('query', '')}".lower()
|
| 273 |
+
return any(
|
| 274 |
+
term in text
|
| 275 |
+
for term in ("auth", "tenant", "owner", "role", "invoice", "route", "guard", "policy")
|
| 276 |
+
)
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
def _compute_step_penalty(
|
| 280 |
+
state: CyberSecurityOWASPState,
|
| 281 |
+
settings: RewardSettings,
|
| 282 |
+
) -> float:
|
| 283 |
+
if not settings.dense_train:
|
| 284 |
+
return 0.0
|
| 285 |
+
rate = settings.value("step_penalty", 0.0)
|
| 286 |
+
if rate >= 0.0:
|
| 287 |
+
return 0.0
|
| 288 |
+
current = float(state.metrics.get("step_penalty_total", 0.0))
|
| 289 |
+
cap = settings.cap("step_penalty", -0.6)
|
| 290 |
+
delta = max(rate, float(cap) - current) if cap is not None else rate
|
| 291 |
+
state.metrics["step_penalty_total"] = current + delta
|
| 292 |
+
return delta
|
| 293 |
+
|
| 294 |
+
|
| 295 |
+
def _compute_speed_bonus(
|
| 296 |
+
state: CyberSecurityOWASPState,
|
| 297 |
+
action: CyberSecurityOWASPAction,
|
| 298 |
+
verifier_result: dict,
|
| 299 |
+
settings: RewardSettings,
|
| 300 |
+
) -> float:
|
| 301 |
+
if not settings.dense_train or action.tool_name != "submit_fix":
|
| 302 |
+
return 0.0
|
| 303 |
+
success = all(
|
| 304 |
+
bool((verifier_result.get(key) or {}).get("passed", False))
|
| 305 |
+
for key in ("security", "oracle_matrix", "regression", "public_routes", "patch_quality")
|
| 306 |
+
)
|
| 307 |
+
if not success:
|
| 308 |
+
return 0.0
|
| 309 |
+
max_steps = max(1, int(state.max_steps or 1))
|
| 310 |
+
bonus = settings.value("speed_bonus", 1.0) * (1.0 - min(state.step_count, max_steps) / max_steps)
|
| 311 |
+
return max(0.0, bonus)
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
def _compute_behavior_penalty(
|
| 315 |
+
state: CyberSecurityOWASPState,
|
| 316 |
+
action: CyberSecurityOWASPAction,
|
| 317 |
+
verifier_result: dict,
|
| 318 |
+
settings: RewardSettings,
|
| 319 |
+
progressive_delta: float,
|
| 320 |
+
) -> float:
|
| 321 |
+
if not settings.dense_train:
|
| 322 |
+
return 0.0
|
| 323 |
+
penalty = 0.0
|
| 324 |
+
tools = [item.get("tool_name") for item in state.action_history]
|
| 325 |
+
if action.tool_name == "noop":
|
| 326 |
+
penalty += settings.value("noop_action", -0.02)
|
| 327 |
+
if action.tool_name == "read_file":
|
| 328 |
+
path = str((action.arguments or {}).get("path", ""))
|
| 329 |
+
reads = [
|
| 330 |
+
item
|
| 331 |
+
for item in state.action_history
|
| 332 |
+
if item.get("tool_name") == "read_file"
|
| 333 |
+
and str((item.get("arguments") or {}).get("path", "")) == path
|
| 334 |
+
]
|
| 335 |
+
if len(reads) > 1:
|
| 336 |
+
penalty += settings.value("repeated_file_read", -0.05)
|
| 337 |
+
if action.tool_name == "send_local_request":
|
| 338 |
+
args = action.arguments or {}
|
| 339 |
+
current = (
|
| 340 |
+
str(args.get("method", "GET")).upper(),
|
| 341 |
+
str(args.get("path", "")),
|
| 342 |
+
str(args.get("user_id", "")),
|
| 343 |
+
)
|
| 344 |
+
matches = [
|
| 345 |
+
item
|
| 346 |
+
for item in state.action_history
|
| 347 |
+
if item.get("tool_name") == "send_local_request"
|
| 348 |
+
and (
|
| 349 |
+
str((item.get("arguments") or {}).get("method", "GET")).upper(),
|
| 350 |
+
str((item.get("arguments") or {}).get("path", "")),
|
| 351 |
+
str((item.get("arguments") or {}).get("user_id", "")),
|
| 352 |
+
)
|
| 353 |
+
== current
|
| 354 |
+
]
|
| 355 |
+
if len(matches) > 1:
|
| 356 |
+
penalty += settings.value("repeated_local_request", -0.05)
|
| 357 |
+
if action.tool_name == "run_visible_tests" and state.visible_test_count > 1:
|
| 358 |
+
penalty += settings.value("repeated_visible_tests", -0.1)
|
| 359 |
+
if action.tool_name == "patch_file" and not state.progress_flags.get("policy_seen"):
|
| 360 |
+
penalty += settings.value("patch_before_policy", -0.3)
|
| 361 |
+
if action.tool_name == "submit_fix":
|
| 362 |
+
if "patch_file" not in tools:
|
| 363 |
+
penalty += settings.value("submit_without_patch", -0.5)
|
| 364 |
+
if state.patch_attempt_count > 0 and state.visible_test_count == 0:
|
| 365 |
+
penalty += settings.value("submit_without_visible_tests", -0.3)
|
| 366 |
+
if action.tool_name == "patch_file" and state.patch_attempt_count > 3:
|
| 367 |
+
penalty += settings.value("excessive_patch_attempt", -0.2)
|
| 368 |
+
files_touched = state.metrics.get("files_touched", [])
|
| 369 |
+
if isinstance(files_touched, list) and len(files_touched) > 5:
|
| 370 |
+
penalty += settings.value("too_many_files_changed", -0.5)
|
| 371 |
+
if action.tool_name == "patch_file":
|
| 372 |
+
penalty += _oversized_patch_penalty(state, settings)
|
| 373 |
+
if (
|
| 374 |
+
progressive_delta <= 0.0
|
| 375 |
+
and not verifier_result.get("invalid_action")
|
| 376 |
+
and action.tool_name
|
| 377 |
+
in {
|
| 378 |
+
"inspect_policy_graph",
|
| 379 |
+
"list_routes",
|
| 380 |
+
"read_openapi",
|
| 381 |
+
"noop",
|
| 382 |
+
"run_visible_tests",
|
| 383 |
+
"send_local_request",
|
| 384 |
+
"compare_identities",
|
| 385 |
+
}
|
| 386 |
+
):
|
| 387 |
+
penalty += settings.value("no_progress_action", -0.05)
|
| 388 |
+
return penalty
|
| 389 |
+
|
| 390 |
+
|
| 391 |
+
def _oversized_patch_penalty(
|
| 392 |
+
state: CyberSecurityOWASPState,
|
| 393 |
+
settings: RewardSettings,
|
| 394 |
+
) -> float:
|
| 395 |
+
diff_lines = [
|
| 396 |
+
line
|
| 397 |
+
for line in str(state.patch_diff or "").splitlines()
|
| 398 |
+
if (line.startswith("+") or line.startswith("-"))
|
| 399 |
+
and not line.startswith("+++")
|
| 400 |
+
and not line.startswith("---")
|
| 401 |
+
]
|
| 402 |
+
entry = settings.entry("oversized_patch")
|
| 403 |
+
threshold = int(entry.get("threshold_lines", 80))
|
| 404 |
+
severe_threshold = int(entry.get("severe_threshold_lines", 180))
|
| 405 |
+
if len(diff_lines) >= severe_threshold:
|
| 406 |
+
return float(entry.get("severe_value", -1.0))
|
| 407 |
+
if len(diff_lines) >= threshold:
|
| 408 |
+
return settings.value("oversized_patch", -0.25)
|
| 409 |
+
return 0.0
|
| 410 |
+
|
| 411 |
+
|
| 412 |
+
def _component_total(reward: dict[str, float]) -> float:
|
| 413 |
+
excluded = {
|
| 414 |
+
"total",
|
| 415 |
+
"terminal_total",
|
| 416 |
+
"progressive",
|
| 417 |
+
"step_penalty",
|
| 418 |
+
"speed_bonus",
|
| 419 |
+
"token_penalty",
|
| 420 |
+
"behavior_penalty",
|
| 421 |
+
"train_total",
|
| 422 |
+
}
|
| 423 |
+
return sum(value for key, value in reward.items() if key not in excluded)
|
| 424 |
+
|
| 425 |
+
|
| 426 |
+
def _cap_terminal(total: float, settings: RewardSettings) -> float:
|
| 427 |
+
cap = settings.value("terminal_cap", 15.0)
|
| 428 |
+
return min(cap, total) if total > 0 else total
|
| 429 |
+
|
| 430 |
+
|
| 431 |
+
def _cap_train(
|
| 432 |
+
total: float,
|
| 433 |
+
settings: RewardSettings,
|
| 434 |
+
state: CyberSecurityOWASPState,
|
| 435 |
+
) -> float:
|
| 436 |
+
floor = settings.value("penalty_floor", -6.0)
|
| 437 |
+
capped = max(floor, total)
|
| 438 |
+
cap = settings.value("train_cap", 21.0)
|
| 439 |
+
if capped > 0.0:
|
| 440 |
+
remaining = max(0.0, cap - float(state.accumulated_reward or 0.0))
|
| 441 |
+
return min(capped, remaining)
|
| 442 |
+
return capped
|
scripts/generate_scenario_cache.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Prepare the validated CyberSecurity_OWASP scenario cache.
|
| 2 |
+
|
| 3 |
+
This command is intentionally offline/cache-prep work. Runtime ``reset()`` can
|
| 4 |
+
load these bundles in required mode without compiling a fresh scenario during a
|
| 5 |
+
Modal smoke or training run.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import argparse
|
| 11 |
+
import json
|
| 12 |
+
import os
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
|
| 15 |
+
from CyberSecurity_OWASP.config import load_scenario_authoring_config
|
| 16 |
+
from CyberSecurity_OWASP.server.scenario_cache import prepare_scenario_cache
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def main() -> None:
|
| 20 |
+
parser = argparse.ArgumentParser(description="Generate validated scenario cache bundles.")
|
| 21 |
+
parser.add_argument("--config", default="", help="Path to scenario authoring JSON config.")
|
| 22 |
+
parser.add_argument("--cache-dir", default="", help="Output scenario cache directory.")
|
| 23 |
+
parser.add_argument("--seed-start", type=int, default=0)
|
| 24 |
+
parser.add_argument("--difficulty-buckets", type=int, default=0)
|
| 25 |
+
parser.add_argument("--train-per-bucket", type=int, default=0)
|
| 26 |
+
parser.add_argument("--validation-per-bucket", type=int, default=0)
|
| 27 |
+
parser.add_argument("--heldout-per-bucket", type=int, default=0)
|
| 28 |
+
parser.add_argument("--force", action="store_true", help="Overwrite existing bundles.")
|
| 29 |
+
args = parser.parse_args()
|
| 30 |
+
|
| 31 |
+
if args.difficulty_buckets:
|
| 32 |
+
os.environ["CYBERSECURITY_OWASP_DIFFICULTY_BUCKETS"] = str(args.difficulty_buckets)
|
| 33 |
+
if args.train_per_bucket:
|
| 34 |
+
os.environ["CYBERSECURITY_OWASP_TRAIN_SCENARIOS_PER_BUCKET"] = str(args.train_per_bucket)
|
| 35 |
+
if args.validation_per_bucket:
|
| 36 |
+
os.environ["CYBERSECURITY_OWASP_VALIDATION_SCENARIOS_PER_BUCKET"] = str(args.validation_per_bucket)
|
| 37 |
+
if args.heldout_per_bucket:
|
| 38 |
+
os.environ["CYBERSECURITY_OWASP_HELDOUT_SCENARIOS_PER_BUCKET"] = str(args.heldout_per_bucket)
|
| 39 |
+
if args.config:
|
| 40 |
+
os.environ["CYBERSECURITY_OWASP_SCENARIO_CONFIG"] = args.config
|
| 41 |
+
if args.cache_dir:
|
| 42 |
+
os.environ["CYBERSECURITY_OWASP_SCENARIO_CACHE_DIR"] = args.cache_dir
|
| 43 |
+
|
| 44 |
+
settings = load_scenario_authoring_config()
|
| 45 |
+
cache_dir = Path(args.cache_dir or settings.runtime.cache_dir)
|
| 46 |
+
result = prepare_scenario_cache(
|
| 47 |
+
cache_dir=cache_dir,
|
| 48 |
+
settings=settings,
|
| 49 |
+
seed_start=args.seed_start,
|
| 50 |
+
force=args.force,
|
| 51 |
+
)
|
| 52 |
+
print(json.dumps(result, indent=2, sort_keys=True))
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
if __name__ == "__main__":
|
| 56 |
+
main()
|
scripts/generate_scenarios.sh
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
#!/usr/bin/env bash
|
| 2 |
set -euo pipefail
|
| 3 |
-
uv run python
|
|
|
|
| 1 |
#!/usr/bin/env bash
|
| 2 |
set -euo pipefail
|
| 3 |
+
uv run python scripts/generate_scenario_cache.py --train-per-bucket 3 --validation-per-bucket 3 --heldout-per-bucket 3
|
scripts/modal_ephemeral_train.py
CHANGED
|
@@ -12,6 +12,7 @@ the local process, so the run disappears when ``modal run`` exits.
|
|
| 12 |
from __future__ import annotations
|
| 13 |
|
| 14 |
import json
|
|
|
|
| 15 |
import subprocess
|
| 16 |
import time
|
| 17 |
from datetime import datetime
|
|
@@ -23,14 +24,18 @@ import modal
|
|
| 23 |
|
| 24 |
APP_NAME = "CyberSecurity_OWASP-ephemeral-training"
|
| 25 |
SECRET_NAME = "CyberSecurity_OWASP-secrets"
|
|
|
|
|
|
|
| 26 |
REMOTE_PROJECT = "/root/CyberSecurity_OWASP"
|
| 27 |
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
| 28 |
|
| 29 |
app = modal.App(APP_NAME)
|
|
|
|
| 30 |
|
| 31 |
image = (
|
| 32 |
modal.Image.debian_slim(python_version="3.11")
|
| 33 |
.apt_install("git")
|
|
|
|
| 34 |
.add_local_dir(
|
| 35 |
PROJECT_ROOT,
|
| 36 |
remote_path=REMOTE_PROJECT,
|
|
@@ -46,11 +51,17 @@ image = (
|
|
| 46 |
"*.pyc",
|
| 47 |
],
|
| 48 |
)
|
| 49 |
-
.run_commands(f"pip install -e {REMOTE_PROJECT}")
|
| 50 |
.workdir(REMOTE_PROJECT)
|
| 51 |
)
|
| 52 |
|
| 53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
class NoopTrainer:
|
| 55 |
"""Deterministic placeholder policy for cheap Modal smoke runs."""
|
| 56 |
|
|
@@ -66,9 +77,49 @@ class NoopTrainer:
|
|
| 66 |
]
|
| 67 |
|
| 68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
@app.function(
|
| 70 |
image=image,
|
| 71 |
timeout=60 * 30,
|
|
|
|
| 72 |
secrets=[modal.Secret.from_name(SECRET_NAME, required_keys=["HF_TOKEN"])],
|
| 73 |
)
|
| 74 |
def run_ephemeral_smoke(
|
|
@@ -77,10 +128,13 @@ def run_ephemeral_smoke(
|
|
| 77 |
trackio_space_id: str = "",
|
| 78 |
trackio_project: str = "CyberSecurity_OWASP-smoke",
|
| 79 |
) -> dict[str, Any]:
|
|
|
|
| 80 |
from CyberSecurity_OWASP.models import CyberSecurityOWASPAction
|
|
|
|
| 81 |
from CyberSecurity_OWASP.server.CyberSecurity_OWASP_environment import (
|
| 82 |
CybersecurityOwaspEnvironment,
|
| 83 |
)
|
|
|
|
| 84 |
from training.rollout import rollout_once
|
| 85 |
from training.trackio_utils import (
|
| 86 |
aggregate_episode_metrics,
|
|
@@ -91,11 +145,26 @@ def run_ephemeral_smoke(
|
|
| 91 |
trackio_run,
|
| 92 |
)
|
| 93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
baseline = []
|
| 95 |
oracle = []
|
| 96 |
run_context = {
|
| 97 |
"algo": "modal_ephemeral_smoke",
|
| 98 |
-
"reward_version": "
|
| 99 |
"env_version": "0.1.0",
|
| 100 |
}
|
| 101 |
|
|
@@ -125,16 +194,28 @@ def run_ephemeral_smoke(
|
|
| 125 |
oracle_env = CybersecurityOwaspEnvironment()
|
| 126 |
oracle_env.reset(seed=seed, split="validation")
|
| 127 |
hidden = oracle_env.state.hidden_facts
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
oracle_env.step(
|
| 129 |
CyberSecurityOWASPAction(
|
| 130 |
-
tool_name="
|
| 131 |
arguments={
|
| 132 |
-
"
|
| 133 |
-
"
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
"policy_rule": "Only owner or billing_admin in same tenant may read invoices.",
|
| 138 |
},
|
| 139 |
)
|
| 140 |
)
|
|
@@ -186,6 +267,9 @@ def run_ephemeral_smoke(
|
|
| 186 |
"baseline_mean_reward": mean(baseline, "reward_total"),
|
| 187 |
"oracle_mean_reward": mean(oracle, "reward_total"),
|
| 188 |
"oracle_success_rate": mean(oracle, "success"),
|
|
|
|
|
|
|
|
|
|
| 189 |
"tracking_metrics": tracking_metrics,
|
| 190 |
"tracking_trace_rows": trace_table_rows(episode_records),
|
| 191 |
"baseline": baseline,
|
|
@@ -356,8 +440,23 @@ def main(
|
|
| 356 |
trackio_space_id: str = "",
|
| 357 |
trackio_project: str = "CyberSecurity_OWASP-smoke",
|
| 358 |
run_name: str = "",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 359 |
) -> None:
|
| 360 |
-
if mode == "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 361 |
result = run_ephemeral_smoke.remote(
|
| 362 |
episodes=episodes,
|
| 363 |
seed_start=seed_start,
|
|
@@ -389,5 +488,5 @@ def main(
|
|
| 389 |
print(json.dumps(result, indent=2, sort_keys=True))
|
| 390 |
else:
|
| 391 |
raise ValueError(
|
| 392 |
-
"mode must be 'smoke', 'grpo-config', 'verify-trackio', or 'inspect-trackio'"
|
| 393 |
)
|
|
|
|
| 12 |
from __future__ import annotations
|
| 13 |
|
| 14 |
import json
|
| 15 |
+
import os
|
| 16 |
import subprocess
|
| 17 |
import time
|
| 18 |
from datetime import datetime
|
|
|
|
| 24 |
|
| 25 |
APP_NAME = "CyberSecurity_OWASP-ephemeral-training"
|
| 26 |
SECRET_NAME = "CyberSecurity_OWASP-secrets"
|
| 27 |
+
SCENARIO_CACHE_VOLUME_NAME = "CyberSecurity_OWASP-scenario-cache"
|
| 28 |
+
SCENARIO_CACHE_DIR = Path("/scenario-cache")
|
| 29 |
REMOTE_PROJECT = "/root/CyberSecurity_OWASP"
|
| 30 |
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
| 31 |
|
| 32 |
app = modal.App(APP_NAME)
|
| 33 |
+
scenario_cache_volume = modal.Volume.from_name(SCENARIO_CACHE_VOLUME_NAME, create_if_missing=True)
|
| 34 |
|
| 35 |
image = (
|
| 36 |
modal.Image.debian_slim(python_version="3.11")
|
| 37 |
.apt_install("git")
|
| 38 |
+
.pip_install("openenv-core[core]>=0.2.2", "trackio>=0.22.0")
|
| 39 |
.add_local_dir(
|
| 40 |
PROJECT_ROOT,
|
| 41 |
remote_path=REMOTE_PROJECT,
|
|
|
|
| 51 |
"*.pyc",
|
| 52 |
],
|
| 53 |
)
|
| 54 |
+
.run_commands(f"pip install --no-deps -e {REMOTE_PROJECT}")
|
| 55 |
.workdir(REMOTE_PROJECT)
|
| 56 |
)
|
| 57 |
|
| 58 |
|
| 59 |
+
def _configure_scenario_cache_env(*, required: bool = True) -> None:
|
| 60 |
+
SCENARIO_CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
| 61 |
+
os.environ["CYBERSECURITY_OWASP_SCENARIO_CACHE_DIR"] = str(SCENARIO_CACHE_DIR)
|
| 62 |
+
os.environ["CYBERSECURITY_OWASP_SCENARIO_CACHE_MODE"] = "require" if required else "fallback"
|
| 63 |
+
|
| 64 |
+
|
| 65 |
class NoopTrainer:
|
| 66 |
"""Deterministic placeholder policy for cheap Modal smoke runs."""
|
| 67 |
|
|
|
|
| 77 |
]
|
| 78 |
|
| 79 |
|
| 80 |
+
@app.function(
|
| 81 |
+
image=image,
|
| 82 |
+
timeout=60 * 60,
|
| 83 |
+
volumes={SCENARIO_CACHE_DIR: scenario_cache_volume},
|
| 84 |
+
)
|
| 85 |
+
def prepare_ephemeral_scenario_cache(
|
| 86 |
+
seed_start: int = 0,
|
| 87 |
+
difficulty_buckets: int = 0,
|
| 88 |
+
train_per_bucket: int = 0,
|
| 89 |
+
validation_per_bucket: int = 0,
|
| 90 |
+
heldout_per_bucket: int = 0,
|
| 91 |
+
force: bool = False,
|
| 92 |
+
) -> dict[str, Any]:
|
| 93 |
+
import os
|
| 94 |
+
|
| 95 |
+
if difficulty_buckets:
|
| 96 |
+
os.environ["CYBERSECURITY_OWASP_DIFFICULTY_BUCKETS"] = str(difficulty_buckets)
|
| 97 |
+
if train_per_bucket:
|
| 98 |
+
os.environ["CYBERSECURITY_OWASP_TRAIN_SCENARIOS_PER_BUCKET"] = str(train_per_bucket)
|
| 99 |
+
if validation_per_bucket:
|
| 100 |
+
os.environ["CYBERSECURITY_OWASP_VALIDATION_SCENARIOS_PER_BUCKET"] = str(validation_per_bucket)
|
| 101 |
+
if heldout_per_bucket:
|
| 102 |
+
os.environ["CYBERSECURITY_OWASP_HELDOUT_SCENARIOS_PER_BUCKET"] = str(heldout_per_bucket)
|
| 103 |
+
_configure_scenario_cache_env(required=False)
|
| 104 |
+
from CyberSecurity_OWASP.config import load_scenario_authoring_config
|
| 105 |
+
from CyberSecurity_OWASP.server.scenario_cache import prepare_scenario_cache
|
| 106 |
+
|
| 107 |
+
settings = load_scenario_authoring_config()
|
| 108 |
+
result = prepare_scenario_cache(
|
| 109 |
+
cache_dir=SCENARIO_CACHE_DIR,
|
| 110 |
+
settings=settings,
|
| 111 |
+
seed_start=seed_start,
|
| 112 |
+
force=force,
|
| 113 |
+
)
|
| 114 |
+
scenario_cache_volume.commit()
|
| 115 |
+
result["scenario_cache_volume"] = SCENARIO_CACHE_VOLUME_NAME
|
| 116 |
+
return result
|
| 117 |
+
|
| 118 |
+
|
| 119 |
@app.function(
|
| 120 |
image=image,
|
| 121 |
timeout=60 * 30,
|
| 122 |
+
volumes={SCENARIO_CACHE_DIR: scenario_cache_volume},
|
| 123 |
secrets=[modal.Secret.from_name(SECRET_NAME, required_keys=["HF_TOKEN"])],
|
| 124 |
)
|
| 125 |
def run_ephemeral_smoke(
|
|
|
|
| 128 |
trackio_space_id: str = "",
|
| 129 |
trackio_project: str = "CyberSecurity_OWASP-smoke",
|
| 130 |
) -> dict[str, Any]:
|
| 131 |
+
_configure_scenario_cache_env(required=True)
|
| 132 |
from CyberSecurity_OWASP.models import CyberSecurityOWASPAction
|
| 133 |
+
from CyberSecurity_OWASP.config import load_scenario_authoring_config
|
| 134 |
from CyberSecurity_OWASP.server.CyberSecurity_OWASP_environment import (
|
| 135 |
CybersecurityOwaspEnvironment,
|
| 136 |
)
|
| 137 |
+
from CyberSecurity_OWASP.server.scenario_cache import ScenarioCache
|
| 138 |
from training.rollout import rollout_once
|
| 139 |
from training.trackio_utils import (
|
| 140 |
aggregate_episode_metrics,
|
|
|
|
| 145 |
trackio_run,
|
| 146 |
)
|
| 147 |
|
| 148 |
+
scenario_cache_volume.reload()
|
| 149 |
+
settings = load_scenario_authoring_config()
|
| 150 |
+
cache_coverage = ScenarioCache(SCENARIO_CACHE_DIR, settings=settings).assert_coverage(
|
| 151 |
+
split="validation",
|
| 152 |
+
difficulty=0,
|
| 153 |
+
)
|
| 154 |
+
available_scenarios = int(
|
| 155 |
+
cache_coverage.get("counts", {}).get("validation", {}).get("0", 0)
|
| 156 |
+
)
|
| 157 |
+
if available_scenarios < episodes:
|
| 158 |
+
raise RuntimeError(
|
| 159 |
+
"Scenario cache does not cover this smoke run. Run prepare-cache "
|
| 160 |
+
f"with a larger validation count. available={available_scenarios}, episodes={episodes}"
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
baseline = []
|
| 164 |
oracle = []
|
| 165 |
run_context = {
|
| 166 |
"algo": "modal_ephemeral_smoke",
|
| 167 |
+
"reward_version": "reward_v2",
|
| 168 |
"env_version": "0.1.0",
|
| 169 |
}
|
| 170 |
|
|
|
|
| 194 |
oracle_env = CybersecurityOwaspEnvironment()
|
| 195 |
oracle_env.reset(seed=seed, split="validation")
|
| 196 |
hidden = oracle_env.state.hidden_facts
|
| 197 |
+
evidence = oracle_env.step(
|
| 198 |
+
CyberSecurityOWASPAction(
|
| 199 |
+
tool_name="send_local_request",
|
| 200 |
+
arguments={
|
| 201 |
+
"method": "GET",
|
| 202 |
+
"path": f"/invoices/{hidden['other_invoice_id']}",
|
| 203 |
+
"user_id": hidden["owner_user_id"],
|
| 204 |
+
},
|
| 205 |
+
)
|
| 206 |
+
)
|
| 207 |
+
trace_id = "req_001"
|
| 208 |
+
if '"trace_id": "req_' in evidence.last_tool_result:
|
| 209 |
+
trace_id = evidence.last_tool_result.split('"trace_id": "', 1)[1].split('"', 1)[0]
|
| 210 |
oracle_env.step(
|
| 211 |
CyberSecurityOWASPAction(
|
| 212 |
+
tool_name="submit_diagnosis",
|
| 213 |
arguments={
|
| 214 |
+
"bug_class": "idor_ownership_bug",
|
| 215 |
+
"route": "GET /invoices/{invoice_id}",
|
| 216 |
+
"violated_policy_rule": "Only owner or billing_admin in same tenant may read invoices.",
|
| 217 |
+
"evidence_trace_ids": [trace_id],
|
| 218 |
+
"fix_plan": "Add tenant and owner/admin checks before returning invoice data.",
|
|
|
|
| 219 |
},
|
| 220 |
)
|
| 221 |
)
|
|
|
|
| 267 |
"baseline_mean_reward": mean(baseline, "reward_total"),
|
| 268 |
"oracle_mean_reward": mean(oracle, "reward_total"),
|
| 269 |
"oracle_success_rate": mean(oracle, "success"),
|
| 270 |
+
"scenario_cache_volume": SCENARIO_CACHE_VOLUME_NAME,
|
| 271 |
+
"scenario_cache_mode": "require",
|
| 272 |
+
"scenario_cache_coverage": cache_coverage,
|
| 273 |
"tracking_metrics": tracking_metrics,
|
| 274 |
"tracking_trace_rows": trace_table_rows(episode_records),
|
| 275 |
"baseline": baseline,
|
|
|
|
| 440 |
trackio_space_id: str = "",
|
| 441 |
trackio_project: str = "CyberSecurity_OWASP-smoke",
|
| 442 |
run_name: str = "",
|
| 443 |
+
cache_difficulty_buckets: int = 0,
|
| 444 |
+
cache_train_per_bucket: int = 0,
|
| 445 |
+
cache_validation_per_bucket: int = 0,
|
| 446 |
+
cache_heldout_per_bucket: int = 0,
|
| 447 |
+
cache_force: bool = False,
|
| 448 |
) -> None:
|
| 449 |
+
if mode == "prepare-cache":
|
| 450 |
+
result = prepare_ephemeral_scenario_cache.remote(
|
| 451 |
+
seed_start=seed_start,
|
| 452 |
+
difficulty_buckets=cache_difficulty_buckets,
|
| 453 |
+
train_per_bucket=cache_train_per_bucket,
|
| 454 |
+
validation_per_bucket=cache_validation_per_bucket,
|
| 455 |
+
heldout_per_bucket=cache_heldout_per_bucket,
|
| 456 |
+
force=cache_force,
|
| 457 |
+
)
|
| 458 |
+
print(json.dumps(result, indent=2, sort_keys=True))
|
| 459 |
+
elif mode == "smoke":
|
| 460 |
result = run_ephemeral_smoke.remote(
|
| 461 |
episodes=episodes,
|
| 462 |
seed_start=seed_start,
|
|
|
|
| 488 |
print(json.dumps(result, indent=2, sort_keys=True))
|
| 489 |
else:
|
| 490 |
raise ValueError(
|
| 491 |
+
"mode must be 'prepare-cache', 'smoke', 'grpo-config', 'verify-trackio', or 'inspect-trackio'"
|
| 492 |
)
|
scripts/modal_train_grpo.py
CHANGED
|
@@ -10,7 +10,7 @@ Example:
|
|
| 10 |
uv run --extra modal modal run scripts/modal_train_grpo.py \
|
| 11 |
--max-steps 10 \
|
| 12 |
--dataset-size 16 \
|
| 13 |
-
--num-generations
|
| 14 |
--difficulty 0
|
| 15 |
"""
|
| 16 |
|
|
@@ -29,9 +29,11 @@ import modal
|
|
| 29 |
APP_NAME = "CyberSecurity_OWASP-grpo"
|
| 30 |
VOLUME_NAME = "CyberSecurity_OWASP-grpo-runs"
|
| 31 |
CACHE_VOLUME_NAME = "CyberSecurity_OWASP-model-cache"
|
|
|
|
| 32 |
SECRET_NAME = "CyberSecurity_OWASP-secrets"
|
| 33 |
RUNS_DIR = pathlib.Path("/runs")
|
| 34 |
CACHE_DIR = pathlib.Path("/cache")
|
|
|
|
| 35 |
HF_HOME_DIR = CACHE_DIR / "huggingface"
|
| 36 |
HF_HUB_CACHE_DIR = HF_HOME_DIR / "hub"
|
| 37 |
TORCH_HOME_DIR = CACHE_DIR / "torch"
|
|
@@ -46,6 +48,16 @@ DEFAULT_GEMMA_MODEL = "unsloth/gemma-4-E2B-it"
|
|
| 46 |
_IMAGE_NOTICE_PRINTED = False
|
| 47 |
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
def _model_repo_slug(model_name: str) -> str:
|
| 50 |
return (
|
| 51 |
model_name.replace("/", "-")
|
|
@@ -86,6 +98,17 @@ def _configure_modal_cache_env() -> dict[str, str]:
|
|
| 86 |
return values
|
| 87 |
|
| 88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
def _print_image_startup_notice() -> None:
|
| 90 |
global _IMAGE_NOTICE_PRINTED
|
| 91 |
if _IMAGE_NOTICE_PRINTED:
|
|
@@ -134,6 +157,16 @@ def _is_config_mode() -> bool:
|
|
| 134 |
return False
|
| 135 |
|
| 136 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
_load_local_env_file()
|
| 138 |
|
| 139 |
|
|
@@ -153,7 +186,10 @@ def _source_mode() -> str:
|
|
| 153 |
|
| 154 |
|
| 155 |
def _training_image() -> modal.Image:
|
| 156 |
-
|
|
|
|
|
|
|
|
|
|
| 157 |
image = (
|
| 158 |
modal.Image.from_registry(
|
| 159 |
"nvidia/cuda:12.8.0-devel-ubuntu22.04",
|
|
@@ -225,28 +261,182 @@ def _training_image() -> modal.Image:
|
|
| 225 |
).workdir(REMOTE_PROJECT)
|
| 226 |
|
| 227 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
app = modal.App(APP_NAME)
|
| 229 |
volume = modal.Volume.from_name(VOLUME_NAME, create_if_missing=True)
|
| 230 |
cache_volume = modal.Volume.from_name(CACHE_VOLUME_NAME, create_if_missing=True)
|
|
|
|
| 231 |
secrets = _modal_secrets()
|
|
|
|
| 232 |
training_image = _training_image()
|
| 233 |
|
| 234 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
@app.function(
|
| 236 |
image=training_image,
|
| 237 |
gpu="L4",
|
| 238 |
timeout=4 * 60 * 60,
|
| 239 |
-
volumes={RUNS_DIR: volume, CACHE_DIR: cache_volume},
|
| 240 |
secrets=secrets,
|
| 241 |
)
|
| 242 |
def check_training_imports() -> dict[str, str]:
|
| 243 |
cache_env = _configure_modal_cache_env()
|
|
|
|
| 244 |
|
| 245 |
import torch
|
| 246 |
import trackio
|
| 247 |
from datasets import Dataset
|
| 248 |
from trl import GRPOConfig, GRPOTrainer
|
| 249 |
-
from unsloth import
|
| 250 |
|
| 251 |
from CyberSecurity_OWASP.server.CyberSecurity_OWASP_environment import (
|
| 252 |
CybersecurityOwaspEnvironment,
|
|
@@ -260,12 +450,12 @@ def check_training_imports() -> dict[str, str]:
|
|
| 260 |
"dataset": Dataset.__name__,
|
| 261 |
"grpo_config": GRPOConfig.__name__,
|
| 262 |
"grpo_trainer": GRPOTrainer.__name__,
|
| 263 |
-
"unsloth_model": FastLanguageModel.__name__,
|
| 264 |
"unsloth_vision_model": FastVisionModel.__name__,
|
| 265 |
"env": CybersecurityOwaspEnvironment.__name__,
|
| 266 |
"reset_phase": obs.phase,
|
| 267 |
"hf_home": cache_env["HF_HOME"],
|
| 268 |
"hf_hub_cache": cache_env["HF_HUB_CACHE"],
|
|
|
|
| 269 |
}
|
| 270 |
|
| 271 |
|
|
@@ -273,7 +463,7 @@ def check_training_imports() -> dict[str, str]:
|
|
| 273 |
image=training_image,
|
| 274 |
gpu="L4",
|
| 275 |
timeout=4 * 60 * 60,
|
| 276 |
-
volumes={RUNS_DIR: volume, CACHE_DIR: cache_volume},
|
| 277 |
secrets=secrets,
|
| 278 |
)
|
| 279 |
def train_cybersecurity_owasp_grpo(
|
|
@@ -289,7 +479,7 @@ def train_cybersecurity_owasp_grpo(
|
|
| 289 |
lora_rank: int = 32,
|
| 290 |
trackio_space_id: str = "Humanlearning/CyberSecurity_OWASP-trackio",
|
| 291 |
trackio_project: str = "CyberSecurity_OWASP-grpo",
|
| 292 |
-
num_generations: int =
|
| 293 |
seed_start: int = 0,
|
| 294 |
git_sha: str = "nogit",
|
| 295 |
run_name: str = "",
|
|
@@ -303,10 +493,11 @@ def train_cybersecurity_owasp_grpo(
|
|
| 303 |
import threading
|
| 304 |
import time
|
| 305 |
|
|
|
|
| 306 |
cache_env = _configure_modal_cache_env()
|
| 307 |
|
| 308 |
import torch
|
| 309 |
-
from unsloth import
|
| 310 |
import transformers.utils.hub as transformers_hub
|
| 311 |
from datasets import Dataset
|
| 312 |
from huggingface_hub import snapshot_download, whoami
|
|
@@ -317,9 +508,13 @@ def train_cybersecurity_owasp_grpo(
|
|
| 317 |
import trackio
|
| 318 |
|
| 319 |
from CyberSecurity_OWASP.models import CyberSecurityOWASPAction
|
|
|
|
| 320 |
from CyberSecurity_OWASP.server.CyberSecurity_OWASP_environment import (
|
| 321 |
CybersecurityOwaspEnvironment,
|
| 322 |
)
|
|
|
|
|
|
|
|
|
|
| 323 |
from training.trackio_utils import (
|
| 324 |
aggregate_episode_metrics,
|
| 325 |
episode_record_from_state,
|
|
@@ -356,6 +551,7 @@ def train_cybersecurity_owasp_grpo(
|
|
| 356 |
|
| 357 |
os.environ["TRACKIO_SPACE_ID"] = trackio_space_id
|
| 358 |
os.environ["TRACKIO_PROJECT"] = trackio_project
|
|
|
|
| 359 |
|
| 360 |
model_slug = model_name.replace("/", "-")
|
| 361 |
stamp = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
|
|
@@ -370,13 +566,42 @@ def train_cybersecurity_owasp_grpo(
|
|
| 370 |
print(f"Reloaded Modal model cache volume: {CACHE_VOLUME_NAME}")
|
| 371 |
except Exception as exc:
|
| 372 |
print(f"Model cache volume reload skipped: {exc!r}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 373 |
cache_env = _configure_modal_cache_env()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
|
| 375 |
training_prompt = (
|
| 376 |
"You are a defensive AppSec repair agent in the local CyberSecurity_OWASP "
|
| 377 |
"OpenEnv environment. Use only the provided local tools. Do not target real "
|
| 378 |
"systems. Work step by step: inspect policy and generated code, reproduce the "
|
| 379 |
-
"authorization issue locally, submit a policy-tied
|
| 380 |
"app, run visible tests, then submit the fix. Do not write explanations unless "
|
| 381 |
"a tool argument needs evidence text."
|
| 382 |
)
|
|
@@ -403,6 +628,8 @@ def train_cybersecurity_owasp_grpo(
|
|
| 403 |
"difficulty": state.difficulty,
|
| 404 |
"domain": state.domain,
|
| 405 |
"bug_family": state.bug_family,
|
|
|
|
|
|
|
| 406 |
"phase": state.phase,
|
| 407 |
"step_count": state.step_count,
|
| 408 |
"done": state.done,
|
|
@@ -463,7 +690,7 @@ def train_cybersecurity_owasp_grpo(
|
|
| 463 |
obs = self._env.step(action)
|
| 464 |
if not obs.last_action_valid:
|
| 465 |
self.invalid_actions += 1
|
| 466 |
-
self.reward = float(
|
| 467 |
self.reward_breakdown = dict(obs.reward_breakdown or {})
|
| 468 |
self.done = bool(obs.done)
|
| 469 |
self.success = bool(self._env.state.success)
|
|
@@ -484,6 +711,8 @@ def train_cybersecurity_owasp_grpo(
|
|
| 484 |
"reward": self.reward,
|
| 485 |
"reward_breakdown": self.reward_breakdown,
|
| 486 |
"invalid_actions": self.invalid_actions,
|
|
|
|
|
|
|
| 487 |
}
|
| 488 |
)
|
| 489 |
return obs.message
|
|
@@ -575,29 +804,35 @@ def train_cybersecurity_owasp_grpo(
|
|
| 575 |
},
|
| 576 |
)
|
| 577 |
|
| 578 |
-
def
|
| 579 |
self,
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
|
|
|
|
|
|
| 583 |
) -> str:
|
| 584 |
"""
|
| 585 |
-
Submit structured
|
| 586 |
|
| 587 |
Args:
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
|
|
|
|
|
|
| 591 |
|
| 592 |
Returns:
|
| 593 |
-
|
| 594 |
"""
|
| 595 |
return self._step(
|
| 596 |
-
"
|
| 597 |
{
|
| 598 |
-
"
|
| 599 |
-
"
|
| 600 |
-
"
|
|
|
|
|
|
|
| 601 |
},
|
| 602 |
)
|
| 603 |
|
|
@@ -637,8 +872,12 @@ def train_cybersecurity_owasp_grpo(
|
|
| 637 |
"""Take no action."""
|
| 638 |
return self._step("noop")
|
| 639 |
|
| 640 |
-
def _score(self) -> float:
|
| 641 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 642 |
|
| 643 |
def __del__(self):
|
| 644 |
try:
|
|
@@ -667,24 +906,31 @@ def train_cybersecurity_owasp_grpo(
|
|
| 667 |
return float(sum(values) / len(values)) if values else 0.0
|
| 668 |
|
| 669 |
def cybersecurity_owasp_reward(environments, **kwargs) -> list[float]:
|
| 670 |
-
rewards = [float(env._score()) for env in environments]
|
| 671 |
completions = kwargs.get("completions") or kwargs.get("completion") or []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 672 |
trace_step["value"] += 1
|
| 673 |
|
| 674 |
episode_records = []
|
| 675 |
-
for env, reward in zip(environments, rewards):
|
| 676 |
record = episode_record_from_state(
|
| 677 |
env._env.state,
|
| 678 |
run_context={
|
| 679 |
"base_model": model_name,
|
| 680 |
"algo": "grpo",
|
| 681 |
-
"reward_version": "
|
| 682 |
"env_version": "0.1.0",
|
| 683 |
},
|
| 684 |
)
|
| 685 |
record.update(
|
| 686 |
{
|
| 687 |
"reward_total": reward,
|
|
|
|
|
|
|
| 688 |
"success": bool(getattr(env, "success", False)),
|
| 689 |
}
|
| 690 |
)
|
|
@@ -761,6 +1007,10 @@ def train_cybersecurity_owasp_grpo(
|
|
| 761 |
log_trackio_metrics(
|
| 762 |
{
|
| 763 |
"system/model_cache_hit": float(cache_hit),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 764 |
"system/hub_push_enabled": float(push_to_hub),
|
| 765 |
},
|
| 766 |
step=int(state.global_step or 0),
|
|
@@ -805,6 +1055,10 @@ def train_cybersecurity_owasp_grpo(
|
|
| 805 |
print(f"Output repo: {output_repo_id}")
|
| 806 |
print(f"Run name: {run_name}")
|
| 807 |
print(f"Model cache volume: {CACHE_VOLUME_NAME}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 808 |
print(f"HF_HOME: {cache_env['HF_HOME']}")
|
| 809 |
print(f"HF_HUB_CACHE: {cache_env['HF_HUB_CACHE']}")
|
| 810 |
print(f"Torch cache: {cache_env['TORCH_HOME']}")
|
|
@@ -839,7 +1093,7 @@ def train_cybersecurity_owasp_grpo(
|
|
| 839 |
)
|
| 840 |
|
| 841 |
print(f"Loading model with Unsloth from_pretrained: {model_name}")
|
| 842 |
-
model_api = FastVisionModel
|
| 843 |
model, tokenizer = model_api.from_pretrained(
|
| 844 |
model_name=model_name,
|
| 845 |
max_seq_length=max_seq_length,
|
|
@@ -854,34 +1108,11 @@ def train_cybersecurity_owasp_grpo(
|
|
| 854 |
try:
|
| 855 |
tokenizer = add_response_schema(tokenizer)
|
| 856 |
except Exception as exc:
|
| 857 |
-
|
| 858 |
-
|
| 859 |
-
|
| 860 |
-
|
| 861 |
-
|
| 862 |
-
)
|
| 863 |
-
else:
|
| 864 |
-
print(f"Tokenizer response schema add failed before cloning: {exc!r}")
|
| 865 |
-
for template_source in ("Qwen/Qwen3-0.6B", "Qwen/Qwen2.5-0.5B-Instruct"):
|
| 866 |
-
try:
|
| 867 |
-
model, tokenizer, added_tokens = clone_chat_template(
|
| 868 |
-
model,
|
| 869 |
-
tokenizer,
|
| 870 |
-
template_source,
|
| 871 |
-
)
|
| 872 |
-
print(
|
| 873 |
-
"Cloned response-schema-capable chat template "
|
| 874 |
-
f"from {template_source}; added {len(added_tokens)} tokens."
|
| 875 |
-
)
|
| 876 |
-
tokenizer = add_response_schema(tokenizer)
|
| 877 |
-
break
|
| 878 |
-
except Exception as clone_exc:
|
| 879 |
-
print(
|
| 880 |
-
"Tokenizer response schema fallback failed for "
|
| 881 |
-
f"{template_source}: {clone_exc!r}"
|
| 882 |
-
)
|
| 883 |
-
else:
|
| 884 |
-
raise
|
| 885 |
|
| 886 |
model = model_api.get_peft_model(
|
| 887 |
model,
|
|
@@ -1001,8 +1232,10 @@ def train_cybersecurity_owasp_grpo(
|
|
| 1001 |
print("Skipping Hub push for this run. Pass --push-to-hub to upload adapters.")
|
| 1002 |
volume.commit()
|
| 1003 |
cache_volume.commit()
|
|
|
|
| 1004 |
print(f"Committed run volume: {VOLUME_NAME}")
|
| 1005 |
print(f"Committed model cache volume: {CACHE_VOLUME_NAME}")
|
|
|
|
| 1006 |
try:
|
| 1007 |
trackio.finish()
|
| 1008 |
except RuntimeError as exc:
|
|
@@ -1025,6 +1258,8 @@ def train_cybersecurity_owasp_grpo(
|
|
| 1025 |
"repo_url": repo_url,
|
| 1026 |
"repo_branch": repo_branch,
|
| 1027 |
"push_to_hub": push_to_hub,
|
|
|
|
|
|
|
| 1028 |
}
|
| 1029 |
|
| 1030 |
|
|
@@ -1043,7 +1278,7 @@ def main(
|
|
| 1043 |
lora_rank: int = 32,
|
| 1044 |
trackio_space_id: str = "Humanlearning/CyberSecurity_OWASP-trackio",
|
| 1045 |
trackio_project: str = "CyberSecurity_OWASP-grpo",
|
| 1046 |
-
num_generations: int =
|
| 1047 |
seed_start: int = 0,
|
| 1048 |
git_sha: str = "nogit",
|
| 1049 |
source_mode: str = "local",
|
|
@@ -1051,13 +1286,31 @@ def main(
|
|
| 1051 |
repo_branch: str = PUBLIC_REPO_BRANCH,
|
| 1052 |
detach: bool = False,
|
| 1053 |
push_to_hub: bool = False,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1054 |
) -> None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1055 |
if mode == "config":
|
| 1056 |
result = check_training_imports.remote()
|
| 1057 |
print(result)
|
| 1058 |
return
|
| 1059 |
if mode != "train":
|
| 1060 |
-
raise ValueError("mode must be 'train' or 'config'")
|
| 1061 |
|
| 1062 |
trackio_space_id = trackio_space_id or os.environ.get(
|
| 1063 |
"TRACKIO_SPACE_ID",
|
|
@@ -1123,15 +1376,17 @@ def main(
|
|
| 1123 |
)
|
| 1124 |
print(f"Hub push enabled: {push_to_hub}")
|
| 1125 |
print(f"Model cache volume: {CACHE_VOLUME_NAME}")
|
|
|
|
| 1126 |
print("Launch phases:")
|
| 1127 |
print(
|
| 1128 |
"1. Modal image build/validation: happens before remote Python logs; "
|
| 1129 |
"slow when local source or dependency layers changed."
|
| 1130 |
)
|
| 1131 |
-
print("2.
|
| 1132 |
-
print("3.
|
| 1133 |
-
print("4.
|
| 1134 |
-
print("5.
|
|
|
|
| 1135 |
print(
|
| 1136 |
"If there is a long pause after trainer.train() starts, watch for "
|
| 1137 |
"Training heartbeat lines every 30 seconds."
|
|
@@ -1159,6 +1414,13 @@ def main(
|
|
| 1159 |
repo_branch=repo_branch,
|
| 1160 |
push_to_hub=push_to_hub,
|
| 1161 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1162 |
if detach:
|
| 1163 |
call = train_cybersecurity_owasp_grpo.spawn(**kwargs)
|
| 1164 |
print(f"Spawned Modal training call: {call.object_id}")
|
|
|
|
| 10 |
uv run --extra modal modal run scripts/modal_train_grpo.py \
|
| 11 |
--max-steps 10 \
|
| 12 |
--dataset-size 16 \
|
| 13 |
+
--num-generations 6 \
|
| 14 |
--difficulty 0
|
| 15 |
"""
|
| 16 |
|
|
|
|
| 29 |
APP_NAME = "CyberSecurity_OWASP-grpo"
|
| 30 |
VOLUME_NAME = "CyberSecurity_OWASP-grpo-runs"
|
| 31 |
CACHE_VOLUME_NAME = "CyberSecurity_OWASP-model-cache"
|
| 32 |
+
SCENARIO_CACHE_VOLUME_NAME = "CyberSecurity_OWASP-scenario-cache"
|
| 33 |
SECRET_NAME = "CyberSecurity_OWASP-secrets"
|
| 34 |
RUNS_DIR = pathlib.Path("/runs")
|
| 35 |
CACHE_DIR = pathlib.Path("/cache")
|
| 36 |
+
SCENARIO_CACHE_DIR = pathlib.Path("/scenario-cache")
|
| 37 |
HF_HOME_DIR = CACHE_DIR / "huggingface"
|
| 38 |
HF_HUB_CACHE_DIR = HF_HOME_DIR / "hub"
|
| 39 |
TORCH_HOME_DIR = CACHE_DIR / "torch"
|
|
|
|
| 48 |
_IMAGE_NOTICE_PRINTED = False
|
| 49 |
|
| 50 |
|
| 51 |
+
def _ensure_gemma4_model(model_name: str) -> str:
|
| 52 |
+
if model_name != DEFAULT_GEMMA_MODEL:
|
| 53 |
+
raise ValueError(
|
| 54 |
+
"CyberSecurity_OWASP GRPO training is pinned to "
|
| 55 |
+
f"{DEFAULT_GEMMA_MODEL}, matching the Unsloth Gemma 4 E2B RL notebook. "
|
| 56 |
+
f"Received {model_name!r}."
|
| 57 |
+
)
|
| 58 |
+
return model_name
|
| 59 |
+
|
| 60 |
+
|
| 61 |
def _model_repo_slug(model_name: str) -> str:
|
| 62 |
return (
|
| 63 |
model_name.replace("/", "-")
|
|
|
|
| 98 |
return values
|
| 99 |
|
| 100 |
|
| 101 |
+
def _configure_scenario_cache_env(*, required: bool = True) -> dict[str, str]:
|
| 102 |
+
values = {
|
| 103 |
+
"CYBERSECURITY_OWASP_SCENARIO_CACHE_DIR": str(SCENARIO_CACHE_DIR),
|
| 104 |
+
"CYBERSECURITY_OWASP_SCENARIO_CACHE_MODE": "require" if required else "fallback",
|
| 105 |
+
}
|
| 106 |
+
for key, value in values.items():
|
| 107 |
+
os.environ[key] = value
|
| 108 |
+
SCENARIO_CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
| 109 |
+
return values
|
| 110 |
+
|
| 111 |
+
|
| 112 |
def _print_image_startup_notice() -> None:
|
| 113 |
global _IMAGE_NOTICE_PRINTED
|
| 114 |
if _IMAGE_NOTICE_PRINTED:
|
|
|
|
| 157 |
return False
|
| 158 |
|
| 159 |
|
| 160 |
+
def _is_prepare_cache_mode() -> bool:
|
| 161 |
+
args = sys.argv[1:]
|
| 162 |
+
for index, arg in enumerate(args):
|
| 163 |
+
if arg == "--mode" and index + 1 < len(args):
|
| 164 |
+
return args[index + 1] == "prepare-cache"
|
| 165 |
+
if arg.startswith("--mode="):
|
| 166 |
+
return arg.split("=", 1)[1] == "prepare-cache"
|
| 167 |
+
return False
|
| 168 |
+
|
| 169 |
+
|
| 170 |
_load_local_env_file()
|
| 171 |
|
| 172 |
|
|
|
|
| 186 |
|
| 187 |
|
| 188 |
def _training_image() -> modal.Image:
|
| 189 |
+
if _is_prepare_cache_mode():
|
| 190 |
+
return _scenario_cache_image()
|
| 191 |
+
if not _is_prepare_cache_mode():
|
| 192 |
+
_print_image_startup_notice()
|
| 193 |
image = (
|
| 194 |
modal.Image.from_registry(
|
| 195 |
"nvidia/cuda:12.8.0-devel-ubuntu22.04",
|
|
|
|
| 261 |
).workdir(REMOTE_PROJECT)
|
| 262 |
|
| 263 |
|
| 264 |
+
def _scenario_cache_image() -> modal.Image:
|
| 265 |
+
image = (
|
| 266 |
+
modal.Image.debian_slim(python_version="3.11")
|
| 267 |
+
.apt_install("git")
|
| 268 |
+
.uv_pip_install("openenv-core[core]>=0.2.3", "trackio>=0.25.0")
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
if _source_mode() == "public":
|
| 272 |
+
repo_url = _cli_arg_value("repo-url", PUBLIC_REPO_URL)
|
| 273 |
+
repo_branch = _cli_arg_value("repo-branch", PUBLIC_REPO_BRANCH)
|
| 274 |
+
image = image.run_commands(
|
| 275 |
+
f"git clone --depth 1 --branch {repo_branch} {repo_url} {REMOTE_PROJECT}",
|
| 276 |
+
f"python -m pip install --no-deps -e {REMOTE_PROJECT}",
|
| 277 |
+
)
|
| 278 |
+
else:
|
| 279 |
+
image = image.add_local_dir(
|
| 280 |
+
PROJECT_ROOT,
|
| 281 |
+
remote_path=REMOTE_PROJECT,
|
| 282 |
+
copy=True,
|
| 283 |
+
ignore=[
|
| 284 |
+
".git",
|
| 285 |
+
".venv",
|
| 286 |
+
".env",
|
| 287 |
+
".env.*",
|
| 288 |
+
"__pycache__",
|
| 289 |
+
".pytest_cache",
|
| 290 |
+
"outputs",
|
| 291 |
+
"*.pyc",
|
| 292 |
+
],
|
| 293 |
+
)
|
| 294 |
+
image = image.run_commands(
|
| 295 |
+
f"python -m pip install --no-deps -e {REMOTE_PROJECT}",
|
| 296 |
+
)
|
| 297 |
+
return image.workdir(REMOTE_PROJECT)
|
| 298 |
+
|
| 299 |
+
|
| 300 |
app = modal.App(APP_NAME)
|
| 301 |
volume = modal.Volume.from_name(VOLUME_NAME, create_if_missing=True)
|
| 302 |
cache_volume = modal.Volume.from_name(CACHE_VOLUME_NAME, create_if_missing=True)
|
| 303 |
+
scenario_cache_volume = modal.Volume.from_name(SCENARIO_CACHE_VOLUME_NAME, create_if_missing=True)
|
| 304 |
secrets = _modal_secrets()
|
| 305 |
+
scenario_cache_image = _scenario_cache_image()
|
| 306 |
training_image = _training_image()
|
| 307 |
|
| 308 |
|
| 309 |
+
@app.function(
|
| 310 |
+
image=scenario_cache_image,
|
| 311 |
+
timeout=2 * 60 * 60,
|
| 312 |
+
volumes={SCENARIO_CACHE_DIR: scenario_cache_volume},
|
| 313 |
+
)
|
| 314 |
+
def prepare_modal_scenario_cache(
|
| 315 |
+
seed_start: int = 0,
|
| 316 |
+
difficulty_buckets: int = 0,
|
| 317 |
+
train_per_bucket: int = 0,
|
| 318 |
+
validation_per_bucket: int = 0,
|
| 319 |
+
heldout_per_bucket: int = 0,
|
| 320 |
+
force: bool = False,
|
| 321 |
+
) -> dict[str, Any]:
|
| 322 |
+
if difficulty_buckets:
|
| 323 |
+
os.environ["CYBERSECURITY_OWASP_DIFFICULTY_BUCKETS"] = str(difficulty_buckets)
|
| 324 |
+
if train_per_bucket:
|
| 325 |
+
os.environ["CYBERSECURITY_OWASP_TRAIN_SCENARIOS_PER_BUCKET"] = str(train_per_bucket)
|
| 326 |
+
if validation_per_bucket:
|
| 327 |
+
os.environ["CYBERSECURITY_OWASP_VALIDATION_SCENARIOS_PER_BUCKET"] = str(validation_per_bucket)
|
| 328 |
+
if heldout_per_bucket:
|
| 329 |
+
os.environ["CYBERSECURITY_OWASP_HELDOUT_SCENARIOS_PER_BUCKET"] = str(heldout_per_bucket)
|
| 330 |
+
_configure_scenario_cache_env(required=False)
|
| 331 |
+
from CyberSecurity_OWASP.config import load_scenario_authoring_config
|
| 332 |
+
from CyberSecurity_OWASP.server.scenario_cache import prepare_scenario_cache
|
| 333 |
+
|
| 334 |
+
settings = load_scenario_authoring_config()
|
| 335 |
+
result = prepare_scenario_cache(
|
| 336 |
+
cache_dir=SCENARIO_CACHE_DIR,
|
| 337 |
+
settings=settings,
|
| 338 |
+
seed_start=seed_start,
|
| 339 |
+
force=force,
|
| 340 |
+
)
|
| 341 |
+
scenario_cache_volume.commit()
|
| 342 |
+
result["scenario_cache_volume"] = SCENARIO_CACHE_VOLUME_NAME
|
| 343 |
+
return result
|
| 344 |
+
|
| 345 |
+
|
| 346 |
+
@app.function(
|
| 347 |
+
image=scenario_cache_image,
|
| 348 |
+
timeout=60 * 10,
|
| 349 |
+
volumes={SCENARIO_CACHE_DIR: scenario_cache_volume},
|
| 350 |
+
)
|
| 351 |
+
def verify_modal_scenario_cache_for_training(
|
| 352 |
+
split: str = "train",
|
| 353 |
+
difficulty: int = 0,
|
| 354 |
+
dataset_size: int = 2,
|
| 355 |
+
seed_start: int = 0,
|
| 356 |
+
) -> dict[str, Any]:
|
| 357 |
+
_configure_scenario_cache_env(required=True)
|
| 358 |
+
scenario_cache_volume.reload()
|
| 359 |
+
|
| 360 |
+
from CyberSecurity_OWASP.config import load_scenario_authoring_config
|
| 361 |
+
from CyberSecurity_OWASP.server.CyberSecurity_OWASP_environment import (
|
| 362 |
+
CybersecurityOwaspEnvironment,
|
| 363 |
+
)
|
| 364 |
+
from CyberSecurity_OWASP.reward_config import compute_token_penalty
|
| 365 |
+
from CyberSecurity_OWASP.server.curriculum import CurriculumController
|
| 366 |
+
from CyberSecurity_OWASP.server.scenario_cache import ScenarioCache
|
| 367 |
+
|
| 368 |
+
settings = load_scenario_authoring_config()
|
| 369 |
+
scenario_profile = CurriculumController(settings=settings).select_profile(
|
| 370 |
+
seed=seed_start,
|
| 371 |
+
split=split,
|
| 372 |
+
requested_difficulty=difficulty,
|
| 373 |
+
)
|
| 374 |
+
resolved_difficulty = int(scenario_profile["difficulty"])
|
| 375 |
+
cache = ScenarioCache(SCENARIO_CACHE_DIR, settings=settings)
|
| 376 |
+
coverage = cache.assert_coverage(split=split, difficulty=resolved_difficulty)
|
| 377 |
+
available_scenarios = int(
|
| 378 |
+
coverage.get("counts", {})
|
| 379 |
+
.get(split, {})
|
| 380 |
+
.get(str(resolved_difficulty), 0)
|
| 381 |
+
)
|
| 382 |
+
if available_scenarios < dataset_size:
|
| 383 |
+
raise RuntimeError(
|
| 384 |
+
"Scenario cache does not cover this Modal dataset. Run "
|
| 385 |
+
"--mode prepare-cache with a larger per-bucket count before training. "
|
| 386 |
+
f"available={available_scenarios}, requested_dataset_size={dataset_size}, "
|
| 387 |
+
f"split={split}, difficulty={resolved_difficulty}"
|
| 388 |
+
)
|
| 389 |
+
|
| 390 |
+
env = CybersecurityOwaspEnvironment()
|
| 391 |
+
try:
|
| 392 |
+
obs = env.reset(seed=seed_start, split=split, difficulty=difficulty)
|
| 393 |
+
if not env.state.cache_hit:
|
| 394 |
+
raise RuntimeError("Scenario cache preflight reset did not hit cache.")
|
| 395 |
+
if env.state.metrics.get("scenario_compile_latency_ms", 0.0):
|
| 396 |
+
raise RuntimeError("Scenario cache preflight unexpectedly compiled a scenario.")
|
| 397 |
+
sample = {
|
| 398 |
+
"phase": obs.phase,
|
| 399 |
+
"task_id": env.state.task_id,
|
| 400 |
+
"cache_hit": env.state.cache_hit,
|
| 401 |
+
"scenario_hash": env.state.scenario_hash,
|
| 402 |
+
"reset_latency_ms": env.state.reset_latency_ms,
|
| 403 |
+
"bundle_load_latency_ms": env.state.metrics.get(
|
| 404 |
+
"scenario_bundle_load_latency_ms",
|
| 405 |
+
0.0,
|
| 406 |
+
),
|
| 407 |
+
}
|
| 408 |
+
finally:
|
| 409 |
+
env.close()
|
| 410 |
+
|
| 411 |
+
return {
|
| 412 |
+
"scenario_cache_volume": SCENARIO_CACHE_VOLUME_NAME,
|
| 413 |
+
"scenario_cache_dir": str(SCENARIO_CACHE_DIR),
|
| 414 |
+
"scenario_cache_mode": "require",
|
| 415 |
+
"split": split,
|
| 416 |
+
"difficulty": resolved_difficulty,
|
| 417 |
+
"dataset_size": dataset_size,
|
| 418 |
+
"available_scenarios": available_scenarios,
|
| 419 |
+
"coverage": coverage,
|
| 420 |
+
"sample_reset": sample,
|
| 421 |
+
}
|
| 422 |
+
|
| 423 |
+
|
| 424 |
@app.function(
|
| 425 |
image=training_image,
|
| 426 |
gpu="L4",
|
| 427 |
timeout=4 * 60 * 60,
|
| 428 |
+
volumes={RUNS_DIR: volume, CACHE_DIR: cache_volume, SCENARIO_CACHE_DIR: scenario_cache_volume},
|
| 429 |
secrets=secrets,
|
| 430 |
)
|
| 431 |
def check_training_imports() -> dict[str, str]:
|
| 432 |
cache_env = _configure_modal_cache_env()
|
| 433 |
+
scenario_cache_env = _configure_scenario_cache_env(required=False)
|
| 434 |
|
| 435 |
import torch
|
| 436 |
import trackio
|
| 437 |
from datasets import Dataset
|
| 438 |
from trl import GRPOConfig, GRPOTrainer
|
| 439 |
+
from unsloth import FastVisionModel
|
| 440 |
|
| 441 |
from CyberSecurity_OWASP.server.CyberSecurity_OWASP_environment import (
|
| 442 |
CybersecurityOwaspEnvironment,
|
|
|
|
| 450 |
"dataset": Dataset.__name__,
|
| 451 |
"grpo_config": GRPOConfig.__name__,
|
| 452 |
"grpo_trainer": GRPOTrainer.__name__,
|
|
|
|
| 453 |
"unsloth_vision_model": FastVisionModel.__name__,
|
| 454 |
"env": CybersecurityOwaspEnvironment.__name__,
|
| 455 |
"reset_phase": obs.phase,
|
| 456 |
"hf_home": cache_env["HF_HOME"],
|
| 457 |
"hf_hub_cache": cache_env["HF_HUB_CACHE"],
|
| 458 |
+
"scenario_cache_dir": scenario_cache_env["CYBERSECURITY_OWASP_SCENARIO_CACHE_DIR"],
|
| 459 |
}
|
| 460 |
|
| 461 |
|
|
|
|
| 463 |
image=training_image,
|
| 464 |
gpu="L4",
|
| 465 |
timeout=4 * 60 * 60,
|
| 466 |
+
volumes={RUNS_DIR: volume, CACHE_DIR: cache_volume, SCENARIO_CACHE_DIR: scenario_cache_volume},
|
| 467 |
secrets=secrets,
|
| 468 |
)
|
| 469 |
def train_cybersecurity_owasp_grpo(
|
|
|
|
| 479 |
lora_rank: int = 32,
|
| 480 |
trackio_space_id: str = "Humanlearning/CyberSecurity_OWASP-trackio",
|
| 481 |
trackio_project: str = "CyberSecurity_OWASP-grpo",
|
| 482 |
+
num_generations: int = 6,
|
| 483 |
seed_start: int = 0,
|
| 484 |
git_sha: str = "nogit",
|
| 485 |
run_name: str = "",
|
|
|
|
| 493 |
import threading
|
| 494 |
import time
|
| 495 |
|
| 496 |
+
model_name = _ensure_gemma4_model(model_name)
|
| 497 |
cache_env = _configure_modal_cache_env()
|
| 498 |
|
| 499 |
import torch
|
| 500 |
+
from unsloth import FastVisionModel
|
| 501 |
import transformers.utils.hub as transformers_hub
|
| 502 |
from datasets import Dataset
|
| 503 |
from huggingface_hub import snapshot_download, whoami
|
|
|
|
| 508 |
import trackio
|
| 509 |
|
| 510 |
from CyberSecurity_OWASP.models import CyberSecurityOWASPAction
|
| 511 |
+
from CyberSecurity_OWASP.config import load_scenario_authoring_config
|
| 512 |
from CyberSecurity_OWASP.server.CyberSecurity_OWASP_environment import (
|
| 513 |
CybersecurityOwaspEnvironment,
|
| 514 |
)
|
| 515 |
+
from CyberSecurity_OWASP.reward_config import compute_token_penalty
|
| 516 |
+
from CyberSecurity_OWASP.server.curriculum import CurriculumController
|
| 517 |
+
from CyberSecurity_OWASP.server.scenario_cache import ScenarioCache
|
| 518 |
from training.trackio_utils import (
|
| 519 |
aggregate_episode_metrics,
|
| 520 |
episode_record_from_state,
|
|
|
|
| 551 |
|
| 552 |
os.environ["TRACKIO_SPACE_ID"] = trackio_space_id
|
| 553 |
os.environ["TRACKIO_PROJECT"] = trackio_project
|
| 554 |
+
os.environ.setdefault("CYBERSECURITY_OWASP_REWARD_MODE", "dense_train")
|
| 555 |
|
| 556 |
model_slug = model_name.replace("/", "-")
|
| 557 |
stamp = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
|
|
|
|
| 566 |
print(f"Reloaded Modal model cache volume: {CACHE_VOLUME_NAME}")
|
| 567 |
except Exception as exc:
|
| 568 |
print(f"Model cache volume reload skipped: {exc!r}")
|
| 569 |
+
try:
|
| 570 |
+
scenario_cache_volume.reload()
|
| 571 |
+
print(f"Reloaded Modal scenario cache volume: {SCENARIO_CACHE_VOLUME_NAME}")
|
| 572 |
+
except Exception as exc:
|
| 573 |
+
print(f"Scenario cache volume reload skipped: {exc!r}")
|
| 574 |
cache_env = _configure_modal_cache_env()
|
| 575 |
+
scenario_cache_env = _configure_scenario_cache_env(required=True)
|
| 576 |
+
scenario_settings = load_scenario_authoring_config()
|
| 577 |
+
scenario_profile = CurriculumController(settings=scenario_settings).select_profile(
|
| 578 |
+
seed=seed_start,
|
| 579 |
+
split=split,
|
| 580 |
+
requested_difficulty=difficulty,
|
| 581 |
+
)
|
| 582 |
+
scenario_cache = ScenarioCache(SCENARIO_CACHE_DIR, settings=scenario_settings)
|
| 583 |
+
scenario_cache_coverage = scenario_cache.assert_coverage(
|
| 584 |
+
split=split,
|
| 585 |
+
difficulty=int(scenario_profile["difficulty"]),
|
| 586 |
+
)
|
| 587 |
+
available_scenarios = int(
|
| 588 |
+
scenario_cache_coverage.get("counts", {})
|
| 589 |
+
.get(split, {})
|
| 590 |
+
.get(str(int(scenario_profile["difficulty"])), 0)
|
| 591 |
+
)
|
| 592 |
+
if available_scenarios < dataset_size:
|
| 593 |
+
raise RuntimeError(
|
| 594 |
+
"Scenario cache does not cover this Modal dataset. Run "
|
| 595 |
+
"--mode prepare-cache with a larger per-bucket count before training. "
|
| 596 |
+
f"available={available_scenarios}, requested_dataset_size={dataset_size}, "
|
| 597 |
+
f"split={split}, difficulty={scenario_profile['difficulty']}"
|
| 598 |
+
)
|
| 599 |
|
| 600 |
training_prompt = (
|
| 601 |
"You are a defensive AppSec repair agent in the local CyberSecurity_OWASP "
|
| 602 |
"OpenEnv environment. Use only the provided local tools. Do not target real "
|
| 603 |
"systems. Work step by step: inspect policy and generated code, reproduce the "
|
| 604 |
+
"authorization issue locally, submit a policy-tied diagnosis, patch the generated "
|
| 605 |
"app, run visible tests, then submit the fix. Do not write explanations unless "
|
| 606 |
"a tool argument needs evidence text."
|
| 607 |
)
|
|
|
|
| 628 |
"difficulty": state.difficulty,
|
| 629 |
"domain": state.domain,
|
| 630 |
"bug_family": state.bug_family,
|
| 631 |
+
"cache_hit": state.cache_hit,
|
| 632 |
+
"scenario_hash": state.scenario_hash,
|
| 633 |
"phase": state.phase,
|
| 634 |
"step_count": state.step_count,
|
| 635 |
"done": state.done,
|
|
|
|
| 690 |
obs = self._env.step(action)
|
| 691 |
if not obs.last_action_valid:
|
| 692 |
self.invalid_actions += 1
|
| 693 |
+
self.reward = float(self._env.state.accumulated_reward)
|
| 694 |
self.reward_breakdown = dict(obs.reward_breakdown or {})
|
| 695 |
self.done = bool(obs.done)
|
| 696 |
self.success = bool(self._env.state.success)
|
|
|
|
| 711 |
"reward": self.reward,
|
| 712 |
"reward_breakdown": self.reward_breakdown,
|
| 713 |
"invalid_actions": self.invalid_actions,
|
| 714 |
+
"scenario_cache_hit": self._env.state.cache_hit,
|
| 715 |
+
"scenario_hash": self._env.state.scenario_hash,
|
| 716 |
}
|
| 717 |
)
|
| 718 |
return obs.message
|
|
|
|
| 804 |
},
|
| 805 |
)
|
| 806 |
|
| 807 |
+
def submit_diagnosis(
|
| 808 |
self,
|
| 809 |
+
bug_class: str,
|
| 810 |
+
route: str,
|
| 811 |
+
violated_policy_rule: str,
|
| 812 |
+
evidence_trace_ids: list[str],
|
| 813 |
+
fix_plan: str,
|
| 814 |
) -> str:
|
| 815 |
"""
|
| 816 |
+
Submit structured diagnosis for the suspected authorization bug.
|
| 817 |
|
| 818 |
Args:
|
| 819 |
+
bug_class: Short class such as idor_ownership_bug.
|
| 820 |
+
route: Method and route pattern believed to be vulnerable.
|
| 821 |
+
violated_policy_rule: Policy rule that the behavior violates.
|
| 822 |
+
evidence_trace_ids: Request trace IDs from local evidence tools.
|
| 823 |
+
fix_plan: Concise secure repair plan.
|
| 824 |
|
| 825 |
Returns:
|
| 826 |
+
Diagnosis acceptance result and next phase information.
|
| 827 |
"""
|
| 828 |
return self._step(
|
| 829 |
+
"submit_diagnosis",
|
| 830 |
{
|
| 831 |
+
"bug_class": bug_class,
|
| 832 |
+
"route": route,
|
| 833 |
+
"violated_policy_rule": violated_policy_rule,
|
| 834 |
+
"evidence_trace_ids": evidence_trace_ids,
|
| 835 |
+
"fix_plan": fix_plan,
|
| 836 |
},
|
| 837 |
)
|
| 838 |
|
|
|
|
| 872 |
"""Take no action."""
|
| 873 |
return self._step("noop")
|
| 874 |
|
| 875 |
+
def _score(self, completion_tokens: int = 0) -> float:
|
| 876 |
+
token_penalty = compute_token_penalty(completion_tokens)
|
| 877 |
+
self._env.state.completion_tokens = int(completion_tokens)
|
| 878 |
+
self._env.state.metrics["completion_tokens"] = int(completion_tokens)
|
| 879 |
+
self._env.state.metrics["token_penalty"] = token_penalty
|
| 880 |
+
return float(self._env.state.accumulated_reward + token_penalty)
|
| 881 |
|
| 882 |
def __del__(self):
|
| 883 |
try:
|
|
|
|
| 906 |
return float(sum(values) / len(values)) if values else 0.0
|
| 907 |
|
| 908 |
def cybersecurity_owasp_reward(environments, **kwargs) -> list[float]:
|
|
|
|
| 909 |
completions = kwargs.get("completions") or kwargs.get("completion") or []
|
| 910 |
+
completion_texts = [_completion_to_text(item) for item in completions]
|
| 911 |
+
completion_tokens = [len(text.split()) for text in completion_texts]
|
| 912 |
+
rewards = [
|
| 913 |
+
float(env._score(completion_tokens[index] if index < len(completion_tokens) else 0))
|
| 914 |
+
for index, env in enumerate(environments)
|
| 915 |
+
]
|
| 916 |
trace_step["value"] += 1
|
| 917 |
|
| 918 |
episode_records = []
|
| 919 |
+
for index, (env, reward) in enumerate(zip(environments, rewards)):
|
| 920 |
record = episode_record_from_state(
|
| 921 |
env._env.state,
|
| 922 |
run_context={
|
| 923 |
"base_model": model_name,
|
| 924 |
"algo": "grpo",
|
| 925 |
+
"reward_version": "reward_v2",
|
| 926 |
"env_version": "0.1.0",
|
| 927 |
},
|
| 928 |
)
|
| 929 |
record.update(
|
| 930 |
{
|
| 931 |
"reward_total": reward,
|
| 932 |
+
"reward_token_penalty": float(env._env.state.metrics.get("token_penalty", 0.0)),
|
| 933 |
+
"completion_tokens": completion_tokens[index] if index < len(completion_tokens) else 0,
|
| 934 |
"success": bool(getattr(env, "success", False)),
|
| 935 |
}
|
| 936 |
)
|
|
|
|
| 1007 |
log_trackio_metrics(
|
| 1008 |
{
|
| 1009 |
"system/model_cache_hit": float(cache_hit),
|
| 1010 |
+
"system/scenario_cache_required": 1.0,
|
| 1011 |
+
"system/scenario_cache_entries": float(
|
| 1012 |
+
scenario_cache_coverage.get("entries", 0)
|
| 1013 |
+
),
|
| 1014 |
"system/hub_push_enabled": float(push_to_hub),
|
| 1015 |
},
|
| 1016 |
step=int(state.global_step or 0),
|
|
|
|
| 1055 |
print(f"Output repo: {output_repo_id}")
|
| 1056 |
print(f"Run name: {run_name}")
|
| 1057 |
print(f"Model cache volume: {CACHE_VOLUME_NAME}")
|
| 1058 |
+
print(f"Scenario cache volume: {SCENARIO_CACHE_VOLUME_NAME}")
|
| 1059 |
+
print(f"Scenario cache dir: {scenario_cache_env['CYBERSECURITY_OWASP_SCENARIO_CACHE_DIR']}")
|
| 1060 |
+
print("Scenario cache mode: require")
|
| 1061 |
+
print(f"Scenario cache coverage: {scenario_cache_coverage}")
|
| 1062 |
print(f"HF_HOME: {cache_env['HF_HOME']}")
|
| 1063 |
print(f"HF_HUB_CACHE: {cache_env['HF_HUB_CACHE']}")
|
| 1064 |
print(f"Torch cache: {cache_env['TORCH_HOME']}")
|
|
|
|
| 1093 |
)
|
| 1094 |
|
| 1095 |
print(f"Loading model with Unsloth from_pretrained: {model_name}")
|
| 1096 |
+
model_api = FastVisionModel
|
| 1097 |
model, tokenizer = model_api.from_pretrained(
|
| 1098 |
model_name=model_name,
|
| 1099 |
max_seq_length=max_seq_length,
|
|
|
|
| 1108 |
try:
|
| 1109 |
tokenizer = add_response_schema(tokenizer)
|
| 1110 |
except Exception as exc:
|
| 1111 |
+
print(
|
| 1112 |
+
"Tokenizer response schema add skipped for Gemma 4 processor, "
|
| 1113 |
+
"matching the Unsloth Gemma 4 GRPO notebook pattern: "
|
| 1114 |
+
f"{exc!r}"
|
| 1115 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1116 |
|
| 1117 |
model = model_api.get_peft_model(
|
| 1118 |
model,
|
|
|
|
| 1232 |
print("Skipping Hub push for this run. Pass --push-to-hub to upload adapters.")
|
| 1233 |
volume.commit()
|
| 1234 |
cache_volume.commit()
|
| 1235 |
+
scenario_cache_volume.commit()
|
| 1236 |
print(f"Committed run volume: {VOLUME_NAME}")
|
| 1237 |
print(f"Committed model cache volume: {CACHE_VOLUME_NAME}")
|
| 1238 |
+
print(f"Committed scenario cache volume: {SCENARIO_CACHE_VOLUME_NAME}")
|
| 1239 |
try:
|
| 1240 |
trackio.finish()
|
| 1241 |
except RuntimeError as exc:
|
|
|
|
| 1258 |
"repo_url": repo_url,
|
| 1259 |
"repo_branch": repo_branch,
|
| 1260 |
"push_to_hub": push_to_hub,
|
| 1261 |
+
"scenario_cache_volume": SCENARIO_CACHE_VOLUME_NAME,
|
| 1262 |
+
"scenario_cache_mode": "require",
|
| 1263 |
}
|
| 1264 |
|
| 1265 |
|
|
|
|
| 1278 |
lora_rank: int = 32,
|
| 1279 |
trackio_space_id: str = "Humanlearning/CyberSecurity_OWASP-trackio",
|
| 1280 |
trackio_project: str = "CyberSecurity_OWASP-grpo",
|
| 1281 |
+
num_generations: int = 6,
|
| 1282 |
seed_start: int = 0,
|
| 1283 |
git_sha: str = "nogit",
|
| 1284 |
source_mode: str = "local",
|
|
|
|
| 1286 |
repo_branch: str = PUBLIC_REPO_BRANCH,
|
| 1287 |
detach: bool = False,
|
| 1288 |
push_to_hub: bool = False,
|
| 1289 |
+
cache_seed_start: int = 0,
|
| 1290 |
+
cache_difficulty_buckets: int = 0,
|
| 1291 |
+
cache_train_per_bucket: int = 0,
|
| 1292 |
+
cache_validation_per_bucket: int = 0,
|
| 1293 |
+
cache_heldout_per_bucket: int = 0,
|
| 1294 |
+
cache_force: bool = False,
|
| 1295 |
) -> None:
|
| 1296 |
+
model_name = _ensure_gemma4_model(model_name)
|
| 1297 |
+
if mode == "prepare-cache":
|
| 1298 |
+
result = prepare_modal_scenario_cache.remote(
|
| 1299 |
+
seed_start=cache_seed_start,
|
| 1300 |
+
difficulty_buckets=cache_difficulty_buckets,
|
| 1301 |
+
train_per_bucket=cache_train_per_bucket,
|
| 1302 |
+
validation_per_bucket=cache_validation_per_bucket,
|
| 1303 |
+
heldout_per_bucket=cache_heldout_per_bucket,
|
| 1304 |
+
force=cache_force,
|
| 1305 |
+
)
|
| 1306 |
+
print(f"Prepared scenario cache: {result}")
|
| 1307 |
+
return
|
| 1308 |
if mode == "config":
|
| 1309 |
result = check_training_imports.remote()
|
| 1310 |
print(result)
|
| 1311 |
return
|
| 1312 |
if mode != "train":
|
| 1313 |
+
raise ValueError("mode must be 'prepare-cache', 'train', or 'config'")
|
| 1314 |
|
| 1315 |
trackio_space_id = trackio_space_id or os.environ.get(
|
| 1316 |
"TRACKIO_SPACE_ID",
|
|
|
|
| 1376 |
)
|
| 1377 |
print(f"Hub push enabled: {push_to_hub}")
|
| 1378 |
print(f"Model cache volume: {CACHE_VOLUME_NAME}")
|
| 1379 |
+
print(f"Scenario cache volume: {SCENARIO_CACHE_VOLUME_NAME}")
|
| 1380 |
print("Launch phases:")
|
| 1381 |
print(
|
| 1382 |
"1. Modal image build/validation: happens before remote Python logs; "
|
| 1383 |
"slow when local source or dependency layers changed."
|
| 1384 |
)
|
| 1385 |
+
print("2. CPU-only scenario cache preflight in CyberSecurity_OWASP-scenario-cache.")
|
| 1386 |
+
print("3. GPU container start on one L4 only after cache preflight passes.")
|
| 1387 |
+
print("4. Model cache check in CyberSecurity_OWASP-model-cache.")
|
| 1388 |
+
print("5. Cached snapshot load into GPU RAM with Unsloth progress.")
|
| 1389 |
+
print("6. GRPO steps, Trackio sync, and volume commit.")
|
| 1390 |
print(
|
| 1391 |
"If there is a long pause after trainer.train() starts, watch for "
|
| 1392 |
"Training heartbeat lines every 30 seconds."
|
|
|
|
| 1414 |
repo_branch=repo_branch,
|
| 1415 |
push_to_hub=push_to_hub,
|
| 1416 |
)
|
| 1417 |
+
preflight = verify_modal_scenario_cache_for_training.remote(
|
| 1418 |
+
split=split,
|
| 1419 |
+
difficulty=difficulty,
|
| 1420 |
+
dataset_size=dataset_size,
|
| 1421 |
+
seed_start=seed_start,
|
| 1422 |
+
)
|
| 1423 |
+
print(f"CPU scenario cache preflight passed: {preflight}")
|
| 1424 |
if detach:
|
| 1425 |
call = train_cybersecurity_owasp_grpo.spawn(**kwargs)
|
| 1426 |
print(f"Spawned Modal training call: {call.object_id}")
|
server/CyberSecurity_OWASP_environment.py
CHANGED
|
@@ -4,12 +4,15 @@ from __future__ import annotations
|
|
| 4 |
|
| 5 |
import json
|
| 6 |
import shutil
|
|
|
|
|
|
|
| 7 |
from typing import Any
|
| 8 |
from uuid import uuid4
|
| 9 |
|
| 10 |
from openenv.core.env_server.interfaces import Environment
|
| 11 |
|
| 12 |
try:
|
|
|
|
| 13 |
from ..models import (
|
| 14 |
CyberSecurityOWASPAction,
|
| 15 |
CyberSecurityOWASPObservation,
|
|
@@ -20,14 +23,19 @@ try:
|
|
| 20 |
from .curriculum import CurriculumController
|
| 21 |
from .episode_logger import EpisodeArtifactLogger
|
| 22 |
from .reward_engine import evaluate_action
|
|
|
|
|
|
|
| 23 |
from .scenario_factory import ScenarioFactory
|
| 24 |
except ImportError: # pragma: no cover
|
|
|
|
| 25 |
from models import CyberSecurityOWASPAction, CyberSecurityOWASPObservation, CyberSecurityOWASPState
|
| 26 |
from validators import detect_cheating
|
| 27 |
from server.action_tools import ActionTools
|
| 28 |
from server.curriculum import CurriculumController
|
| 29 |
from server.episode_logger import EpisodeArtifactLogger
|
| 30 |
from server.reward_engine import evaluate_action
|
|
|
|
|
|
|
| 31 |
from server.scenario_factory import ScenarioFactory
|
| 32 |
|
| 33 |
|
|
@@ -40,7 +48,7 @@ ALLOWED_TOOLS = {
|
|
| 40 |
"search_code",
|
| 41 |
"send_local_request",
|
| 42 |
"compare_identities",
|
| 43 |
-
"
|
| 44 |
"noop",
|
| 45 |
},
|
| 46 |
"patch": {
|
|
@@ -80,21 +88,31 @@ class CybersecurityOwaspEnvironment(
|
|
| 80 |
episode_id: str | None = None,
|
| 81 |
split: str = "train",
|
| 82 |
difficulty: int = 0,
|
|
|
|
| 83 |
**_: Any,
|
| 84 |
) -> CyberSecurityOWASPObservation:
|
|
|
|
| 85 |
self.close()
|
|
|
|
|
|
|
| 86 |
actual_seed = int(seed if seed is not None else 0)
|
| 87 |
curriculum_profile = self._curriculum.select_profile(
|
| 88 |
seed=actual_seed,
|
| 89 |
split=split,
|
| 90 |
requested_difficulty=difficulty,
|
| 91 |
)
|
| 92 |
-
scenario = self.
|
| 93 |
actual_seed,
|
| 94 |
split=split,
|
| 95 |
-
|
| 96 |
curriculum_profile=curriculum_profile,
|
|
|
|
|
|
|
| 97 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
self._state = CyberSecurityOWASPState(
|
| 99 |
episode_id=episode_id or str(uuid4()),
|
| 100 |
task_id=scenario["task_id"],
|
|
@@ -107,6 +125,12 @@ class CybersecurityOwaspEnvironment(
|
|
| 107 |
scenario_family=scenario["scenario_family"],
|
| 108 |
template_id=scenario["template_id"],
|
| 109 |
target_weakness=scenario["target_weakness"],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
phase="discover",
|
| 111 |
step_count=0,
|
| 112 |
max_steps=40,
|
|
@@ -115,7 +139,17 @@ class CybersecurityOwaspEnvironment(
|
|
| 115 |
visible_facts={"workspace_summary": scenario["workspace_summary"]},
|
| 116 |
hidden_facts=scenario["hidden_facts"],
|
| 117 |
curriculum_snapshot=scenario["curriculum_snapshot"],
|
| 118 |
-
metrics={
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
)
|
| 120 |
self._task_brief = scenario["task_brief"]
|
| 121 |
self._visible_policy_hint = scenario["public_hint"]
|
|
@@ -123,6 +157,51 @@ class CybersecurityOwaspEnvironment(
|
|
| 123 |
self._last_done_observation = None
|
| 124 |
return self._observation("Scenario ready. Start in discover phase.", reward=0.0)
|
| 125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
def step(
|
| 127 |
self,
|
| 128 |
action: CyberSecurityOWASPAction,
|
|
@@ -195,8 +274,6 @@ class CybersecurityOwaspEnvironment(
|
|
| 195 |
def _execute(
|
| 196 |
self, action: CyberSecurityOWASPAction, anti_cheat_flags: list[str]
|
| 197 |
) -> tuple[str, dict, dict[str, float], str | None]:
|
| 198 |
-
verifier, reward = evaluate_action(self._state, action, anti_cheat_flags)
|
| 199 |
-
|
| 200 |
if action.tool_name in {
|
| 201 |
"noop",
|
| 202 |
"inspect_policy_graph",
|
|
@@ -213,16 +290,20 @@ class CybersecurityOwaspEnvironment(
|
|
| 213 |
self._visible_policy_hint,
|
| 214 |
self._workspace_summary,
|
| 215 |
).execute(action)
|
|
|
|
| 216 |
return result.message, verifier, reward, result.visible_test_result
|
| 217 |
-
if action.tool_name == "
|
| 218 |
verifier, reward = evaluate_action(self._state, action, anti_cheat_flags)
|
| 219 |
self._state.verification_summary = verifier
|
| 220 |
-
|
|
|
|
|
|
|
| 221 |
self._state.finding_submitted = True
|
| 222 |
self._state.phase = "patch"
|
| 223 |
-
return "
|
| 224 |
-
return "
|
| 225 |
if action.tool_name == "run_visible_tests":
|
|
|
|
| 226 |
verifier, reward = evaluate_action(self._state, action, anti_cheat_flags)
|
| 227 |
self._state.verification_summary = verifier
|
| 228 |
visible_tests = json.dumps(verifier.get("visible", {}), indent=2, sort_keys=True)
|
|
@@ -256,6 +337,11 @@ class CybersecurityOwaspEnvironment(
|
|
| 256 |
self._state.last_reward = float(reward.get("total", 0.0))
|
| 257 |
self._state.accumulated_reward += self._state.last_reward
|
| 258 |
self._state.reward_history.append(reward)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
if self._state.step_count >= self._state.max_steps and not self._state.done:
|
| 260 |
self._state.done = True
|
| 261 |
self._state.phase = "done"
|
|
|
|
| 4 |
|
| 5 |
import json
|
| 6 |
import shutil
|
| 7 |
+
import time
|
| 8 |
+
from dataclasses import asdict
|
| 9 |
from typing import Any
|
| 10 |
from uuid import uuid4
|
| 11 |
|
| 12 |
from openenv.core.env_server.interfaces import Environment
|
| 13 |
|
| 14 |
try:
|
| 15 |
+
from ..config import load_scenario_authoring_config
|
| 16 |
from ..models import (
|
| 17 |
CyberSecurityOWASPAction,
|
| 18 |
CyberSecurityOWASPObservation,
|
|
|
|
| 23 |
from .curriculum import CurriculumController
|
| 24 |
from .episode_logger import EpisodeArtifactLogger
|
| 25 |
from .reward_engine import evaluate_action
|
| 26 |
+
from ..rewards import should_terminate_for_flags
|
| 27 |
+
from .scenario_cache import ScenarioCache, ScenarioCacheMiss, cache_key_for_scenario
|
| 28 |
from .scenario_factory import ScenarioFactory
|
| 29 |
except ImportError: # pragma: no cover
|
| 30 |
+
from config import load_scenario_authoring_config
|
| 31 |
from models import CyberSecurityOWASPAction, CyberSecurityOWASPObservation, CyberSecurityOWASPState
|
| 32 |
from validators import detect_cheating
|
| 33 |
from server.action_tools import ActionTools
|
| 34 |
from server.curriculum import CurriculumController
|
| 35 |
from server.episode_logger import EpisodeArtifactLogger
|
| 36 |
from server.reward_engine import evaluate_action
|
| 37 |
+
from rewards import should_terminate_for_flags
|
| 38 |
+
from server.scenario_cache import ScenarioCache, ScenarioCacheMiss, cache_key_for_scenario
|
| 39 |
from server.scenario_factory import ScenarioFactory
|
| 40 |
|
| 41 |
|
|
|
|
| 48 |
"search_code",
|
| 49 |
"send_local_request",
|
| 50 |
"compare_identities",
|
| 51 |
+
"submit_diagnosis",
|
| 52 |
"noop",
|
| 53 |
},
|
| 54 |
"patch": {
|
|
|
|
| 88 |
episode_id: str | None = None,
|
| 89 |
split: str = "train",
|
| 90 |
difficulty: int = 0,
|
| 91 |
+
family_budget: dict[str, Any] | None = None,
|
| 92 |
**_: Any,
|
| 93 |
) -> CyberSecurityOWASPObservation:
|
| 94 |
+
reset_started = time.perf_counter()
|
| 95 |
self.close()
|
| 96 |
+
settings = load_scenario_authoring_config()
|
| 97 |
+
self._curriculum.settings = settings
|
| 98 |
actual_seed = int(seed if seed is not None else 0)
|
| 99 |
curriculum_profile = self._curriculum.select_profile(
|
| 100 |
seed=actual_seed,
|
| 101 |
split=split,
|
| 102 |
requested_difficulty=difficulty,
|
| 103 |
)
|
| 104 |
+
scenario = self._load_or_compile_scenario(
|
| 105 |
actual_seed,
|
| 106 |
split=split,
|
| 107 |
+
requested_difficulty=difficulty,
|
| 108 |
curriculum_profile=curriculum_profile,
|
| 109 |
+
family_budget=family_budget,
|
| 110 |
+
settings=settings,
|
| 111 |
)
|
| 112 |
+
cache_info = dict(scenario.get("cache", {}))
|
| 113 |
+
cache_key = cache_info.get("cache_key", {})
|
| 114 |
+
scenario_hash = str(cache_info.get("scenario_hash", ""))
|
| 115 |
+
reset_latency_ms = (time.perf_counter() - reset_started) * 1000
|
| 116 |
self._state = CyberSecurityOWASPState(
|
| 117 |
episode_id=episode_id or str(uuid4()),
|
| 118 |
task_id=scenario["task_id"],
|
|
|
|
| 125 |
scenario_family=scenario["scenario_family"],
|
| 126 |
template_id=scenario["template_id"],
|
| 127 |
target_weakness=scenario["target_weakness"],
|
| 128 |
+
cache_key=cache_key,
|
| 129 |
+
scenario_hash=scenario_hash,
|
| 130 |
+
generator_version=str(cache_key.get("generator_version", settings.runtime.generator_version)),
|
| 131 |
+
verifier_version=str(cache_key.get("verifier_version", settings.runtime.verifier_version)),
|
| 132 |
+
cache_hit=bool(cache_info.get("hit", False)),
|
| 133 |
+
reset_latency_ms=reset_latency_ms,
|
| 134 |
phase="discover",
|
| 135 |
step_count=0,
|
| 136 |
max_steps=40,
|
|
|
|
| 139 |
visible_facts={"workspace_summary": scenario["workspace_summary"]},
|
| 140 |
hidden_facts=scenario["hidden_facts"],
|
| 141 |
curriculum_snapshot=scenario["curriculum_snapshot"],
|
| 142 |
+
metrics={
|
| 143 |
+
"reset_count": 1,
|
| 144 |
+
"reset_latency_ms": reset_latency_ms,
|
| 145 |
+
"scenario_cache_hit": bool(cache_info.get("hit", False)),
|
| 146 |
+
"scenario_cache_mode": settings.runtime.cache_mode,
|
| 147 |
+
"scenario_cache_key": cache_key,
|
| 148 |
+
"scenario_hash": scenario_hash,
|
| 149 |
+
"scenario_bundle_load_latency_ms": float(cache_info.get("load_latency_ms", 0.0)),
|
| 150 |
+
"scenario_compile_latency_ms": float(cache_info.get("compile_latency_ms", 0.0)),
|
| 151 |
+
"scenario_cache_dir": settings.runtime.cache_dir,
|
| 152 |
+
},
|
| 153 |
)
|
| 154 |
self._task_brief = scenario["task_brief"]
|
| 155 |
self._visible_policy_hint = scenario["public_hint"]
|
|
|
|
| 157 |
self._last_done_observation = None
|
| 158 |
return self._observation("Scenario ready. Start in discover phase.", reward=0.0)
|
| 159 |
|
| 160 |
+
def _load_or_compile_scenario(
|
| 161 |
+
self,
|
| 162 |
+
seed: int,
|
| 163 |
+
*,
|
| 164 |
+
split: str,
|
| 165 |
+
requested_difficulty: int,
|
| 166 |
+
curriculum_profile: dict[str, Any],
|
| 167 |
+
family_budget: dict[str, Any] | None,
|
| 168 |
+
settings: Any,
|
| 169 |
+
) -> dict[str, Any]:
|
| 170 |
+
difficulty = int(curriculum_profile.get("difficulty", requested_difficulty))
|
| 171 |
+
if settings.runtime.cache_mode != "disabled":
|
| 172 |
+
cache = ScenarioCache(settings.runtime.cache_dir, settings=settings)
|
| 173 |
+
try:
|
| 174 |
+
cached = cache.load_bundle(
|
| 175 |
+
seed=seed,
|
| 176 |
+
split=split,
|
| 177 |
+
difficulty=difficulty,
|
| 178 |
+
family_budget=family_budget,
|
| 179 |
+
)
|
| 180 |
+
return cached.scenario
|
| 181 |
+
except ScenarioCacheMiss as exc:
|
| 182 |
+
if settings.runtime.cache_mode == "require":
|
| 183 |
+
raise RuntimeError(
|
| 184 |
+
"Scenario cache miss in required mode. Run cache prep before "
|
| 185 |
+
"training/eval; runtime reset must not compile scenarios. "
|
| 186 |
+
f"Details: {exc}"
|
| 187 |
+
) from exc
|
| 188 |
+
|
| 189 |
+
compile_started = time.perf_counter()
|
| 190 |
+
scenario = self._scenario_factory.compile_scenario(
|
| 191 |
+
seed,
|
| 192 |
+
split=split,
|
| 193 |
+
difficulty=requested_difficulty,
|
| 194 |
+
curriculum_profile=curriculum_profile,
|
| 195 |
+
)
|
| 196 |
+
key = cache_key_for_scenario(scenario, settings=settings)
|
| 197 |
+
scenario["cache"] = {
|
| 198 |
+
"hit": False,
|
| 199 |
+
"cache_key": asdict(key),
|
| 200 |
+
"scenario_hash": key.scenario_hash,
|
| 201 |
+
"compile_latency_ms": (time.perf_counter() - compile_started) * 1000,
|
| 202 |
+
}
|
| 203 |
+
return scenario
|
| 204 |
+
|
| 205 |
def step(
|
| 206 |
self,
|
| 207 |
action: CyberSecurityOWASPAction,
|
|
|
|
| 274 |
def _execute(
|
| 275 |
self, action: CyberSecurityOWASPAction, anti_cheat_flags: list[str]
|
| 276 |
) -> tuple[str, dict, dict[str, float], str | None]:
|
|
|
|
|
|
|
| 277 |
if action.tool_name in {
|
| 278 |
"noop",
|
| 279 |
"inspect_policy_graph",
|
|
|
|
| 290 |
self._visible_policy_hint,
|
| 291 |
self._workspace_summary,
|
| 292 |
).execute(action)
|
| 293 |
+
verifier, reward = evaluate_action(self._state, action, anti_cheat_flags)
|
| 294 |
return result.message, verifier, reward, result.visible_test_result
|
| 295 |
+
if action.tool_name == "submit_diagnosis":
|
| 296 |
verifier, reward = evaluate_action(self._state, action, anti_cheat_flags)
|
| 297 |
self._state.verification_summary = verifier
|
| 298 |
+
self._state.diagnosis = dict(action.arguments or {})
|
| 299 |
+
if verifier.get("diagnosis", {}).get("valid"):
|
| 300 |
+
self._state.diagnosis_submitted = True
|
| 301 |
self._state.finding_submitted = True
|
| 302 |
self._state.phase = "patch"
|
| 303 |
+
return "Diagnosis recorded. Patch phase unlocked.", verifier, reward, None
|
| 304 |
+
return "Diagnosis was not specific enough to unlock patching.", verifier, reward, None
|
| 305 |
if action.tool_name == "run_visible_tests":
|
| 306 |
+
self._state.visible_test_count += 1
|
| 307 |
verifier, reward = evaluate_action(self._state, action, anti_cheat_flags)
|
| 308 |
self._state.verification_summary = verifier
|
| 309 |
visible_tests = json.dumps(verifier.get("visible", {}), indent=2, sort_keys=True)
|
|
|
|
| 337 |
self._state.last_reward = float(reward.get("total", 0.0))
|
| 338 |
self._state.accumulated_reward += self._state.last_reward
|
| 339 |
self._state.reward_history.append(reward)
|
| 340 |
+
flags = list((verifier or {}).get("anti_cheat_flags", []) or [])
|
| 341 |
+
if flags and should_terminate_for_flags(flags):
|
| 342 |
+
self._state.done = True
|
| 343 |
+
self._state.phase = "done"
|
| 344 |
+
self._state.failure_reason = "anti_cheat_violation"
|
| 345 |
if self._state.step_count >= self._state.max_steps and not self._state.done:
|
| 346 |
self._state.done = True
|
| 347 |
self._state.phase = "done"
|
server/__init__.py
CHANGED
|
@@ -10,6 +10,7 @@ from .adversarial_designer import BoundedAdversarialDesigner
|
|
| 10 |
from .CyberSecurity_OWASP_environment import CybersecurityOwaspEnvironment
|
| 11 |
from .curriculum import CurriculumController
|
| 12 |
from .scenario_factory import ScenarioFactory
|
|
|
|
| 13 |
from .verifier import MultiLayerVerifier
|
| 14 |
|
| 15 |
__all__ = [
|
|
@@ -17,5 +18,6 @@ __all__ = [
|
|
| 17 |
"CurriculumController",
|
| 18 |
"CybersecurityOwaspEnvironment",
|
| 19 |
"MultiLayerVerifier",
|
|
|
|
| 20 |
"ScenarioFactory",
|
| 21 |
]
|
|
|
|
| 10 |
from .CyberSecurity_OWASP_environment import CybersecurityOwaspEnvironment
|
| 11 |
from .curriculum import CurriculumController
|
| 12 |
from .scenario_factory import ScenarioFactory
|
| 13 |
+
from .scenario_cache import ScenarioCache
|
| 14 |
from .verifier import MultiLayerVerifier
|
| 15 |
|
| 16 |
__all__ = [
|
|
|
|
| 18 |
"CurriculumController",
|
| 19 |
"CybersecurityOwaspEnvironment",
|
| 20 |
"MultiLayerVerifier",
|
| 21 |
+
"ScenarioCache",
|
| 22 |
"ScenarioFactory",
|
| 23 |
]
|
server/app_sandbox.py
CHANGED
|
@@ -59,6 +59,7 @@ class AppSandbox:
|
|
| 59 |
)
|
| 60 |
)
|
| 61 |
self.state.patch_diff = patch_diff
|
|
|
|
| 62 |
files_touched = self.state.metrics.setdefault("files_touched", [])
|
| 63 |
if path not in files_touched:
|
| 64 |
files_touched.append(path)
|
|
@@ -84,7 +85,14 @@ class AppSandbox:
|
|
| 84 |
def send_local_request(self, method: str, path: str, user_id: str | None = None) -> dict[str, Any]:
|
| 85 |
if not is_local_route(path):
|
| 86 |
raise ValueError("send_local_request only accepts local route paths")
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
def compare_identities(
|
| 90 |
self,
|
|
@@ -95,11 +103,59 @@ class AppSandbox:
|
|
| 95 |
) -> dict[str, Any]:
|
| 96 |
if not is_local_route(path):
|
| 97 |
raise ValueError("compare_identities only accepts local route paths")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
return {
|
| 99 |
-
"
|
| 100 |
-
"
|
|
|
|
| 101 |
}
|
| 102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
def _resolve_path(self, path: str, *, write: bool = False) -> Path:
|
| 104 |
allowed, normalized_or_error = is_path_allowed(self.state, path, write=write)
|
| 105 |
if not allowed:
|
|
|
|
| 59 |
)
|
| 60 |
)
|
| 61 |
self.state.patch_diff = patch_diff
|
| 62 |
+
self.state.patch_attempt_count += 1
|
| 63 |
files_touched = self.state.metrics.setdefault("files_touched", [])
|
| 64 |
if path not in files_touched:
|
| 65 |
files_touched.append(path)
|
|
|
|
| 85 |
def send_local_request(self, method: str, path: str, user_id: str | None = None) -> dict[str, Any]:
|
| 86 |
if not is_local_route(path):
|
| 87 |
raise ValueError("send_local_request only accepts local route paths")
|
| 88 |
+
response = simulate_request(self.state, method, path, user_id)
|
| 89 |
+
trace_id = self._record_request_trace(
|
| 90 |
+
method=method,
|
| 91 |
+
path=path,
|
| 92 |
+
user_id=user_id,
|
| 93 |
+
status=int(response.get("status", 0) or 0),
|
| 94 |
+
)
|
| 95 |
+
return {"trace_id": trace_id, **response}
|
| 96 |
|
| 97 |
def compare_identities(
|
| 98 |
self,
|
|
|
|
| 103 |
) -> dict[str, Any]:
|
| 104 |
if not is_local_route(path):
|
| 105 |
raise ValueError("compare_identities only accepts local route paths")
|
| 106 |
+
first = simulate_request(self.state, method, path, first_user_id)
|
| 107 |
+
second = simulate_request(self.state, method, path, second_user_id)
|
| 108 |
+
trace_id = self._record_request_trace(
|
| 109 |
+
method=method,
|
| 110 |
+
path=path,
|
| 111 |
+
user_id=first_user_id,
|
| 112 |
+
status=int(first.get("status", 0) or 0),
|
| 113 |
+
comparison_user_id=second_user_id,
|
| 114 |
+
comparison_status=int(second.get("status", 0) or 0),
|
| 115 |
+
)
|
| 116 |
return {
|
| 117 |
+
"trace_id": trace_id,
|
| 118 |
+
"first": first,
|
| 119 |
+
"second": second,
|
| 120 |
}
|
| 121 |
|
| 122 |
+
def _record_request_trace(
|
| 123 |
+
self,
|
| 124 |
+
*,
|
| 125 |
+
method: str,
|
| 126 |
+
path: str,
|
| 127 |
+
user_id: str | None,
|
| 128 |
+
status: int,
|
| 129 |
+
comparison_user_id: str | None = None,
|
| 130 |
+
comparison_status: int | None = None,
|
| 131 |
+
) -> str:
|
| 132 |
+
trace_id = f"req_{len(self.state.request_trace) + 1:03d}"
|
| 133 |
+
hidden = self.state.hidden_facts
|
| 134 |
+
unauthorized_success = (
|
| 135 |
+
str(hidden.get("other_invoice_id", "")) in path
|
| 136 |
+
and user_id == hidden.get("owner_user_id")
|
| 137 |
+
and status == 200
|
| 138 |
+
)
|
| 139 |
+
if comparison_user_id is not None and comparison_status is not None:
|
| 140 |
+
unauthorized_success = unauthorized_success or (
|
| 141 |
+
str(hidden.get("other_invoice_id", "")) in path
|
| 142 |
+
and comparison_user_id == hidden.get("owner_user_id")
|
| 143 |
+
and comparison_status == 200
|
| 144 |
+
)
|
| 145 |
+
self.state.request_trace.append(
|
| 146 |
+
{
|
| 147 |
+
"trace_id": trace_id,
|
| 148 |
+
"method": method.upper(),
|
| 149 |
+
"path": path,
|
| 150 |
+
"user_id": user_id,
|
| 151 |
+
"status": status,
|
| 152 |
+
"comparison_user_id": comparison_user_id,
|
| 153 |
+
"comparison_status": comparison_status,
|
| 154 |
+
"unauthorized_success": unauthorized_success,
|
| 155 |
+
}
|
| 156 |
+
)
|
| 157 |
+
return trace_id
|
| 158 |
+
|
| 159 |
def _resolve_path(self, path: str, *, write: bool = False) -> Path:
|
| 160 |
allowed, normalized_or_error = is_path_allowed(self.state, path, write=write)
|
| 161 |
if not allowed:
|
server/curriculum.py
CHANGED
|
@@ -7,12 +7,14 @@ from dataclasses import dataclass, field
|
|
| 7 |
from typing import Any
|
| 8 |
|
| 9 |
try:
|
|
|
|
| 10 |
from ..models import CyberSecurityOWASPState
|
| 11 |
except ImportError: # pragma: no cover
|
|
|
|
| 12 |
from models import CyberSecurityOWASPState
|
| 13 |
|
| 14 |
|
| 15 |
-
DIFFICULTY_TIERS = ("
|
| 16 |
WEAKNESS_TARGETS = (
|
| 17 |
"same_role_cross_object",
|
| 18 |
"cross_tenant_boundary",
|
|
@@ -31,6 +33,7 @@ class CurriculumController:
|
|
| 31 |
outcomes_by_target: dict[str, list[bool]] = field(default_factory=lambda: defaultdict(list))
|
| 32 |
failures_by_target: dict[str, int] = field(default_factory=lambda: defaultdict(int))
|
| 33 |
episodes_seen: int = 0
|
|
|
|
| 34 |
|
| 35 |
def select_profile(
|
| 36 |
self,
|
|
@@ -48,7 +51,7 @@ class CurriculumController:
|
|
| 48 |
)
|
| 49 |
return {
|
| 50 |
"difficulty": difficulty,
|
| 51 |
-
"difficulty_tier":
|
| 52 |
"target_weakness": target,
|
| 53 |
"split": split,
|
| 54 |
"episodes_seen": self.episodes_seen,
|
|
@@ -82,11 +85,12 @@ class CurriculumController:
|
|
| 82 |
}
|
| 83 |
|
| 84 |
def _difficulty_for_split(self, split: str, requested_difficulty: int) -> int:
|
| 85 |
-
|
|
|
|
| 86 |
if split == "hidden_eval":
|
| 87 |
-
return max(3, difficulty)
|
| 88 |
if self.episodes_seen >= self.window_size and self._recent_reward_mean() > 10.0:
|
| 89 |
-
return min(difficulty + 1,
|
| 90 |
return difficulty
|
| 91 |
|
| 92 |
def _target_for_seed(self, seed: int, split: str) -> str:
|
|
@@ -97,3 +101,7 @@ class CurriculumController:
|
|
| 97 |
if not self.reward_trend:
|
| 98 |
return 0.0
|
| 99 |
return sum(self.reward_trend) / len(self.reward_trend)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
from typing import Any
|
| 8 |
|
| 9 |
try:
|
| 10 |
+
from ..config import ScenarioAuthoringSettings, load_scenario_authoring_config
|
| 11 |
from ..models import CyberSecurityOWASPState
|
| 12 |
except ImportError: # pragma: no cover
|
| 13 |
+
from config import ScenarioAuthoringSettings, load_scenario_authoring_config
|
| 14 |
from models import CyberSecurityOWASPState
|
| 15 |
|
| 16 |
|
| 17 |
+
DIFFICULTY_TIERS = ("D0", "D1", "D2", "D3")
|
| 18 |
WEAKNESS_TARGETS = (
|
| 19 |
"same_role_cross_object",
|
| 20 |
"cross_tenant_boundary",
|
|
|
|
| 33 |
outcomes_by_target: dict[str, list[bool]] = field(default_factory=lambda: defaultdict(list))
|
| 34 |
failures_by_target: dict[str, int] = field(default_factory=lambda: defaultdict(int))
|
| 35 |
episodes_seen: int = 0
|
| 36 |
+
settings: ScenarioAuthoringSettings = field(default_factory=load_scenario_authoring_config)
|
| 37 |
|
| 38 |
def select_profile(
|
| 39 |
self,
|
|
|
|
| 51 |
)
|
| 52 |
return {
|
| 53 |
"difficulty": difficulty,
|
| 54 |
+
"difficulty_tier": self._difficulty_label(difficulty),
|
| 55 |
"target_weakness": target,
|
| 56 |
"split": split,
|
| 57 |
"episodes_seen": self.episodes_seen,
|
|
|
|
| 85 |
}
|
| 86 |
|
| 87 |
def _difficulty_for_split(self, split: str, requested_difficulty: int) -> int:
|
| 88 |
+
max_difficulty = self.settings.curriculum.difficulty_bucket_count - 1
|
| 89 |
+
difficulty = max(0, min(int(requested_difficulty), max_difficulty))
|
| 90 |
if split == "hidden_eval":
|
| 91 |
+
return max(min(3, max_difficulty), difficulty)
|
| 92 |
if self.episodes_seen >= self.window_size and self._recent_reward_mean() > 10.0:
|
| 93 |
+
return min(difficulty + 1, max_difficulty)
|
| 94 |
return difficulty
|
| 95 |
|
| 96 |
def _target_for_seed(self, seed: int, split: str) -> str:
|
|
|
|
| 101 |
if not self.reward_trend:
|
| 102 |
return 0.0
|
| 103 |
return sum(self.reward_trend) / len(self.reward_trend)
|
| 104 |
+
|
| 105 |
+
def _difficulty_label(self, difficulty: int) -> str:
|
| 106 |
+
labels = self.settings.curriculum.difficulty_labels
|
| 107 |
+
return labels[min(max(0, difficulty), len(labels) - 1)]
|
server/episode_logger.py
CHANGED
|
@@ -49,6 +49,13 @@ class EpisodeArtifactLogger:
|
|
| 49 |
"regression_result": self._verifier_layer(state, "regression"),
|
| 50 |
"reward_breakdown": state.reward_history[-1] if state.reward_history else {},
|
| 51 |
"reward_breakdown_by_step": state.reward_history,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
"final_status": "resolved" if state.success else "failed",
|
| 53 |
"failure_reason": state.failure_reason,
|
| 54 |
"safety_violations": [
|
|
|
|
| 49 |
"regression_result": self._verifier_layer(state, "regression"),
|
| 50 |
"reward_breakdown": state.reward_history[-1] if state.reward_history else {},
|
| 51 |
"reward_breakdown_by_step": state.reward_history,
|
| 52 |
+
"total_reward": state.accumulated_reward,
|
| 53 |
+
"final_reward_breakdown": state.reward_history[-1] if state.reward_history else {},
|
| 54 |
+
"progress_reward_total": state.progress_reward_total,
|
| 55 |
+
"completion_tokens": state.completion_tokens,
|
| 56 |
+
"diagnosis_submitted": state.diagnosis_submitted,
|
| 57 |
+
"diagnosis": state.diagnosis,
|
| 58 |
+
"request_trace": state.request_trace,
|
| 59 |
"final_status": "resolved" if state.success else "failed",
|
| 60 |
"failure_reason": state.failure_reason,
|
| 61 |
"safety_violations": [
|
server/scenario_cache.py
ADDED
|
@@ -0,0 +1,525 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Versioned executable scenario cache for fast deterministic reset."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import hashlib
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
import shutil
|
| 9 |
+
import tempfile
|
| 10 |
+
import time
|
| 11 |
+
from dataclasses import asdict, dataclass
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
from typing import Any, Iterable
|
| 14 |
+
from uuid import uuid4
|
| 15 |
+
|
| 16 |
+
try:
|
| 17 |
+
from ..config import ScenarioAuthoringSettings, load_scenario_authoring_config
|
| 18 |
+
from .curriculum import CurriculumController
|
| 19 |
+
from .scenario_factory import ScenarioFactory
|
| 20 |
+
except ImportError: # pragma: no cover
|
| 21 |
+
from config import ScenarioAuthoringSettings, load_scenario_authoring_config
|
| 22 |
+
from server.curriculum import CurriculumController
|
| 23 |
+
from server.scenario_factory import ScenarioFactory
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
SCENARIO_CACHE_REQUIRED_FILES = (
|
| 27 |
+
"scenario.json",
|
| 28 |
+
"app_source",
|
| 29 |
+
"policy_graph.json",
|
| 30 |
+
"visible_tests.py",
|
| 31 |
+
"hidden_tests.py",
|
| 32 |
+
"oracle_tests.py",
|
| 33 |
+
"expected_exploit_trace.json",
|
| 34 |
+
"reward_config.json",
|
| 35 |
+
"metadata.json",
|
| 36 |
+
)
|
| 37 |
+
MANIFEST_FILE = "manifest.json"
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
@dataclass(frozen=True)
|
| 41 |
+
class ScenarioCacheKey:
|
| 42 |
+
difficulty_level: int
|
| 43 |
+
authz_bug_type: str
|
| 44 |
+
app_family: str
|
| 45 |
+
framework: str
|
| 46 |
+
policy_shape: str
|
| 47 |
+
tenant_model: str
|
| 48 |
+
exploit_depth: str
|
| 49 |
+
patch_scope: str
|
| 50 |
+
regression_risk: str
|
| 51 |
+
generator_version: str
|
| 52 |
+
verifier_version: str
|
| 53 |
+
scenario_hash: str
|
| 54 |
+
|
| 55 |
+
def stable_id(self) -> str:
|
| 56 |
+
return _stable_hash(asdict(self))[:16]
|
| 57 |
+
|
| 58 |
+
def path_slug(self) -> str:
|
| 59 |
+
return (
|
| 60 |
+
f"d{self.difficulty_level}-{self.authz_bug_type}-"
|
| 61 |
+
f"{self.app_family}-{self.framework}-{self.stable_id()}"
|
| 62 |
+
).replace("/", "-").replace("_style_python", "")
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
@dataclass(frozen=True)
|
| 66 |
+
class ScenarioCacheLoad:
|
| 67 |
+
scenario: dict[str, Any]
|
| 68 |
+
bundle_path: Path
|
| 69 |
+
load_latency_ms: float
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
class ScenarioCacheMiss(RuntimeError):
|
| 73 |
+
"""Raised when runtime cache mode requires a bundle that is not present."""
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
class ScenarioCache:
|
| 77 |
+
"""Reads and writes complete executable scenario bundles."""
|
| 78 |
+
|
| 79 |
+
def __init__(
|
| 80 |
+
self,
|
| 81 |
+
root: str | Path,
|
| 82 |
+
*,
|
| 83 |
+
settings: ScenarioAuthoringSettings | None = None,
|
| 84 |
+
):
|
| 85 |
+
self.root = Path(root)
|
| 86 |
+
self.settings = settings or load_scenario_authoring_config()
|
| 87 |
+
|
| 88 |
+
def write_bundle(self, scenario: dict[str, Any], *, force: bool = False) -> dict[str, Any]:
|
| 89 |
+
key = cache_key_for_scenario(scenario, settings=self.settings)
|
| 90 |
+
bundle_path = self._bundle_path(
|
| 91 |
+
split=str(scenario["split"] if "split" in scenario else scenario["curriculum_snapshot"].get("split", "train")),
|
| 92 |
+
difficulty=int(scenario["difficulty"]),
|
| 93 |
+
key=key,
|
| 94 |
+
)
|
| 95 |
+
if bundle_path.exists() and not force:
|
| 96 |
+
metadata = self._read_json(bundle_path / "metadata.json")
|
| 97 |
+
return {"created": False, "bundle_path": str(bundle_path), **metadata}
|
| 98 |
+
|
| 99 |
+
workspace = Path(scenario["workspace"])
|
| 100 |
+
if bundle_path.exists():
|
| 101 |
+
shutil.rmtree(bundle_path)
|
| 102 |
+
bundle_path.mkdir(parents=True, exist_ok=True)
|
| 103 |
+
app_source = bundle_path / "app_source"
|
| 104 |
+
app_source.mkdir(parents=True, exist_ok=True)
|
| 105 |
+
|
| 106 |
+
editable_files = list(scenario["hidden_facts"].get("editable_files", []))
|
| 107 |
+
for rel in editable_files:
|
| 108 |
+
source = workspace / rel
|
| 109 |
+
target = app_source / rel
|
| 110 |
+
target.parent.mkdir(parents=True, exist_ok=True)
|
| 111 |
+
shutil.copy2(source, target)
|
| 112 |
+
|
| 113 |
+
hidden_facts = _cacheable_hidden_facts(scenario["hidden_facts"])
|
| 114 |
+
scenario_record = {
|
| 115 |
+
"schema_version": 1,
|
| 116 |
+
"task_id": scenario["task_id"],
|
| 117 |
+
"seed": _seed_from_task_id(scenario["task_id"]),
|
| 118 |
+
"split": scenario["curriculum_snapshot"].get("split", "train"),
|
| 119 |
+
"difficulty": int(scenario["difficulty"]),
|
| 120 |
+
"difficulty_tier": scenario["difficulty_tier"],
|
| 121 |
+
"domain": scenario["domain"],
|
| 122 |
+
"bug_family": scenario["bug_family"],
|
| 123 |
+
"scenario_family": scenario["scenario_family"],
|
| 124 |
+
"template_id": scenario["template_id"],
|
| 125 |
+
"target_weakness": scenario["target_weakness"],
|
| 126 |
+
"task_brief": scenario["task_brief"],
|
| 127 |
+
"public_hint": scenario["public_hint"],
|
| 128 |
+
"workspace_summary": scenario["workspace_summary"],
|
| 129 |
+
"hidden_facts": hidden_facts,
|
| 130 |
+
"editable_files": editable_files,
|
| 131 |
+
"curriculum_snapshot": scenario.get("curriculum_snapshot", {}),
|
| 132 |
+
"cache_key": asdict(key),
|
| 133 |
+
}
|
| 134 |
+
metadata = {
|
| 135 |
+
"cache_key": asdict(key),
|
| 136 |
+
"scenario_hash": key.scenario_hash,
|
| 137 |
+
"generator_version": self.settings.runtime.generator_version,
|
| 138 |
+
"verifier_version": self.settings.runtime.verifier_version,
|
| 139 |
+
"scenario_author_model": self.settings.scenario_author.model_id,
|
| 140 |
+
"scenario_author_provider": self.settings.scenario_author.provider,
|
| 141 |
+
"difficulty_calibration_strategy": (
|
| 142 |
+
self.settings.curriculum.difficulty_calibration_strategy
|
| 143 |
+
),
|
| 144 |
+
"validated": True,
|
| 145 |
+
"bundle_files": list(SCENARIO_CACHE_REQUIRED_FILES),
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
_write_json(bundle_path / "scenario.json", scenario_record)
|
| 149 |
+
_write_json(bundle_path / "policy_graph.json", scenario["public_hint"])
|
| 150 |
+
_write_json(bundle_path / "expected_exploit_trace.json", _expected_exploit_trace(hidden_facts))
|
| 151 |
+
_write_json(bundle_path / "reward_config.json", _reward_config())
|
| 152 |
+
_write_json(bundle_path / "metadata.json", metadata)
|
| 153 |
+
(bundle_path / "visible_tests.py").write_text(
|
| 154 |
+
(workspace / "tests/test_visible.py").read_text(encoding="utf-8"),
|
| 155 |
+
encoding="utf-8",
|
| 156 |
+
)
|
| 157 |
+
(bundle_path / "hidden_tests.py").write_text(
|
| 158 |
+
_hidden_tests_contract(),
|
| 159 |
+
encoding="utf-8",
|
| 160 |
+
)
|
| 161 |
+
(bundle_path / "oracle_tests.py").write_text(
|
| 162 |
+
_oracle_tests_contract(),
|
| 163 |
+
encoding="utf-8",
|
| 164 |
+
)
|
| 165 |
+
self._update_manifest(bundle_path, scenario_record, metadata)
|
| 166 |
+
return {"created": True, "bundle_path": str(bundle_path), **metadata}
|
| 167 |
+
|
| 168 |
+
def load_bundle(
|
| 169 |
+
self,
|
| 170 |
+
*,
|
| 171 |
+
seed: int,
|
| 172 |
+
split: str,
|
| 173 |
+
difficulty: int,
|
| 174 |
+
family_budget: dict[str, Any] | None = None,
|
| 175 |
+
) -> ScenarioCacheLoad:
|
| 176 |
+
del family_budget # reserved for weighted family sampling once multiple families exist
|
| 177 |
+
started = time.perf_counter()
|
| 178 |
+
bundle_path = self.find_bundle(seed=seed, split=split, difficulty=difficulty)
|
| 179 |
+
if bundle_path is None:
|
| 180 |
+
raise ScenarioCacheMiss(
|
| 181 |
+
f"No cached scenario bundle for split={split!r}, difficulty={difficulty}, seed={seed}."
|
| 182 |
+
)
|
| 183 |
+
validate_bundle(bundle_path)
|
| 184 |
+
scenario_record = self._read_json(bundle_path / "scenario.json")
|
| 185 |
+
metadata = self._read_json(bundle_path / "metadata.json")
|
| 186 |
+
workspace = _make_workspace(prefix=f"cybersecurity_owasp_cached_{split}_{seed}_")
|
| 187 |
+
shutil.copytree(bundle_path / "app_source", workspace, dirs_exist_ok=True)
|
| 188 |
+
|
| 189 |
+
editable_files = list(scenario_record["editable_files"])
|
| 190 |
+
hidden_facts = dict(scenario_record["hidden_facts"])
|
| 191 |
+
hidden_facts.update(
|
| 192 |
+
{
|
| 193 |
+
"workspace": str(workspace),
|
| 194 |
+
"editable_files": editable_files,
|
| 195 |
+
"initial_file_hashes": {
|
| 196 |
+
rel: (workspace / rel).read_text(encoding="utf-8")
|
| 197 |
+
for rel in editable_files
|
| 198 |
+
},
|
| 199 |
+
"scenario_cache": {
|
| 200 |
+
"bundle_path": str(bundle_path),
|
| 201 |
+
"cache_key": metadata["cache_key"],
|
| 202 |
+
"scenario_hash": metadata["scenario_hash"],
|
| 203 |
+
"generator_version": metadata["generator_version"],
|
| 204 |
+
"verifier_version": metadata["verifier_version"],
|
| 205 |
+
},
|
| 206 |
+
}
|
| 207 |
+
)
|
| 208 |
+
scenario = {
|
| 209 |
+
"task_id": scenario_record["task_id"],
|
| 210 |
+
"workspace": workspace,
|
| 211 |
+
"domain": scenario_record["domain"],
|
| 212 |
+
"bug_family": scenario_record["bug_family"],
|
| 213 |
+
"scenario_family": scenario_record["scenario_family"],
|
| 214 |
+
"template_id": scenario_record["template_id"],
|
| 215 |
+
"target_weakness": scenario_record["target_weakness"],
|
| 216 |
+
"difficulty": int(scenario_record["difficulty"]),
|
| 217 |
+
"difficulty_tier": scenario_record["difficulty_tier"],
|
| 218 |
+
"curriculum_snapshot": {
|
| 219 |
+
**scenario_record.get("curriculum_snapshot", {}),
|
| 220 |
+
"split": split,
|
| 221 |
+
"cache_key": metadata["cache_key"],
|
| 222 |
+
"scenario_hash": metadata["scenario_hash"],
|
| 223 |
+
},
|
| 224 |
+
"task_brief": scenario_record["task_brief"],
|
| 225 |
+
"public_hint": scenario_record["public_hint"],
|
| 226 |
+
"workspace_summary": scenario_record["workspace_summary"],
|
| 227 |
+
"hidden_facts": hidden_facts,
|
| 228 |
+
"cache": {
|
| 229 |
+
"hit": True,
|
| 230 |
+
"bundle_path": str(bundle_path),
|
| 231 |
+
"cache_key": metadata["cache_key"],
|
| 232 |
+
"scenario_hash": metadata["scenario_hash"],
|
| 233 |
+
"load_latency_ms": (time.perf_counter() - started) * 1000,
|
| 234 |
+
},
|
| 235 |
+
}
|
| 236 |
+
return ScenarioCacheLoad(
|
| 237 |
+
scenario=scenario,
|
| 238 |
+
bundle_path=bundle_path,
|
| 239 |
+
load_latency_ms=float(scenario["cache"]["load_latency_ms"]),
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
def find_bundle(self, *, seed: int, split: str, difficulty: int) -> Path | None:
|
| 243 |
+
entries = [
|
| 244 |
+
entry
|
| 245 |
+
for entry in self._manifest_entries()
|
| 246 |
+
if entry.get("seed") == int(seed)
|
| 247 |
+
and entry.get("split") == split
|
| 248 |
+
and entry.get("difficulty") == int(difficulty)
|
| 249 |
+
and entry.get("validated") is True
|
| 250 |
+
]
|
| 251 |
+
if not entries:
|
| 252 |
+
return None
|
| 253 |
+
selected = sorted(entries, key=lambda item: str(item.get("scenario_hash", "")))[0]
|
| 254 |
+
path = self.root / str(selected["bundle_path"])
|
| 255 |
+
return path if path.exists() else None
|
| 256 |
+
|
| 257 |
+
def coverage(self) -> dict[str, Any]:
|
| 258 |
+
counts: dict[str, dict[str, int]] = {}
|
| 259 |
+
for entry in self._manifest_entries():
|
| 260 |
+
if not entry.get("validated"):
|
| 261 |
+
continue
|
| 262 |
+
split = str(entry.get("split", "train"))
|
| 263 |
+
difficulty = str(entry.get("difficulty", 0))
|
| 264 |
+
counts.setdefault(split, {})
|
| 265 |
+
counts[split][difficulty] = counts[split].get(difficulty, 0) + 1
|
| 266 |
+
return {"root": str(self.root), "counts": counts, "entries": len(self._manifest_entries())}
|
| 267 |
+
|
| 268 |
+
def assert_coverage(self, *, split: str, difficulty: int | None = None) -> dict[str, Any]:
|
| 269 |
+
coverage = self.coverage()
|
| 270 |
+
required = self.settings.curriculum.minimum_for_split(split)
|
| 271 |
+
difficulties: Iterable[int]
|
| 272 |
+
if difficulty is None:
|
| 273 |
+
difficulties = range(self.settings.curriculum.difficulty_bucket_count)
|
| 274 |
+
else:
|
| 275 |
+
difficulties = [difficulty]
|
| 276 |
+
missing: list[dict[str, int]] = []
|
| 277 |
+
split_counts = coverage["counts"].get(split, {})
|
| 278 |
+
for item in difficulties:
|
| 279 |
+
actual = int(split_counts.get(str(item), 0))
|
| 280 |
+
if actual < required:
|
| 281 |
+
missing.append({"difficulty": int(item), "actual": actual, "required": required})
|
| 282 |
+
if missing:
|
| 283 |
+
raise ScenarioCacheMiss(
|
| 284 |
+
f"Scenario cache coverage is below minimum for split={split!r}: {missing}"
|
| 285 |
+
)
|
| 286 |
+
return coverage
|
| 287 |
+
|
| 288 |
+
def _bundle_path(self, *, split: str, difficulty: int, key: ScenarioCacheKey) -> Path:
|
| 289 |
+
return self.root / split / f"difficulty_{difficulty}" / key.path_slug()
|
| 290 |
+
|
| 291 |
+
def _manifest_entries(self) -> list[dict[str, Any]]:
|
| 292 |
+
manifest_path = self.root / MANIFEST_FILE
|
| 293 |
+
if manifest_path.exists():
|
| 294 |
+
return list(self._read_json(manifest_path).get("entries", []))
|
| 295 |
+
return self._scan_entries()
|
| 296 |
+
|
| 297 |
+
def _scan_entries(self) -> list[dict[str, Any]]:
|
| 298 |
+
entries = []
|
| 299 |
+
for metadata_path in self.root.glob("**/metadata.json"):
|
| 300 |
+
bundle_path = metadata_path.parent
|
| 301 |
+
try:
|
| 302 |
+
validate_bundle(bundle_path)
|
| 303 |
+
scenario = self._read_json(bundle_path / "scenario.json")
|
| 304 |
+
metadata = self._read_json(metadata_path)
|
| 305 |
+
except Exception:
|
| 306 |
+
continue
|
| 307 |
+
entries.append(_manifest_entry(self.root, bundle_path, scenario, metadata))
|
| 308 |
+
return entries
|
| 309 |
+
|
| 310 |
+
def _update_manifest(
|
| 311 |
+
self,
|
| 312 |
+
bundle_path: Path,
|
| 313 |
+
scenario_record: dict[str, Any],
|
| 314 |
+
metadata: dict[str, Any],
|
| 315 |
+
) -> None:
|
| 316 |
+
self.root.mkdir(parents=True, exist_ok=True)
|
| 317 |
+
manifest_path = self.root / MANIFEST_FILE
|
| 318 |
+
entries = self._manifest_entries()
|
| 319 |
+
entry = _manifest_entry(self.root, bundle_path, scenario_record, metadata)
|
| 320 |
+
entries = [
|
| 321 |
+
item for item in entries if item.get("bundle_path") != entry["bundle_path"]
|
| 322 |
+
]
|
| 323 |
+
entries.append(entry)
|
| 324 |
+
_write_json(manifest_path, {"schema_version": 1, "entries": sorted(entries, key=lambda item: item["bundle_path"])})
|
| 325 |
+
|
| 326 |
+
def _read_json(self, path: Path) -> dict[str, Any]:
|
| 327 |
+
return json.loads(path.read_text(encoding="utf-8"))
|
| 328 |
+
|
| 329 |
+
|
| 330 |
+
def cache_key_for_scenario(
|
| 331 |
+
scenario: dict[str, Any],
|
| 332 |
+
*,
|
| 333 |
+
settings: ScenarioAuthoringSettings | None = None,
|
| 334 |
+
) -> ScenarioCacheKey:
|
| 335 |
+
settings = settings or load_scenario_authoring_config()
|
| 336 |
+
workspace_summary = scenario.get("workspace_summary", {})
|
| 337 |
+
hidden = scenario.get("hidden_facts", {})
|
| 338 |
+
stable_payload = {
|
| 339 |
+
"task_id": scenario.get("task_id"),
|
| 340 |
+
"difficulty": scenario.get("difficulty"),
|
| 341 |
+
"domain": scenario.get("domain"),
|
| 342 |
+
"bug_family": scenario.get("bug_family"),
|
| 343 |
+
"scenario_family": scenario.get("scenario_family"),
|
| 344 |
+
"template_id": scenario.get("template_id"),
|
| 345 |
+
"target_weakness": scenario.get("target_weakness"),
|
| 346 |
+
"public_hint": scenario.get("public_hint"),
|
| 347 |
+
"users": hidden.get("users"),
|
| 348 |
+
"invoices": hidden.get("invoices"),
|
| 349 |
+
}
|
| 350 |
+
return ScenarioCacheKey(
|
| 351 |
+
difficulty_level=int(scenario.get("difficulty", 0)),
|
| 352 |
+
authz_bug_type=str(scenario.get("bug_family", "unknown")),
|
| 353 |
+
app_family=str(scenario.get("domain", "unknown")),
|
| 354 |
+
framework=str(workspace_summary.get("framework", "unknown")),
|
| 355 |
+
policy_shape="owner_admin_tenant_policy",
|
| 356 |
+
tenant_model="same_tenant_with_foreign_tenant",
|
| 357 |
+
exploit_depth=str(scenario.get("target_weakness", "direct_object_reference")),
|
| 358 |
+
patch_scope="route_guard",
|
| 359 |
+
regression_risk="owner_admin_public_routes",
|
| 360 |
+
generator_version=settings.runtime.generator_version,
|
| 361 |
+
verifier_version=settings.runtime.verifier_version,
|
| 362 |
+
scenario_hash=_stable_hash(stable_payload),
|
| 363 |
+
)
|
| 364 |
+
|
| 365 |
+
|
| 366 |
+
def validate_bundle(bundle_path: str | Path) -> None:
|
| 367 |
+
path = Path(bundle_path)
|
| 368 |
+
missing = [name for name in SCENARIO_CACHE_REQUIRED_FILES if not (path / name).exists()]
|
| 369 |
+
if missing:
|
| 370 |
+
raise ScenarioCacheMiss(f"Scenario bundle is incomplete at {path}: missing {missing}")
|
| 371 |
+
scenario = json.loads((path / "scenario.json").read_text(encoding="utf-8"))
|
| 372 |
+
editable = set(scenario.get("editable_files", []))
|
| 373 |
+
protected = {"hidden_tests.py", "oracle_tests.py", "reward_config.json", "metadata.json"}
|
| 374 |
+
if editable.intersection(protected):
|
| 375 |
+
raise ScenarioCacheMiss(f"Scenario bundle exposes protected files as editable: {protected}")
|
| 376 |
+
|
| 377 |
+
|
| 378 |
+
def prepare_scenario_cache(
|
| 379 |
+
*,
|
| 380 |
+
cache_dir: str | Path | None = None,
|
| 381 |
+
settings: ScenarioAuthoringSettings | None = None,
|
| 382 |
+
seed_start: int = 0,
|
| 383 |
+
force: bool = False,
|
| 384 |
+
) -> dict[str, Any]:
|
| 385 |
+
settings = settings or load_scenario_authoring_config()
|
| 386 |
+
cache_root = Path(cache_dir or settings.runtime.cache_dir)
|
| 387 |
+
cache = ScenarioCache(cache_root, settings=settings)
|
| 388 |
+
factory = ScenarioFactory()
|
| 389 |
+
curriculum = CurriculumController()
|
| 390 |
+
created: list[dict[str, Any]] = []
|
| 391 |
+
split_counts = {
|
| 392 |
+
"train": settings.curriculum.train_scenarios_per_bucket,
|
| 393 |
+
"validation": settings.curriculum.validation_scenarios_per_bucket,
|
| 394 |
+
"hidden_eval": settings.curriculum.heldout_eval_scenarios_per_bucket,
|
| 395 |
+
}
|
| 396 |
+
for split, per_bucket in split_counts.items():
|
| 397 |
+
for requested_difficulty in range(settings.curriculum.difficulty_bucket_count):
|
| 398 |
+
for index in range(per_bucket):
|
| 399 |
+
seed = int(seed_start) + requested_difficulty * per_bucket + index
|
| 400 |
+
profile = curriculum.select_profile(
|
| 401 |
+
seed=seed,
|
| 402 |
+
split=split,
|
| 403 |
+
requested_difficulty=requested_difficulty,
|
| 404 |
+
)
|
| 405 |
+
scenario = factory.compile_scenario(
|
| 406 |
+
seed,
|
| 407 |
+
split=split,
|
| 408 |
+
difficulty=requested_difficulty,
|
| 409 |
+
curriculum_profile=profile,
|
| 410 |
+
)
|
| 411 |
+
try:
|
| 412 |
+
created.append(cache.write_bundle(scenario, force=force))
|
| 413 |
+
finally:
|
| 414 |
+
workspace = scenario.get("workspace")
|
| 415 |
+
if workspace:
|
| 416 |
+
shutil.rmtree(workspace, ignore_errors=True)
|
| 417 |
+
return {
|
| 418 |
+
"cache_dir": str(cache_root),
|
| 419 |
+
"created": sum(1 for item in created if item.get("created")),
|
| 420 |
+
"seen": len(created),
|
| 421 |
+
"coverage": cache.coverage(),
|
| 422 |
+
"config": {
|
| 423 |
+
"difficulty_bucket_count": settings.curriculum.difficulty_bucket_count,
|
| 424 |
+
"train_scenarios_per_bucket": settings.curriculum.train_scenarios_per_bucket,
|
| 425 |
+
"validation_scenarios_per_bucket": settings.curriculum.validation_scenarios_per_bucket,
|
| 426 |
+
"heldout_eval_scenarios_per_bucket": settings.curriculum.heldout_eval_scenarios_per_bucket,
|
| 427 |
+
"scenario_author_model": settings.scenario_author.model_id,
|
| 428 |
+
},
|
| 429 |
+
}
|
| 430 |
+
|
| 431 |
+
|
| 432 |
+
def _cacheable_hidden_facts(hidden: dict[str, Any]) -> dict[str, Any]:
|
| 433 |
+
return {
|
| 434 |
+
key: value
|
| 435 |
+
for key, value in hidden.items()
|
| 436 |
+
if key not in {"workspace", "editable_files", "initial_file_hashes"}
|
| 437 |
+
}
|
| 438 |
+
|
| 439 |
+
|
| 440 |
+
def _expected_exploit_trace(hidden: dict[str, Any]) -> dict[str, Any]:
|
| 441 |
+
return {
|
| 442 |
+
"method": "GET",
|
| 443 |
+
"path": f"/invoices/{hidden['other_invoice_id']}",
|
| 444 |
+
"actor": hidden["owner_user_id"],
|
| 445 |
+
"vulnerable_status": 200,
|
| 446 |
+
"fixed_status": 403,
|
| 447 |
+
}
|
| 448 |
+
|
| 449 |
+
|
| 450 |
+
def _reward_config() -> dict[str, Any]:
|
| 451 |
+
return {
|
| 452 |
+
"max_reward": 15.0,
|
| 453 |
+
"components": [
|
| 454 |
+
"discovery",
|
| 455 |
+
"security",
|
| 456 |
+
"regression",
|
| 457 |
+
"public_routes",
|
| 458 |
+
"patch_quality",
|
| 459 |
+
"visible_tests",
|
| 460 |
+
"safety",
|
| 461 |
+
"anti_cheat",
|
| 462 |
+
],
|
| 463 |
+
}
|
| 464 |
+
|
| 465 |
+
|
| 466 |
+
def _hidden_tests_contract() -> str:
|
| 467 |
+
return (
|
| 468 |
+
"# Hidden invariant contract for the deterministic verifier.\n"
|
| 469 |
+
"# Runtime observations must never expose this file.\n"
|
| 470 |
+
"CHECKS = ['same_tenant_other_user_blocked', 'cross_tenant_blocked']\n"
|
| 471 |
+
)
|
| 472 |
+
|
| 473 |
+
|
| 474 |
+
def _oracle_tests_contract() -> str:
|
| 475 |
+
return (
|
| 476 |
+
"# Oracle matrix contract for allowed/denied authorization tuples.\n"
|
| 477 |
+
"# Runtime observations must never expose this file.\n"
|
| 478 |
+
"CHECKS = ['owner_allowed', 'admin_allowed', 'public_allowed', 'cross_tenant_denied']\n"
|
| 479 |
+
)
|
| 480 |
+
|
| 481 |
+
|
| 482 |
+
def _manifest_entry(
|
| 483 |
+
root: Path,
|
| 484 |
+
bundle_path: Path,
|
| 485 |
+
scenario_record: dict[str, Any],
|
| 486 |
+
metadata: dict[str, Any],
|
| 487 |
+
) -> dict[str, Any]:
|
| 488 |
+
return {
|
| 489 |
+
"bundle_path": str(bundle_path.relative_to(root)).replace("\\", "/"),
|
| 490 |
+
"seed": int(scenario_record.get("seed", 0)),
|
| 491 |
+
"split": str(scenario_record.get("split", "train")),
|
| 492 |
+
"difficulty": int(scenario_record.get("difficulty", 0)),
|
| 493 |
+
"scenario_hash": str(metadata.get("scenario_hash", "")),
|
| 494 |
+
"cache_key": metadata.get("cache_key", {}),
|
| 495 |
+
"validated": bool(metadata.get("validated", False)),
|
| 496 |
+
}
|
| 497 |
+
|
| 498 |
+
|
| 499 |
+
def _make_workspace(prefix: str) -> Path:
|
| 500 |
+
root = Path(os.getenv("CYBERSECURITY_OWASP_WORKSPACE_ROOT", tempfile.gettempdir()))
|
| 501 |
+
root.mkdir(parents=True, exist_ok=True)
|
| 502 |
+
for _ in range(100):
|
| 503 |
+
workspace = root / f"{prefix}{uuid4().hex[:12]}"
|
| 504 |
+
try:
|
| 505 |
+
workspace.mkdir()
|
| 506 |
+
except FileExistsError:
|
| 507 |
+
continue
|
| 508 |
+
return workspace
|
| 509 |
+
raise RuntimeError("Unable to create isolated cached scenario workspace")
|
| 510 |
+
|
| 511 |
+
|
| 512 |
+
def _seed_from_task_id(task_id: str) -> int:
|
| 513 |
+
try:
|
| 514 |
+
return int(task_id.rsplit("-", 1)[-1])
|
| 515 |
+
except ValueError:
|
| 516 |
+
return 0
|
| 517 |
+
|
| 518 |
+
|
| 519 |
+
def _stable_hash(payload: Any) -> str:
|
| 520 |
+
encoded = json.dumps(payload, sort_keys=True, separators=(",", ":"), default=str)
|
| 521 |
+
return hashlib.sha256(encoded.encode("utf-8")).hexdigest()
|
| 522 |
+
|
| 523 |
+
|
| 524 |
+
def _write_json(path: Path, payload: Any) -> None:
|
| 525 |
+
path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
|
server/scenario_factory.py
CHANGED
|
@@ -125,7 +125,7 @@ class ScenarioFactory:
|
|
| 125 |
"curriculum_snapshot": profile,
|
| 126 |
"task_brief": (
|
| 127 |
"Inspect the generated invoices app and policy. Find the broken "
|
| 128 |
-
"authorization behavior, submit a
|
| 129 |
"the app, preserve intended owner/admin/public behavior, then submit."
|
| 130 |
),
|
| 131 |
"public_hint": public_hint,
|
|
|
|
| 125 |
"curriculum_snapshot": profile,
|
| 126 |
"task_brief": (
|
| 127 |
"Inspect the generated invoices app and policy. Find the broken "
|
| 128 |
+
"authorization behavior, submit a diagnosis with local evidence, patch "
|
| 129 |
"the app, preserve intended owner/admin/public behavior, then submit."
|
| 130 |
),
|
| 131 |
"public_hint": public_hint,
|
server/verifier.py
CHANGED
|
@@ -49,8 +49,9 @@ class MultiLayerVerifier:
|
|
| 49 |
"invalid_action": invalid_action,
|
| 50 |
"repeated_action": self._is_repeated_action(state, action),
|
| 51 |
}
|
| 52 |
-
if action.tool_name == "
|
| 53 |
-
verifier_result["
|
|
|
|
| 54 |
elif action.tool_name == "run_visible_tests":
|
| 55 |
verifier_result["visible"] = run_visible_tests(state)
|
| 56 |
elif action.tool_name == "submit_fix":
|
|
|
|
| 49 |
"invalid_action": invalid_action,
|
| 50 |
"repeated_action": self._is_repeated_action(state, action),
|
| 51 |
}
|
| 52 |
+
if action.tool_name == "submit_diagnosis":
|
| 53 |
+
verifier_result["diagnosis"] = verify_finding(state, action.arguments)
|
| 54 |
+
verifier_result["finding"] = verifier_result["diagnosis"]
|
| 55 |
elif action.tool_name == "run_visible_tests":
|
| 56 |
verifier_result["visible"] = run_visible_tests(state)
|
| 57 |
elif action.tool_name == "submit_fix":
|
tests/helpers.py
CHANGED
|
@@ -12,16 +12,28 @@ def make_env(seed: int = 7) -> CybersecurityOwaspEnvironment:
|
|
| 12 |
|
| 13 |
def submit_valid_finding(env: CybersecurityOwaspEnvironment):
|
| 14 |
hidden = env.state.hidden_facts
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
return env.step(
|
| 16 |
CyberSecurityOWASPAction(
|
| 17 |
-
tool_name="
|
| 18 |
arguments={
|
| 19 |
-
"
|
| 20 |
-
"
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
"policy_rule": "owner or billing_admin in same tenant only",
|
| 25 |
},
|
| 26 |
)
|
| 27 |
)
|
|
|
|
| 12 |
|
| 13 |
def submit_valid_finding(env: CybersecurityOwaspEnvironment):
|
| 14 |
hidden = env.state.hidden_facts
|
| 15 |
+
trace = env.step(
|
| 16 |
+
CyberSecurityOWASPAction(
|
| 17 |
+
tool_name="send_local_request",
|
| 18 |
+
arguments={
|
| 19 |
+
"method": "GET",
|
| 20 |
+
"path": f"/invoices/{hidden['other_invoice_id']}",
|
| 21 |
+
"user_id": hidden["owner_user_id"],
|
| 22 |
+
},
|
| 23 |
+
)
|
| 24 |
+
)
|
| 25 |
+
trace_id = "req_001"
|
| 26 |
+
if '"trace_id": "req_' in trace.last_tool_result:
|
| 27 |
+
trace_id = trace.last_tool_result.split('"trace_id": "', 1)[1].split('"', 1)[0]
|
| 28 |
return env.step(
|
| 29 |
CyberSecurityOWASPAction(
|
| 30 |
+
tool_name="submit_diagnosis",
|
| 31 |
arguments={
|
| 32 |
+
"bug_class": "idor_ownership_bug",
|
| 33 |
+
"route": "GET /invoices/{invoice_id}",
|
| 34 |
+
"violated_policy_rule": "owner or billing_admin in same tenant only",
|
| 35 |
+
"evidence_trace_ids": [trace_id],
|
| 36 |
+
"fix_plan": "add tenant and owner/admin checks before returning the invoice",
|
|
|
|
| 37 |
},
|
| 38 |
)
|
| 39 |
)
|
tests/test_closed_loop_runtime.py
CHANGED
|
@@ -14,7 +14,7 @@ def test_curriculum_selects_profile_and_tracks_mastery():
|
|
| 14 |
controller = CurriculumController()
|
| 15 |
profile = controller.select_profile(seed=3, split="train", requested_difficulty=1)
|
| 16 |
|
| 17 |
-
assert profile["difficulty_tier"] == "
|
| 18 |
assert profile["target_weakness"]
|
| 19 |
assert "target_mastery" in profile["mastery"]
|
| 20 |
|
|
@@ -43,7 +43,7 @@ def test_reset_records_scenario_family_and_partial_observability():
|
|
| 43 |
serialized_hint = json.dumps(obs.visible_policy_hint).lower()
|
| 44 |
|
| 45 |
assert env.state.scenario_family.startswith("heldout.")
|
| 46 |
-
assert env.state.difficulty_tier
|
| 47 |
assert "oracle_matrix" not in serialized_hint
|
| 48 |
assert "hidden_tests" not in serialized_hint
|
| 49 |
assert "injected bug" not in serialized_hint
|
|
|
|
| 14 |
controller = CurriculumController()
|
| 15 |
profile = controller.select_profile(seed=3, split="train", requested_difficulty=1)
|
| 16 |
|
| 17 |
+
assert profile["difficulty_tier"] == "D1"
|
| 18 |
assert profile["target_weakness"]
|
| 19 |
assert "target_mastery" in profile["mastery"]
|
| 20 |
|
|
|
|
| 43 |
serialized_hint = json.dumps(obs.visible_policy_hint).lower()
|
| 44 |
|
| 45 |
assert env.state.scenario_family.startswith("heldout.")
|
| 46 |
+
assert env.state.difficulty_tier == "D3"
|
| 47 |
assert "oracle_matrix" not in serialized_hint
|
| 48 |
assert "hidden_tests" not in serialized_hint
|
| 49 |
assert "injected bug" not in serialized_hint
|
tests/test_invalid_actions.py
CHANGED
|
@@ -21,6 +21,8 @@ def test_blocked_hidden_path_does_not_crash():
|
|
| 21 |
assert obs.last_action_valid is False
|
| 22 |
assert "blocked" in (obs.last_action_error or "")
|
| 23 |
assert "blocked_file_probe" in env.state.anti_cheat_flags
|
|
|
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
def test_external_request_is_blocked():
|
|
@@ -33,6 +35,8 @@ def test_external_request_is_blocked():
|
|
| 33 |
)
|
| 34 |
assert obs.last_action_valid is False
|
| 35 |
assert "external_network_attempt" in env.state.anti_cheat_flags
|
|
|
|
|
|
|
| 36 |
|
| 37 |
|
| 38 |
def test_visible_tests_are_not_patchable():
|
|
|
|
| 21 |
assert obs.last_action_valid is False
|
| 22 |
assert "blocked" in (obs.last_action_error or "")
|
| 23 |
assert "blocked_file_probe" in env.state.anti_cheat_flags
|
| 24 |
+
assert obs.done is True
|
| 25 |
+
assert env.state.failure_reason == "anti_cheat_violation"
|
| 26 |
|
| 27 |
|
| 28 |
def test_external_request_is_blocked():
|
|
|
|
| 35 |
)
|
| 36 |
assert obs.last_action_valid is False
|
| 37 |
assert "external_network_attempt" in env.state.anti_cheat_flags
|
| 38 |
+
assert obs.done is True
|
| 39 |
+
assert env.state.failure_reason == "anti_cheat_violation"
|
| 40 |
|
| 41 |
|
| 42 |
def test_visible_tests_are_not_patchable():
|
tests/test_modal_scenario_cache_static.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def test_modal_train_uses_persistent_required_scenario_cache():
|
| 8 |
+
source = (ROOT / "scripts" / "modal_train_grpo.py").read_text(encoding="utf-8")
|
| 9 |
+
|
| 10 |
+
assert "SCENARIO_CACHE_VOLUME_NAME = \"CyberSecurity_OWASP-scenario-cache\"" in source
|
| 11 |
+
assert "SCENARIO_CACHE_DIR = pathlib.Path(\"/scenario-cache\")" in source
|
| 12 |
+
assert "CYBERSECURITY_OWASP_SCENARIO_CACHE_MODE" in source
|
| 13 |
+
assert "\"require\" if required else \"fallback\"" in source
|
| 14 |
+
assert "mode == \"prepare-cache\"" in source
|
| 15 |
+
assert "def verify_modal_scenario_cache_for_training" in source
|
| 16 |
+
assert "CPU scenario cache preflight passed" in source
|
| 17 |
+
assert "scenario_cache.assert_coverage" in source
|
| 18 |
+
assert "volumes={RUNS_DIR: volume, CACHE_DIR: cache_volume, SCENARIO_CACHE_DIR: scenario_cache_volume}" in source
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def test_modal_ephemeral_smoke_uses_required_scenario_cache():
|
| 22 |
+
source = (ROOT / "scripts" / "modal_ephemeral_train.py").read_text(encoding="utf-8")
|
| 23 |
+
|
| 24 |
+
assert "SCENARIO_CACHE_VOLUME_NAME = \"CyberSecurity_OWASP-scenario-cache\"" in source
|
| 25 |
+
assert "SCENARIO_CACHE_DIR = Path(\"/scenario-cache\")" in source
|
| 26 |
+
assert "mode == \"prepare-cache\"" in source
|
| 27 |
+
assert "_configure_scenario_cache_env(required=True)" in source
|
| 28 |
+
assert "ScenarioCache(SCENARIO_CACHE_DIR" in source
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def test_modal_training_is_pinned_to_gemma4_e2b():
|
| 32 |
+
source = (ROOT / "scripts" / "modal_train_grpo.py").read_text(encoding="utf-8")
|
| 33 |
+
|
| 34 |
+
assert "DEFAULT_GEMMA_MODEL = \"unsloth/gemma-4-E2B-it\"" in source
|
| 35 |
+
assert "def _ensure_gemma4_model(model_name: str) -> str:" in source
|
| 36 |
+
assert "model_name = _ensure_gemma4_model(model_name)" in source
|
| 37 |
+
assert "from unsloth import FastVisionModel" in source
|
| 38 |
+
assert "Qwen" not in source
|
| 39 |
+
assert "FastLanguageModel" not in source
|
tests/test_reward_config.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
|
| 3 |
+
import pytest
|
| 4 |
+
|
| 5 |
+
from CyberSecurity_OWASP.reward_config import (
|
| 6 |
+
compute_token_penalty,
|
| 7 |
+
load_reward_settings,
|
| 8 |
+
)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def test_default_reward_config_has_descriptions():
|
| 12 |
+
settings = load_reward_settings()
|
| 13 |
+
|
| 14 |
+
assert settings.mode == "sparse_eval"
|
| 15 |
+
assert settings.training_mode == "dense_train"
|
| 16 |
+
assert settings.value("terminal_cap") == 15.0
|
| 17 |
+
for key, value in settings.raw.items():
|
| 18 |
+
if isinstance(value, dict):
|
| 19 |
+
assert value.get("description")
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def test_reward_config_env_overrides(monkeypatch):
|
| 23 |
+
monkeypatch.setenv("CYBERSECURITY_OWASP_REWARD_MODE", "dense_train")
|
| 24 |
+
monkeypatch.setenv("CYBERSECURITY_OWASP_REWARD_STAGE", "late")
|
| 25 |
+
monkeypatch.setenv("CYBERSECURITY_OWASP_SHAPING_WEIGHT", "0.25")
|
| 26 |
+
|
| 27 |
+
settings = load_reward_settings()
|
| 28 |
+
|
| 29 |
+
assert settings.mode == "dense_train"
|
| 30 |
+
assert settings.stage == "late"
|
| 31 |
+
assert settings.shaping_weight == 0.25
|
| 32 |
+
assert compute_token_penalty(850, settings) == -0.5
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def test_reward_config_rejects_missing_descriptions(monkeypatch):
|
| 36 |
+
config_path = Path("outputs/test_reward_config_bad.yaml")
|
| 37 |
+
config_path.parent.mkdir(parents=True, exist_ok=True)
|
| 38 |
+
config_path.write_text(
|
| 39 |
+
"reward:\n mode: sparse_eval\n policy_inspected:\n value: 0.3\n",
|
| 40 |
+
encoding="utf-8",
|
| 41 |
+
)
|
| 42 |
+
try:
|
| 43 |
+
monkeypatch.setenv("CYBERSECURITY_OWASP_REWARD_CONFIG", str(config_path))
|
| 44 |
+
|
| 45 |
+
with pytest.raises(ValueError, match="description"):
|
| 46 |
+
load_reward_settings()
|
| 47 |
+
finally:
|
| 48 |
+
config_path.unlink(missing_ok=True)
|
tests/test_rewards.py
CHANGED
|
@@ -6,7 +6,7 @@ from .helpers import apply_secure_patch, make_env, secure_invoice_source, submit
|
|
| 6 |
def test_oracle_patch_gets_high_reward():
|
| 7 |
env = make_env(40)
|
| 8 |
finding = submit_valid_finding(env)
|
| 9 |
-
assert finding.reward_breakdown["discovery"] ==
|
| 10 |
apply_secure_patch(env)
|
| 11 |
visible = env.step(CyberSecurityOWASPAction(tool_name="run_visible_tests"))
|
| 12 |
assert visible.reward_breakdown["visible_tests"] == 1.0
|
|
@@ -65,3 +65,65 @@ def test_visible_tests_only_does_not_get_high_reward():
|
|
| 65 |
assert visible.reward_breakdown["visible_tests"] == 1.0
|
| 66 |
final = env.step(CyberSecurityOWASPAction(tool_name="submit_fix"))
|
| 67 |
assert final.reward_breakdown["total"] < 5.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
def test_oracle_patch_gets_high_reward():
|
| 7 |
env = make_env(40)
|
| 8 |
finding = submit_valid_finding(env)
|
| 9 |
+
assert finding.reward_breakdown["discovery"] == 1.0
|
| 10 |
apply_secure_patch(env)
|
| 11 |
visible = env.step(CyberSecurityOWASPAction(tool_name="run_visible_tests"))
|
| 12 |
assert visible.reward_breakdown["visible_tests"] == 1.0
|
|
|
|
| 65 |
assert visible.reward_breakdown["visible_tests"] == 1.0
|
| 66 |
final = env.step(CyberSecurityOWASPAction(tool_name="submit_fix"))
|
| 67 |
assert final.reward_breakdown["total"] < 5.0
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def test_sparse_mode_does_not_pay_progressive_reward(monkeypatch):
|
| 71 |
+
monkeypatch.setenv("CYBERSECURITY_OWASP_REWARD_MODE", "sparse_eval")
|
| 72 |
+
env = make_env(45)
|
| 73 |
+
obs = env.step(CyberSecurityOWASPAction(tool_name="inspect_policy_graph"))
|
| 74 |
+
assert obs.reward_breakdown["progressive"] == 0.0
|
| 75 |
+
assert obs.reward_breakdown["total"] == 0.0
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def test_dense_mode_pays_capped_progressive_reward(monkeypatch):
|
| 79 |
+
monkeypatch.setenv("CYBERSECURITY_OWASP_REWARD_MODE", "dense_train")
|
| 80 |
+
monkeypatch.setenv("CYBERSECURITY_OWASP_REWARD_STAGE", "early")
|
| 81 |
+
env = make_env(46)
|
| 82 |
+
obs = env.step(CyberSecurityOWASPAction(tool_name="inspect_policy_graph"))
|
| 83 |
+
assert obs.reward_breakdown["progressive"] == 0.30
|
| 84 |
+
assert obs.reward_breakdown["step_penalty"] < 0.0
|
| 85 |
+
assert obs.reward_breakdown["total"] > 0.0
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def test_terminal_score_unchanged_by_dense_shaping(monkeypatch):
|
| 89 |
+
monkeypatch.setenv("CYBERSECURITY_OWASP_REWARD_MODE", "sparse_eval")
|
| 90 |
+
sparse_env = make_env(47)
|
| 91 |
+
submit_valid_finding(sparse_env)
|
| 92 |
+
apply_secure_patch(sparse_env)
|
| 93 |
+
sparse_env.step(CyberSecurityOWASPAction(tool_name="run_visible_tests"))
|
| 94 |
+
sparse_final = sparse_env.step(CyberSecurityOWASPAction(tool_name="submit_fix"))
|
| 95 |
+
|
| 96 |
+
monkeypatch.setenv("CYBERSECURITY_OWASP_REWARD_MODE", "dense_train")
|
| 97 |
+
dense_env = make_env(47)
|
| 98 |
+
dense_env.step(CyberSecurityOWASPAction(tool_name="inspect_policy_graph"))
|
| 99 |
+
submit_valid_finding(dense_env)
|
| 100 |
+
apply_secure_patch(dense_env)
|
| 101 |
+
dense_env.step(CyberSecurityOWASPAction(tool_name="run_visible_tests"))
|
| 102 |
+
dense_final = dense_env.step(CyberSecurityOWASPAction(tool_name="submit_fix"))
|
| 103 |
+
|
| 104 |
+
assert dense_final.reward_breakdown["terminal_total"] == sparse_final.reward_breakdown["terminal_total"]
|
| 105 |
+
assert dense_final.reward_breakdown["train_total"] != dense_final.reward_breakdown["terminal_total"]
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def test_repeated_futile_actions_are_penalized(monkeypatch):
|
| 109 |
+
monkeypatch.setenv("CYBERSECURITY_OWASP_REWARD_MODE", "dense_train")
|
| 110 |
+
env = make_env(48)
|
| 111 |
+
|
| 112 |
+
first = env.step(CyberSecurityOWASPAction(tool_name="inspect_policy_graph"))
|
| 113 |
+
second = env.step(CyberSecurityOWASPAction(tool_name="inspect_policy_graph"))
|
| 114 |
+
|
| 115 |
+
assert first.reward_breakdown["progressive"] > 0.0
|
| 116 |
+
assert second.reward_breakdown["progressive"] == 0.0
|
| 117 |
+
assert second.reward_breakdown["behavior_penalty"] <= -0.10
|
| 118 |
+
assert second.reward_breakdown["total"] < 0.0
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def test_dense_episode_reward_cap_blocks_repeated_positive_farming(monkeypatch):
|
| 122 |
+
monkeypatch.setenv("CYBERSECURITY_OWASP_REWARD_MODE", "dense_train")
|
| 123 |
+
env = make_env(49)
|
| 124 |
+
env.state.accumulated_reward = 20.99
|
| 125 |
+
|
| 126 |
+
capped = env.step(CyberSecurityOWASPAction(tool_name="inspect_policy_graph"))
|
| 127 |
+
|
| 128 |
+
assert 0.0 <= capped.reward_breakdown["total"] <= 0.011
|
| 129 |
+
assert env.state.accumulated_reward <= 21.001
|
tests/test_scenario_authoring_config.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
|
| 3 |
+
import pytest
|
| 4 |
+
|
| 5 |
+
from CyberSecurity_OWASP.config import load_scenario_authoring_config
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def test_default_scenario_authoring_config_uses_deepseek_defaults(monkeypatch):
|
| 9 |
+
for key in list(
|
| 10 |
+
name for name in __import__("os").environ if name.startswith("CYBERSECURITY_OWASP_")
|
| 11 |
+
):
|
| 12 |
+
monkeypatch.delenv(key, raising=False)
|
| 13 |
+
|
| 14 |
+
settings = load_scenario_authoring_config()
|
| 15 |
+
|
| 16 |
+
assert settings.scenario_author.model_id == "deepseek-ai/DeepSeek-V4-Pro"
|
| 17 |
+
assert settings.scenario_author.provider == "huggingface"
|
| 18 |
+
assert settings.scenario_author.thinking_mode == "thinking"
|
| 19 |
+
assert settings.scenario_author.reasoning_effort == "high"
|
| 20 |
+
assert settings.scenario_author.temperature == 1.0
|
| 21 |
+
assert settings.scenario_author.top_p == 1.0
|
| 22 |
+
assert settings.curriculum.difficulty_bucket_count == 4
|
| 23 |
+
assert settings.curriculum.train_scenarios_per_bucket == 25
|
| 24 |
+
assert settings.curriculum.heldout_eval_scenarios_per_bucket == 10
|
| 25 |
+
assert settings.curriculum.target_cache_hit_rate == 0.95
|
| 26 |
+
assert settings.curriculum.target_reset_latency_ms == 200
|
| 27 |
+
assert settings.curriculum.scenario_refresh_rate_per_epoch == 0.05
|
| 28 |
+
assert settings.curriculum.difficulty_calibration_strategy == "baseline_agent_pass_rate"
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def test_scenario_authoring_config_env_overrides(monkeypatch, tmp_path):
|
| 32 |
+
config_path = tmp_path / "config.json"
|
| 33 |
+
config_path.write_text(
|
| 34 |
+
json.dumps(
|
| 35 |
+
{
|
| 36 |
+
"scenario_author": {},
|
| 37 |
+
"curriculum": {"difficulty_labels": ["D0", "D1"]},
|
| 38 |
+
"runtime": {},
|
| 39 |
+
}
|
| 40 |
+
),
|
| 41 |
+
encoding="utf-8",
|
| 42 |
+
)
|
| 43 |
+
monkeypatch.setenv("CYBERSECURITY_OWASP_SCENARIO_CONFIG", str(config_path))
|
| 44 |
+
monkeypatch.setenv("CYBERSECURITY_OWASP_SCENARIO_AUTHOR_MODEL", "test/model")
|
| 45 |
+
monkeypatch.setenv("CYBERSECURITY_OWASP_DIFFICULTY_BUCKETS", "2")
|
| 46 |
+
monkeypatch.setenv("CYBERSECURITY_OWASP_TRAIN_SCENARIOS_PER_BUCKET", "3")
|
| 47 |
+
monkeypatch.setenv("CYBERSECURITY_OWASP_SCENARIO_CACHE_MODE", "require")
|
| 48 |
+
|
| 49 |
+
settings = load_scenario_authoring_config()
|
| 50 |
+
|
| 51 |
+
assert settings.scenario_author.model_id == "test/model"
|
| 52 |
+
assert settings.curriculum.difficulty_bucket_count == 2
|
| 53 |
+
assert settings.curriculum.train_scenarios_per_bucket == 3
|
| 54 |
+
assert settings.runtime.cache_mode == "require"
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def test_scenario_authoring_config_rejects_bad_values(monkeypatch, tmp_path):
|
| 58 |
+
config_path = tmp_path / "bad.json"
|
| 59 |
+
config_path.write_text(
|
| 60 |
+
json.dumps(
|
| 61 |
+
{
|
| 62 |
+
"scenario_author": {"temperature": 0},
|
| 63 |
+
"curriculum": {},
|
| 64 |
+
"runtime": {},
|
| 65 |
+
}
|
| 66 |
+
),
|
| 67 |
+
encoding="utf-8",
|
| 68 |
+
)
|
| 69 |
+
monkeypatch.setenv("CYBERSECURITY_OWASP_SCENARIO_CONFIG", str(config_path))
|
| 70 |
+
|
| 71 |
+
with pytest.raises(ValueError, match="sampling"):
|
| 72 |
+
load_scenario_authoring_config()
|
tests/test_scenario_cache.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import shutil
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
import pytest
|
| 6 |
+
|
| 7 |
+
from CyberSecurity_OWASP.config import load_scenario_authoring_config
|
| 8 |
+
from CyberSecurity_OWASP.models import CyberSecurityOWASPAction
|
| 9 |
+
from CyberSecurity_OWASP.server.CyberSecurity_OWASP_environment import (
|
| 10 |
+
CybersecurityOwaspEnvironment,
|
| 11 |
+
)
|
| 12 |
+
from CyberSecurity_OWASP.server.curriculum import CurriculumController
|
| 13 |
+
from CyberSecurity_OWASP.server.scenario_cache import (
|
| 14 |
+
SCENARIO_CACHE_REQUIRED_FILES,
|
| 15 |
+
ScenarioCache,
|
| 16 |
+
ScenarioCacheMiss,
|
| 17 |
+
cache_key_for_scenario,
|
| 18 |
+
prepare_scenario_cache,
|
| 19 |
+
validate_bundle,
|
| 20 |
+
)
|
| 21 |
+
from CyberSecurity_OWASP.server.scenario_factory import ScenarioFactory
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def _small_cache(monkeypatch, tmp_path):
|
| 25 |
+
monkeypatch.setenv("CYBERSECURITY_OWASP_SCENARIO_CACHE_DIR", str(tmp_path))
|
| 26 |
+
monkeypatch.setenv("CYBERSECURITY_OWASP_DIFFICULTY_BUCKETS", "1")
|
| 27 |
+
monkeypatch.setenv("CYBERSECURITY_OWASP_TRAIN_SCENARIOS_PER_BUCKET", "1")
|
| 28 |
+
monkeypatch.setenv("CYBERSECURITY_OWASP_VALIDATION_SCENARIOS_PER_BUCKET", "1")
|
| 29 |
+
monkeypatch.setenv("CYBERSECURITY_OWASP_HELDOUT_SCENARIOS_PER_BUCKET", "1")
|
| 30 |
+
settings = load_scenario_authoring_config()
|
| 31 |
+
result = prepare_scenario_cache(cache_dir=tmp_path, settings=settings, force=True)
|
| 32 |
+
return settings, result
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def test_scenario_cache_bundle_contract_and_key_hash(monkeypatch, tmp_path):
|
| 36 |
+
settings, result = _small_cache(monkeypatch, tmp_path)
|
| 37 |
+
assert result["created"] >= 1
|
| 38 |
+
|
| 39 |
+
cache = ScenarioCache(tmp_path, settings=settings)
|
| 40 |
+
bundle_path = cache.find_bundle(seed=0, split="train", difficulty=0)
|
| 41 |
+
assert bundle_path is not None
|
| 42 |
+
validate_bundle(bundle_path)
|
| 43 |
+
|
| 44 |
+
for name in SCENARIO_CACHE_REQUIRED_FILES:
|
| 45 |
+
assert (bundle_path / name).exists()
|
| 46 |
+
|
| 47 |
+
scenario = json.loads((bundle_path / "scenario.json").read_text(encoding="utf-8"))
|
| 48 |
+
key = scenario["cache_key"]
|
| 49 |
+
assert set(key) == {
|
| 50 |
+
"difficulty_level",
|
| 51 |
+
"authz_bug_type",
|
| 52 |
+
"app_family",
|
| 53 |
+
"framework",
|
| 54 |
+
"policy_shape",
|
| 55 |
+
"tenant_model",
|
| 56 |
+
"exploit_depth",
|
| 57 |
+
"patch_scope",
|
| 58 |
+
"regression_risk",
|
| 59 |
+
"generator_version",
|
| 60 |
+
"verifier_version",
|
| 61 |
+
"scenario_hash",
|
| 62 |
+
}
|
| 63 |
+
assert len(key["scenario_hash"]) == 64
|
| 64 |
+
|
| 65 |
+
# The helper should produce the same hash for the same stable scenario payload.
|
| 66 |
+
profile = CurriculumController(settings=settings).select_profile(
|
| 67 |
+
seed=0,
|
| 68 |
+
split="train",
|
| 69 |
+
requested_difficulty=0,
|
| 70 |
+
)
|
| 71 |
+
compiled = ScenarioFactory().compile_scenario(
|
| 72 |
+
0,
|
| 73 |
+
split="train",
|
| 74 |
+
difficulty=0,
|
| 75 |
+
curriculum_profile=profile,
|
| 76 |
+
)
|
| 77 |
+
try:
|
| 78 |
+
assert cache_key_for_scenario(compiled, settings=settings).scenario_hash == key["scenario_hash"]
|
| 79 |
+
finally:
|
| 80 |
+
shutil.rmtree(compiled["workspace"], ignore_errors=True)
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def test_runtime_reset_uses_required_cache_without_compiling(monkeypatch, tmp_path):
|
| 84 |
+
settings, _ = _small_cache(monkeypatch, tmp_path)
|
| 85 |
+
monkeypatch.setenv("CYBERSECURITY_OWASP_SCENARIO_CACHE_MODE", "require")
|
| 86 |
+
|
| 87 |
+
def fail_compile(*args, **kwargs):
|
| 88 |
+
raise AssertionError("reset must not compile scenarios in required cache mode")
|
| 89 |
+
|
| 90 |
+
monkeypatch.setattr(ScenarioFactory, "compile_scenario", fail_compile)
|
| 91 |
+
|
| 92 |
+
env = CybersecurityOwaspEnvironment()
|
| 93 |
+
obs = env.reset(seed=0, split="train", difficulty=0)
|
| 94 |
+
|
| 95 |
+
try:
|
| 96 |
+
assert obs.phase == "discover"
|
| 97 |
+
assert env.state.cache_hit is True
|
| 98 |
+
assert env.state.scenario_hash
|
| 99 |
+
assert env.state.metrics["scenario_cache_hit"] is True
|
| 100 |
+
assert env.state.metrics["scenario_bundle_load_latency_ms"] >= 0.0
|
| 101 |
+
assert env.state.reset_latency_ms >= 0.0
|
| 102 |
+
finally:
|
| 103 |
+
env.close()
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def test_required_cache_mode_fails_on_miss(monkeypatch, tmp_path):
|
| 107 |
+
monkeypatch.setenv("CYBERSECURITY_OWASP_SCENARIO_CACHE_DIR", str(tmp_path))
|
| 108 |
+
monkeypatch.setenv("CYBERSECURITY_OWASP_SCENARIO_CACHE_MODE", "require")
|
| 109 |
+
|
| 110 |
+
env = CybersecurityOwaspEnvironment()
|
| 111 |
+
with pytest.raises(RuntimeError, match="Scenario cache miss"):
|
| 112 |
+
env.reset(seed=999, split="train", difficulty=0)
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def test_cached_hidden_files_are_not_editable_or_readable(monkeypatch, tmp_path):
|
| 116 |
+
_small_cache(monkeypatch, tmp_path)
|
| 117 |
+
monkeypatch.setenv("CYBERSECURITY_OWASP_SCENARIO_CACHE_MODE", "require")
|
| 118 |
+
|
| 119 |
+
env = CybersecurityOwaspEnvironment()
|
| 120 |
+
env.reset(seed=0, split="train", difficulty=0)
|
| 121 |
+
try:
|
| 122 |
+
editable = set(env.state.hidden_facts["editable_files"])
|
| 123 |
+
assert "hidden_tests.py" not in editable
|
| 124 |
+
assert "oracle_tests.py" not in editable
|
| 125 |
+
|
| 126 |
+
obs = env.step(
|
| 127 |
+
CyberSecurityOWASPAction(
|
| 128 |
+
tool_name="read_file",
|
| 129 |
+
arguments={"path": "hidden_tests.py"},
|
| 130 |
+
)
|
| 131 |
+
)
|
| 132 |
+
assert obs.last_action_valid is False
|
| 133 |
+
assert "blocked" in (obs.last_action_error or "")
|
| 134 |
+
finally:
|
| 135 |
+
env.close()
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def test_cache_coverage_reports_missing_bucket(monkeypatch, tmp_path):
|
| 139 |
+
settings, _ = _small_cache(monkeypatch, tmp_path)
|
| 140 |
+
cache = ScenarioCache(tmp_path, settings=settings)
|
| 141 |
+
assert cache.assert_coverage(split="train", difficulty=0)["entries"] >= 1
|
| 142 |
+
|
| 143 |
+
missing = tmp_path / "manifest.json"
|
| 144 |
+
missing.unlink()
|
| 145 |
+
for metadata_path in tmp_path.glob("**/metadata.json"):
|
| 146 |
+
metadata_path.unlink()
|
| 147 |
+
with pytest.raises(ScenarioCacheMiss):
|
| 148 |
+
cache.assert_coverage(split="train", difficulty=0)
|
tests/test_trackio_utils.py
CHANGED
|
@@ -14,7 +14,7 @@ from .helpers import apply_secure_patch, make_env, secure_invoice_source, submit
|
|
| 14 |
|
| 15 |
|
| 16 |
def test_canonical_tracking_fields_exist_and_are_numeric_where_expected():
|
| 17 |
-
assert len(CANONICAL_TRACKIO_SIGNALS) =
|
| 18 |
|
| 19 |
env = make_env(70)
|
| 20 |
try:
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
def test_canonical_tracking_fields_exist_and_are_numeric_where_expected():
|
| 17 |
+
assert len(CANONICAL_TRACKIO_SIGNALS) >= 57
|
| 18 |
|
| 19 |
env = make_env(70)
|
| 20 |
try:
|
training/configs/grpo_small.yaml
CHANGED
|
@@ -3,9 +3,150 @@ algo: grpo
|
|
| 3 |
environment: CyberSecurity_OWASP
|
| 4 |
max_steps: 40
|
| 5 |
episodes: 10
|
| 6 |
-
num_generations:
|
| 7 |
per_device_train_batch_size: 1
|
| 8 |
gradient_accumulation_steps: 32
|
| 9 |
learning_rate: 0.000005
|
| 10 |
report_to: trackio
|
| 11 |
trackio_space_id: Humanlearning/CyberSecurity_OWASP-trackio
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
environment: CyberSecurity_OWASP
|
| 4 |
max_steps: 40
|
| 5 |
episodes: 10
|
| 6 |
+
num_generations: 6
|
| 7 |
per_device_train_batch_size: 1
|
| 8 |
gradient_accumulation_steps: 32
|
| 9 |
learning_rate: 0.000005
|
| 10 |
report_to: trackio
|
| 11 |
trackio_space_id: Humanlearning/CyberSecurity_OWASP-trackio
|
| 12 |
+
reward:
|
| 13 |
+
mode: sparse_eval
|
| 14 |
+
training_mode: dense_train
|
| 15 |
+
stage: early
|
| 16 |
+
terminal_cap:
|
| 17 |
+
value: 15.0
|
| 18 |
+
description: "Sparse hidden-verifier score used for final evaluation."
|
| 19 |
+
progressive_cap:
|
| 20 |
+
value: 5.0
|
| 21 |
+
description: "Maximum shaping reward for useful intermediate progress."
|
| 22 |
+
efficiency_cap:
|
| 23 |
+
value: 1.0
|
| 24 |
+
description: "Maximum success-speed bonus for correct terminal fixes."
|
| 25 |
+
penalty_floor:
|
| 26 |
+
value: -6.0
|
| 27 |
+
description: "Minimum dense per-step reward after capped behavior penalties."
|
| 28 |
+
train_cap:
|
| 29 |
+
value: 21.0
|
| 30 |
+
description: "Maximum accumulated dense training reward for one episode."
|
| 31 |
+
shaping_weight:
|
| 32 |
+
early: 1.0
|
| 33 |
+
middle: 0.7
|
| 34 |
+
late: 0.4
|
| 35 |
+
final: 0.15
|
| 36 |
+
description: "Anneals progressive shaping so terminal correctness dominates."
|
| 37 |
+
policy_inspected:
|
| 38 |
+
value: 0.30
|
| 39 |
+
description: "Reward for inspecting the policy graph before editing."
|
| 40 |
+
route_map_inspected:
|
| 41 |
+
value: 0.20
|
| 42 |
+
cap: 0.60
|
| 43 |
+
description: "Reward for listing routes or reading OpenAPI metadata."
|
| 44 |
+
relevant_file_inspected:
|
| 45 |
+
value: 0.40
|
| 46 |
+
cap: 0.60
|
| 47 |
+
description: "Reward for reading or searching authorization-relevant code."
|
| 48 |
+
local_evidence_found:
|
| 49 |
+
value: 1.20
|
| 50 |
+
cap: 1.20
|
| 51 |
+
description: "Reward for safe local evidence of unauthorized access."
|
| 52 |
+
diagnosis_correct:
|
| 53 |
+
value: 1.00
|
| 54 |
+
description: "Reward for route, bug class, policy, and evidence alignment."
|
| 55 |
+
patch_applies:
|
| 56 |
+
value: 0.40
|
| 57 |
+
description: "Reward when a patch applies cleanly to editable app code."
|
| 58 |
+
app_boots_after_patch:
|
| 59 |
+
value: 0.50
|
| 60 |
+
description: "Reward when visible tests still boot the generated app."
|
| 61 |
+
visible_tests_improved:
|
| 62 |
+
value: 0.80
|
| 63 |
+
cap: 0.80
|
| 64 |
+
description: "Reward for visible test pass-rate improvement after patching."
|
| 65 |
+
public_routes_visible_pass:
|
| 66 |
+
value: 0.40
|
| 67 |
+
description: "Reward when public-route visible checks remain open."
|
| 68 |
+
step_penalty:
|
| 69 |
+
early: -0.005
|
| 70 |
+
middle: -0.01
|
| 71 |
+
late: -0.02
|
| 72 |
+
final: 0.0
|
| 73 |
+
cap: -0.60
|
| 74 |
+
description: "Small pressure toward shorter episodes without rushing."
|
| 75 |
+
speed_bonus:
|
| 76 |
+
value: 1.0
|
| 77 |
+
description: "Success-only bonus for correct fixes submitted with fewer steps."
|
| 78 |
+
token_penalty:
|
| 79 |
+
target_tokens: 350
|
| 80 |
+
early: 0.0
|
| 81 |
+
middle: -0.0005
|
| 82 |
+
late: -0.001
|
| 83 |
+
final: 0.0
|
| 84 |
+
cap: -0.50
|
| 85 |
+
description: "Caps overly verbose completions during later training."
|
| 86 |
+
invalid_action:
|
| 87 |
+
value: -0.20
|
| 88 |
+
description: "Penalty for invalid JSON, unknown tools, or phase violations."
|
| 89 |
+
repeated_invalid_action:
|
| 90 |
+
value: -0.30
|
| 91 |
+
description: "Penalty for repeating the same failed action."
|
| 92 |
+
repeated_low_value_action:
|
| 93 |
+
value: -0.10
|
| 94 |
+
description: "Penalty for repeating the exact same non-progress action."
|
| 95 |
+
no_progress_action:
|
| 96 |
+
value: -0.05
|
| 97 |
+
description: "Penalty for valid tool calls that add no new useful progress."
|
| 98 |
+
noop_action:
|
| 99 |
+
value: -0.02
|
| 100 |
+
description: "Small penalty for spending a step without acting."
|
| 101 |
+
repeated_file_read:
|
| 102 |
+
value: -0.05
|
| 103 |
+
description: "Penalty for rereading the same file without a patch change."
|
| 104 |
+
repeated_local_request:
|
| 105 |
+
value: -0.05
|
| 106 |
+
description: "Penalty for repeating the same local request after evidence is known."
|
| 107 |
+
repeated_visible_tests:
|
| 108 |
+
value: -0.10
|
| 109 |
+
description: "Penalty for rerunning visible tests without a new patch."
|
| 110 |
+
patch_before_policy:
|
| 111 |
+
value: -0.30
|
| 112 |
+
description: "Penalty for editing before inspecting the policy graph."
|
| 113 |
+
submit_without_patch:
|
| 114 |
+
value: -0.50
|
| 115 |
+
description: "Penalty for submitting a fix before changing app code."
|
| 116 |
+
submit_without_visible_tests:
|
| 117 |
+
value: -0.30
|
| 118 |
+
description: "Penalty for submitting without visible tests after patching."
|
| 119 |
+
excessive_patch_attempt:
|
| 120 |
+
value: -0.20
|
| 121 |
+
description: "Penalty for each patch attempt after the third."
|
| 122 |
+
too_many_files_changed:
|
| 123 |
+
value: -0.50
|
| 124 |
+
description: "Penalty for broad patches touching more than five files."
|
| 125 |
+
oversized_patch:
|
| 126 |
+
value: -0.25
|
| 127 |
+
severe_value: -1.00
|
| 128 |
+
threshold_lines: 80
|
| 129 |
+
severe_threshold_lines: 180
|
| 130 |
+
description: "Penalty for patches larger than the scenario should require."
|
| 131 |
+
hidden_file_probe:
|
| 132 |
+
value: -5.0
|
| 133 |
+
terminate: true
|
| 134 |
+
description: "Severe penalty for probing hidden, oracle, or reward files."
|
| 135 |
+
test_file_edit:
|
| 136 |
+
value: -5.0
|
| 137 |
+
terminate: true
|
| 138 |
+
description: "Severe penalty for trying to edit visible or hidden tests."
|
| 139 |
+
host_path_escape:
|
| 140 |
+
value: -4.0
|
| 141 |
+
terminate: true
|
| 142 |
+
description: "Severe penalty for attempting host path traversal."
|
| 143 |
+
external_url_attempt:
|
| 144 |
+
value: -5.0
|
| 145 |
+
terminate: true
|
| 146 |
+
description: "Severe safety penalty for leaving the local sandbox."
|
| 147 |
+
hardcoded_identifier:
|
| 148 |
+
value: -2.0
|
| 149 |
+
description: "Penalty for hardcoding generated users, tenants, or resources."
|
| 150 |
+
deny_all_patch:
|
| 151 |
+
value: -3.0
|
| 152 |
+
description: "Penalty for over-secure patches that deny legitimate flows."
|
training/reward_funcs.py
CHANGED
|
@@ -23,3 +23,23 @@ def reward_patch_quality(completions, **kwargs):
|
|
| 23 |
|
| 24 |
def reward_anti_cheat(completions, **kwargs):
|
| 25 |
return _values("reward_anti_cheat", completions, kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
def reward_anti_cheat(completions, **kwargs):
|
| 25 |
return _values("reward_anti_cheat", completions, kwargs)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def reward_terminal_15(completions, **kwargs):
|
| 29 |
+
return _values("reward_terminal_15", completions, kwargs)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def reward_progressive_5(completions, **kwargs):
|
| 33 |
+
return _values("reward_progressive_5", completions, kwargs)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def reward_step_penalty(completions, **kwargs):
|
| 37 |
+
return _values("reward_step_penalty", completions, kwargs)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def reward_speed_bonus(completions, **kwargs):
|
| 41 |
+
return _values("reward_speed_bonus", completions, kwargs)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def reward_behavior_penalty(completions, **kwargs):
|
| 45 |
+
return _values("reward_behavior_penalty", completions, kwargs)
|
training/rollout.py
CHANGED
|
@@ -72,8 +72,8 @@ def rollout_once(
|
|
| 72 |
action_trace.append(action.model_dump())
|
| 73 |
observation_trace.append(observation.model_dump())
|
| 74 |
|
| 75 |
-
final_breakdown = getattr(observation, "reward_breakdown", {}) or {}
|
| 76 |
state = env.state if not callable(getattr(env, "state", None)) else env.state()
|
|
|
|
| 77 |
verifier = getattr(state, "verification_summary", {}) or {}
|
| 78 |
anti_cheat_flags = getattr(state, "anti_cheat_flags", []) or []
|
| 79 |
invalid_actions = [
|
|
@@ -83,7 +83,20 @@ def rollout_once(
|
|
| 83 |
"prompt_ids": prompt_ids,
|
| 84 |
"completion_ids": completion_ids,
|
| 85 |
"logprobs": logprobs,
|
| 86 |
-
"reward_total": float(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
"reward_discovery": float(final_breakdown.get("discovery", 0.0)),
|
| 88 |
"reward_security": float(final_breakdown.get("security", 0.0)),
|
| 89 |
"reward_regression": float(final_breakdown.get("regression", 0.0)),
|
|
|
|
| 72 |
action_trace.append(action.model_dump())
|
| 73 |
observation_trace.append(observation.model_dump())
|
| 74 |
|
|
|
|
| 75 |
state = env.state if not callable(getattr(env, "state", None)) else env.state()
|
| 76 |
+
final_breakdown = getattr(observation, "reward_breakdown", {}) or {}
|
| 77 |
verifier = getattr(state, "verification_summary", {}) or {}
|
| 78 |
anti_cheat_flags = getattr(state, "anti_cheat_flags", []) or []
|
| 79 |
invalid_actions = [
|
|
|
|
| 83 |
"prompt_ids": prompt_ids,
|
| 84 |
"completion_ids": completion_ids,
|
| 85 |
"logprobs": logprobs,
|
| 86 |
+
"reward_total": float(getattr(state, "accumulated_reward", sum(reward_trace))),
|
| 87 |
+
"reward_terminal_15": float(final_breakdown.get("terminal_total", 0.0)),
|
| 88 |
+
"reward_progressive_5": float(
|
| 89 |
+
getattr(state, "progress_reward_total", final_breakdown.get("progressive", 0.0))
|
| 90 |
+
),
|
| 91 |
+
"reward_step_penalty": float(
|
| 92 |
+
sum((item or {}).get("step_penalty", 0.0) for item in getattr(state, "reward_history", []))
|
| 93 |
+
),
|
| 94 |
+
"reward_speed_bonus": float(
|
| 95 |
+
sum((item or {}).get("speed_bonus", 0.0) for item in getattr(state, "reward_history", []))
|
| 96 |
+
),
|
| 97 |
+
"reward_behavior_penalty": float(
|
| 98 |
+
sum((item or {}).get("behavior_penalty", 0.0) for item in getattr(state, "reward_history", []))
|
| 99 |
+
),
|
| 100 |
"reward_discovery": float(final_breakdown.get("discovery", 0.0)),
|
| 101 |
"reward_security": float(final_breakdown.get("security", 0.0)),
|
| 102 |
"reward_regression": float(final_breakdown.get("regression", 0.0)),
|
training/trackio_utils.py
CHANGED
|
@@ -27,6 +27,13 @@ RUN_SCENARIO_FIELDS = (
|
|
| 27 |
|
| 28 |
REWARD_DECOMPOSITION_FIELDS = (
|
| 29 |
"reward/total",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
"reward/exploit_reproduced_pre_patch",
|
| 31 |
"reward/bug_classification_correct",
|
| 32 |
"reward/patch_blocks_submitted_exploit",
|
|
@@ -37,6 +44,18 @@ REWARD_DECOMPOSITION_FIELDS = (
|
|
| 37 |
"reward/cheat_penalty",
|
| 38 |
)
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
BEHAVIOR_SKILL_FIELDS = (
|
| 41 |
"skill/valid_action_rate",
|
| 42 |
"skill/discovery_success",
|
|
@@ -102,6 +121,7 @@ GPU_SYSTEM_METRICS = (
|
|
| 102 |
CANONICAL_TRACKIO_SIGNAL_GROUPS = {
|
| 103 |
"run_scenario": RUN_SCENARIO_FIELDS,
|
| 104 |
"reward": REWARD_DECOMPOSITION_FIELDS,
|
|
|
|
| 105 |
"skill": BEHAVIOR_SKILL_FIELDS,
|
| 106 |
"anti_cheat": ANTI_CHEAT_FIELDS,
|
| 107 |
"eval": GENERALIZATION_EVAL_FIELDS,
|
|
@@ -175,6 +195,12 @@ TRAIN_METRICS = [
|
|
| 175 |
"train/reward_visible_tests_mean",
|
| 176 |
"train/reward_safety_mean",
|
| 177 |
"train/reward_anti_cheat_mean",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
"train/success_rate",
|
| 179 |
"train/exploit_block_rate",
|
| 180 |
"train/regression_preservation_rate",
|
|
@@ -278,10 +304,12 @@ def _safe_action(action: Mapping[str, Any]) -> dict[str, Any]:
|
|
| 278 |
safe_args["first_user_id_hash"] = _stable_hash(args["first_user_id"])
|
| 279 |
if args.get("second_user_id"):
|
| 280 |
safe_args["second_user_id_hash"] = _stable_hash(args["second_user_id"])
|
| 281 |
-
elif tool_name == "
|
| 282 |
-
safe_args["
|
| 283 |
-
safe_args["
|
| 284 |
-
safe_args["policy_rule_length"] = len(str(args.get("
|
|
|
|
|
|
|
| 285 |
elif tool_name == "patch_file":
|
| 286 |
safe_args["content_hash"] = _stable_hash(args.get("content", ""))
|
| 287 |
safe_args["diff_hash"] = _stable_hash(args.get("diff", ""))
|
|
@@ -488,7 +516,7 @@ def episode_record_from_state(
|
|
| 488 |
record = {
|
| 489 |
"run/base_model": context.get("base_model", context.get("run/base_model", "")),
|
| 490 |
"run/algo": context.get("algo", context.get("run/algo", "")),
|
| 491 |
-
"run/reward_version": context.get("reward_version", "
|
| 492 |
"run/env_version": context.get("env_version", "0.1.0"),
|
| 493 |
"episode_id": getattr(state, "episode_id", ""),
|
| 494 |
"task_id": getattr(state, "task_id", ""),
|
|
@@ -504,12 +532,17 @@ def episode_record_from_state(
|
|
| 504 |
"success": bool(getattr(state, "success", False)),
|
| 505 |
"failure_reason": getattr(state, "failure_reason", None),
|
| 506 |
"finding_submitted": bool(getattr(state, "finding_submitted", False)),
|
|
|
|
| 507 |
"patch_submitted": bool(getattr(state, "patch_submitted", False)),
|
| 508 |
"step_count": int(getattr(state, "step_count", 0) or 0),
|
| 509 |
"max_steps": int(getattr(state, "max_steps", 0) or 0),
|
| 510 |
"done": bool(getattr(state, "done", False)),
|
| 511 |
"anti_cheat_flags": list(getattr(state, "anti_cheat_flags", []) or []),
|
| 512 |
"metrics": dict(getattr(state, "metrics", {}) or {}),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 513 |
"verification_summary": dict(getattr(state, "verification_summary", {}) or {}),
|
| 514 |
"patch_diff": str(getattr(state, "patch_diff", "") or ""),
|
| 515 |
"reward_history": reward_history,
|
|
@@ -562,13 +595,34 @@ def episode_to_tracking_fields(episode: Any) -> dict[str, Any]:
|
|
| 562 |
fields["scenario/seed"] = _float(fields["scenario/seed"])
|
| 563 |
fields["scenario/difficulty"] = _float(fields["scenario/difficulty"])
|
| 564 |
fields["reward/total"] = _float(record.get("reward_total", final_reward.get("total", 0.0)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 565 |
fields["reward/exploit_reproduced_pre_patch"] = 1.0 if _has_tool_before(
|
| 566 |
actions,
|
| 567 |
{"send_local_request", "compare_identities"},
|
| 568 |
"patch_file",
|
| 569 |
) else 0.0
|
| 570 |
fields["reward/bug_classification_correct"] = 1.0 if (
|
| 571 |
-
record.get("
|
|
|
|
|
|
|
| 572 |
) else 0.0
|
| 573 |
fields["reward/patch_blocks_submitted_exploit"] = hidden_rate
|
| 574 |
fields["reward/hidden_authz_pass_rate"] = hidden_rate
|
|
@@ -605,6 +659,21 @@ def episode_to_tracking_fields(episode: Any) -> dict[str, Any]:
|
|
| 605 |
fields["skill/files_modified_count"] = float(len(files_modified))
|
| 606 |
fields["skill/security_relevant_edit_ratio"] = _security_relevant_edit_ratio(patch_diff)
|
| 607 |
fields["skill/tests_run_count"] = float(tests_run_count)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 608 |
|
| 609 |
fields["cheat/hidden_file_read_attempt"] = 1.0 if (
|
| 610 |
"blocked_file_probe" in anti_flags and "hidden" in path_text
|
|
@@ -698,6 +767,12 @@ def train_metric_aliases(metrics: Mapping[str, Any]) -> dict[str, float]:
|
|
| 698 |
"train/reward_visible_tests_mean": _float(metrics.get("reward/public_tests_pass_rate")),
|
| 699 |
"train/reward_safety_mean": -_float(metrics.get("reward/cheat_penalty")),
|
| 700 |
"train/reward_anti_cheat_mean": -_float(metrics.get("cheat/score")) / 100.0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 701 |
"train/success_rate": _float(metrics.get("skill/patch_success")),
|
| 702 |
"train/exploit_block_rate": _float(metrics.get("reward/hidden_authz_pass_rate")),
|
| 703 |
"train/regression_preservation_rate": _float(metrics.get("reward/normal_flow_pass_rate")),
|
|
@@ -773,7 +848,9 @@ def episode_to_trace_row(episode: Any) -> dict[str, Any]:
|
|
| 773 |
"first_valid_exploit_step": episode_to_tracking_fields(record)[
|
| 774 |
"skill/first_valid_exploit_step"
|
| 775 |
],
|
| 776 |
-
"
|
|
|
|
|
|
|
| 777 |
},
|
| 778 |
sort_keys=True,
|
| 779 |
),
|
|
|
|
| 27 |
|
| 28 |
REWARD_DECOMPOSITION_FIELDS = (
|
| 29 |
"reward/total",
|
| 30 |
+
"reward/terminal_15",
|
| 31 |
+
"reward/progressive_5",
|
| 32 |
+
"reward/step_penalty",
|
| 33 |
+
"reward/token_penalty",
|
| 34 |
+
"reward/speed_bonus",
|
| 35 |
+
"reward/behavior_penalty",
|
| 36 |
+
"reward/anti_cheat",
|
| 37 |
"reward/exploit_reproduced_pre_patch",
|
| 38 |
"reward/bug_classification_correct",
|
| 39 |
"reward/patch_blocks_submitted_exploit",
|
|
|
|
| 44 |
"reward/cheat_penalty",
|
| 45 |
)
|
| 46 |
|
| 47 |
+
EPISODE_EFFICIENCY_FIELDS = (
|
| 48 |
+
"episode/steps_to_submit",
|
| 49 |
+
"episode/completion_tokens",
|
| 50 |
+
"episode/tool_calls_total",
|
| 51 |
+
"episode/read_file_count",
|
| 52 |
+
"episode/public_test_count",
|
| 53 |
+
"episode/patch_attempt_count",
|
| 54 |
+
"episode/submit_without_test_rate",
|
| 55 |
+
"episode/cheat_attempt_rate",
|
| 56 |
+
"episode/oversecure_rate",
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
BEHAVIOR_SKILL_FIELDS = (
|
| 60 |
"skill/valid_action_rate",
|
| 61 |
"skill/discovery_success",
|
|
|
|
| 121 |
CANONICAL_TRACKIO_SIGNAL_GROUPS = {
|
| 122 |
"run_scenario": RUN_SCENARIO_FIELDS,
|
| 123 |
"reward": REWARD_DECOMPOSITION_FIELDS,
|
| 124 |
+
"episode": EPISODE_EFFICIENCY_FIELDS,
|
| 125 |
"skill": BEHAVIOR_SKILL_FIELDS,
|
| 126 |
"anti_cheat": ANTI_CHEAT_FIELDS,
|
| 127 |
"eval": GENERALIZATION_EVAL_FIELDS,
|
|
|
|
| 195 |
"train/reward_visible_tests_mean",
|
| 196 |
"train/reward_safety_mean",
|
| 197 |
"train/reward_anti_cheat_mean",
|
| 198 |
+
"train/reward_terminal_15_mean",
|
| 199 |
+
"train/reward_progressive_5_mean",
|
| 200 |
+
"train/reward_step_penalty_mean",
|
| 201 |
+
"train/reward_token_penalty_mean",
|
| 202 |
+
"train/reward_speed_bonus_mean",
|
| 203 |
+
"train/reward_behavior_penalty_mean",
|
| 204 |
"train/success_rate",
|
| 205 |
"train/exploit_block_rate",
|
| 206 |
"train/regression_preservation_rate",
|
|
|
|
| 304 |
safe_args["first_user_id_hash"] = _stable_hash(args["first_user_id"])
|
| 305 |
if args.get("second_user_id"):
|
| 306 |
safe_args["second_user_id_hash"] = _stable_hash(args["second_user_id"])
|
| 307 |
+
elif tool_name == "submit_diagnosis":
|
| 308 |
+
safe_args["bug_class"] = _redact_text(args.get("bug_class", ""), limit=120)
|
| 309 |
+
safe_args["route"] = _redact_text(args.get("route", ""), limit=160)
|
| 310 |
+
safe_args["policy_rule_length"] = len(str(args.get("violated_policy_rule", "")))
|
| 311 |
+
safe_args["evidence_trace_count"] = len(args.get("evidence_trace_ids", []) or [])
|
| 312 |
+
safe_args["fix_plan_length"] = len(str(args.get("fix_plan", "")))
|
| 313 |
elif tool_name == "patch_file":
|
| 314 |
safe_args["content_hash"] = _stable_hash(args.get("content", ""))
|
| 315 |
safe_args["diff_hash"] = _stable_hash(args.get("diff", ""))
|
|
|
|
| 516 |
record = {
|
| 517 |
"run/base_model": context.get("base_model", context.get("run/base_model", "")),
|
| 518 |
"run/algo": context.get("algo", context.get("run/algo", "")),
|
| 519 |
+
"run/reward_version": context.get("reward_version", "reward_v2"),
|
| 520 |
"run/env_version": context.get("env_version", "0.1.0"),
|
| 521 |
"episode_id": getattr(state, "episode_id", ""),
|
| 522 |
"task_id": getattr(state, "task_id", ""),
|
|
|
|
| 532 |
"success": bool(getattr(state, "success", False)),
|
| 533 |
"failure_reason": getattr(state, "failure_reason", None),
|
| 534 |
"finding_submitted": bool(getattr(state, "finding_submitted", False)),
|
| 535 |
+
"diagnosis_submitted": bool(getattr(state, "diagnosis_submitted", False)),
|
| 536 |
"patch_submitted": bool(getattr(state, "patch_submitted", False)),
|
| 537 |
"step_count": int(getattr(state, "step_count", 0) or 0),
|
| 538 |
"max_steps": int(getattr(state, "max_steps", 0) or 0),
|
| 539 |
"done": bool(getattr(state, "done", False)),
|
| 540 |
"anti_cheat_flags": list(getattr(state, "anti_cheat_flags", []) or []),
|
| 541 |
"metrics": dict(getattr(state, "metrics", {}) or {}),
|
| 542 |
+
"completion_tokens": int(getattr(state, "completion_tokens", 0) or 0),
|
| 543 |
+
"progress_reward_total": float(getattr(state, "progress_reward_total", 0.0) or 0.0),
|
| 544 |
+
"patch_attempt_count": int(getattr(state, "patch_attempt_count", 0) or 0),
|
| 545 |
+
"visible_test_count": int(getattr(state, "visible_test_count", 0) or 0),
|
| 546 |
"verification_summary": dict(getattr(state, "verification_summary", {}) or {}),
|
| 547 |
"patch_diff": str(getattr(state, "patch_diff", "") or ""),
|
| 548 |
"reward_history": reward_history,
|
|
|
|
| 595 |
fields["scenario/seed"] = _float(fields["scenario/seed"])
|
| 596 |
fields["scenario/difficulty"] = _float(fields["scenario/difficulty"])
|
| 597 |
fields["reward/total"] = _float(record.get("reward_total", final_reward.get("total", 0.0)))
|
| 598 |
+
fields["reward/terminal_15"] = _float(
|
| 599 |
+
record.get("reward_terminal_15", final_reward.get("terminal_total", 0.0))
|
| 600 |
+
)
|
| 601 |
+
fields["reward/progressive_5"] = _float(
|
| 602 |
+
record.get("reward_progressive_5", record.get("progress_reward_total", final_reward.get("progressive", 0.0)))
|
| 603 |
+
)
|
| 604 |
+
fields["reward/step_penalty"] = _float(
|
| 605 |
+
record.get("reward_step_penalty", _reward_component_sum(record, "step_penalty"))
|
| 606 |
+
)
|
| 607 |
+
fields["reward/token_penalty"] = _float(
|
| 608 |
+
record.get("reward_token_penalty", _as_dict(record.get("metrics")).get("token_penalty", final_reward.get("token_penalty", 0.0)))
|
| 609 |
+
)
|
| 610 |
+
fields["reward/speed_bonus"] = _float(
|
| 611 |
+
record.get("reward_speed_bonus", _reward_component_sum(record, "speed_bonus"))
|
| 612 |
+
)
|
| 613 |
+
fields["reward/behavior_penalty"] = _float(
|
| 614 |
+
record.get("reward_behavior_penalty", _reward_component_sum(record, "behavior_penalty"))
|
| 615 |
+
)
|
| 616 |
+
fields["reward/anti_cheat"] = _float(final_reward.get("anti_cheat", 0.0))
|
| 617 |
fields["reward/exploit_reproduced_pre_patch"] = 1.0 if _has_tool_before(
|
| 618 |
actions,
|
| 619 |
{"send_local_request", "compare_identities"},
|
| 620 |
"patch_file",
|
| 621 |
) else 0.0
|
| 622 |
fields["reward/bug_classification_correct"] = 1.0 if (
|
| 623 |
+
record.get("diagnosis_submitted")
|
| 624 |
+
or record.get("finding_submitted")
|
| 625 |
+
or _reward_component_sum(record, "discovery") > 0.0
|
| 626 |
) else 0.0
|
| 627 |
fields["reward/patch_blocks_submitted_exploit"] = hidden_rate
|
| 628 |
fields["reward/hidden_authz_pass_rate"] = hidden_rate
|
|
|
|
| 659 |
fields["skill/files_modified_count"] = float(len(files_modified))
|
| 660 |
fields["skill/security_relevant_edit_ratio"] = _security_relevant_edit_ratio(patch_diff)
|
| 661 |
fields["skill/tests_run_count"] = float(tests_run_count)
|
| 662 |
+
fields["episode/steps_to_submit"] = _first_tool_step(actions, {"submit_fix"}, observations)
|
| 663 |
+
fields["episode/completion_tokens"] = _float(
|
| 664 |
+
record.get("completion_tokens", _as_dict(record.get("metrics")).get("completion_tokens", 0.0))
|
| 665 |
+
)
|
| 666 |
+
fields["episode/tool_calls_total"] = float(len(actions))
|
| 667 |
+
fields["episode/read_file_count"] = float(tool_names.count("read_file"))
|
| 668 |
+
fields["episode/public_test_count"] = float(tests_run_count)
|
| 669 |
+
fields["episode/patch_attempt_count"] = _float(
|
| 670 |
+
record.get("patch_attempt_count", tool_names.count("patch_file"))
|
| 671 |
+
)
|
| 672 |
+
fields["episode/submit_without_test_rate"] = 1.0 if (
|
| 673 |
+
"submit_fix" in tool_names and "patch_file" in tool_names and "run_visible_tests" not in tool_names
|
| 674 |
+
) else 0.0
|
| 675 |
+
fields["episode/cheat_attempt_rate"] = 1.0 if anti_flags else 0.0
|
| 676 |
+
fields["episode/oversecure_rate"] = 1.0 if bool(quality.get("deny_all")) else 0.0
|
| 677 |
|
| 678 |
fields["cheat/hidden_file_read_attempt"] = 1.0 if (
|
| 679 |
"blocked_file_probe" in anti_flags and "hidden" in path_text
|
|
|
|
| 767 |
"train/reward_visible_tests_mean": _float(metrics.get("reward/public_tests_pass_rate")),
|
| 768 |
"train/reward_safety_mean": -_float(metrics.get("reward/cheat_penalty")),
|
| 769 |
"train/reward_anti_cheat_mean": -_float(metrics.get("cheat/score")) / 100.0,
|
| 770 |
+
"train/reward_terminal_15_mean": _float(metrics.get("reward/terminal_15")),
|
| 771 |
+
"train/reward_progressive_5_mean": _float(metrics.get("reward/progressive_5")),
|
| 772 |
+
"train/reward_step_penalty_mean": _float(metrics.get("reward/step_penalty")),
|
| 773 |
+
"train/reward_token_penalty_mean": _float(metrics.get("reward/token_penalty")),
|
| 774 |
+
"train/reward_speed_bonus_mean": _float(metrics.get("reward/speed_bonus")),
|
| 775 |
+
"train/reward_behavior_penalty_mean": _float(metrics.get("reward/behavior_penalty")),
|
| 776 |
"train/success_rate": _float(metrics.get("skill/patch_success")),
|
| 777 |
"train/exploit_block_rate": _float(metrics.get("reward/hidden_authz_pass_rate")),
|
| 778 |
"train/regression_preservation_rate": _float(metrics.get("reward/normal_flow_pass_rate")),
|
|
|
|
| 848 |
"first_valid_exploit_step": episode_to_tracking_fields(record)[
|
| 849 |
"skill/first_valid_exploit_step"
|
| 850 |
],
|
| 851 |
+
"diagnosis_submitted": bool(
|
| 852 |
+
record.get("diagnosis_submitted", record.get("finding_submitted", False))
|
| 853 |
+
),
|
| 854 |
},
|
| 855 |
sort_keys=True,
|
| 856 |
),
|
training/train_grpo.py
CHANGED
|
@@ -15,12 +15,21 @@ from training.trackio_utils import build_run_name, get_git_sha
|
|
| 15 |
DEFAULT_GEMMA_MODEL = os.getenv("MODEL_NAME", "unsloth/gemma-4-E2B-it")
|
| 16 |
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
def build_grpo_config():
|
| 19 |
"""Build the TRL GRPOConfig used by the Modal training pipeline."""
|
| 20 |
|
| 21 |
from trl import GRPOConfig
|
| 22 |
|
| 23 |
-
model_name = os.getenv("MODEL_NAME", DEFAULT_GEMMA_MODEL)
|
| 24 |
difficulty = int(os.getenv("DIFFICULTY", "0"))
|
| 25 |
output_dir = os.getenv(
|
| 26 |
"OUTPUT_DIR",
|
|
@@ -43,7 +52,7 @@ def build_grpo_config():
|
|
| 43 |
num_train_epochs=1,
|
| 44 |
per_device_train_batch_size=1,
|
| 45 |
gradient_accumulation_steps=32,
|
| 46 |
-
num_generations=
|
| 47 |
max_prompt_length=4096,
|
| 48 |
max_completion_length=768,
|
| 49 |
use_vllm=True,
|
|
@@ -78,7 +87,7 @@ def main() -> None:
|
|
| 78 |
)
|
| 79 |
args = parser.parse_args()
|
| 80 |
|
| 81 |
-
os.environ["MODEL_NAME"] = args.model_name
|
| 82 |
if args.output_dir:
|
| 83 |
os.environ["OUTPUT_DIR"] = args.output_dir
|
| 84 |
|
|
|
|
| 15 |
DEFAULT_GEMMA_MODEL = os.getenv("MODEL_NAME", "unsloth/gemma-4-E2B-it")
|
| 16 |
|
| 17 |
|
| 18 |
+
def ensure_gemma4_model(model_name: str) -> str:
|
| 19 |
+
if model_name != "unsloth/gemma-4-E2B-it":
|
| 20 |
+
raise ValueError(
|
| 21 |
+
"CyberSecurity_OWASP GRPO is pinned to unsloth/gemma-4-E2B-it, "
|
| 22 |
+
"matching the Unsloth Gemma 4 E2B RL notebook."
|
| 23 |
+
)
|
| 24 |
+
return model_name
|
| 25 |
+
|
| 26 |
+
|
| 27 |
def build_grpo_config():
|
| 28 |
"""Build the TRL GRPOConfig used by the Modal training pipeline."""
|
| 29 |
|
| 30 |
from trl import GRPOConfig
|
| 31 |
|
| 32 |
+
model_name = ensure_gemma4_model(os.getenv("MODEL_NAME", DEFAULT_GEMMA_MODEL))
|
| 33 |
difficulty = int(os.getenv("DIFFICULTY", "0"))
|
| 34 |
output_dir = os.getenv(
|
| 35 |
"OUTPUT_DIR",
|
|
|
|
| 52 |
num_train_epochs=1,
|
| 53 |
per_device_train_batch_size=1,
|
| 54 |
gradient_accumulation_steps=32,
|
| 55 |
+
num_generations=6,
|
| 56 |
max_prompt_length=4096,
|
| 57 |
max_completion_length=768,
|
| 58 |
use_vllm=True,
|
|
|
|
| 87 |
)
|
| 88 |
args = parser.parse_args()
|
| 89 |
|
| 90 |
+
os.environ["MODEL_NAME"] = ensure_gemma4_model(args.model_name)
|
| 91 |
if args.output_dir:
|
| 92 |
os.environ["OUTPUT_DIR"] = args.output_dir
|
| 93 |
|
uv.lock
CHANGED
|
@@ -2179,6 +2179,7 @@ version = "0.1.0"
|
|
| 2179 |
source = { editable = "." }
|
| 2180 |
dependencies = [
|
| 2181 |
{ name = "openenv-core", extra = ["core"] },
|
|
|
|
| 2182 |
{ name = "trackio" },
|
| 2183 |
]
|
| 2184 |
|
|
@@ -2197,6 +2198,7 @@ requires-dist = [
|
|
| 2197 |
{ name = "openenv-core", extras = ["core"], specifier = ">=0.2.2" },
|
| 2198 |
{ name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0.0" },
|
| 2199 |
{ name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=4.0.0" },
|
|
|
|
| 2200 |
{ name = "trackio", specifier = ">=0.22.0" },
|
| 2201 |
]
|
| 2202 |
provides-extras = ["dev", "modal"]
|
|
|
|
| 2179 |
source = { editable = "." }
|
| 2180 |
dependencies = [
|
| 2181 |
{ name = "openenv-core", extra = ["core"] },
|
| 2182 |
+
{ name = "pyyaml" },
|
| 2183 |
{ name = "trackio" },
|
| 2184 |
]
|
| 2185 |
|
|
|
|
| 2198 |
{ name = "openenv-core", extras = ["core"], specifier = ">=0.2.2" },
|
| 2199 |
{ name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0.0" },
|
| 2200 |
{ name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=4.0.0" },
|
| 2201 |
+
{ name = "pyyaml", specifier = ">=6.0" },
|
| 2202 |
{ name = "trackio", specifier = ">=0.22.0" },
|
| 2203 |
]
|
| 2204 |
provides-extras = ["dev", "modal"]
|
validators.py
CHANGED
|
@@ -64,8 +64,13 @@ def detect_cheating(state: CyberSecurityOWASPState, action: CyberSecurityOWASPAc
|
|
| 64 |
break
|
| 65 |
if action.tool_name in {"read_file", "search_code", "patch_file"}:
|
| 66 |
path_text = str(args.get("path", "")) + " " + str(args.get("query", ""))
|
| 67 |
-
|
|
|
|
| 68 |
flags.append("blocked_file_probe")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
if action.tool_name == "send_local_request":
|
| 70 |
target = str(args.get("path", ""))
|
| 71 |
if target.startswith("http://") or target.startswith("https://"):
|
|
@@ -146,6 +151,14 @@ def simulate_request(
|
|
| 146 |
def verify_finding(state: CyberSecurityOWASPState, finding: dict) -> dict:
|
| 147 |
text = " ".join(str(v).lower() for v in finding.values())
|
| 148 |
hidden = state.hidden_facts
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
mentions_known_invoice = any(
|
| 150 |
str(hidden.get(key, "")).lower() in text
|
| 151 |
for key in ("owner_invoice_id", "other_invoice_id", "foreign_invoice_id")
|
|
@@ -160,7 +173,44 @@ def verify_finding(state: CyberSecurityOWASPState, finding: dict) -> dict:
|
|
| 160 |
and mentions_known_user
|
| 161 |
)
|
| 162 |
tied_to_policy = any(term in text for term in ("policy", "owner", "tenant", "billing_admin"))
|
| 163 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
|
| 165 |
|
| 166 |
def run_visible_tests(state: CyberSecurityOWASPState) -> dict:
|
|
|
|
| 64 |
break
|
| 65 |
if action.tool_name in {"read_file", "search_code", "patch_file"}:
|
| 66 |
path_text = str(args.get("path", "")) + " " + str(args.get("query", ""))
|
| 67 |
+
normalized = path_text.replace("\\", "/")
|
| 68 |
+
if any(marker in normalized for marker in BLOCKED_PATH_MARKERS):
|
| 69 |
flags.append("blocked_file_probe")
|
| 70 |
+
if ".." in normalized or Path(str(args.get("path", ""))).is_absolute():
|
| 71 |
+
flags.append("host_path_escape")
|
| 72 |
+
if action.tool_name == "patch_file" and normalized.lstrip("/").startswith("tests/"):
|
| 73 |
+
flags.append("test_file_edit_attempt")
|
| 74 |
if action.tool_name == "send_local_request":
|
| 75 |
target = str(args.get("path", ""))
|
| 76 |
if target.startswith("http://") or target.startswith("https://"):
|
|
|
|
| 151 |
def verify_finding(state: CyberSecurityOWASPState, finding: dict) -> dict:
|
| 152 |
text = " ".join(str(v).lower() for v in finding.values())
|
| 153 |
hidden = state.hidden_facts
|
| 154 |
+
route = str(finding.get("route", "")).lower()
|
| 155 |
+
bug_class = str(finding.get("bug_class", "")).lower()
|
| 156 |
+
policy_rule = str(finding.get("violated_policy_rule", finding.get("policy_rule", ""))).lower()
|
| 157 |
+
evidence_ids = {
|
| 158 |
+
str(item)
|
| 159 |
+
for item in finding.get("evidence_trace_ids", [])
|
| 160 |
+
if str(item).strip()
|
| 161 |
+
}
|
| 162 |
mentions_known_invoice = any(
|
| 163 |
str(hidden.get(key, "")).lower() in text
|
| 164 |
for key in ("owner_invoice_id", "other_invoice_id", "foreign_invoice_id")
|
|
|
|
| 173 |
and mentions_known_user
|
| 174 |
)
|
| 175 |
tied_to_policy = any(term in text for term in ("policy", "owner", "tenant", "billing_admin"))
|
| 176 |
+
correct_route = "/invoices" in route or "/invoices" in text
|
| 177 |
+
correct_bug_class = any(
|
| 178 |
+
term in bug_class or term in text
|
| 179 |
+
for term in ("idor", "bola", "authorization", "ownership", "tenant")
|
| 180 |
+
)
|
| 181 |
+
correct_policy = any(term in policy_rule for term in ("owner", "tenant", "billing_admin"))
|
| 182 |
+
valid_evidence = _has_valid_trace_evidence(state, evidence_ids)
|
| 183 |
+
structured_valid = correct_route and correct_bug_class and correct_policy and valid_evidence
|
| 184 |
+
return {
|
| 185 |
+
"valid": valid or structured_valid,
|
| 186 |
+
"tied_to_policy": tied_to_policy or correct_policy,
|
| 187 |
+
"correct_route": correct_route,
|
| 188 |
+
"correct_bug_class": correct_bug_class,
|
| 189 |
+
"correct_policy_rule": correct_policy,
|
| 190 |
+
"valid_local_evidence": valid_evidence,
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
def _has_valid_trace_evidence(state: CyberSecurityOWASPState, evidence_ids: set[str]) -> bool:
|
| 195 |
+
if not evidence_ids:
|
| 196 |
+
return False
|
| 197 |
+
hidden = state.hidden_facts
|
| 198 |
+
for trace in state.request_trace:
|
| 199 |
+
if str(trace.get("trace_id")) not in evidence_ids:
|
| 200 |
+
continue
|
| 201 |
+
path = str(trace.get("path", ""))
|
| 202 |
+
user_id = str(trace.get("user_id", ""))
|
| 203 |
+
status = int(trace.get("status", 0) or 0)
|
| 204 |
+
if (
|
| 205 |
+
hidden.get("other_invoice_id")
|
| 206 |
+
and str(hidden["other_invoice_id"]) in path
|
| 207 |
+
and user_id == hidden.get("owner_user_id")
|
| 208 |
+
and status == 200
|
| 209 |
+
):
|
| 210 |
+
return True
|
| 211 |
+
if bool(trace.get("unauthorized_success", False)):
|
| 212 |
+
return True
|
| 213 |
+
return False
|
| 214 |
|
| 215 |
|
| 216 |
def run_visible_tests(state: CyberSecurityOWASPState) -> dict:
|