Spaces:
Sleeping
Sleeping
Commit ·
3807ea3
1
Parent(s): 06bfd31
feat: implement core RL training infrastructure, including GRPO training, evaluation utilities, custom environments, and Modal-based execution scripts.
Browse files- README.md +98 -189
- __init__.py +15 -9
- bug_mutator.py +17 -0
- client.py +21 -81
- evals.py +63 -0
- fixture_generator.py +17 -0
- models.py +72 -18
- policy_graph.py +105 -0
- pyproject.toml +4 -1
- rewards.py +66 -0
- safety.py +17 -0
- scenario_compiler.py +46 -0
- scripts/docker_build.sh +3 -0
- scripts/docker_run.sh +3 -0
- scripts/generate_scenarios.sh +3 -0
- scripts/modal_ephemeral_train.py +163 -0
- scripts/modal_run_ephemeral.sh +3 -0
- scripts/push_space.sh +3 -0
- scripts/run_local.sh +3 -0
- scripts/smoke_test.sh +3 -0
- server/CyberSecurity_OWASP_environment.py +344 -82
- server/app.py +6 -28
- server/reward_engine.py +49 -0
- template_renderer.py +97 -0
- tests/__init__.py +1 -0
- tests/helpers.py +51 -0
- tests/test_anti_cheat.py +16 -0
- tests/test_invalid_actions.py +48 -0
- tests/test_models.py +14 -0
- tests/test_reset_step_state.py +25 -0
- tests/test_rewards.py +67 -0
- tests/test_rollouts.py +29 -0
- tests/test_seed_reproducibility.py +10 -0
- training/configs/grpo_small.yaml +9 -0
- training/eval_before_after.py +29 -0
- training/reward_funcs.py +25 -0
- training/rollout.py +84 -0
- training/trackio_utils.py +40 -0
- training/train_grpo.py +46 -0
- uv.lock +0 -0
- validators.py +224 -0
README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: gray
|
| 6 |
sdk: docker
|
|
@@ -9,247 +9,156 @@ app_port: 8000
|
|
| 9 |
base_path: /web
|
| 10 |
tags:
|
| 11 |
- openenv
|
|
|
|
|
|
|
| 12 |
---
|
| 13 |
|
| 14 |
-
#
|
| 15 |
|
| 16 |
-
|
| 17 |
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
The simplest way to use the Cybersecurity Owasp environment is through the `CybersecurityOwaspEnv` class:
|
| 21 |
-
|
| 22 |
-
```python
|
| 23 |
-
from CyberSecurity_OWASP import CybersecurityOwaspAction, CybersecurityOwaspEnv
|
| 24 |
-
|
| 25 |
-
try:
|
| 26 |
-
# Create environment from Docker image
|
| 27 |
-
CyberSecurity_OWASPenv = CybersecurityOwaspEnv.from_docker_image("CyberSecurity_OWASP-env:latest")
|
| 28 |
-
|
| 29 |
-
# Reset
|
| 30 |
-
result = CyberSecurity_OWASPenv.reset()
|
| 31 |
-
print(f"Reset: {result.observation.echoed_message}")
|
| 32 |
-
|
| 33 |
-
# Send multiple messages
|
| 34 |
-
messages = ["Hello, World!", "Testing echo", "Final message"]
|
| 35 |
-
|
| 36 |
-
for msg in messages:
|
| 37 |
-
result = CyberSecurity_OWASPenv.step(CybersecurityOwaspAction(message=msg))
|
| 38 |
-
print(f"Sent: '{msg}'")
|
| 39 |
-
print(f" → Echoed: '{result.observation.echoed_message}'")
|
| 40 |
-
print(f" → Length: {result.observation.message_length}")
|
| 41 |
-
print(f" → Reward: {result.reward}")
|
| 42 |
-
|
| 43 |
-
finally:
|
| 44 |
-
# Always clean up
|
| 45 |
-
CyberSecurity_OWASPenv.close()
|
| 46 |
```
|
| 47 |
|
| 48 |
-
|
| 49 |
-
- Starting the Docker container
|
| 50 |
-
- Waiting for the server to be ready
|
| 51 |
-
- Connecting to the environment
|
| 52 |
-
- Container cleanup when you call `close()`
|
| 53 |
-
|
| 54 |
-
## Building the Docker Image
|
| 55 |
|
| 56 |
-
|
| 57 |
|
| 58 |
```bash
|
| 59 |
-
|
| 60 |
-
|
|
|
|
| 61 |
```
|
| 62 |
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
You can easily deploy your OpenEnv environment to Hugging Face Spaces using the `openenv push` command:
|
| 66 |
|
| 67 |
-
```
|
| 68 |
-
|
| 69 |
-
openenv push
|
| 70 |
|
| 71 |
-
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
| 73 |
```
|
| 74 |
|
| 75 |
-
|
| 76 |
-
1. Validate that the directory is an OpenEnv environment (checks for `openenv.yaml`)
|
| 77 |
-
2. Prepare a custom build for Hugging Face Docker space (enables web interface)
|
| 78 |
-
3. Upload to Hugging Face (ensuring you're logged in)
|
| 79 |
-
|
| 80 |
-
### Prerequisites
|
| 81 |
|
| 82 |
-
|
| 83 |
|
| 84 |
-
|
|
|
|
|
|
|
| 85 |
|
| 86 |
-
|
| 87 |
-
- `--repo-id`, `-r`: Repository ID in format 'username/repo-name' (defaults to 'username/env-name' from openenv.yaml)
|
| 88 |
-
- `--base-image`, `-b`: Base Docker image to use (overrides Dockerfile FROM)
|
| 89 |
-
- `--private`: Deploy the space as private (default: public)
|
| 90 |
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
-
|
| 94 |
-
# Push to your personal namespace (defaults to username/env-name from openenv.yaml)
|
| 95 |
-
openenv push
|
| 96 |
|
| 97 |
-
|
| 98 |
-
|
|
|
|
| 99 |
|
| 100 |
-
#
|
| 101 |
-
openenv push --base-image ghcr.io/meta-pytorch/openenv-base:latest
|
| 102 |
|
| 103 |
-
|
| 104 |
-
openenv push --private
|
| 105 |
|
| 106 |
-
|
| 107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
```
|
| 109 |
|
| 110 |
-
|
| 111 |
-
`https://huggingface.co/spaces/<repo-id>`
|
| 112 |
-
|
| 113 |
-
The deployed space includes:
|
| 114 |
-
- **Web Interface** at `/web` - Interactive UI for exploring the environment
|
| 115 |
-
- **API Documentation** at `/docs` - Full OpenAPI/Swagger interface
|
| 116 |
-
- **Health Check** at `/health` - Container health monitoring
|
| 117 |
-
- **WebSocket** at `/ws` - Persistent session endpoint for low-latency interactions
|
| 118 |
|
| 119 |
-
##
|
| 120 |
|
| 121 |
-
|
| 122 |
-
**CybersecurityOwaspAction**: Contains a single field
|
| 123 |
-
- `message` (str) - The message to echo back
|
| 124 |
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
-
|
| 128 |
-
-
|
| 129 |
-
-
|
| 130 |
-
- `done` (bool) - Always False for echo environment
|
| 131 |
-
- `metadata` (dict) - Additional info like step count
|
| 132 |
|
| 133 |
-
|
| 134 |
-
The reward is calculated as: `message_length × 0.1`
|
| 135 |
-
- "Hi" → reward: 0.2
|
| 136 |
-
- "Hello, World!" → reward: 1.3
|
| 137 |
-
- Empty message → reward: 0.0
|
| 138 |
|
| 139 |
-
##
|
| 140 |
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
If you already have a Cybersecurity Owasp environment server running, you can connect directly:
|
| 144 |
-
|
| 145 |
-
```python
|
| 146 |
-
from CyberSecurity_OWASP import CybersecurityOwaspEnv
|
| 147 |
-
|
| 148 |
-
# Connect to existing server
|
| 149 |
-
CyberSecurity_OWASPenv = CybersecurityOwaspEnv(base_url="<ENV_HTTP_URL_HERE>")
|
| 150 |
-
|
| 151 |
-
# Use as normal
|
| 152 |
-
result = CyberSecurity_OWASPenv.reset()
|
| 153 |
-
result = CyberSecurity_OWASPenv.step(CybersecurityOwaspAction(message="Hello!"))
|
| 154 |
```
|
| 155 |
|
| 156 |
-
|
| 157 |
|
| 158 |
-
##
|
| 159 |
|
| 160 |
-
|
| 161 |
|
| 162 |
-
``
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
print(f"Reset: {result.observation.echoed_message}")
|
| 169 |
-
# Multiple steps with low latency
|
| 170 |
-
for msg in ["Hello", "World", "!"]:
|
| 171 |
-
result = env.step(CybersecurityOwaspAction(message=msg))
|
| 172 |
-
print(f"Echoed: {result.observation.echoed_message}")
|
| 173 |
-
```
|
| 174 |
|
| 175 |
-
The
|
| 176 |
-
- **Lower latency**: No HTTP connection overhead per request
|
| 177 |
-
- **Persistent session**: Server maintains your environment state
|
| 178 |
-
- **Efficient for episodes**: Better for many sequential steps
|
| 179 |
|
| 180 |
-
##
|
| 181 |
|
| 182 |
-
|
| 183 |
-
modify `server/app.py` to use factory mode:
|
| 184 |
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
CybersecurityOwaspAction,
|
| 190 |
-
CybersecurityOwaspObservation,
|
| 191 |
-
max_concurrent_envs=4, # Allow 4 concurrent sessions
|
| 192 |
-
)
|
| 193 |
```
|
| 194 |
|
| 195 |
-
|
| 196 |
|
| 197 |
-
```
|
| 198 |
-
|
| 199 |
-
from concurrent.futures import ThreadPoolExecutor
|
| 200 |
-
|
| 201 |
-
def run_episode(client_id: int):
|
| 202 |
-
with CybersecurityOwaspEnv(base_url="http://localhost:8000") as env:
|
| 203 |
-
result = env.reset()
|
| 204 |
-
for i in range(10):
|
| 205 |
-
result = env.step(CybersecurityOwaspAction(message=f"Client {client_id}, step {i}"))
|
| 206 |
-
return client_id, result.observation.message_length
|
| 207 |
-
|
| 208 |
-
# Run 4 episodes concurrently
|
| 209 |
-
with ThreadPoolExecutor(max_workers=4) as executor:
|
| 210 |
-
results = list(executor.map(run_episode, range(4)))
|
| 211 |
```
|
| 212 |
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
### Direct Environment Testing
|
| 216 |
|
| 217 |
-
|
| 218 |
|
| 219 |
```bash
|
| 220 |
-
|
| 221 |
-
python3 server/CyberSecurity_OWASP_environment.py
|
| 222 |
```
|
| 223 |
|
| 224 |
-
|
| 225 |
-
- Environment resets correctly
|
| 226 |
-
- Step executes actions properly
|
| 227 |
-
- State tracking works
|
| 228 |
-
- Rewards are calculated correctly
|
| 229 |
-
|
| 230 |
-
### Running Locally
|
| 231 |
-
|
| 232 |
-
Run the server locally for development:
|
| 233 |
|
| 234 |
```bash
|
| 235 |
-
|
| 236 |
```
|
| 237 |
|
| 238 |
-
##
|
| 239 |
|
| 240 |
-
```
|
| 241 |
-
CyberSecurity_OWASP/
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
├── README.md # This file
|
| 245 |
-
├── openenv.yaml # OpenEnv manifest
|
| 246 |
-
├── pyproject.toml # Project metadata and dependencies
|
| 247 |
-
├── uv.lock # Locked dependencies (generated)
|
| 248 |
-
├── client.py # CybersecurityOwaspEnv client
|
| 249 |
-
├── models.py # Action and Observation models
|
| 250 |
-
└── server/
|
| 251 |
-
├── __init__.py # Server module exports
|
| 252 |
-
├── CyberSecurity_OWASP_environment.py # Core environment logic
|
| 253 |
-
├── app.py # FastAPI application (HTTP + WebSocket endpoints)
|
| 254 |
-
└── Dockerfile # Container image definition
|
| 255 |
```
|
|
|
|
| 1 |
---
|
| 2 |
+
title: CyberSecurity_OWASP Environment Server
|
| 3 |
+
emoji: 🛡️
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: gray
|
| 6 |
sdk: docker
|
|
|
|
| 9 |
base_path: /web
|
| 10 |
tags:
|
| 11 |
- openenv
|
| 12 |
+
- cybersecurity
|
| 13 |
+
- owasp
|
| 14 |
---
|
| 15 |
|
| 16 |
+
# CyberSecurity_OWASP
|
| 17 |
|
| 18 |
+
`CyberSecurity_OWASP` is an OpenEnv-compliant reinforcement-learning environment for a single LLM agent that performs a defensive authorization-repair workflow:
|
| 19 |
|
| 20 |
+
```text
|
| 21 |
+
inspect generated app + policy -> discover authorization bug -> submit finding -> patch code -> preserve intended behavior
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
```
|
| 23 |
|
| 24 |
+
The current implementation includes a functional MVP scenario: an invoices FastAPI-style app with one injected OWASP A01 BOLA/IDOR defect, visible tests, hidden deterministic verifier checks, anti-cheat safeguards, and decomposed reward.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
+
## Quick Start
|
| 27 |
|
| 28 |
```bash
|
| 29 |
+
uv sync --extra dev
|
| 30 |
+
uv run --extra dev pytest
|
| 31 |
+
uv run server --port 8000
|
| 32 |
```
|
| 33 |
|
| 34 |
+
Then connect with the OpenEnv client:
|
|
|
|
|
|
|
| 35 |
|
| 36 |
+
```python
|
| 37 |
+
from CyberSecurity_OWASP import CyberSecurityOWASPAction, CyberSecurityOWASPEnv
|
|
|
|
| 38 |
|
| 39 |
+
with CyberSecurityOWASPEnv(base_url="http://localhost:8000") as env:
|
| 40 |
+
result = env.reset(seed=7)
|
| 41 |
+
print(result.observation.task_brief)
|
| 42 |
+
result = env.step(CyberSecurityOWASPAction(tool_name="list_routes"))
|
| 43 |
+
print(result.observation.last_tool_result)
|
| 44 |
```
|
| 45 |
|
| 46 |
+
## Action Space
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
+
The agent emits one JSON action at a time:
|
| 49 |
|
| 50 |
+
```json
|
| 51 |
+
{"tool_name":"read_file","arguments":{"path":"app/routes/invoices.py"}}
|
| 52 |
+
```
|
| 53 |
|
| 54 |
+
Supported tools:
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
+
- `inspect_policy_graph`
|
| 57 |
+
- `list_routes`
|
| 58 |
+
- `read_openapi`
|
| 59 |
+
- `read_file`
|
| 60 |
+
- `search_code`
|
| 61 |
+
- `send_local_request`
|
| 62 |
+
- `compare_identities`
|
| 63 |
+
- `submit_finding`
|
| 64 |
+
- `patch_file`
|
| 65 |
+
- `run_visible_tests`
|
| 66 |
+
- `submit_fix`
|
| 67 |
+
- `noop`
|
| 68 |
|
| 69 |
+
Tools are phase-gated:
|
|
|
|
|
|
|
| 70 |
|
| 71 |
+
- `discover`: inspect policy/routes/files, run safe local requests, compare identities, submit finding.
|
| 72 |
+
- `patch`: read/search, patch editable app files, run visible tests, submit final fix.
|
| 73 |
+
- `done`: stable terminal observation only.
|
| 74 |
|
| 75 |
+
## Reward
|
|
|
|
| 76 |
|
| 77 |
+
Terminal reward uses stable components:
|
|
|
|
| 78 |
|
| 79 |
+
```python
|
| 80 |
+
{
|
| 81 |
+
"discovery": 0.0,
|
| 82 |
+
"security": 0.0,
|
| 83 |
+
"regression": 0.0,
|
| 84 |
+
"public_routes": 0.0,
|
| 85 |
+
"patch_quality": 0.0,
|
| 86 |
+
"visible_tests": 0.0,
|
| 87 |
+
"safety": 0.0,
|
| 88 |
+
"anti_cheat": 0.0,
|
| 89 |
+
"total": 0.0,
|
| 90 |
+
}
|
| 91 |
```
|
| 92 |
|
| 93 |
+
The verifier rewards blocking the hidden exploit while preserving legitimate owner/admin behavior and intentionally public routes. It penalizes deny-all fixes, hardcoded IDs, hidden file probes, external URL attempts, and test/fixture tampering.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
+
## Scenario Generation
|
| 96 |
|
| 97 |
+
`reset(seed)` compiles a fresh isolated workspace under a temp directory. The MVP compiler generates:
|
|
|
|
|
|
|
| 98 |
|
| 99 |
+
- invoices domain policy graph;
|
| 100 |
+
- randomized users, tenants, invoices, and IDs;
|
| 101 |
+
- generated app files under `app/`;
|
| 102 |
+
- visible tests under `tests/test_visible.py`;
|
| 103 |
+
- hidden facts kept only in state for deterministic verification.
|
|
|
|
|
|
|
| 104 |
|
| 105 |
+
Additional domains and bug families are scaffolded for extension.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
+
## Testing
|
| 108 |
|
| 109 |
+
```bash
|
| 110 |
+
uv run --extra dev pytest
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
```
|
| 112 |
|
| 113 |
+
The suite covers model serialization, reset/step/state behavior, seed reproducibility, invalid actions, reward outcomes, anti-cheat checks, and scripted rollout policies.
|
| 114 |
|
| 115 |
+
## Training Scaffold
|
| 116 |
|
| 117 |
+
Training files are under `training/`:
|
| 118 |
|
| 119 |
+
- `rollout.py`
|
| 120 |
+
- `reward_funcs.py`
|
| 121 |
+
- `train_grpo.py`
|
| 122 |
+
- `eval_before_after.py`
|
| 123 |
+
- `trackio_utils.py`
|
| 124 |
+
- `configs/grpo_small.yaml`
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
|
| 126 |
+
The training scaffold is intentionally minimal until the environment/verifier behavior is stable. Trackio metric names and GRPO defaults follow the project brief.
|
|
|
|
|
|
|
|
|
|
| 127 |
|
| 128 |
+
## Modal Ephemeral Runs
|
| 129 |
|
| 130 |
+
Modal Labs support is kept in a separate launcher script so the local OpenEnv server and core training scaffold stay unchanged.
|
|
|
|
| 131 |
|
| 132 |
+
Install the optional local Modal client:
|
| 133 |
+
|
| 134 |
+
```bash
|
| 135 |
+
uv sync --extra modal
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
```
|
| 137 |
|
| 138 |
+
Run a temporary Modal app for a cheap environment/training smoke check:
|
| 139 |
|
| 140 |
+
```bash
|
| 141 |
+
uv run --extra modal modal run scripts/modal_ephemeral_train.py --mode smoke --episodes 4
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
```
|
| 143 |
|
| 144 |
+
The app is ephemeral: Modal starts it for the command and stops it when the command exits. The remote result is written locally under `outputs/rollouts/`.
|
|
|
|
|
|
|
| 145 |
|
| 146 |
+
You can also validate the GRPO config construction remotely:
|
| 147 |
|
| 148 |
```bash
|
| 149 |
+
uv run --extra modal modal run scripts/modal_ephemeral_train.py --mode grpo-config
|
|
|
|
| 150 |
```
|
| 151 |
|
| 152 |
+
The shell wrapper is equivalent:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
```bash
|
| 155 |
+
MODE=smoke EPISODES=4 uv run --extra modal bash scripts/modal_run_ephemeral.sh
|
| 156 |
```
|
| 157 |
|
| 158 |
+
## Docker / Spaces
|
| 159 |
|
| 160 |
+
```bash
|
| 161 |
+
docker build -t CyberSecurity_OWASP:latest -f server/Dockerfile .
|
| 162 |
+
docker run --rm -p 8000:8000 CyberSecurity_OWASP:latest
|
| 163 |
+
openenv push --repo-id <username>/CyberSecurity_OWASP
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
```
|
__init__.py
CHANGED
|
@@ -1,16 +1,22 @@
|
|
| 1 |
-
|
| 2 |
-
# All rights reserved.
|
| 3 |
-
#
|
| 4 |
-
# This source code is licensed under the BSD-style license found in the
|
| 5 |
-
# LICENSE file in the root directory of this source tree.
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
__all__ = [
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
"CybersecurityOwaspAction",
|
| 14 |
"CybersecurityOwaspObservation",
|
|
|
|
| 15 |
"CybersecurityOwaspEnv",
|
| 16 |
]
|
|
|
|
| 1 |
+
"""CyberSecurity_OWASP OpenEnv package."""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
from .client import CyberSecurityOWASPEnv, CybersecurityOwaspEnv
|
| 4 |
+
from .models import (
|
| 5 |
+
CyberSecurityOWASPAction,
|
| 6 |
+
CyberSecurityOWASPObservation,
|
| 7 |
+
CyberSecurityOWASPState,
|
| 8 |
+
CybersecurityOwaspAction,
|
| 9 |
+
CybersecurityOwaspObservation,
|
| 10 |
+
CybersecurityOwaspState,
|
| 11 |
+
)
|
| 12 |
|
| 13 |
__all__ = [
|
| 14 |
+
"CyberSecurityOWASPAction",
|
| 15 |
+
"CyberSecurityOWASPObservation",
|
| 16 |
+
"CyberSecurityOWASPState",
|
| 17 |
+
"CyberSecurityOWASPEnv",
|
| 18 |
"CybersecurityOwaspAction",
|
| 19 |
"CybersecurityOwaspObservation",
|
| 20 |
+
"CybersecurityOwaspState",
|
| 21 |
"CybersecurityOwaspEnv",
|
| 22 |
]
|
bug_mutator.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Bug-family metadata for generated authorization defects."""
|
| 2 |
+
|
| 3 |
+
BUG_FAMILIES = {
|
| 4 |
+
"bola_idor": {
|
| 5 |
+
"name": "BOLA/IDOR",
|
| 6 |
+
"defect": "Invoice lookup returns any invoice to any authenticated user.",
|
| 7 |
+
"repair": "Require same tenant and either owner or billing_admin.",
|
| 8 |
+
},
|
| 9 |
+
"bfla": {"name": "BFLA", "status": "scaffolded"},
|
| 10 |
+
"tenant_leak": {"name": "Tenant leak", "status": "scaffolded"},
|
| 11 |
+
"jwt_claim_trust": {"name": "JWT claim trust", "status": "scaffolded"},
|
| 12 |
+
"public_route_trap": {"name": "Public route trap", "status": "scaffolded"},
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def describe_bug_family(name: str) -> dict:
|
| 17 |
+
return BUG_FAMILIES.get(name, {"name": name, "status": "unknown"})
|
client.py
CHANGED
|
@@ -1,99 +1,39 @@
|
|
| 1 |
-
|
| 2 |
-
# All rights reserved.
|
| 3 |
-
#
|
| 4 |
-
# This source code is licensed under the BSD-style license found in the
|
| 5 |
-
# LICENSE file in the root directory of this source tree.
|
| 6 |
|
| 7 |
-
|
| 8 |
|
| 9 |
-
from typing import
|
| 10 |
|
| 11 |
from openenv.core import EnvClient
|
| 12 |
from openenv.core.client_types import StepResult
|
| 13 |
-
from openenv.core.env_server.types import State
|
| 14 |
|
| 15 |
-
from .models import
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
-
class
|
| 19 |
-
EnvClient[
|
| 20 |
):
|
| 21 |
-
"""
|
| 22 |
-
Client for the Cybersecurity Owasp Environment.
|
| 23 |
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
Each client instance has its own dedicated environment session on the server.
|
| 27 |
|
| 28 |
-
|
| 29 |
-
>>> # Connect to a running server
|
| 30 |
-
>>> with CybersecurityOwaspEnv(base_url="http://localhost:8000") as client:
|
| 31 |
-
... result = client.reset()
|
| 32 |
-
... print(result.observation.echoed_message)
|
| 33 |
-
...
|
| 34 |
-
... result = client.step(CybersecurityOwaspAction(message="Hello!"))
|
| 35 |
-
... print(result.observation.echoed_message)
|
| 36 |
-
|
| 37 |
-
Example with Docker:
|
| 38 |
-
>>> # Automatically start container and connect
|
| 39 |
-
>>> client = CybersecurityOwaspEnv.from_docker_image("CyberSecurity_OWASP-env:latest")
|
| 40 |
-
>>> try:
|
| 41 |
-
... result = client.reset()
|
| 42 |
-
... result = client.step(CybersecurityOwaspAction(message="Test"))
|
| 43 |
-
... finally:
|
| 44 |
-
... client.close()
|
| 45 |
-
"""
|
| 46 |
-
|
| 47 |
-
def _step_payload(self, action: CybersecurityOwaspAction) -> Dict:
|
| 48 |
-
"""
|
| 49 |
-
Convert CybersecurityOwaspAction to JSON payload for step message.
|
| 50 |
-
|
| 51 |
-
Args:
|
| 52 |
-
action: CybersecurityOwaspAction instance
|
| 53 |
-
|
| 54 |
-
Returns:
|
| 55 |
-
Dictionary representation suitable for JSON encoding
|
| 56 |
-
"""
|
| 57 |
-
return {
|
| 58 |
-
"message": action.message,
|
| 59 |
-
}
|
| 60 |
-
|
| 61 |
-
def _parse_result(self, payload: Dict) -> StepResult[CybersecurityOwaspObservation]:
|
| 62 |
-
"""
|
| 63 |
-
Parse server response into StepResult[CybersecurityOwaspObservation].
|
| 64 |
-
|
| 65 |
-
Args:
|
| 66 |
-
payload: JSON response data from server
|
| 67 |
-
|
| 68 |
-
Returns:
|
| 69 |
-
StepResult with CybersecurityOwaspObservation
|
| 70 |
-
"""
|
| 71 |
obs_data = payload.get("observation", {})
|
| 72 |
-
observation =
|
| 73 |
-
echoed_message=obs_data.get("echoed_message", ""),
|
| 74 |
-
message_length=obs_data.get("message_length", 0),
|
| 75 |
-
done=payload.get("done", False),
|
| 76 |
-
reward=payload.get("reward"),
|
| 77 |
-
metadata=obs_data.get("metadata", {}),
|
| 78 |
-
)
|
| 79 |
-
|
| 80 |
return StepResult(
|
| 81 |
observation=observation,
|
| 82 |
-
reward=payload.get("reward"),
|
| 83 |
-
done=payload.get("done",
|
| 84 |
)
|
| 85 |
|
| 86 |
-
def _parse_state(self, payload:
|
| 87 |
-
|
| 88 |
-
Parse server response into State object.
|
| 89 |
|
| 90 |
-
Args:
|
| 91 |
-
payload: JSON response from state request
|
| 92 |
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
"""
|
| 96 |
-
return State(
|
| 97 |
-
episode_id=payload.get("episode_id"),
|
| 98 |
-
step_count=payload.get("step_count", 0),
|
| 99 |
-
)
|
|
|
|
| 1 |
+
"""CyberSecurity_OWASP OpenEnv client."""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
from __future__ import annotations
|
| 4 |
|
| 5 |
+
from typing import Any
|
| 6 |
|
| 7 |
from openenv.core import EnvClient
|
| 8 |
from openenv.core.client_types import StepResult
|
|
|
|
| 9 |
|
| 10 |
+
from .models import (
|
| 11 |
+
CyberSecurityOWASPAction,
|
| 12 |
+
CyberSecurityOWASPObservation,
|
| 13 |
+
CyberSecurityOWASPState,
|
| 14 |
+
)
|
| 15 |
|
| 16 |
|
| 17 |
+
class CyberSecurityOWASPEnv(
|
| 18 |
+
EnvClient[CyberSecurityOWASPAction, CyberSecurityOWASPObservation, CyberSecurityOWASPState]
|
| 19 |
):
|
| 20 |
+
"""WebSocket client for the CyberSecurity_OWASP environment."""
|
|
|
|
| 21 |
|
| 22 |
+
def _step_payload(self, action: CyberSecurityOWASPAction) -> dict[str, Any]:
|
| 23 |
+
return action.model_dump()
|
|
|
|
| 24 |
|
| 25 |
+
def _parse_result(self, payload: dict[str, Any]) -> StepResult[CyberSecurityOWASPObservation]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
obs_data = payload.get("observation", {})
|
| 27 |
+
observation = CyberSecurityOWASPObservation(**obs_data)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
return StepResult(
|
| 29 |
observation=observation,
|
| 30 |
+
reward=payload.get("reward", observation.reward),
|
| 31 |
+
done=payload.get("done", observation.done),
|
| 32 |
)
|
| 33 |
|
| 34 |
+
def _parse_state(self, payload: dict[str, Any]) -> CyberSecurityOWASPState:
|
| 35 |
+
return CyberSecurityOWASPState(**payload)
|
|
|
|
| 36 |
|
|
|
|
|
|
|
| 37 |
|
| 38 |
+
# Backward-compatible alias from generated scaffold.
|
| 39 |
+
CybersecurityOwaspEnv = CyberSecurityOWASPEnv
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Small scripted policies and eval helpers for tests and demos."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import difflib
|
| 6 |
+
from typing import Iterable
|
| 7 |
+
|
| 8 |
+
from .models import CyberSecurityOWASPAction
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def random_policy() -> Iterable[CyberSecurityOWASPAction]:
|
| 12 |
+
yield CyberSecurityOWASPAction(tool_name="noop")
|
| 13 |
+
yield CyberSecurityOWASPAction(tool_name="list_routes")
|
| 14 |
+
yield CyberSecurityOWASPAction(tool_name="submit_fix")
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def bad_policy() -> Iterable[CyberSecurityOWASPAction]:
|
| 18 |
+
yield CyberSecurityOWASPAction(tool_name="read_file", arguments={"path": "../hidden_tests.py"})
|
| 19 |
+
yield CyberSecurityOWASPAction(
|
| 20 |
+
tool_name="send_local_request", arguments={"method": "GET", "path": "https://example.com/"}
|
| 21 |
+
)
|
| 22 |
+
yield CyberSecurityOWASPAction(tool_name="submit_fix")
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def oracle_policy(original_source: str) -> list[CyberSecurityOWASPAction]:
|
| 26 |
+
fixed = original_source.replace(
|
| 27 |
+
" # BUG: this only checks that the caller is authenticated. It forgets the\n"
|
| 28 |
+
" # owner/admin and tenant policy checks required by the policy graph.\n"
|
| 29 |
+
" return {\"status\": 200, \"body\": invoice}\n",
|
| 30 |
+
" if invoice[\"tenant_id\"] != actor[\"tenant_id\"]:\n"
|
| 31 |
+
" return {\"status\": 403, \"body\": {\"detail\": \"forbidden\"}}\n"
|
| 32 |
+
" if invoice[\"owner_user_id\"] != actor[\"user_id\"] and not is_billing_admin(actor):\n"
|
| 33 |
+
" return {\"status\": 403, \"body\": {\"detail\": \"forbidden\"}}\n"
|
| 34 |
+
" return {\"status\": 200, \"body\": invoice}\n",
|
| 35 |
+
)
|
| 36 |
+
diff = "".join(
|
| 37 |
+
difflib.unified_diff(
|
| 38 |
+
original_source.splitlines(True),
|
| 39 |
+
fixed.splitlines(True),
|
| 40 |
+
fromfile="app/routes/invoices.py",
|
| 41 |
+
tofile="app/routes/invoices.py",
|
| 42 |
+
)
|
| 43 |
+
)
|
| 44 |
+
return [
|
| 45 |
+
CyberSecurityOWASPAction(tool_name="inspect_policy_graph"),
|
| 46 |
+
CyberSecurityOWASPAction(
|
| 47 |
+
tool_name="send_local_request",
|
| 48 |
+
arguments={"method": "GET", "path": "__EXPLOIT_PATH__", "user_id": "__EXPLOIT_USER__"},
|
| 49 |
+
),
|
| 50 |
+
CyberSecurityOWASPAction(
|
| 51 |
+
tool_name="submit_finding",
|
| 52 |
+
arguments={
|
| 53 |
+
"summary": "BOLA/IDOR authorization bug: same-tenant user can read another user's invoice.",
|
| 54 |
+
"evidence": "__EVIDENCE__",
|
| 55 |
+
"policy_rule": "Only the owner or billing_admin in the same tenant may read invoices.",
|
| 56 |
+
},
|
| 57 |
+
),
|
| 58 |
+
CyberSecurityOWASPAction(
|
| 59 |
+
tool_name="patch_file", arguments={"path": "app/routes/invoices.py", "diff": diff}
|
| 60 |
+
),
|
| 61 |
+
CyberSecurityOWASPAction(tool_name="run_visible_tests"),
|
| 62 |
+
CyberSecurityOWASPAction(tool_name="submit_fix"),
|
| 63 |
+
]
|
fixture_generator.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Fixture helpers for scenario compilers."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Any
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def visible_workspace_summary(files: list[str], public_hint: dict[str, Any]) -> dict[str, Any]:
|
| 9 |
+
return {
|
| 10 |
+
"framework": "fastapi_style_python",
|
| 11 |
+
"editable_files": files,
|
| 12 |
+
"routes": [
|
| 13 |
+
{"method": "GET", "path": "/health", "public": True},
|
| 14 |
+
{"method": "GET", "path": "/invoices/{invoice_id}", "public": False},
|
| 15 |
+
],
|
| 16 |
+
"domain": public_hint.get("domain", "invoices"),
|
| 17 |
+
}
|
models.py
CHANGED
|
@@ -1,27 +1,81 @@
|
|
| 1 |
-
|
| 2 |
-
# All rights reserved.
|
| 3 |
-
#
|
| 4 |
-
# This source code is licensed under the BSD-style license found in the
|
| 5 |
-
# LICENSE file in the root directory of this source tree.
|
| 6 |
|
| 7 |
-
|
| 8 |
-
Data models for the Cybersecurity Owasp Environment.
|
| 9 |
|
| 10 |
-
|
| 11 |
-
"""
|
| 12 |
-
|
| 13 |
-
from openenv.core.env_server.types import Action, Observation
|
| 14 |
from pydantic import Field
|
| 15 |
|
| 16 |
|
| 17 |
-
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
-
|
|
|
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
-
class CybersecurityOwaspObservation(Observation):
|
| 24 |
-
"""Observation from the Cybersecurity Owasp environment - the echoed message."""
|
| 25 |
|
| 26 |
-
|
| 27 |
-
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Typed OpenEnv models for the CyberSecurity_OWASP environment."""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
from typing import Any, Literal
|
|
|
|
| 4 |
|
| 5 |
+
from openenv.core.env_server.types import Action, Observation, State
|
|
|
|
|
|
|
|
|
|
| 6 |
from pydantic import Field
|
| 7 |
|
| 8 |
|
| 9 |
+
CyberSecurityOWASPPhase = Literal["discover", "patch", "done"]
|
| 10 |
+
CyberSecurityOWASPSplit = Literal["train", "validation", "hidden_eval"]
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class CyberSecurityOWASPAction(Action):
|
| 14 |
+
"""One typed action emitted by the single defensive AppSec agent."""
|
| 15 |
+
|
| 16 |
+
tool_name: Literal[
|
| 17 |
+
"inspect_policy_graph",
|
| 18 |
+
"list_routes",
|
| 19 |
+
"read_openapi",
|
| 20 |
+
"read_file",
|
| 21 |
+
"search_code",
|
| 22 |
+
"send_local_request",
|
| 23 |
+
"compare_identities",
|
| 24 |
+
"submit_finding",
|
| 25 |
+
"patch_file",
|
| 26 |
+
"run_visible_tests",
|
| 27 |
+
"submit_fix",
|
| 28 |
+
"noop",
|
| 29 |
+
] = Field(..., description="Tool to execute for this step")
|
| 30 |
+
arguments: dict[str, Any] = Field(
|
| 31 |
+
default_factory=dict, description="JSON-serializable tool arguments"
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class CyberSecurityOWASPObservation(Observation):
|
| 36 |
+
"""Structured observation returned after reset and every action."""
|
| 37 |
+
|
| 38 |
+
phase: CyberSecurityOWASPPhase = "discover"
|
| 39 |
+
message: str = ""
|
| 40 |
+
task_brief: str = ""
|
| 41 |
+
visible_policy_hint: dict[str, Any] = Field(default_factory=dict)
|
| 42 |
+
workspace_summary: dict[str, Any] = Field(default_factory=dict)
|
| 43 |
+
available_actions: list[str] = Field(default_factory=list)
|
| 44 |
+
last_tool_result: str = ""
|
| 45 |
+
last_action_valid: bool = True
|
| 46 |
+
last_action_error: str | None = None
|
| 47 |
+
visible_test_result: str | None = None
|
| 48 |
+
reward_breakdown: dict[str, float] = Field(default_factory=dict)
|
| 49 |
+
done_reason: str | None = None
|
| 50 |
+
|
| 51 |
|
| 52 |
+
class CyberSecurityOWASPState(State):
|
| 53 |
+
"""Internal state used for replay, validation, reward, and eval logging."""
|
| 54 |
|
| 55 |
+
task_id: str = ""
|
| 56 |
+
seed: int = 0
|
| 57 |
+
split: CyberSecurityOWASPSplit = "train"
|
| 58 |
+
difficulty: int = 0
|
| 59 |
+
domain: str = ""
|
| 60 |
+
bug_family: str = ""
|
| 61 |
+
phase: CyberSecurityOWASPPhase = "discover"
|
| 62 |
+
max_steps: int = 40
|
| 63 |
+
done: bool = False
|
| 64 |
+
success: bool = False
|
| 65 |
+
failure_reason: str | None = None
|
| 66 |
+
finding_submitted: bool = False
|
| 67 |
+
patch_submitted: bool = False
|
| 68 |
+
accumulated_reward: float = 0.0
|
| 69 |
+
last_reward: float = 0.0
|
| 70 |
+
action_history: list[dict[str, Any]] = Field(default_factory=list)
|
| 71 |
+
reward_history: list[dict[str, float]] = Field(default_factory=list)
|
| 72 |
+
visible_facts: dict[str, Any] = Field(default_factory=dict)
|
| 73 |
+
hidden_facts: dict[str, Any] = Field(default_factory=dict)
|
| 74 |
+
metrics: dict[str, Any] = Field(default_factory=dict)
|
| 75 |
+
anti_cheat_flags: list[str] = Field(default_factory=list)
|
| 76 |
|
|
|
|
|
|
|
| 77 |
|
| 78 |
+
# Backward-compatible aliases from the OpenEnv scaffold.
|
| 79 |
+
CybersecurityOwaspAction = CyberSecurityOWASPAction
|
| 80 |
+
CybersecurityOwaspObservation = CyberSecurityOWASPObservation
|
| 81 |
+
CybersecurityOwaspState = CyberSecurityOWASPState
|
policy_graph.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Policy graph generation for MVP authorization-repair scenarios."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import random
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
from typing import Any
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@dataclass(frozen=True)
|
| 11 |
+
class CompiledPolicy:
|
| 12 |
+
public_hint: dict[str, Any]
|
| 13 |
+
hidden_facts: dict[str, Any]
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def build_invoice_policy(seed: int) -> CompiledPolicy:
|
| 17 |
+
"""Create a deterministic invoices policy graph with randomized IDs."""
|
| 18 |
+
|
| 19 |
+
rng = random.Random(seed)
|
| 20 |
+
suffix = rng.randint(1000, 9999)
|
| 21 |
+
tenant_a = f"tenant-{suffix}-a"
|
| 22 |
+
tenant_b = f"tenant-{suffix}-b"
|
| 23 |
+
owner = f"user-{suffix}-owner"
|
| 24 |
+
intruder = f"user-{suffix}-intruder"
|
| 25 |
+
admin = f"user-{suffix}-billing-admin"
|
| 26 |
+
owner_invoice = f"inv-{suffix}-owned"
|
| 27 |
+
other_invoice = f"inv-{suffix}-other"
|
| 28 |
+
foreign_invoice = f"inv-{suffix}-foreign"
|
| 29 |
+
|
| 30 |
+
users = {
|
| 31 |
+
owner: {"user_id": owner, "tenant_id": tenant_a, "roles": ["user"]},
|
| 32 |
+
intruder: {"user_id": intruder, "tenant_id": tenant_a, "roles": ["user"]},
|
| 33 |
+
admin: {"user_id": admin, "tenant_id": tenant_a, "roles": ["billing_admin"]},
|
| 34 |
+
f"user-{suffix}-outside": {
|
| 35 |
+
"user_id": f"user-{suffix}-outside",
|
| 36 |
+
"tenant_id": tenant_b,
|
| 37 |
+
"roles": ["user"],
|
| 38 |
+
},
|
| 39 |
+
}
|
| 40 |
+
invoices = {
|
| 41 |
+
owner_invoice: {
|
| 42 |
+
"invoice_id": owner_invoice,
|
| 43 |
+
"tenant_id": tenant_a,
|
| 44 |
+
"owner_user_id": owner,
|
| 45 |
+
"amount": 1200,
|
| 46 |
+
"status": "open",
|
| 47 |
+
},
|
| 48 |
+
other_invoice: {
|
| 49 |
+
"invoice_id": other_invoice,
|
| 50 |
+
"tenant_id": tenant_a,
|
| 51 |
+
"owner_user_id": intruder,
|
| 52 |
+
"amount": 410,
|
| 53 |
+
"status": "open",
|
| 54 |
+
},
|
| 55 |
+
foreign_invoice: {
|
| 56 |
+
"invoice_id": foreign_invoice,
|
| 57 |
+
"tenant_id": tenant_b,
|
| 58 |
+
"owner_user_id": f"user-{suffix}-outside",
|
| 59 |
+
"amount": 990,
|
| 60 |
+
"status": "open",
|
| 61 |
+
},
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
public_hint = {
|
| 65 |
+
"domain": "invoices",
|
| 66 |
+
"policy_rules": [
|
| 67 |
+
"Authenticated users may read invoices they own.",
|
| 68 |
+
"Billing admins may read invoices in their own tenant.",
|
| 69 |
+
"Users must not read another user's invoice unless they have a billing_admin role.",
|
| 70 |
+
"Cross-tenant invoice reads are forbidden.",
|
| 71 |
+
"GET /health is intentionally public.",
|
| 72 |
+
],
|
| 73 |
+
"users": {
|
| 74 |
+
alias: {
|
| 75 |
+
"user_id": value["user_id"],
|
| 76 |
+
"tenant_id": value["tenant_id"],
|
| 77 |
+
"roles": value["roles"],
|
| 78 |
+
}
|
| 79 |
+
for alias, value in {
|
| 80 |
+
"owner": users[owner],
|
| 81 |
+
"same_tenant_other_user": users[intruder],
|
| 82 |
+
"billing_admin": users[admin],
|
| 83 |
+
}.items()
|
| 84 |
+
},
|
| 85 |
+
"resources": {
|
| 86 |
+
"owned_invoice": owner_invoice,
|
| 87 |
+
"same_tenant_other_invoice": other_invoice,
|
| 88 |
+
"foreign_tenant_invoice": foreign_invoice,
|
| 89 |
+
},
|
| 90 |
+
"public_routes": [{"method": "GET", "path": "/health"}],
|
| 91 |
+
}
|
| 92 |
+
hidden_facts = {
|
| 93 |
+
"users": users,
|
| 94 |
+
"invoices": invoices,
|
| 95 |
+
"owner_user_id": owner,
|
| 96 |
+
"intruder_user_id": intruder,
|
| 97 |
+
"admin_user_id": admin,
|
| 98 |
+
"owner_invoice_id": owner_invoice,
|
| 99 |
+
"other_invoice_id": other_invoice,
|
| 100 |
+
"foreign_invoice_id": foreign_invoice,
|
| 101 |
+
"tenant_a": tenant_a,
|
| 102 |
+
"tenant_b": tenant_b,
|
| 103 |
+
"bug_family": "bola_idor",
|
| 104 |
+
}
|
| 105 |
+
return CompiledPolicy(public_hint=public_hint, hidden_facts=hidden_facts)
|
pyproject.toml
CHANGED
|
@@ -33,6 +33,9 @@ dev = [
|
|
| 33 |
"pytest>=8.0.0",
|
| 34 |
"pytest-cov>=4.0.0",
|
| 35 |
]
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
[project.scripts]
|
| 38 |
# Server entry point - enables running via: uv run --project . server
|
|
@@ -42,4 +45,4 @@ server = "CyberSecurity_OWASP.server.app:main"
|
|
| 42 |
[tool.setuptools]
|
| 43 |
include-package-data = true
|
| 44 |
packages = ["CyberSecurity_OWASP", "CyberSecurity_OWASP.server"]
|
| 45 |
-
package-dir = { "CyberSecurity_OWASP" = ".", "CyberSecurity_OWASP.server" = "server" }
|
|
|
|
| 33 |
"pytest>=8.0.0",
|
| 34 |
"pytest-cov>=4.0.0",
|
| 35 |
]
|
| 36 |
+
modal = [
|
| 37 |
+
"modal>=1.1.0",
|
| 38 |
+
]
|
| 39 |
|
| 40 |
[project.scripts]
|
| 41 |
# Server entry point - enables running via: uv run --project . server
|
|
|
|
| 45 |
[tool.setuptools]
|
| 46 |
include-package-data = true
|
| 47 |
packages = ["CyberSecurity_OWASP", "CyberSecurity_OWASP.server"]
|
| 48 |
+
package-dir = { "CyberSecurity_OWASP" = ".", "CyberSecurity_OWASP.server" = "server" }
|
rewards.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Reward computation for CyberSecurity_OWASP."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from .models import CyberSecurityOWASPAction, CyberSecurityOWASPState
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
REWARD_KEYS = (
|
| 9 |
+
"discovery",
|
| 10 |
+
"security",
|
| 11 |
+
"regression",
|
| 12 |
+
"public_routes",
|
| 13 |
+
"patch_quality",
|
| 14 |
+
"visible_tests",
|
| 15 |
+
"safety",
|
| 16 |
+
"anti_cheat",
|
| 17 |
+
"total",
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def empty_reward() -> dict[str, float]:
|
| 22 |
+
return {key: 0.0 for key in REWARD_KEYS}
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def compute_reward(
|
| 26 |
+
state: CyberSecurityOWASPState,
|
| 27 |
+
action: CyberSecurityOWASPAction,
|
| 28 |
+
verifier_result: dict,
|
| 29 |
+
) -> dict[str, float]:
|
| 30 |
+
reward = empty_reward()
|
| 31 |
+
if action.tool_name == "submit_finding":
|
| 32 |
+
finding = verifier_result.get("finding", {})
|
| 33 |
+
reward["discovery"] = (2.0 if finding.get("valid") else 0.0) + (
|
| 34 |
+
1.0 if finding.get("tied_to_policy") else 0.0
|
| 35 |
+
)
|
| 36 |
+
elif action.tool_name == "run_visible_tests":
|
| 37 |
+
visible = verifier_result.get("visible", {})
|
| 38 |
+
reward["visible_tests"] = 1.0 if visible.get("passed") else 0.0
|
| 39 |
+
elif action.tool_name == "submit_fix":
|
| 40 |
+
security = verifier_result.get("security", {})
|
| 41 |
+
regression = verifier_result.get("regression", {})
|
| 42 |
+
public_routes = verifier_result.get("public_routes", {})
|
| 43 |
+
quality = verifier_result.get("patch_quality", {})
|
| 44 |
+
visible = verifier_result.get("visible", {})
|
| 45 |
+
reward["security"] = 5.0 if security.get("passed") else -6.0
|
| 46 |
+
reward["regression"] = 3.0 if regression.get("passed") else -5.0
|
| 47 |
+
reward["public_routes"] = 1.0 if public_routes.get("passed") else -3.0
|
| 48 |
+
reward["patch_quality"] = 2.0 if quality.get("passed") else 0.0
|
| 49 |
+
reward["visible_tests"] = 1.0 if visible.get("passed") else -6.0
|
| 50 |
+
if quality.get("hardcoded"):
|
| 51 |
+
reward["anti_cheat"] -= 4.0
|
| 52 |
+
if quality.get("deny_all"):
|
| 53 |
+
reward["safety"] -= 5.0
|
| 54 |
+
|
| 55 |
+
flags = verifier_result.get("anti_cheat_flags", []) or []
|
| 56 |
+
if flags:
|
| 57 |
+
if "external_network_attempt" in flags:
|
| 58 |
+
reward["safety"] -= 8.0
|
| 59 |
+
if "blocked_file_probe" in flags:
|
| 60 |
+
reward["anti_cheat"] -= 8.0
|
| 61 |
+
if "hardcoded_hidden_identifier" in flags:
|
| 62 |
+
reward["anti_cheat"] -= 4.0
|
| 63 |
+
|
| 64 |
+
total = sum(value for key, value in reward.items() if key != "total")
|
| 65 |
+
reward["total"] = min(15.0, total) if total > 0 else total
|
| 66 |
+
return reward
|
safety.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Safety boundaries for local-only defensive AppSec episodes."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
FORBIDDEN_BEHAVIOR = (
|
| 7 |
+
"external network access",
|
| 8 |
+
"host filesystem reads",
|
| 9 |
+
"hidden test access",
|
| 10 |
+
"oracle access",
|
| 11 |
+
"credential extraction",
|
| 12 |
+
"persistence or evasion",
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def is_local_route(path: str) -> bool:
|
| 17 |
+
return path.startswith("/") and not path.startswith("//") and "://" not in path
|
scenario_compiler.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Deterministic scenario compiler for CyberSecurity_OWASP."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import tempfile
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Any
|
| 8 |
+
|
| 9 |
+
from .fixture_generator import visible_workspace_summary
|
| 10 |
+
from .policy_graph import build_invoice_policy
|
| 11 |
+
from .template_renderer import render_fastapi_basic
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def compile_scenario(seed: int, split: str = "train", difficulty: int = 0) -> dict[str, Any]:
|
| 15 |
+
"""Compile one isolated MVP authorization-repair scenario."""
|
| 16 |
+
|
| 17 |
+
compiled = build_invoice_policy(seed)
|
| 18 |
+
workspace = Path(tempfile.mkdtemp(prefix=f"cybersecurity_owasp_{split}_{seed}_"))
|
| 19 |
+
editable_files = render_fastapi_basic(workspace, compiled.public_hint, compiled.hidden_facts)
|
| 20 |
+
task_id = f"{split}-invoices-bola-{seed}"
|
| 21 |
+
hidden = dict(compiled.hidden_facts)
|
| 22 |
+
hidden.update(
|
| 23 |
+
{
|
| 24 |
+
"workspace": str(workspace),
|
| 25 |
+
"editable_files": editable_files,
|
| 26 |
+
"initial_file_hashes": {
|
| 27 |
+
path: (workspace / path).read_text(encoding="utf-8")
|
| 28 |
+
for path in editable_files
|
| 29 |
+
},
|
| 30 |
+
}
|
| 31 |
+
)
|
| 32 |
+
return {
|
| 33 |
+
"task_id": task_id,
|
| 34 |
+
"workspace": workspace,
|
| 35 |
+
"domain": "invoices",
|
| 36 |
+
"bug_family": "bola_idor",
|
| 37 |
+
"difficulty": difficulty,
|
| 38 |
+
"task_brief": (
|
| 39 |
+
"Inspect the generated invoices app and policy. Find the broken "
|
| 40 |
+
"authorization behavior, submit a finding with local evidence, patch "
|
| 41 |
+
"the app, preserve intended owner/admin/public behavior, then submit."
|
| 42 |
+
),
|
| 43 |
+
"public_hint": compiled.public_hint,
|
| 44 |
+
"workspace_summary": visible_workspace_summary(editable_files, compiled.public_hint),
|
| 45 |
+
"hidden_facts": hidden,
|
| 46 |
+
}
|
scripts/docker_build.sh
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
docker build -t CyberSecurity_OWASP:latest -f server/Dockerfile .
|
scripts/docker_run.sh
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
docker run --rm -p "${PORT:-8000}:8000" CyberSecurity_OWASP:latest
|
scripts/generate_scenarios.sh
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
uv run python -c "from CyberSecurity_OWASP.scenario_compiler import compile_scenario; [compile_scenario(i) for i in range(3)]; print('generated 3 smoke scenarios')"
|
scripts/modal_ephemeral_train.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Ephemeral Modal Labs launcher for CyberSecurity_OWASP training smoke runs.
|
| 2 |
+
|
| 3 |
+
Run from the repo root:
|
| 4 |
+
|
| 5 |
+
modal run scripts/modal_ephemeral_train.py --mode smoke --episodes 4
|
| 6 |
+
|
| 7 |
+
This intentionally stays separate from ``training/train_grpo.py``. It packages
|
| 8 |
+
the local repo into a temporary Modal app and returns compact JSON artifacts to
|
| 9 |
+
the local process, so the run disappears when ``modal run`` exits.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import json
|
| 15 |
+
from datetime import datetime
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
from typing import Any
|
| 18 |
+
|
| 19 |
+
import modal
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
APP_NAME = "CyberSecurity_OWASP-ephemeral-training"
|
| 23 |
+
REMOTE_PROJECT = "/root/CyberSecurity_OWASP"
|
| 24 |
+
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
| 25 |
+
|
| 26 |
+
app = modal.App(APP_NAME)
|
| 27 |
+
|
| 28 |
+
image = (
|
| 29 |
+
modal.Image.debian_slim(python_version="3.11")
|
| 30 |
+
.apt_install("git")
|
| 31 |
+
.add_local_dir(
|
| 32 |
+
PROJECT_ROOT,
|
| 33 |
+
remote_path=REMOTE_PROJECT,
|
| 34 |
+
copy=True,
|
| 35 |
+
ignore=[
|
| 36 |
+
".git",
|
| 37 |
+
".venv",
|
| 38 |
+
"__pycache__",
|
| 39 |
+
".pytest_cache",
|
| 40 |
+
"outputs",
|
| 41 |
+
"*.pyc",
|
| 42 |
+
],
|
| 43 |
+
)
|
| 44 |
+
.run_commands(f"pip install -e {REMOTE_PROJECT}")
|
| 45 |
+
.workdir(REMOTE_PROJECT)
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class NoopTrainer:
|
| 50 |
+
"""Deterministic placeholder policy for cheap Modal smoke runs."""
|
| 51 |
+
|
| 52 |
+
def generate_rollout_completions(self, prompts: list[str]) -> list[dict[str, Any]]:
|
| 53 |
+
return [
|
| 54 |
+
{
|
| 55 |
+
"text": '{"tool_name":"noop","arguments":{}}',
|
| 56 |
+
"prompt_ids": [],
|
| 57 |
+
"completion_ids": [],
|
| 58 |
+
"logprobs": [],
|
| 59 |
+
}
|
| 60 |
+
for _ in prompts
|
| 61 |
+
]
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
@app.function(image=image, timeout=60 * 30)
|
| 65 |
+
def run_ephemeral_smoke(episodes: int = 4, seed_start: int = 0) -> dict[str, Any]:
|
| 66 |
+
from CyberSecurity_OWASP.models import CyberSecurityOWASPAction
|
| 67 |
+
from CyberSecurity_OWASP.server.CyberSecurity_OWASP_environment import (
|
| 68 |
+
CybersecurityOwaspEnvironment,
|
| 69 |
+
)
|
| 70 |
+
from training.rollout import rollout_once
|
| 71 |
+
|
| 72 |
+
baseline = []
|
| 73 |
+
oracle = []
|
| 74 |
+
|
| 75 |
+
for offset in range(episodes):
|
| 76 |
+
seed = seed_start + offset
|
| 77 |
+
|
| 78 |
+
baseline_env = CybersecurityOwaspEnvironment()
|
| 79 |
+
baseline_env.reset(seed=seed, split="validation")
|
| 80 |
+
baseline.append(rollout_once(NoopTrainer(), baseline_env, max_steps=5))
|
| 81 |
+
|
| 82 |
+
oracle_env = CybersecurityOwaspEnvironment()
|
| 83 |
+
oracle_env.reset(seed=seed, split="validation")
|
| 84 |
+
hidden = oracle_env.state.hidden_facts
|
| 85 |
+
oracle_env.step(
|
| 86 |
+
CyberSecurityOWASPAction(
|
| 87 |
+
tool_name="submit_finding",
|
| 88 |
+
arguments={
|
| 89 |
+
"summary": "BOLA/IDOR authorization bug in invoice read route.",
|
| 90 |
+
"evidence": (
|
| 91 |
+
f"user {hidden['owner_user_id']} can request invoice "
|
| 92 |
+
f"{hidden['other_invoice_id']} despite the owner/admin policy"
|
| 93 |
+
),
|
| 94 |
+
"policy_rule": "Only owner or billing_admin in same tenant may read invoices.",
|
| 95 |
+
},
|
| 96 |
+
)
|
| 97 |
+
)
|
| 98 |
+
source = (
|
| 99 |
+
Path(hidden["workspace"]) / "app/routes/invoices.py"
|
| 100 |
+
).read_text(encoding="utf-8")
|
| 101 |
+
fixed = source.replace(
|
| 102 |
+
" # BUG: this only checks that the caller is authenticated. It forgets the\n"
|
| 103 |
+
" # owner/admin and tenant policy checks required by the policy graph.\n"
|
| 104 |
+
" return {\"status\": 200, \"body\": invoice}\n",
|
| 105 |
+
" if invoice[\"tenant_id\"] != actor[\"tenant_id\"]:\n"
|
| 106 |
+
" return {\"status\": 403, \"body\": {\"detail\": \"forbidden\"}}\n"
|
| 107 |
+
" if invoice[\"owner_user_id\"] != actor[\"user_id\"] and not is_billing_admin(actor):\n"
|
| 108 |
+
" return {\"status\": 403, \"body\": {\"detail\": \"forbidden\"}}\n"
|
| 109 |
+
" return {\"status\": 200, \"body\": invoice}\n",
|
| 110 |
+
)
|
| 111 |
+
oracle_env.step(
|
| 112 |
+
CyberSecurityOWASPAction(
|
| 113 |
+
tool_name="patch_file",
|
| 114 |
+
arguments={"path": "app/routes/invoices.py", "content": fixed},
|
| 115 |
+
)
|
| 116 |
+
)
|
| 117 |
+
oracle_env.step(CyberSecurityOWASPAction(tool_name="run_visible_tests"))
|
| 118 |
+
final = oracle_env.step(CyberSecurityOWASPAction(tool_name="submit_fix"))
|
| 119 |
+
oracle.append(
|
| 120 |
+
{
|
| 121 |
+
"seed": seed,
|
| 122 |
+
"success": oracle_env.state.success,
|
| 123 |
+
"reward_total": final.reward_breakdown.get("total", 0.0),
|
| 124 |
+
"reward_breakdown": final.reward_breakdown,
|
| 125 |
+
}
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
def mean(items: list[dict[str, Any]], key: str) -> float:
|
| 129 |
+
return sum(float(item.get(key, 0.0)) for item in items) / max(1, len(items))
|
| 130 |
+
|
| 131 |
+
return {
|
| 132 |
+
"run_name": f"{APP_NAME}-{datetime.utcnow().strftime('%Y%m%d-%H%M%S')}",
|
| 133 |
+
"mode": "smoke",
|
| 134 |
+
"episodes": episodes,
|
| 135 |
+
"seed_start": seed_start,
|
| 136 |
+
"baseline_mean_reward": mean(baseline, "reward_total"),
|
| 137 |
+
"oracle_mean_reward": mean(oracle, "reward_total"),
|
| 138 |
+
"oracle_success_rate": mean(oracle, "success"),
|
| 139 |
+
"baseline": baseline,
|
| 140 |
+
"oracle": oracle,
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
@app.function(image=image, timeout=60 * 10)
|
| 145 |
+
def run_grpo_config_check() -> str:
|
| 146 |
+
from training.train_grpo import build_grpo_config
|
| 147 |
+
|
| 148 |
+
return str(build_grpo_config())
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
@app.local_entrypoint()
|
| 152 |
+
def main(mode: str = "smoke", episodes: int = 4, seed_start: int = 0) -> None:
|
| 153 |
+
if mode == "smoke":
|
| 154 |
+
result = run_ephemeral_smoke.remote(episodes=episodes, seed_start=seed_start)
|
| 155 |
+
output_dir = PROJECT_ROOT / "outputs" / "rollouts"
|
| 156 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 157 |
+
output_path = output_dir / f"{result['run_name']}.json"
|
| 158 |
+
output_path.write_text(json.dumps(result, indent=2, sort_keys=True), encoding="utf-8")
|
| 159 |
+
print(json.dumps({"saved": str(output_path), **result}, indent=2, sort_keys=True))
|
| 160 |
+
elif mode == "grpo-config":
|
| 161 |
+
print(run_grpo_config_check.remote())
|
| 162 |
+
else:
|
| 163 |
+
raise ValueError("mode must be 'smoke' or 'grpo-config'")
|
scripts/modal_run_ephemeral.sh
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
modal run scripts/modal_ephemeral_train.py --mode "${MODE:-smoke}" --episodes "${EPISODES:-4}" --seed-start "${SEED_START:-0}"
|
scripts/push_space.sh
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
openenv push --repo-id "${HF_REPO_ID:?set HF_REPO_ID, e.g. username/CyberSecurity_OWASP}"
|
scripts/run_local.sh
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
uv run server --port "${PORT:-8000}"
|
scripts/smoke_test.sh
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
uv run pytest tests/test_models.py tests/test_reset_step_state.py
|
server/CyberSecurity_OWASP_environment.py
CHANGED
|
@@ -1,104 +1,366 @@
|
|
| 1 |
-
|
| 2 |
-
# All rights reserved.
|
| 3 |
-
#
|
| 4 |
-
# This source code is licensed under the BSD-style license found in the
|
| 5 |
-
# LICENSE file in the root directory of this source tree.
|
| 6 |
|
| 7 |
-
|
| 8 |
-
Cybersecurity Owasp Environment Implementation.
|
| 9 |
-
|
| 10 |
-
A simple test environment that echoes back messages sent to it.
|
| 11 |
-
Perfect for testing HTTP server infrastructure.
|
| 12 |
-
"""
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
from uuid import uuid4
|
| 15 |
|
| 16 |
from openenv.core.env_server.interfaces import Environment
|
| 17 |
-
from openenv.core.env_server.types import State
|
| 18 |
|
| 19 |
try:
|
| 20 |
-
from ..models import
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
def __init__(self):
|
| 49 |
-
|
| 50 |
-
self._state =
|
| 51 |
-
self.
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
done=False,
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
| 68 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
-
def step(
|
| 71 |
-
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
-
|
| 75 |
-
|
|
|
|
|
|
|
| 76 |
|
| 77 |
-
Returns:
|
| 78 |
-
CybersecurityOwaspObservation with the echoed message and its length
|
| 79 |
-
"""
|
| 80 |
self._state.step_count += 1
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
-
|
| 83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
-
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
reward=reward,
|
| 93 |
-
metadata={"
|
| 94 |
)
|
| 95 |
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
|
|
|
| 100 |
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""CyberSecurity_OWASP OpenEnv environment implementation."""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
+
import json
|
| 6 |
+
import shutil
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Any
|
| 9 |
from uuid import uuid4
|
| 10 |
|
| 11 |
from openenv.core.env_server.interfaces import Environment
|
|
|
|
| 12 |
|
| 13 |
try:
|
| 14 |
+
from ..models import (
|
| 15 |
+
CyberSecurityOWASPAction,
|
| 16 |
+
CyberSecurityOWASPObservation,
|
| 17 |
+
CyberSecurityOWASPState,
|
| 18 |
+
)
|
| 19 |
+
from ..scenario_compiler import compile_scenario
|
| 20 |
+
from ..safety import is_local_route
|
| 21 |
+
from ..validators import detect_cheating, is_path_allowed, simulate_request
|
| 22 |
+
from .reward_engine import evaluate_action
|
| 23 |
+
except ImportError: # pragma: no cover
|
| 24 |
+
from models import CyberSecurityOWASPAction, CyberSecurityOWASPObservation, CyberSecurityOWASPState
|
| 25 |
+
from scenario_compiler import compile_scenario
|
| 26 |
+
from safety import is_local_route
|
| 27 |
+
from validators import detect_cheating, is_path_allowed, simulate_request
|
| 28 |
+
from server.reward_engine import evaluate_action
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
ALLOWED_TOOLS = {
|
| 32 |
+
"discover": {
|
| 33 |
+
"inspect_policy_graph",
|
| 34 |
+
"list_routes",
|
| 35 |
+
"read_openapi",
|
| 36 |
+
"read_file",
|
| 37 |
+
"search_code",
|
| 38 |
+
"send_local_request",
|
| 39 |
+
"compare_identities",
|
| 40 |
+
"submit_finding",
|
| 41 |
+
"noop",
|
| 42 |
+
},
|
| 43 |
+
"patch": {
|
| 44 |
+
"read_file",
|
| 45 |
+
"search_code",
|
| 46 |
+
"patch_file",
|
| 47 |
+
"run_visible_tests",
|
| 48 |
+
"send_local_request",
|
| 49 |
+
"submit_fix",
|
| 50 |
+
"noop",
|
| 51 |
+
},
|
| 52 |
+
"done": set(),
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
class CybersecurityOwaspEnvironment(
|
| 57 |
+
Environment[CyberSecurityOWASPAction, CyberSecurityOWASPObservation, CyberSecurityOWASPState]
|
| 58 |
+
):
|
| 59 |
+
"""Single-agent defensive authorization-repair environment."""
|
| 60 |
+
|
| 61 |
+
SUPPORTS_CONCURRENT_SESSIONS = True
|
| 62 |
|
| 63 |
def __init__(self):
|
| 64 |
+
super().__init__()
|
| 65 |
+
self._state = CyberSecurityOWASPState(episode_id=str(uuid4()))
|
| 66 |
+
self._task_brief = ""
|
| 67 |
+
self._visible_policy_hint: dict[str, Any] = {}
|
| 68 |
+
self._workspace_summary: dict[str, Any] = {}
|
| 69 |
+
self._last_done_observation: CyberSecurityOWASPObservation | None = None
|
| 70 |
+
|
| 71 |
+
def reset(
|
| 72 |
+
self,
|
| 73 |
+
seed: int | None = None,
|
| 74 |
+
episode_id: str | None = None,
|
| 75 |
+
split: str = "train",
|
| 76 |
+
difficulty: int = 0,
|
| 77 |
+
**_: Any,
|
| 78 |
+
) -> CyberSecurityOWASPObservation:
|
| 79 |
+
self.close()
|
| 80 |
+
actual_seed = int(seed if seed is not None else 0)
|
| 81 |
+
scenario = compile_scenario(actual_seed, split=split, difficulty=difficulty)
|
| 82 |
+
self._state = CyberSecurityOWASPState(
|
| 83 |
+
episode_id=episode_id or str(uuid4()),
|
| 84 |
+
task_id=scenario["task_id"],
|
| 85 |
+
seed=actual_seed,
|
| 86 |
+
split=split,
|
| 87 |
+
difficulty=difficulty,
|
| 88 |
+
domain=scenario["domain"],
|
| 89 |
+
bug_family=scenario["bug_family"],
|
| 90 |
+
phase="discover",
|
| 91 |
+
step_count=0,
|
| 92 |
+
max_steps=40,
|
| 93 |
done=False,
|
| 94 |
+
success=False,
|
| 95 |
+
visible_facts={"workspace_summary": scenario["workspace_summary"]},
|
| 96 |
+
hidden_facts=scenario["hidden_facts"],
|
| 97 |
+
metrics={"reset_count": 1},
|
| 98 |
)
|
| 99 |
+
self._task_brief = scenario["task_brief"]
|
| 100 |
+
self._visible_policy_hint = scenario["public_hint"]
|
| 101 |
+
self._workspace_summary = scenario["workspace_summary"]
|
| 102 |
+
self._last_done_observation = None
|
| 103 |
+
return self._observation("Scenario ready. Start in discover phase.", reward=0.0)
|
| 104 |
|
| 105 |
+
def step(
|
| 106 |
+
self,
|
| 107 |
+
action: CyberSecurityOWASPAction,
|
| 108 |
+
timeout_s: float | None = None,
|
| 109 |
+
**_: Any,
|
| 110 |
+
) -> CyberSecurityOWASPObservation:
|
| 111 |
+
if self._state.done:
|
| 112 |
+
return self._last_done_observation or self._observation(
|
| 113 |
+
"Episode is already done.", reward=0.0, done_reason=self._state.failure_reason
|
| 114 |
+
)
|
| 115 |
|
| 116 |
+
anti_cheat_flags = detect_cheating(self._state, action)
|
| 117 |
+
for flag in anti_cheat_flags:
|
| 118 |
+
if flag not in self._state.anti_cheat_flags:
|
| 119 |
+
self._state.anti_cheat_flags.append(flag)
|
| 120 |
|
|
|
|
|
|
|
|
|
|
| 121 |
self._state.step_count += 1
|
| 122 |
+
self._state.action_history.append(
|
| 123 |
+
{"tool_name": action.tool_name, "arguments": action.arguments}
|
| 124 |
+
)
|
| 125 |
|
| 126 |
+
if action.tool_name not in ALLOWED_TOOLS[self._state.phase]:
|
| 127 |
+
verifier, reward = evaluate_action(self._state, action, anti_cheat_flags)
|
| 128 |
+
return self._finish_step(
|
| 129 |
+
"Action is not allowed in the current phase.",
|
| 130 |
+
reward,
|
| 131 |
+
valid=False,
|
| 132 |
+
error=f"{action.tool_name} is not allowed during {self._state.phase}",
|
| 133 |
+
verifier=verifier,
|
| 134 |
+
)
|
| 135 |
|
| 136 |
+
try:
|
| 137 |
+
result, verifier, reward, visible_tests = self._execute(action, anti_cheat_flags)
|
| 138 |
+
return self._finish_step(
|
| 139 |
+
result,
|
| 140 |
+
reward,
|
| 141 |
+
valid=True,
|
| 142 |
+
verifier=verifier,
|
| 143 |
+
visible_test_result=visible_tests,
|
| 144 |
+
)
|
| 145 |
+
except Exception as exc: # keep malformed agent actions from crashing the server
|
| 146 |
+
verifier, reward = evaluate_action(self._state, action, anti_cheat_flags)
|
| 147 |
+
return self._finish_step(
|
| 148 |
+
"Tool execution failed.",
|
| 149 |
+
reward,
|
| 150 |
+
valid=False,
|
| 151 |
+
error=str(exc),
|
| 152 |
+
verifier=verifier,
|
| 153 |
+
)
|
| 154 |
|
| 155 |
+
@property
|
| 156 |
+
def state(self) -> CyberSecurityOWASPState:
|
| 157 |
+
return self._state
|
| 158 |
+
|
| 159 |
+
def close(self) -> None:
|
| 160 |
+
workspace = self._state.hidden_facts.get("workspace")
|
| 161 |
+
if workspace:
|
| 162 |
+
shutil.rmtree(workspace, ignore_errors=True)
|
| 163 |
+
|
| 164 |
+
def _execute(
|
| 165 |
+
self, action: CyberSecurityOWASPAction, anti_cheat_flags: list[str]
|
| 166 |
+
) -> tuple[str, dict, dict[str, float], str | None]:
|
| 167 |
+
verifier: dict = {"anti_cheat_flags": anti_cheat_flags}
|
| 168 |
+
reward = {key: 0.0 for key in (
|
| 169 |
+
"discovery",
|
| 170 |
+
"security",
|
| 171 |
+
"regression",
|
| 172 |
+
"public_routes",
|
| 173 |
+
"patch_quality",
|
| 174 |
+
"visible_tests",
|
| 175 |
+
"safety",
|
| 176 |
+
"anti_cheat",
|
| 177 |
+
"total",
|
| 178 |
+
)}
|
| 179 |
+
visible_tests = None
|
| 180 |
+
args = action.arguments or {}
|
| 181 |
+
|
| 182 |
+
if action.tool_name == "noop":
|
| 183 |
+
return "No operation.", verifier, reward, None
|
| 184 |
+
if action.tool_name == "inspect_policy_graph":
|
| 185 |
+
return json.dumps(self._visible_policy_hint, indent=2, sort_keys=True), verifier, reward, None
|
| 186 |
+
if action.tool_name == "list_routes":
|
| 187 |
+
return json.dumps(self._workspace_summary["routes"], indent=2), verifier, reward, None
|
| 188 |
+
if action.tool_name == "read_openapi":
|
| 189 |
+
return json.dumps(
|
| 190 |
+
{
|
| 191 |
+
"openapi": "3.1.0",
|
| 192 |
+
"info": {"title": "Generated invoices app", "version": "0.1.0"},
|
| 193 |
+
"paths": {
|
| 194 |
+
"/health": {"get": {"x-public": True}},
|
| 195 |
+
"/invoices/{invoice_id}": {"get": {"x-public": False}},
|
| 196 |
+
},
|
| 197 |
+
},
|
| 198 |
+
indent=2,
|
| 199 |
+
), verifier, reward, None
|
| 200 |
+
if action.tool_name == "read_file":
|
| 201 |
+
path = self._resolve_path(str(args.get("path", "")))
|
| 202 |
+
return path.read_text(encoding="utf-8"), verifier, reward, None
|
| 203 |
+
if action.tool_name == "search_code":
|
| 204 |
+
return self._search_code(str(args.get("query", ""))), verifier, reward, None
|
| 205 |
+
if action.tool_name == "send_local_request":
|
| 206 |
+
if not is_local_route(str(args.get("path", ""))):
|
| 207 |
+
raise ValueError("send_local_request only accepts local route paths")
|
| 208 |
+
response = simulate_request(
|
| 209 |
+
self._state,
|
| 210 |
+
str(args.get("method", "GET")),
|
| 211 |
+
str(args.get("path", "")),
|
| 212 |
+
args.get("user_id"),
|
| 213 |
+
)
|
| 214 |
+
return json.dumps(response, indent=2, sort_keys=True), verifier, reward, None
|
| 215 |
+
if action.tool_name == "compare_identities":
|
| 216 |
+
path = str(args.get("path", ""))
|
| 217 |
+
first = str(args.get("first_user_id", ""))
|
| 218 |
+
second = str(args.get("second_user_id", ""))
|
| 219 |
+
if not is_local_route(path):
|
| 220 |
+
raise ValueError("compare_identities only accepts local route paths")
|
| 221 |
+
response = {
|
| 222 |
+
"first": simulate_request(self._state, str(args.get("method", "GET")), path, first),
|
| 223 |
+
"second": simulate_request(self._state, str(args.get("method", "GET")), path, second),
|
| 224 |
+
}
|
| 225 |
+
return json.dumps(response, indent=2, sort_keys=True), verifier, reward, None
|
| 226 |
+
if action.tool_name == "submit_finding":
|
| 227 |
+
verifier, reward = evaluate_action(self._state, action, anti_cheat_flags)
|
| 228 |
+
if verifier.get("finding", {}).get("valid"):
|
| 229 |
+
self._state.finding_submitted = True
|
| 230 |
+
self._state.phase = "patch"
|
| 231 |
+
return "Finding accepted. Patch phase unlocked.", verifier, reward, None
|
| 232 |
+
return "Finding was not specific enough to unlock patching.", verifier, reward, None
|
| 233 |
+
if action.tool_name == "patch_file":
|
| 234 |
+
path = self._resolve_path(str(args.get("path", "")), write=True)
|
| 235 |
+
if "content" in args:
|
| 236 |
+
path.write_text(str(args["content"]), encoding="utf-8")
|
| 237 |
+
else:
|
| 238 |
+
self._apply_unified_diff(path, str(args.get("diff", "")))
|
| 239 |
+
return f"Patched {args.get('path')}.", verifier, reward, None
|
| 240 |
+
if action.tool_name == "run_visible_tests":
|
| 241 |
+
verifier, reward = evaluate_action(self._state, action, anti_cheat_flags)
|
| 242 |
+
visible_tests = json.dumps(verifier.get("visible", {}), indent=2, sort_keys=True)
|
| 243 |
+
return visible_tests, verifier, reward, visible_tests
|
| 244 |
+
if action.tool_name == "submit_fix":
|
| 245 |
+
verifier, reward = evaluate_action(self._state, action, anti_cheat_flags)
|
| 246 |
+
self._state.patch_submitted = True
|
| 247 |
+
security = verifier.get("security", {}).get("passed", False)
|
| 248 |
+
regression = verifier.get("regression", {}).get("passed", False)
|
| 249 |
+
public = verifier.get("public_routes", {}).get("passed", False)
|
| 250 |
+
quality = verifier.get("patch_quality", {}).get("passed", False)
|
| 251 |
+
self._state.success = bool(security and regression and public and quality)
|
| 252 |
+
self._state.done = True
|
| 253 |
+
self._state.phase = "done"
|
| 254 |
+
self._state.failure_reason = None if self._state.success else "hidden_verifier_failed"
|
| 255 |
+
return json.dumps(verifier, indent=2, sort_keys=True), verifier, reward, None
|
| 256 |
+
raise ValueError(f"Unhandled tool {action.tool_name}")
|
| 257 |
+
|
| 258 |
+
def _finish_step(
|
| 259 |
+
self,
|
| 260 |
+
message: str,
|
| 261 |
+
reward: dict[str, float],
|
| 262 |
+
*,
|
| 263 |
+
valid: bool,
|
| 264 |
+
error: str | None = None,
|
| 265 |
+
verifier: dict | None = None,
|
| 266 |
+
visible_test_result: str | None = None,
|
| 267 |
+
) -> CyberSecurityOWASPObservation:
|
| 268 |
+
self._state.last_reward = float(reward.get("total", 0.0))
|
| 269 |
+
self._state.accumulated_reward += self._state.last_reward
|
| 270 |
+
self._state.reward_history.append(reward)
|
| 271 |
+
if self._state.step_count >= self._state.max_steps and not self._state.done:
|
| 272 |
+
self._state.done = True
|
| 273 |
+
self._state.phase = "done"
|
| 274 |
+
self._state.failure_reason = "max_steps_exceeded"
|
| 275 |
+
obs = self._observation(
|
| 276 |
+
message,
|
| 277 |
+
reward=self._state.last_reward,
|
| 278 |
+
valid=valid,
|
| 279 |
+
error=error,
|
| 280 |
+
reward_breakdown=reward,
|
| 281 |
+
visible_test_result=visible_test_result,
|
| 282 |
+
done_reason=self._state.failure_reason,
|
| 283 |
+
)
|
| 284 |
+
if self._state.done:
|
| 285 |
+
self._last_done_observation = obs
|
| 286 |
+
return obs
|
| 287 |
+
|
| 288 |
+
def _observation(
|
| 289 |
+
self,
|
| 290 |
+
message: str,
|
| 291 |
+
*,
|
| 292 |
+
reward: float,
|
| 293 |
+
valid: bool = True,
|
| 294 |
+
error: str | None = None,
|
| 295 |
+
reward_breakdown: dict[str, float] | None = None,
|
| 296 |
+
visible_test_result: str | None = None,
|
| 297 |
+
done_reason: str | None = None,
|
| 298 |
+
) -> CyberSecurityOWASPObservation:
|
| 299 |
+
return CyberSecurityOWASPObservation(
|
| 300 |
+
phase=self._state.phase,
|
| 301 |
+
message=message,
|
| 302 |
+
task_brief=self._task_brief,
|
| 303 |
+
visible_policy_hint=self._visible_policy_hint,
|
| 304 |
+
workspace_summary=self._workspace_summary,
|
| 305 |
+
available_actions=sorted(ALLOWED_TOOLS[self._state.phase]),
|
| 306 |
+
last_tool_result=message,
|
| 307 |
+
last_action_valid=valid,
|
| 308 |
+
last_action_error=error,
|
| 309 |
+
visible_test_result=visible_test_result,
|
| 310 |
+
reward_breakdown=reward_breakdown or {},
|
| 311 |
+
done_reason=done_reason,
|
| 312 |
+
done=self._state.done,
|
| 313 |
reward=reward,
|
| 314 |
+
metadata={"episode_id": self._state.episode_id, "step_count": self._state.step_count},
|
| 315 |
)
|
| 316 |
|
| 317 |
+
def _resolve_path(self, path: str, *, write: bool = False) -> Path:
|
| 318 |
+
allowed, normalized_or_error = is_path_allowed(self._state, path, write=write)
|
| 319 |
+
if not allowed:
|
| 320 |
+
raise ValueError(normalized_or_error)
|
| 321 |
+
return Path(str(self._state.hidden_facts["workspace"])) / normalized_or_error
|
| 322 |
|
| 323 |
+
def _search_code(self, query: str) -> str:
|
| 324 |
+
if not query:
|
| 325 |
+
raise ValueError("query is required")
|
| 326 |
+
results: list[str] = []
|
| 327 |
+
workspace = Path(str(self._state.hidden_facts["workspace"]))
|
| 328 |
+
for rel in self._state.hidden_facts.get("editable_files", []):
|
| 329 |
+
path = workspace / rel
|
| 330 |
+
text = path.read_text(encoding="utf-8")
|
| 331 |
+
for idx, line in enumerate(text.splitlines(), start=1):
|
| 332 |
+
if query.lower() in line.lower():
|
| 333 |
+
results.append(f"{rel}:{idx}: {line}")
|
| 334 |
+
return "\n".join(results) or "No matches."
|
| 335 |
+
|
| 336 |
+
def _apply_unified_diff(self, path: Path, diff: str) -> None:
|
| 337 |
+
if not diff.strip():
|
| 338 |
+
raise ValueError("diff or content is required")
|
| 339 |
+
original = path.read_text(encoding="utf-8").splitlines(True)
|
| 340 |
+
output: list[str] = []
|
| 341 |
+
old_index = 0
|
| 342 |
+
lines = diff.splitlines(True)
|
| 343 |
+
i = 0
|
| 344 |
+
while i < len(lines):
|
| 345 |
+
line = lines[i]
|
| 346 |
+
if not line.startswith("@@"):
|
| 347 |
+
i += 1
|
| 348 |
+
continue
|
| 349 |
+
old_start = int(line.split()[1].split(",")[0][1:])
|
| 350 |
+
output.extend(original[old_index : old_start - 1])
|
| 351 |
+
old_index = old_start - 1
|
| 352 |
+
i += 1
|
| 353 |
+
while i < len(lines) and not lines[i].startswith("@@"):
|
| 354 |
+
hunk_line = lines[i]
|
| 355 |
+
if hunk_line.startswith(" "):
|
| 356 |
+
output.append(original[old_index])
|
| 357 |
+
old_index += 1
|
| 358 |
+
elif hunk_line.startswith("-"):
|
| 359 |
+
old_index += 1
|
| 360 |
+
elif hunk_line.startswith("+"):
|
| 361 |
+
output.append(hunk_line[1:])
|
| 362 |
+
elif hunk_line.startswith("\\"):
|
| 363 |
+
pass
|
| 364 |
+
i += 1
|
| 365 |
+
output.extend(original[old_index:])
|
| 366 |
+
path.write_text("".join(output), encoding="utf-8")
|
server/app.py
CHANGED
|
@@ -4,29 +4,7 @@
|
|
| 4 |
# This source code is licensed under the BSD-style license found in the
|
| 5 |
# LICENSE file in the root directory of this source tree.
|
| 6 |
|
| 7 |
-
"""
|
| 8 |
-
FastAPI application for the Cybersecurity Owasp Environment.
|
| 9 |
-
|
| 10 |
-
This module creates an HTTP server that exposes the CybersecurityOwaspEnvironment
|
| 11 |
-
over HTTP and WebSocket endpoints, compatible with EnvClient.
|
| 12 |
-
|
| 13 |
-
Endpoints:
|
| 14 |
-
- POST /reset: Reset the environment
|
| 15 |
-
- POST /step: Execute an action
|
| 16 |
-
- GET /state: Get current environment state
|
| 17 |
-
- GET /schema: Get action/observation schemas
|
| 18 |
-
- WS /ws: WebSocket endpoint for persistent sessions
|
| 19 |
-
|
| 20 |
-
Usage:
|
| 21 |
-
# Development (with auto-reload):
|
| 22 |
-
uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
|
| 23 |
-
|
| 24 |
-
# Production:
|
| 25 |
-
uvicorn server.app:app --host 0.0.0.0 --port 8000 --workers 4
|
| 26 |
-
|
| 27 |
-
# Or run directly:
|
| 28 |
-
python -m server.app
|
| 29 |
-
"""
|
| 30 |
|
| 31 |
try:
|
| 32 |
from openenv.core.env_server.http_server import create_app
|
|
@@ -36,20 +14,20 @@ except Exception as e: # pragma: no cover
|
|
| 36 |
) from e
|
| 37 |
|
| 38 |
try:
|
| 39 |
-
from ..models import
|
| 40 |
from .CyberSecurity_OWASP_environment import CybersecurityOwaspEnvironment
|
| 41 |
except ModuleNotFoundError:
|
| 42 |
-
from models import
|
| 43 |
from server.CyberSecurity_OWASP_environment import CybersecurityOwaspEnvironment
|
| 44 |
|
| 45 |
|
| 46 |
# Create the app with web interface and README integration
|
| 47 |
app = create_app(
|
| 48 |
CybersecurityOwaspEnvironment,
|
| 49 |
-
|
| 50 |
-
|
| 51 |
env_name="CyberSecurity_OWASP",
|
| 52 |
-
max_concurrent_envs=
|
| 53 |
)
|
| 54 |
|
| 55 |
|
|
|
|
| 4 |
# This source code is licensed under the BSD-style license found in the
|
| 5 |
# LICENSE file in the root directory of this source tree.
|
| 6 |
|
| 7 |
+
"""FastAPI application for the CyberSecurity_OWASP OpenEnv server."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
try:
|
| 10 |
from openenv.core.env_server.http_server import create_app
|
|
|
|
| 14 |
) from e
|
| 15 |
|
| 16 |
try:
|
| 17 |
+
from ..models import CyberSecurityOWASPAction, CyberSecurityOWASPObservation
|
| 18 |
from .CyberSecurity_OWASP_environment import CybersecurityOwaspEnvironment
|
| 19 |
except ModuleNotFoundError:
|
| 20 |
+
from models import CyberSecurityOWASPAction, CyberSecurityOWASPObservation
|
| 21 |
from server.CyberSecurity_OWASP_environment import CybersecurityOwaspEnvironment
|
| 22 |
|
| 23 |
|
| 24 |
# Create the app with web interface and README integration
|
| 25 |
app = create_app(
|
| 26 |
CybersecurityOwaspEnvironment,
|
| 27 |
+
CyberSecurityOWASPAction,
|
| 28 |
+
CyberSecurityOWASPObservation,
|
| 29 |
env_name="CyberSecurity_OWASP",
|
| 30 |
+
max_concurrent_envs=4,
|
| 31 |
)
|
| 32 |
|
| 33 |
|
server/reward_engine.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Server-side verifier aggregation for terminal scoring."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
try:
|
| 6 |
+
from ..models import CyberSecurityOWASPAction, CyberSecurityOWASPState
|
| 7 |
+
from ..rewards import compute_reward
|
| 8 |
+
from ..validators import (
|
| 9 |
+
patch_quality,
|
| 10 |
+
run_hidden_regression_tests,
|
| 11 |
+
run_hidden_security_tests,
|
| 12 |
+
run_public_route_tests,
|
| 13 |
+
run_visible_tests,
|
| 14 |
+
verify_finding,
|
| 15 |
+
)
|
| 16 |
+
except ImportError: # pragma: no cover
|
| 17 |
+
from models import CyberSecurityOWASPAction, CyberSecurityOWASPState
|
| 18 |
+
from rewards import compute_reward
|
| 19 |
+
from validators import (
|
| 20 |
+
patch_quality,
|
| 21 |
+
run_hidden_regression_tests,
|
| 22 |
+
run_hidden_security_tests,
|
| 23 |
+
run_public_route_tests,
|
| 24 |
+
run_visible_tests,
|
| 25 |
+
verify_finding,
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def evaluate_action(
|
| 30 |
+
state: CyberSecurityOWASPState,
|
| 31 |
+
action: CyberSecurityOWASPAction,
|
| 32 |
+
anti_cheat_flags: list[str] | None = None,
|
| 33 |
+
) -> tuple[dict, dict[str, float]]:
|
| 34 |
+
verifier_result: dict = {"anti_cheat_flags": anti_cheat_flags or []}
|
| 35 |
+
if action.tool_name == "submit_finding":
|
| 36 |
+
verifier_result["finding"] = verify_finding(state, action.arguments)
|
| 37 |
+
elif action.tool_name == "run_visible_tests":
|
| 38 |
+
verifier_result["visible"] = run_visible_tests(state)
|
| 39 |
+
elif action.tool_name == "submit_fix":
|
| 40 |
+
verifier_result.update(
|
| 41 |
+
{
|
| 42 |
+
"visible": run_visible_tests(state),
|
| 43 |
+
"security": run_hidden_security_tests(state),
|
| 44 |
+
"regression": run_hidden_regression_tests(state),
|
| 45 |
+
"public_routes": run_public_route_tests(state),
|
| 46 |
+
"patch_quality": patch_quality(state),
|
| 47 |
+
}
|
| 48 |
+
)
|
| 49 |
+
return verifier_result, compute_reward(state, action, verifier_result)
|
template_renderer.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Render small FastAPI-style generated workspaces."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Any
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def render_fastapi_basic(workspace: Path, public_hint: dict[str, Any], hidden: dict[str, Any]) -> list[str]:
|
| 10 |
+
"""Render the MVP vulnerable invoices app and visible tests."""
|
| 11 |
+
|
| 12 |
+
app_dir = workspace / "app"
|
| 13 |
+
routes_dir = app_dir / "routes"
|
| 14 |
+
tests_dir = workspace / "tests"
|
| 15 |
+
routes_dir.mkdir(parents=True, exist_ok=True)
|
| 16 |
+
tests_dir.mkdir(parents=True, exist_ok=True)
|
| 17 |
+
|
| 18 |
+
(app_dir / "__init__.py").write_text("", encoding="utf-8")
|
| 19 |
+
(routes_dir / "__init__.py").write_text("", encoding="utf-8")
|
| 20 |
+
(app_dir / "data.py").write_text(
|
| 21 |
+
"USERS = " + repr(hidden["users"]) + "\n\nINVOICES = " + repr(hidden["invoices"]) + "\n",
|
| 22 |
+
encoding="utf-8",
|
| 23 |
+
)
|
| 24 |
+
(app_dir / "auth.py").write_text(
|
| 25 |
+
"""from .data import USERS
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def get_actor(user_id: str):
|
| 29 |
+
return USERS.get(user_id)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def is_billing_admin(actor: dict) -> bool:
|
| 33 |
+
return "billing_admin" in actor.get("roles", [])
|
| 34 |
+
""",
|
| 35 |
+
encoding="utf-8",
|
| 36 |
+
)
|
| 37 |
+
(routes_dir / "invoices.py").write_text(
|
| 38 |
+
"""from app.auth import get_actor, is_billing_admin
|
| 39 |
+
from app.data import INVOICES
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def get_invoice(invoice_id: str, user_id: str):
|
| 43 |
+
actor = get_actor(user_id)
|
| 44 |
+
if actor is None:
|
| 45 |
+
return {"status": 401, "body": {"detail": "unknown user"}}
|
| 46 |
+
|
| 47 |
+
invoice = INVOICES.get(invoice_id)
|
| 48 |
+
if invoice is None:
|
| 49 |
+
return {"status": 404, "body": {"detail": "invoice not found"}}
|
| 50 |
+
|
| 51 |
+
# BUG: this only checks that the caller is authenticated. It forgets the
|
| 52 |
+
# owner/admin and tenant policy checks required by the policy graph.
|
| 53 |
+
return {"status": 200, "body": invoice}
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def health():
|
| 57 |
+
return {"status": 200, "body": {"status": "ok"}}
|
| 58 |
+
""",
|
| 59 |
+
encoding="utf-8",
|
| 60 |
+
)
|
| 61 |
+
(app_dir / "main.py").write_text(
|
| 62 |
+
"""from app.routes.invoices import get_invoice, health
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
ROUTES = [
|
| 66 |
+
{"method": "GET", "path": "/health", "handler": health, "public": True},
|
| 67 |
+
{"method": "GET", "path": "/invoices/{invoice_id}", "handler": get_invoice, "public": False},
|
| 68 |
+
]
|
| 69 |
+
""",
|
| 70 |
+
encoding="utf-8",
|
| 71 |
+
)
|
| 72 |
+
(tests_dir / "test_visible.py").write_text(
|
| 73 |
+
f"""from app.routes.invoices import get_invoice, health
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def test_health_is_public():
|
| 77 |
+
assert health()["status"] == 200
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def test_owner_can_read_own_invoice():
|
| 81 |
+
result = get_invoice("{hidden['owner_invoice_id']}", "{hidden['owner_user_id']}")
|
| 82 |
+
assert result["status"] == 200
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def test_admin_can_read_same_tenant_invoice():
|
| 86 |
+
result = get_invoice("{hidden['other_invoice_id']}", "{hidden['admin_user_id']}")
|
| 87 |
+
assert result["status"] == 200
|
| 88 |
+
""",
|
| 89 |
+
encoding="utf-8",
|
| 90 |
+
)
|
| 91 |
+
return [
|
| 92 |
+
"app/main.py",
|
| 93 |
+
"app/auth.py",
|
| 94 |
+
"app/data.py",
|
| 95 |
+
"app/routes/invoices.py",
|
| 96 |
+
"tests/test_visible.py",
|
| 97 |
+
]
|
tests/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Test package for CyberSecurity_OWASP."""
|
tests/helpers.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
|
| 3 |
+
from CyberSecurity_OWASP.models import CyberSecurityOWASPAction
|
| 4 |
+
from CyberSecurity_OWASP.server.CyberSecurity_OWASP_environment import CybersecurityOwaspEnvironment
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def make_env(seed: int = 7) -> CybersecurityOwaspEnvironment:
|
| 8 |
+
env = CybersecurityOwaspEnvironment()
|
| 9 |
+
env.reset(seed=seed, episode_id=f"test-{seed}")
|
| 10 |
+
return env
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def submit_valid_finding(env: CybersecurityOwaspEnvironment):
|
| 14 |
+
hidden = env.state.hidden_facts
|
| 15 |
+
return env.step(
|
| 16 |
+
CyberSecurityOWASPAction(
|
| 17 |
+
tool_name="submit_finding",
|
| 18 |
+
arguments={
|
| 19 |
+
"summary": "BOLA IDOR authorization bug lets a same-tenant user read another user's invoice.",
|
| 20 |
+
"evidence": (
|
| 21 |
+
f"user {hidden['owner_user_id']} can request invoice "
|
| 22 |
+
f"{hidden['other_invoice_id']} even though policy requires owner or billing_admin"
|
| 23 |
+
),
|
| 24 |
+
"policy_rule": "owner or billing_admin in same tenant only",
|
| 25 |
+
},
|
| 26 |
+
)
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def secure_invoice_source(env: CybersecurityOwaspEnvironment) -> str:
|
| 31 |
+
workspace = Path(env.state.hidden_facts["workspace"])
|
| 32 |
+
source = (workspace / "app/routes/invoices.py").read_text(encoding="utf-8")
|
| 33 |
+
return source.replace(
|
| 34 |
+
" # BUG: this only checks that the caller is authenticated. It forgets the\n"
|
| 35 |
+
" # owner/admin and tenant policy checks required by the policy graph.\n"
|
| 36 |
+
" return {\"status\": 200, \"body\": invoice}\n",
|
| 37 |
+
" if invoice[\"tenant_id\"] != actor[\"tenant_id\"]:\n"
|
| 38 |
+
" return {\"status\": 403, \"body\": {\"detail\": \"forbidden\"}}\n"
|
| 39 |
+
" if invoice[\"owner_user_id\"] != actor[\"user_id\"] and not is_billing_admin(actor):\n"
|
| 40 |
+
" return {\"status\": 403, \"body\": {\"detail\": \"forbidden\"}}\n"
|
| 41 |
+
" return {\"status\": 200, \"body\": invoice}\n",
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def apply_secure_patch(env: CybersecurityOwaspEnvironment):
|
| 46 |
+
return env.step(
|
| 47 |
+
CyberSecurityOWASPAction(
|
| 48 |
+
tool_name="patch_file",
|
| 49 |
+
arguments={"path": "app/routes/invoices.py", "content": secure_invoice_source(env)},
|
| 50 |
+
)
|
| 51 |
+
)
|
tests/test_anti_cheat.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from CyberSecurity_OWASP.models import CyberSecurityOWASPAction
|
| 2 |
+
|
| 3 |
+
from .helpers import make_env, submit_valid_finding
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def test_patch_hidden_identifier_flagged():
|
| 7 |
+
env = make_env(50)
|
| 8 |
+
submit_valid_finding(env)
|
| 9 |
+
hidden_id = env.state.hidden_facts["other_invoice_id"]
|
| 10 |
+
env.step(
|
| 11 |
+
CyberSecurityOWASPAction(
|
| 12 |
+
tool_name="patch_file",
|
| 13 |
+
arguments={"path": "app/routes/invoices.py", "content": f"# {hidden_id}\n"},
|
| 14 |
+
)
|
| 15 |
+
)
|
| 16 |
+
assert "hardcoded_hidden_identifier" in env.state.anti_cheat_flags
|
tests/test_invalid_actions.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from CyberSecurity_OWASP.models import CyberSecurityOWASPAction
|
| 2 |
+
|
| 3 |
+
from .helpers import make_env, submit_valid_finding
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def test_wrong_phase_action_is_structured_error():
|
| 7 |
+
env = make_env(30)
|
| 8 |
+
obs = env.step(
|
| 9 |
+
CyberSecurityOWASPAction(
|
| 10 |
+
tool_name="patch_file",
|
| 11 |
+
arguments={"path": "app/routes/invoices.py", "content": ""},
|
| 12 |
+
)
|
| 13 |
+
)
|
| 14 |
+
assert obs.last_action_valid is False
|
| 15 |
+
assert "not allowed" in (obs.last_action_error or "")
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def test_blocked_hidden_path_does_not_crash():
|
| 19 |
+
env = make_env(31)
|
| 20 |
+
obs = env.step(CyberSecurityOWASPAction(tool_name="read_file", arguments={"path": "../hidden.py"}))
|
| 21 |
+
assert obs.last_action_valid is False
|
| 22 |
+
assert "blocked" in (obs.last_action_error or "")
|
| 23 |
+
assert "blocked_file_probe" in env.state.anti_cheat_flags
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def test_external_request_is_blocked():
|
| 27 |
+
env = make_env(32)
|
| 28 |
+
obs = env.step(
|
| 29 |
+
CyberSecurityOWASPAction(
|
| 30 |
+
tool_name="send_local_request",
|
| 31 |
+
arguments={"method": "GET", "path": "https://example.com"},
|
| 32 |
+
)
|
| 33 |
+
)
|
| 34 |
+
assert obs.last_action_valid is False
|
| 35 |
+
assert "external_network_attempt" in env.state.anti_cheat_flags
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def test_visible_tests_are_not_patchable():
|
| 39 |
+
env = make_env(33)
|
| 40 |
+
submit_valid_finding(env)
|
| 41 |
+
obs = env.step(
|
| 42 |
+
CyberSecurityOWASPAction(
|
| 43 |
+
tool_name="patch_file",
|
| 44 |
+
arguments={"path": "tests/test_visible.py", "content": ""},
|
| 45 |
+
)
|
| 46 |
+
)
|
| 47 |
+
assert obs.last_action_valid is False
|
| 48 |
+
assert "not patchable" in (obs.last_action_error or "")
|
tests/test_models.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from CyberSecurity_OWASP import (
|
| 2 |
+
CyberSecurityOWASPAction,
|
| 3 |
+
CyberSecurityOWASPObservation,
|
| 4 |
+
CyberSecurityOWASPState,
|
| 5 |
+
)
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def test_models_serialize():
|
| 9 |
+
action = CyberSecurityOWASPAction(tool_name="noop")
|
| 10 |
+
assert action.model_dump()["tool_name"] == "noop"
|
| 11 |
+
obs = CyberSecurityOWASPObservation(phase="discover", message="ok")
|
| 12 |
+
assert obs.model_dump()["phase"] == "discover"
|
| 13 |
+
state = CyberSecurityOWASPState(episode_id="e1", seed=1)
|
| 14 |
+
assert state.model_dump()["seed"] == 1
|
tests/test_reset_step_state.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from CyberSecurity_OWASP.models import CyberSecurityOWASPAction
|
| 2 |
+
|
| 3 |
+
from .helpers import make_env
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def test_reset_initializes_scenario_and_state():
|
| 7 |
+
env = make_env(10)
|
| 8 |
+
state = env.state
|
| 9 |
+
assert state.seed == 10
|
| 10 |
+
assert state.phase == "discover"
|
| 11 |
+
assert state.domain == "invoices"
|
| 12 |
+
assert state.bug_family == "bola_idor"
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def test_step_count_and_done_stability():
|
| 16 |
+
env = make_env(11)
|
| 17 |
+
env.step(CyberSecurityOWASPAction(tool_name="noop"))
|
| 18 |
+
assert env.state.step_count == 1
|
| 19 |
+
env.state.done = True
|
| 20 |
+
env.state.phase = "done"
|
| 21 |
+
first = env.step(CyberSecurityOWASPAction(tool_name="noop"))
|
| 22 |
+
second = env.step(CyberSecurityOWASPAction(tool_name="noop"))
|
| 23 |
+
assert first.done is True
|
| 24 |
+
assert second.done is True
|
| 25 |
+
assert env.state.step_count == 1
|
tests/test_rewards.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from CyberSecurity_OWASP.models import CyberSecurityOWASPAction
|
| 2 |
+
|
| 3 |
+
from .helpers import apply_secure_patch, make_env, secure_invoice_source, submit_valid_finding
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def test_oracle_patch_gets_high_reward():
|
| 7 |
+
env = make_env(40)
|
| 8 |
+
finding = submit_valid_finding(env)
|
| 9 |
+
assert finding.reward_breakdown["discovery"] == 3.0
|
| 10 |
+
apply_secure_patch(env)
|
| 11 |
+
visible = env.step(CyberSecurityOWASPAction(tool_name="run_visible_tests"))
|
| 12 |
+
assert visible.reward_breakdown["visible_tests"] == 1.0
|
| 13 |
+
final = env.step(CyberSecurityOWASPAction(tool_name="submit_fix"))
|
| 14 |
+
assert env.state.success is True
|
| 15 |
+
assert final.reward_breakdown["total"] >= 12.0
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def test_finding_without_patch_does_not_succeed():
|
| 19 |
+
env = make_env(41)
|
| 20 |
+
submit_valid_finding(env)
|
| 21 |
+
final = env.step(CyberSecurityOWASPAction(tool_name="submit_fix"))
|
| 22 |
+
assert env.state.success is False
|
| 23 |
+
assert final.reward_breakdown["security"] < 0
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def test_deny_all_patch_fails_regression():
|
| 27 |
+
env = make_env(42)
|
| 28 |
+
submit_valid_finding(env)
|
| 29 |
+
source = secure_invoice_source(env).replace(
|
| 30 |
+
" if invoice[\"tenant_id\"] != actor[\"tenant_id\"]:\n",
|
| 31 |
+
" return {\"status\": 403, \"body\": {\"detail\": \"forbidden\"}}\n"
|
| 32 |
+
" if invoice[\"tenant_id\"] != actor[\"tenant_id\"]:\n",
|
| 33 |
+
)
|
| 34 |
+
env.step(
|
| 35 |
+
CyberSecurityOWASPAction(
|
| 36 |
+
tool_name="patch_file",
|
| 37 |
+
arguments={"path": "app/routes/invoices.py", "content": source},
|
| 38 |
+
)
|
| 39 |
+
)
|
| 40 |
+
final = env.step(CyberSecurityOWASPAction(tool_name="submit_fix"))
|
| 41 |
+
assert final.reward_breakdown["regression"] < 0
|
| 42 |
+
assert env.state.success is False
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def test_hardcoded_patch_is_penalized():
|
| 46 |
+
env = make_env(43)
|
| 47 |
+
submit_valid_finding(env)
|
| 48 |
+
hidden = env.state.hidden_facts
|
| 49 |
+
source = secure_invoice_source(env) + f"\n# hardcoded {hidden['owner_invoice_id']}\n"
|
| 50 |
+
env.step(
|
| 51 |
+
CyberSecurityOWASPAction(
|
| 52 |
+
tool_name="patch_file",
|
| 53 |
+
arguments={"path": "app/routes/invoices.py", "content": source},
|
| 54 |
+
)
|
| 55 |
+
)
|
| 56 |
+
final = env.step(CyberSecurityOWASPAction(tool_name="submit_fix"))
|
| 57 |
+
assert final.reward_breakdown["anti_cheat"] < 0
|
| 58 |
+
assert env.state.success is False
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def test_visible_tests_only_does_not_get_high_reward():
|
| 62 |
+
env = make_env(44)
|
| 63 |
+
submit_valid_finding(env)
|
| 64 |
+
visible = env.step(CyberSecurityOWASPAction(tool_name="run_visible_tests"))
|
| 65 |
+
assert visible.reward_breakdown["visible_tests"] == 1.0
|
| 66 |
+
final = env.step(CyberSecurityOWASPAction(tool_name="submit_fix"))
|
| 67 |
+
assert final.reward_breakdown["total"] < 5.0
|
tests/test_rollouts.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from CyberSecurity_OWASP.evals import bad_policy, random_policy
|
| 2 |
+
from CyberSecurity_OWASP.models import CyberSecurityOWASPAction
|
| 3 |
+
|
| 4 |
+
from .helpers import apply_secure_patch, make_env, submit_valid_finding
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def test_random_policy_does_not_crash():
|
| 8 |
+
env = make_env(60)
|
| 9 |
+
for action in random_policy():
|
| 10 |
+
obs = env.step(action)
|
| 11 |
+
assert obs is not None
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def test_bad_policy_is_penalized_or_flagged():
|
| 15 |
+
env = make_env(61)
|
| 16 |
+
for action in bad_policy():
|
| 17 |
+
obs = env.step(action)
|
| 18 |
+
assert env.state.anti_cheat_flags
|
| 19 |
+
assert obs.reward <= 0
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def test_scripted_oracle_solves_episode():
|
| 23 |
+
env = make_env(62)
|
| 24 |
+
submit_valid_finding(env)
|
| 25 |
+
apply_secure_patch(env)
|
| 26 |
+
env.step(CyberSecurityOWASPAction(tool_name="run_visible_tests"))
|
| 27 |
+
final = env.step(CyberSecurityOWASPAction(tool_name="submit_fix"))
|
| 28 |
+
assert final.done is True
|
| 29 |
+
assert env.state.success is True
|
tests/test_seed_reproducibility.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .helpers import make_env
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def test_same_seed_reproducible_visible_facts():
|
| 5 |
+
a = make_env(22)
|
| 6 |
+
b = make_env(22)
|
| 7 |
+
assert a.state.task_id == b.state.task_id
|
| 8 |
+
assert a.state.hidden_facts["owner_invoice_id"] == b.state.hidden_facts["owner_invoice_id"]
|
| 9 |
+
assert a.state.hidden_facts["other_invoice_id"] == b.state.hidden_facts["other_invoice_id"]
|
| 10 |
+
assert a.state.visible_facts == b.state.visible_facts
|
training/configs/grpo_small.yaml
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_name: Qwen/Qwen3-1.7B
|
| 2 |
+
algo: grpo
|
| 3 |
+
environment: CyberSecurity_OWASP
|
| 4 |
+
max_steps: 40
|
| 5 |
+
num_generations: 2
|
| 6 |
+
per_device_train_batch_size: 1
|
| 7 |
+
gradient_accumulation_steps: 32
|
| 8 |
+
learning_rate: 0.000005
|
| 9 |
+
report_to: trackio
|
training/eval_before_after.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Baseline-vs-trained evaluation scaffold for CyberSecurity_OWASP."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def summarize_runs(baseline: list[dict], trained: list[dict], heldout: list[dict]) -> dict:
|
| 10 |
+
def mean(items: list[dict], key: str) -> float:
|
| 11 |
+
return sum(float(item.get(key, 0.0)) for item in items) / max(1, len(items))
|
| 12 |
+
|
| 13 |
+
return {
|
| 14 |
+
"baseline_success_rate": mean(baseline, "success"),
|
| 15 |
+
"trained_success_rate": mean(trained, "success"),
|
| 16 |
+
"absolute_success_improvement": mean(trained, "success") - mean(baseline, "success"),
|
| 17 |
+
"baseline_mean_reward": mean(baseline, "reward_total"),
|
| 18 |
+
"trained_mean_reward": mean(trained, "reward_total"),
|
| 19 |
+
"absolute_reward_improvement": mean(trained, "reward_total") - mean(baseline, "reward_total"),
|
| 20 |
+
"heldout_success_rate": mean(heldout, "success"),
|
| 21 |
+
"heldout_mean_reward": mean(heldout, "reward_total"),
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def save_eval_summary(run_name: str, summary: dict) -> Path:
|
| 26 |
+
output = Path("outputs/evals") / f"{run_name}_eval_summary.json"
|
| 27 |
+
output.parent.mkdir(parents=True, exist_ok=True)
|
| 28 |
+
output.write_text(json.dumps(summary, indent=2, sort_keys=True), encoding="utf-8")
|
| 29 |
+
return output
|
training/reward_funcs.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Reward functions exposed for TRL/GRPO logging."""
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def _values(name: str, completions, kwargs):
|
| 5 |
+
return [float(x) for x in kwargs.get(name, [0.0] * len(completions))]
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def reward_total(completions, **kwargs):
|
| 9 |
+
return _values("reward_total", completions, kwargs)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def reward_security(completions, **kwargs):
|
| 13 |
+
return _values("reward_security", completions, kwargs)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def reward_regression(completions, **kwargs):
|
| 17 |
+
return _values("reward_regression", completions, kwargs)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def reward_patch_quality(completions, **kwargs):
|
| 21 |
+
return _values("reward_patch_quality", completions, kwargs)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def reward_anti_cheat(completions, **kwargs):
|
| 25 |
+
return _values("reward_anti_cheat", completions, kwargs)
|
training/rollout.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Minimal rollout loop for CyberSecurity_OWASP episodes."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
from typing import Any
|
| 7 |
+
|
| 8 |
+
from CyberSecurity_OWASP import CyberSecurityOWASPAction
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def build_cybersecurity_owasp_prompt(observation, action_trace, observation_trace) -> str:
|
| 12 |
+
return (
|
| 13 |
+
"You are a defensive AppSec repair agent. Output exactly one JSON action.\n"
|
| 14 |
+
f"Phase: {observation.phase}\n"
|
| 15 |
+
f"Task: {observation.task_brief}\n"
|
| 16 |
+
f"Available actions: {observation.available_actions}\n"
|
| 17 |
+
f"Last result: {observation.last_tool_result}\n"
|
| 18 |
+
'Example: {"tool_name":"read_file","arguments":{"path":"app/routes/invoices.py"}}'
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def parse_action_json(text: str) -> CyberSecurityOWASPAction:
|
| 23 |
+
data = json.loads(text)
|
| 24 |
+
return CyberSecurityOWASPAction(**data)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def generate_rollout_completions(trainer, prompts: list[str]) -> list[dict[str, Any]]:
|
| 28 |
+
if hasattr(trainer, "generate_rollout_completions"):
|
| 29 |
+
return trainer.generate_rollout_completions(prompts)
|
| 30 |
+
return [
|
| 31 |
+
{
|
| 32 |
+
"text": '{"tool_name":"noop","arguments":{}}',
|
| 33 |
+
"prompt_ids": [],
|
| 34 |
+
"completion_ids": [],
|
| 35 |
+
"logprobs": [],
|
| 36 |
+
}
|
| 37 |
+
for _ in prompts
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def rollout_once(trainer, env, tokenizer=None, dataset_prompt: str = "", max_steps: int = 40) -> dict:
|
| 42 |
+
result = env.reset()
|
| 43 |
+
observation = result.observation if hasattr(result, "observation") else result
|
| 44 |
+
|
| 45 |
+
prompt_ids = []
|
| 46 |
+
completion_ids = []
|
| 47 |
+
logprobs = []
|
| 48 |
+
reward_trace = []
|
| 49 |
+
action_trace = []
|
| 50 |
+
observation_trace = []
|
| 51 |
+
|
| 52 |
+
for _ in range(max_steps):
|
| 53 |
+
if getattr(observation, "done", False):
|
| 54 |
+
break
|
| 55 |
+
prompt = build_cybersecurity_owasp_prompt(observation, action_trace, observation_trace)
|
| 56 |
+
rollout_output = generate_rollout_completions(trainer, [prompt])[0]
|
| 57 |
+
action = parse_action_json(rollout_output["text"])
|
| 58 |
+
result = env.step(action)
|
| 59 |
+
observation = result.observation if hasattr(result, "observation") else result
|
| 60 |
+
|
| 61 |
+
prompt_ids.extend(rollout_output["prompt_ids"])
|
| 62 |
+
completion_ids.extend(rollout_output["completion_ids"])
|
| 63 |
+
logprobs.extend(rollout_output["logprobs"])
|
| 64 |
+
reward_trace.append(float(getattr(observation, "reward", 0.0) or 0.0))
|
| 65 |
+
action_trace.append(action.model_dump())
|
| 66 |
+
observation_trace.append(observation.model_dump())
|
| 67 |
+
|
| 68 |
+
final_breakdown = getattr(observation, "reward_breakdown", {}) or {}
|
| 69 |
+
state = env.state if not callable(getattr(env, "state", None)) else env.state()
|
| 70 |
+
return {
|
| 71 |
+
"prompt_ids": prompt_ids,
|
| 72 |
+
"completion_ids": completion_ids,
|
| 73 |
+
"logprobs": logprobs,
|
| 74 |
+
"reward_total": float(final_breakdown.get("total", sum(reward_trace))),
|
| 75 |
+
"reward_discovery": float(final_breakdown.get("discovery", 0.0)),
|
| 76 |
+
"reward_security": float(final_breakdown.get("security", 0.0)),
|
| 77 |
+
"reward_regression": float(final_breakdown.get("regression", 0.0)),
|
| 78 |
+
"reward_patch_quality": float(final_breakdown.get("patch_quality", 0.0)),
|
| 79 |
+
"reward_anti_cheat": float(final_breakdown.get("anti_cheat", 0.0)),
|
| 80 |
+
"success": bool(getattr(state, "success", False)),
|
| 81 |
+
"episode_length": len(action_trace),
|
| 82 |
+
"actions": action_trace,
|
| 83 |
+
"observations": observation_trace,
|
| 84 |
+
}
|
training/trackio_utils.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Trackio helpers used by training and evaluation scripts."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
TRAIN_METRICS = [
|
| 9 |
+
"train/reward_total_mean",
|
| 10 |
+
"train/reward_discovery_mean",
|
| 11 |
+
"train/reward_security_mean",
|
| 12 |
+
"train/reward_regression_mean",
|
| 13 |
+
"train/reward_public_routes_mean",
|
| 14 |
+
"train/reward_patch_quality_mean",
|
| 15 |
+
"train/reward_visible_tests_mean",
|
| 16 |
+
"train/reward_safety_mean",
|
| 17 |
+
"train/reward_anti_cheat_mean",
|
| 18 |
+
"train/success_rate",
|
| 19 |
+
"train/exploit_block_rate",
|
| 20 |
+
"train/regression_preservation_rate",
|
| 21 |
+
"train/public_route_preservation_rate",
|
| 22 |
+
"train/invalid_action_rate",
|
| 23 |
+
"train/timeout_rate",
|
| 24 |
+
"train/safety_violation_rate",
|
| 25 |
+
"train/reward_hacking_suspected_rate",
|
| 26 |
+
"train/episode_length_mean",
|
| 27 |
+
"train/episode_length_p95",
|
| 28 |
+
"train/rollouts_per_second",
|
| 29 |
+
"train/tokens_per_second",
|
| 30 |
+
"train/loss",
|
| 31 |
+
"train/learning_rate",
|
| 32 |
+
"train/kl",
|
| 33 |
+
"train/grad_norm",
|
| 34 |
+
]
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def build_run_name(model: str, algo: str, difficulty: int, git_sha: str = "nogit") -> str:
|
| 38 |
+
stamp = datetime.utcnow().strftime("%Y%m%d-%H%M")
|
| 39 |
+
model_slug = model.replace("/", "-")
|
| 40 |
+
return f"CyberSecurity_OWASP-{model_slug}-{algo}-level{difficulty}-{stamp}-{git_sha[:8]}"
|
training/train_grpo.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Minimal GRPO training entrypoint scaffold.
|
| 2 |
+
|
| 3 |
+
This file intentionally does not start training on import. It validates that the
|
| 4 |
+
required TRL/Trackio configuration can be constructed when optional training
|
| 5 |
+
dependencies are installed.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import os
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def build_grpo_config():
|
| 14 |
+
from trl import GRPOConfig
|
| 15 |
+
|
| 16 |
+
output_dir = os.getenv("OUTPUT_DIR", "CyberSecurity_OWASP-qwen3-1.7b-grpo")
|
| 17 |
+
trackio_space_id = os.getenv("TRACKIO_SPACE_ID", output_dir)
|
| 18 |
+
return GRPOConfig(
|
| 19 |
+
output_dir=output_dir,
|
| 20 |
+
report_to="trackio",
|
| 21 |
+
trackio_space_id=trackio_space_id,
|
| 22 |
+
logging_steps=1,
|
| 23 |
+
save_steps=25,
|
| 24 |
+
learning_rate=5e-6,
|
| 25 |
+
num_train_epochs=1,
|
| 26 |
+
per_device_train_batch_size=1,
|
| 27 |
+
gradient_accumulation_steps=32,
|
| 28 |
+
num_generations=2,
|
| 29 |
+
max_prompt_length=4096,
|
| 30 |
+
max_completion_length=768,
|
| 31 |
+
use_vllm=True,
|
| 32 |
+
vllm_mode="colocate",
|
| 33 |
+
vllm_gpu_memory_utilization=0.2,
|
| 34 |
+
gradient_checkpointing=True,
|
| 35 |
+
gradient_checkpointing_kwargs={"use_reentrant": False},
|
| 36 |
+
push_to_hub=False,
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def main():
|
| 41 |
+
config = build_grpo_config()
|
| 42 |
+
print(config)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
if __name__ == "__main__":
|
| 46 |
+
main()
|
uv.lock
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
validators.py
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Deterministic validators and hidden checks for CyberSecurity_OWASP."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Any
|
| 7 |
+
|
| 8 |
+
from .models import CyberSecurityOWASPAction, CyberSecurityOWASPState
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
BLOCKED_PATH_MARKERS = (
|
| 12 |
+
"hidden",
|
| 13 |
+
"oracle",
|
| 14 |
+
"reward_engine",
|
| 15 |
+
"validators.py",
|
| 16 |
+
"rewards.py",
|
| 17 |
+
".git",
|
| 18 |
+
"..",
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def _workspace(state: CyberSecurityOWASPState) -> Path:
|
| 23 |
+
return Path(str(state.hidden_facts["workspace"]))
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def _route_source(state: CyberSecurityOWASPState) -> str:
|
| 27 |
+
return (_workspace(state) / "app/routes/invoices.py").read_text(encoding="utf-8")
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def is_path_allowed(state: CyberSecurityOWASPState, path: str, *, write: bool = False) -> tuple[bool, str]:
|
| 31 |
+
normalized = path.replace("\\", "/").lstrip("/")
|
| 32 |
+
if any(marker in normalized for marker in BLOCKED_PATH_MARKERS):
|
| 33 |
+
return False, "blocked path"
|
| 34 |
+
if Path(path).is_absolute():
|
| 35 |
+
return False, "absolute host paths are blocked"
|
| 36 |
+
editable = set(state.hidden_facts.get("editable_files", []))
|
| 37 |
+
if normalized not in editable:
|
| 38 |
+
return False, "path is outside editable generated workspace"
|
| 39 |
+
if write and normalized.startswith("tests/"):
|
| 40 |
+
return False, "visible tests are readable but not patchable"
|
| 41 |
+
return True, normalized
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def detect_cheating(state: CyberSecurityOWASPState, action: CyberSecurityOWASPAction) -> list[str]:
|
| 45 |
+
flags: list[str] = []
|
| 46 |
+
args = action.arguments or {}
|
| 47 |
+
text = " ".join(str(v) for v in args.values())
|
| 48 |
+
for key in (
|
| 49 |
+
"owner_user_id",
|
| 50 |
+
"intruder_user_id",
|
| 51 |
+
"admin_user_id",
|
| 52 |
+
"owner_invoice_id",
|
| 53 |
+
"other_invoice_id",
|
| 54 |
+
"foreign_invoice_id",
|
| 55 |
+
"tenant_a",
|
| 56 |
+
"tenant_b",
|
| 57 |
+
):
|
| 58 |
+
value = str(state.hidden_facts.get(key, ""))
|
| 59 |
+
if value and value in text and action.tool_name == "patch_file":
|
| 60 |
+
flags.append("hardcoded_hidden_identifier")
|
| 61 |
+
break
|
| 62 |
+
if action.tool_name in {"read_file", "search_code", "patch_file"}:
|
| 63 |
+
path_text = str(args.get("path", "")) + " " + str(args.get("query", ""))
|
| 64 |
+
if any(marker in path_text.replace("\\", "/") for marker in BLOCKED_PATH_MARKERS):
|
| 65 |
+
flags.append("blocked_file_probe")
|
| 66 |
+
if action.tool_name == "send_local_request":
|
| 67 |
+
target = str(args.get("path", ""))
|
| 68 |
+
if target.startswith("http://") or target.startswith("https://"):
|
| 69 |
+
flags.append("external_network_attempt")
|
| 70 |
+
return sorted(set(flags))
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def _source_is_secure(source: str) -> bool:
|
| 74 |
+
has_tenant_check = "tenant_id" in source and "actor" in source and "!=" in source
|
| 75 |
+
has_owner = "owner_user_id" in source and "actor" in source
|
| 76 |
+
has_admin = "is_billing_admin" in source
|
| 77 |
+
has_forbidden = '"status": 403' in source or "'status': 403" in source
|
| 78 |
+
return has_tenant_check and has_owner and has_admin and has_forbidden
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def _source_is_deny_all(source: str) -> bool:
|
| 82 |
+
early_unconditional = (
|
| 83 |
+
" return {\"status\": 403" in source
|
| 84 |
+
and " return {\"status\": 403" in source.split("if invoice[", 1)[0]
|
| 85 |
+
)
|
| 86 |
+
missing_positive_policy = "owner_user_id" not in source or "is_billing_admin" not in source
|
| 87 |
+
return early_unconditional or (
|
| 88 |
+
("return {\"status\": 403" in source or "return {'status': 403" in source)
|
| 89 |
+
and missing_positive_policy
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def _source_has_hardcoding(state: CyberSecurityOWASPState, source: str) -> bool:
|
| 94 |
+
values = [
|
| 95 |
+
str(state.hidden_facts.get(key, ""))
|
| 96 |
+
for key in (
|
| 97 |
+
"owner_user_id",
|
| 98 |
+
"intruder_user_id",
|
| 99 |
+
"admin_user_id",
|
| 100 |
+
"owner_invoice_id",
|
| 101 |
+
"other_invoice_id",
|
| 102 |
+
"foreign_invoice_id",
|
| 103 |
+
"tenant_a",
|
| 104 |
+
"tenant_b",
|
| 105 |
+
)
|
| 106 |
+
]
|
| 107 |
+
return any(value and value in source for value in values)
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def simulate_request(
|
| 111 |
+
state: CyberSecurityOWASPState, method: str, path: str, user_id: str | None = None
|
| 112 |
+
) -> dict[str, Any]:
|
| 113 |
+
if method.upper() != "GET":
|
| 114 |
+
return {"status": 405, "body": {"detail": "method not allowed"}}
|
| 115 |
+
source = _route_source(state)
|
| 116 |
+
if path == "/health":
|
| 117 |
+
if "def health" not in source or "status\": 200" not in source:
|
| 118 |
+
return {"status": 404, "body": {"detail": "health unavailable"}}
|
| 119 |
+
return {"status": 200, "body": {"status": "ok"}}
|
| 120 |
+
if not path.startswith("/invoices/"):
|
| 121 |
+
return {"status": 404, "body": {"detail": "route not found"}}
|
| 122 |
+
|
| 123 |
+
invoice_id = path.rsplit("/", 1)[-1]
|
| 124 |
+
users = state.hidden_facts["users"]
|
| 125 |
+
invoices = state.hidden_facts["invoices"]
|
| 126 |
+
actor = users.get(user_id or "")
|
| 127 |
+
invoice = invoices.get(invoice_id)
|
| 128 |
+
if actor is None:
|
| 129 |
+
return {"status": 401, "body": {"detail": "unknown user"}}
|
| 130 |
+
if invoice is None:
|
| 131 |
+
return {"status": 404, "body": {"detail": "invoice not found"}}
|
| 132 |
+
if _source_is_deny_all(source):
|
| 133 |
+
return {"status": 403, "body": {"detail": "forbidden"}}
|
| 134 |
+
if _source_is_secure(source):
|
| 135 |
+
same_tenant = invoice["tenant_id"] == actor["tenant_id"]
|
| 136 |
+
allowed = invoice["owner_user_id"] == actor["user_id"] or "billing_admin" in actor["roles"]
|
| 137 |
+
if same_tenant and allowed:
|
| 138 |
+
return {"status": 200, "body": invoice}
|
| 139 |
+
return {"status": 403, "body": {"detail": "forbidden"}}
|
| 140 |
+
return {"status": 200, "body": invoice}
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def verify_finding(state: CyberSecurityOWASPState, finding: dict) -> dict:
|
| 144 |
+
text = " ".join(str(v).lower() for v in finding.values())
|
| 145 |
+
hidden = state.hidden_facts
|
| 146 |
+
mentions_known_invoice = any(
|
| 147 |
+
str(hidden.get(key, "")).lower() in text
|
| 148 |
+
for key in ("owner_invoice_id", "other_invoice_id", "foreign_invoice_id")
|
| 149 |
+
)
|
| 150 |
+
mentions_known_user = any(
|
| 151 |
+
str(hidden.get(key, "")).lower() in text
|
| 152 |
+
for key in ("owner_user_id", "intruder_user_id", "admin_user_id")
|
| 153 |
+
)
|
| 154 |
+
valid = (
|
| 155 |
+
any(term in text for term in ("idor", "bola", "authorization", "owner", "tenant"))
|
| 156 |
+
and mentions_known_invoice
|
| 157 |
+
and mentions_known_user
|
| 158 |
+
)
|
| 159 |
+
tied_to_policy = any(term in text for term in ("policy", "owner", "tenant", "billing_admin"))
|
| 160 |
+
return {"valid": valid, "tied_to_policy": tied_to_policy}
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def run_visible_tests(state: CyberSecurityOWASPState) -> dict:
|
| 164 |
+
hidden = state.hidden_facts
|
| 165 |
+
checks = {
|
| 166 |
+
"health_public": simulate_request(state, "GET", "/health")["status"] == 200,
|
| 167 |
+
"owner_can_read": simulate_request(
|
| 168 |
+
state, "GET", f"/invoices/{hidden['owner_invoice_id']}", hidden["owner_user_id"]
|
| 169 |
+
)["status"]
|
| 170 |
+
== 200,
|
| 171 |
+
"admin_can_read": simulate_request(
|
| 172 |
+
state, "GET", f"/invoices/{hidden['other_invoice_id']}", hidden["admin_user_id"]
|
| 173 |
+
)["status"]
|
| 174 |
+
== 200,
|
| 175 |
+
}
|
| 176 |
+
return {"passed": all(checks.values()), "checks": checks}
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def run_hidden_security_tests(state: CyberSecurityOWASPState) -> dict:
|
| 180 |
+
hidden = state.hidden_facts
|
| 181 |
+
checks = {
|
| 182 |
+
"same_tenant_other_user_blocked": simulate_request(
|
| 183 |
+
state, "GET", f"/invoices/{hidden['other_invoice_id']}", hidden["owner_user_id"]
|
| 184 |
+
)["status"]
|
| 185 |
+
== 403,
|
| 186 |
+
"cross_tenant_blocked": simulate_request(
|
| 187 |
+
state, "GET", f"/invoices/{hidden['foreign_invoice_id']}", hidden["admin_user_id"]
|
| 188 |
+
)["status"]
|
| 189 |
+
== 403,
|
| 190 |
+
}
|
| 191 |
+
return {"passed": all(checks.values()), "checks": checks}
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
def run_hidden_regression_tests(state: CyberSecurityOWASPState) -> dict:
|
| 195 |
+
hidden = state.hidden_facts
|
| 196 |
+
checks = {
|
| 197 |
+
"owner_preserved": simulate_request(
|
| 198 |
+
state, "GET", f"/invoices/{hidden['owner_invoice_id']}", hidden["owner_user_id"]
|
| 199 |
+
)["status"]
|
| 200 |
+
== 200,
|
| 201 |
+
"admin_preserved": simulate_request(
|
| 202 |
+
state, "GET", f"/invoices/{hidden['other_invoice_id']}", hidden["admin_user_id"]
|
| 203 |
+
)["status"]
|
| 204 |
+
== 200,
|
| 205 |
+
}
|
| 206 |
+
return {"passed": all(checks.values()), "checks": checks}
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
def run_public_route_tests(state: CyberSecurityOWASPState) -> dict:
|
| 210 |
+
checks = {"health_public": simulate_request(state, "GET", "/health")["status"] == 200}
|
| 211 |
+
return {"passed": all(checks.values()), "checks": checks}
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
def patch_quality(state: CyberSecurityOWASPState) -> dict:
|
| 215 |
+
source = _route_source(state)
|
| 216 |
+
secure = _source_is_secure(source)
|
| 217 |
+
hardcoded = _source_has_hardcoding(state, source)
|
| 218 |
+
deny_all = _source_is_deny_all(source)
|
| 219 |
+
return {
|
| 220 |
+
"passed": secure and not hardcoded and not deny_all,
|
| 221 |
+
"secure_pattern": secure,
|
| 222 |
+
"hardcoded": hardcoded,
|
| 223 |
+
"deny_all": deny_all,
|
| 224 |
+
}
|