Upload 17 files
Browse files- Dockerfile +30 -0
- README.md +115 -5
- __init__.py +4 -0
- client.py +45 -0
- grader.py +254 -0
- inference.py +234 -0
- init.py +4 -0
- models.py +38 -0
- openenv.yaml +6 -0
- pyproject.toml +26 -0
- requirements.txt +6 -0
- server/__init__.py +1 -0
- server/app.py +47 -0
- server/environment.py +281 -0
- server/init.py +3 -0
- task_validation.py +127 -0
- uv.lock +0 -0
Dockerfile
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim AS builder
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 6 |
+
PYTHONUNBUFFERED=1
|
| 7 |
+
|
| 8 |
+
COPY requirements.txt ./
|
| 9 |
+
|
| 10 |
+
RUN python -m pip install --no-cache-dir --upgrade pip
|
| 11 |
+
RUN python -m venv /app/.venv
|
| 12 |
+
RUN /app/.venv/bin/pip install --no-cache-dir -r requirements.txt
|
| 13 |
+
|
| 14 |
+
FROM python:3.11-slim AS runtime
|
| 15 |
+
|
| 16 |
+
WORKDIR /app
|
| 17 |
+
|
| 18 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 19 |
+
PYTHONUNBUFFERED=1 \
|
| 20 |
+
PATH="/app/.venv/bin:${PATH}"
|
| 21 |
+
|
| 22 |
+
COPY --from=builder /app/.venv /app/.venv
|
| 23 |
+
COPY . /app
|
| 24 |
+
|
| 25 |
+
EXPOSE 8000
|
| 26 |
+
|
| 27 |
+
HEALTHCHECK --interval=30s --timeout=5s --start-period=20s --retries=3 \
|
| 28 |
+
CMD python -c "import urllib.request,sys; sys.exit(0 if urllib.request.urlopen('http://127.0.0.1:8000/health', timeout=3).status==200 else 1)"
|
| 29 |
+
|
| 30 |
+
CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000"]
|
README.md
CHANGED
|
@@ -1,11 +1,121 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
colorTo: yellow
|
| 6 |
sdk: docker
|
|
|
|
| 7 |
pinned: false
|
| 8 |
-
short_description: model
|
| 9 |
---
|
| 10 |
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: redteampentestlab
|
| 3 |
+
emoji: "🛡️"
|
| 4 |
+
colorFrom: red
|
| 5 |
colorTo: yellow
|
| 6 |
sdk: docker
|
| 7 |
+
app_port: 8000
|
| 8 |
pinned: false
|
|
|
|
| 9 |
---
|
| 10 |
|
| 11 |
+
# redteampentestlab
|
| 12 |
+
|
| 13 |
+
redteampentestlab is an OpenEnv-compatible reinforcement learning environment for automated penetration testing simulation. The agent must solve realistic pentest chains by executing actions in the correct order and collecting CTF-style flags.
|
| 14 |
+
|
| 15 |
+
## Environment Description
|
| 16 |
+
|
| 17 |
+
The environment exposes a FastAPI server through OpenEnv and simulates three pentesting missions:
|
| 18 |
+
|
| 19 |
+
1. Easy: Web Application Recon
|
| 20 |
+
2. Medium: SQLi to RCE
|
| 21 |
+
3. Hard: APT Multi-Stage Compromise
|
| 22 |
+
|
| 23 |
+
Each mission has:
|
| 24 |
+
|
| 25 |
+
- A target host or network
|
| 26 |
+
- A required ordered action chain
|
| 27 |
+
- Step-level rewards for partial progress
|
| 28 |
+
- A completion reward and a hidden flag
|
| 29 |
+
|
| 30 |
+
The reward design is shaped for RL training signals and remains strictly between 0 and 1.
|
| 31 |
+
|
| 32 |
+
## Action Space
|
| 33 |
+
|
| 34 |
+
The action model accepts one of the following values:
|
| 35 |
+
|
| 36 |
+
- scan
|
| 37 |
+
- enumerate
|
| 38 |
+
- exploit
|
| 39 |
+
- escalate
|
| 40 |
+
- c2
|
| 41 |
+
- cleanup
|
| 42 |
+
|
| 43 |
+
## Observation Space
|
| 44 |
+
|
| 45 |
+
Each step returns an observation with:
|
| 46 |
+
|
| 47 |
+
- target_ip: current host or subnet under assessment
|
| 48 |
+
- current_state: BRIEFING, IN_PROGRESS, SUCCESS, INVALID, ORDER_VIOLATION, or REPEAT
|
| 49 |
+
- output: realistic pentest tool-style output for the executed action
|
| 50 |
+
- difficulty: easy, medium, or hard
|
| 51 |
+
- reward: scalar reward signal (strictly 0 < reward < 1)
|
| 52 |
+
- done: episode termination flag
|
| 53 |
+
|
| 54 |
+
## State Space
|
| 55 |
+
|
| 56 |
+
Environment state includes:
|
| 57 |
+
|
| 58 |
+
- episode: episode counter
|
| 59 |
+
- task: active task name
|
| 60 |
+
- progress: normalized task completion value between 0.0 and 1.0
|
| 61 |
+
|
| 62 |
+
## Setup Instructions
|
| 63 |
+
|
| 64 |
+
### Option A: pip
|
| 65 |
+
|
| 66 |
+
```bash
|
| 67 |
+
pip install -r requirements.txt
|
| 68 |
+
uvicorn server.app:app --host 0.0.0.0 --port 8000
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
### Option B: uv
|
| 72 |
+
|
| 73 |
+
```bash
|
| 74 |
+
uv sync
|
| 75 |
+
uv run uvicorn server.app:app --host 0.0.0.0 --port 8000
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
### Validate OpenEnv
|
| 79 |
+
|
| 80 |
+
```bash
|
| 81 |
+
openenv validate
|
| 82 |
+
openenv validate --url http://localhost:8000 --json --verbose
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
### Validate Decimal Bounds
|
| 86 |
+
|
| 87 |
+
```bash
|
| 88 |
+
python task_validation.py
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
## Inference and Grading
|
| 92 |
+
|
| 93 |
+
Run baseline inference:
|
| 94 |
+
|
| 95 |
+
```bash
|
| 96 |
+
python inference.py
|
| 97 |
+
```
|
| 98 |
+
|
| 99 |
+
Run grader:
|
| 100 |
+
|
| 101 |
+
```bash
|
| 102 |
+
python inference.py > out.txt && python grader.py out.txt
|
| 103 |
+
```
|
| 104 |
+
|
| 105 |
+
Inference also writes a structured pentest report to pentest_report.json.
|
| 106 |
+
|
| 107 |
+
## Environment Variables
|
| 108 |
+
|
| 109 |
+
- API_BASE_URL (default: https://api.openai.com/v1) - API endpoint for the LLM
|
| 110 |
+
- MODEL_NAME (default: o3-mini) - Model identifier used for inference (OpenAI o3-mini)
|
| 111 |
+
- OPENAI_API_KEY (required) - OpenAI API key; if not set, falls back to HF_TOKEN
|
| 112 |
+
- HF_TOKEN (required if OPENAI_API_KEY not set) - Alternative API key environment variable
|
| 113 |
+
|
| 114 |
+
**Note:** At least one of OPENAI_API_KEY or HF_TOKEN must be set, or the inference will fail at startup.
|
| 115 |
+
|
| 116 |
+
## Docker
|
| 117 |
+
|
| 118 |
+
```bash
|
| 119 |
+
docker build -t redteampentestlab .
|
| 120 |
+
docker run -p 8000:8000 redteampentestlab
|
| 121 |
+
```
|
__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .client import RedteampentestlabEnv
|
| 2 |
+
from .models import RedTeamAction, RedTeamObservation
|
| 3 |
+
|
| 4 |
+
__all__ = ["RedteampentestlabEnv", "RedTeamAction", "RedTeamObservation"]
|
client.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Any, Dict
|
| 2 |
+
|
| 3 |
+
try:
|
| 4 |
+
from openenv.core import EnvClient
|
| 5 |
+
except Exception:
|
| 6 |
+
try:
|
| 7 |
+
from openenv.core.client import EnvClient
|
| 8 |
+
except Exception:
|
| 9 |
+
class EnvClient: # type: ignore[no-redef]
|
| 10 |
+
def __class_getitem__(cls, _item):
|
| 11 |
+
return cls
|
| 12 |
+
|
| 13 |
+
def __init__(self, *args, **kwargs):
|
| 14 |
+
self.base_url = kwargs.get("base_url")
|
| 15 |
+
|
| 16 |
+
try:
|
| 17 |
+
from openenv.core.env_server import State
|
| 18 |
+
except Exception:
|
| 19 |
+
from pydantic import BaseModel as State
|
| 20 |
+
|
| 21 |
+
try:
|
| 22 |
+
from models import RedTeamAction, RedTeamObservation, RedTeamState
|
| 23 |
+
except Exception:
|
| 24 |
+
from .models import RedTeamAction, RedTeamObservation, RedTeamState
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class RedteampentestlabEnv(EnvClient[RedTeamAction, RedTeamObservation, State]):
|
| 28 |
+
env_name = "redteampentestlab"
|
| 29 |
+
action_type = RedTeamAction
|
| 30 |
+
observation_type = RedTeamObservation
|
| 31 |
+
|
| 32 |
+
def _step_payload(self, action: RedTeamAction) -> Dict[str, Any]:
|
| 33 |
+
if hasattr(action, "model_dump"):
|
| 34 |
+
return action.model_dump()
|
| 35 |
+
return {"action": getattr(action, "action", str(action))}
|
| 36 |
+
|
| 37 |
+
def _parse_result(self, result: Dict[str, Any]) -> RedTeamObservation:
|
| 38 |
+
if hasattr(RedTeamObservation, "model_validate"):
|
| 39 |
+
return RedTeamObservation.model_validate(result)
|
| 40 |
+
return RedTeamObservation(**result)
|
| 41 |
+
|
| 42 |
+
def _parse_state(self, state_payload: Dict[str, Any]) -> State:
|
| 43 |
+
if hasattr(RedTeamState, "model_validate"):
|
| 44 |
+
return RedTeamState.model_validate(state_payload)
|
| 45 |
+
return RedTeamState(**state_payload)
|
grader.py
ADDED
|
@@ -0,0 +1,254 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Grader for RedTeam PentestLab - scores STRICTLY inside (0, 1) exclusive."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import re
|
| 5 |
+
import sys
|
| 6 |
+
from typing import Dict, List, Tuple
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
SCORE_FLOOR = 0.10
|
| 10 |
+
SCORE_CEIL = 0.90
|
| 11 |
+
TASK_IDS = ["alpha", "bravo", "charlie"]
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def strict_clamp(score: float) -> float:
|
| 15 |
+
"""
|
| 16 |
+
Clamp score to STRICTLY inside (0, 1).
|
| 17 |
+
|
| 18 |
+
This is the ONLY function that sets score bounds.
|
| 19 |
+
Every score - per-step, per-task, overall - passes through here.
|
| 20 |
+
Uses wide margins (0.10 to 0.90) to survive float rounding in any context.
|
| 21 |
+
Never asserts. Never raises. Always returns a valid float.
|
| 22 |
+
"""
|
| 23 |
+
try:
|
| 24 |
+
s = float(score)
|
| 25 |
+
except (TypeError, ValueError):
|
| 26 |
+
return SCORE_FLOOR
|
| 27 |
+
|
| 28 |
+
if s != s:
|
| 29 |
+
return SCORE_FLOOR
|
| 30 |
+
if s == float("inf"):
|
| 31 |
+
return SCORE_CEIL
|
| 32 |
+
if s == float("-inf"):
|
| 33 |
+
return SCORE_FLOOR
|
| 34 |
+
|
| 35 |
+
s = max(SCORE_FLOOR, min(SCORE_CEIL, s))
|
| 36 |
+
|
| 37 |
+
if s <= 0:
|
| 38 |
+
return SCORE_FLOOR
|
| 39 |
+
if s >= 1:
|
| 40 |
+
return SCORE_CEIL
|
| 41 |
+
|
| 42 |
+
s = round(s, 4)
|
| 43 |
+
|
| 44 |
+
if s <= 0:
|
| 45 |
+
return SCORE_FLOOR
|
| 46 |
+
if s >= 1:
|
| 47 |
+
return SCORE_CEIL
|
| 48 |
+
|
| 49 |
+
return s
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def parse_inference_output(output: str) -> List[Dict]:
|
| 53 |
+
"""Parse inference.py stdout into one record per [START]..[END] block."""
|
| 54 |
+
tasks: List[Dict] = []
|
| 55 |
+
current: Dict = {}
|
| 56 |
+
active = False
|
| 57 |
+
|
| 58 |
+
for raw_line in output.splitlines():
|
| 59 |
+
line = raw_line.strip()
|
| 60 |
+
|
| 61 |
+
if line.startswith("[START]"):
|
| 62 |
+
m = re.search(r"task=(\S+)\s+env=(\S+)\s+model=(\S+)", line)
|
| 63 |
+
if m:
|
| 64 |
+
current = {
|
| 65 |
+
"task": m.group(1),
|
| 66 |
+
"env": m.group(2),
|
| 67 |
+
"model": m.group(3),
|
| 68 |
+
"success": False,
|
| 69 |
+
"steps": 0,
|
| 70 |
+
"rewards": [],
|
| 71 |
+
"step_details": [],
|
| 72 |
+
}
|
| 73 |
+
active = True
|
| 74 |
+
|
| 75 |
+
elif line.startswith("[STEP]") and active:
|
| 76 |
+
m = re.search(
|
| 77 |
+
r"step=(\S+)\s+action=(\w+)\s+reward=([\d.eE+-]+)\s+done=(\w+)\s+error=(\S+)",
|
| 78 |
+
line,
|
| 79 |
+
)
|
| 80 |
+
if m:
|
| 81 |
+
try:
|
| 82 |
+
rew = float(m.group(3))
|
| 83 |
+
except ValueError:
|
| 84 |
+
rew = 0.10
|
| 85 |
+
current["step_details"].append(
|
| 86 |
+
{
|
| 87 |
+
"step": m.group(1),
|
| 88 |
+
"action": m.group(2),
|
| 89 |
+
"reward": rew,
|
| 90 |
+
"done": m.group(4).lower() == "true",
|
| 91 |
+
"error": None if m.group(5).lower() == "null" else m.group(5),
|
| 92 |
+
}
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
elif line.startswith("[END]") and active:
|
| 96 |
+
m = re.search(r"success=(\w+)(?:\s+steps=\d+)?\s+rewards=([\d.,\s.eE+-]*)", line)
|
| 97 |
+
if m:
|
| 98 |
+
current["success"] = m.group(1).lower() == "true"
|
| 99 |
+
raw_rewards = m.group(2) or ""
|
| 100 |
+
parsed_rewards: List[float] = []
|
| 101 |
+
for tok in raw_rewards.split(","):
|
| 102 |
+
tok = tok.strip()
|
| 103 |
+
if not tok:
|
| 104 |
+
continue
|
| 105 |
+
try:
|
| 106 |
+
parsed_rewards.append(float(tok))
|
| 107 |
+
except ValueError:
|
| 108 |
+
continue
|
| 109 |
+
current["rewards"] = parsed_rewards
|
| 110 |
+
current["steps"] = len(parsed_rewards)
|
| 111 |
+
tasks.append(current)
|
| 112 |
+
current = {}
|
| 113 |
+
active = False
|
| 114 |
+
|
| 115 |
+
return tasks
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def make_fallback_task(task_id: str) -> Dict:
|
| 119 |
+
return {
|
| 120 |
+
"task": task_id,
|
| 121 |
+
"env": "redteam_pentest",
|
| 122 |
+
"model": "unknown",
|
| 123 |
+
"success": False,
|
| 124 |
+
"steps": 0,
|
| 125 |
+
"rewards": [],
|
| 126 |
+
"step_details": [],
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def grade_task(data: Dict) -> Tuple[float, Dict]:
|
| 131 |
+
"""
|
| 132 |
+
Grade one task. Returns (score, details) where score is strictly in (0, 1).
|
| 133 |
+
|
| 134 |
+
Scoring breakdown (designed so theoretical max < 0.90, min > 0.10):
|
| 135 |
+
Base: 0.35 (success) or 0.15 (failure)
|
| 136 |
+
Reward bonus: up to 0.30 (scaled to max_possible=0.80)
|
| 137 |
+
Chain penalty: up to -0.09 (0.03 per negative-reward step, max 3)
|
| 138 |
+
Max possible: 0.65
|
| 139 |
+
Min possible: 0.06 before strict clamp
|
| 140 |
+
"""
|
| 141 |
+
success = bool(data.get("success", False))
|
| 142 |
+
rewards = data.get("rewards", []) or []
|
| 143 |
+
step_details = data.get("step_details", []) or []
|
| 144 |
+
|
| 145 |
+
score = 0.35 if success else 0.15
|
| 146 |
+
|
| 147 |
+
total_reward = sum(max(0, r) for r in rewards)
|
| 148 |
+
reward_bonus = min((total_reward / 0.80) * 0.30, 0.30) if total_reward > 0 else 0
|
| 149 |
+
score += reward_bonus
|
| 150 |
+
|
| 151 |
+
violations = sum(1 for s in step_details if float(s.get("reward", 0)) < 0)
|
| 152 |
+
score -= min(violations * 0.03, 0.09)
|
| 153 |
+
|
| 154 |
+
score = strict_clamp(score)
|
| 155 |
+
details = {
|
| 156 |
+
"success": success,
|
| 157 |
+
"steps_taken": len(rewards),
|
| 158 |
+
"total_reward": round(sum(rewards), 4) if rewards else 0,
|
| 159 |
+
"violations": violations,
|
| 160 |
+
"final_score": score,
|
| 161 |
+
}
|
| 162 |
+
return score, details
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def _run() -> None:
|
| 166 |
+
output = ""
|
| 167 |
+
|
| 168 |
+
if len(sys.argv) >= 2:
|
| 169 |
+
output_file = sys.argv[1]
|
| 170 |
+
try:
|
| 171 |
+
with open(output_file, "r", encoding="utf-8") as f:
|
| 172 |
+
output = f.read()
|
| 173 |
+
except OSError as e:
|
| 174 |
+
print(f"WARNING: unable to read '{output_file}': {e}", file=sys.stderr)
|
| 175 |
+
output = ""
|
| 176 |
+
else:
|
| 177 |
+
try:
|
| 178 |
+
output = sys.stdin.read()
|
| 179 |
+
except Exception:
|
| 180 |
+
output = ""
|
| 181 |
+
|
| 182 |
+
try:
|
| 183 |
+
tasks = parse_inference_output(output)
|
| 184 |
+
except Exception as e:
|
| 185 |
+
print(f"WARNING: parse error ({e}); using fallback tasks", file=sys.stderr)
|
| 186 |
+
tasks = []
|
| 187 |
+
|
| 188 |
+
while len(tasks) < 3:
|
| 189 |
+
idx = len(tasks)
|
| 190 |
+
tid = TASK_IDS[idx] if idx < len(TASK_IDS) else f"task_{idx}"
|
| 191 |
+
tasks.append(make_fallback_task(tid))
|
| 192 |
+
|
| 193 |
+
graded: List[Tuple[Dict, float, Dict]] = []
|
| 194 |
+
for i, task_data in enumerate(tasks[:3]):
|
| 195 |
+
try:
|
| 196 |
+
score, details = grade_task(task_data)
|
| 197 |
+
except Exception as e:
|
| 198 |
+
print(f"WARNING: grading error on task {i}: {e}", file=sys.stderr)
|
| 199 |
+
score = SCORE_FLOOR
|
| 200 |
+
details = {"final_score": SCORE_FLOOR, "success": False}
|
| 201 |
+
|
| 202 |
+
score = strict_clamp(score)
|
| 203 |
+
if not (0 < score < 1):
|
| 204 |
+
print(f"WARNING: out-of-range score {score} on task {i}; forcing floor", file=sys.stderr)
|
| 205 |
+
score = SCORE_FLOOR
|
| 206 |
+
|
| 207 |
+
details["final_score"] = strict_clamp(score)
|
| 208 |
+
graded.append((task_data, strict_clamp(score), details))
|
| 209 |
+
|
| 210 |
+
overall = strict_clamp(sum(score for _, score, _ in graded) / 3.0)
|
| 211 |
+
|
| 212 |
+
for i, (_, score, _) in enumerate(graded):
|
| 213 |
+
tid = TASK_IDS[i] if i < len(TASK_IDS) else f"task_{i}"
|
| 214 |
+
out_score = strict_clamp(score)
|
| 215 |
+
print(f"TASK_SCORE:{tid}:{out_score}")
|
| 216 |
+
|
| 217 |
+
print(f"OVERALL_SCORE:{overall}")
|
| 218 |
+
|
| 219 |
+
json_tasks = []
|
| 220 |
+
for i, (_, score, _) in enumerate(graded):
|
| 221 |
+
tid = TASK_IDS[i] if i < len(TASK_IDS) else f"task_{i}"
|
| 222 |
+
json_tasks.append({"task_id": tid, "score": strict_clamp(score)})
|
| 223 |
+
|
| 224 |
+
payload = {
|
| 225 |
+
"overall_score": strict_clamp(overall),
|
| 226 |
+
"tasks": json_tasks,
|
| 227 |
+
}
|
| 228 |
+
print(f"JSON_OUTPUT:{json.dumps(payload)}")
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
def main() -> None:
|
| 232 |
+
try:
|
| 233 |
+
_run()
|
| 234 |
+
except Exception as e:
|
| 235 |
+
print(f"WARNING: unhandled grader exception: {e}", file=sys.stderr)
|
| 236 |
+
fallback_payload = {
|
| 237 |
+
"overall_score": SCORE_FLOOR,
|
| 238 |
+
"tasks": [
|
| 239 |
+
{"task_id": "alpha", "score": SCORE_FLOOR},
|
| 240 |
+
{"task_id": "bravo", "score": SCORE_FLOOR},
|
| 241 |
+
{"task_id": "charlie", "score": SCORE_FLOOR},
|
| 242 |
+
],
|
| 243 |
+
}
|
| 244 |
+
print("TASK_SCORE:alpha:0.1")
|
| 245 |
+
print("TASK_SCORE:bravo:0.1")
|
| 246 |
+
print("TASK_SCORE:charlie:0.1")
|
| 247 |
+
print("OVERALL_SCORE:0.1")
|
| 248 |
+
print(f"JSON_OUTPUT:{json.dumps(fallback_payload)}")
|
| 249 |
+
finally:
|
| 250 |
+
sys.exit(0)
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
if __name__ == "__main__":
|
| 254 |
+
main()
|
inference.py
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
import re
|
| 5 |
+
import sys
|
| 6 |
+
from datetime import datetime, timezone
|
| 7 |
+
from typing import Dict, List, Optional, Tuple
|
| 8 |
+
|
| 9 |
+
from openai import OpenAI
|
| 10 |
+
|
| 11 |
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
| 12 |
+
|
| 13 |
+
try:
|
| 14 |
+
from server.environment import RedTeamPentestEnvironment
|
| 15 |
+
except Exception:
|
| 16 |
+
from .server.environment import RedTeamPentestEnvironment
|
| 17 |
+
|
| 18 |
+
try:
|
| 19 |
+
from models import RedTeamAction
|
| 20 |
+
except Exception:
|
| 21 |
+
from .models import RedTeamAction
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# LLM Configuration with OpenAI defaults
|
| 25 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
|
| 26 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "o3-mini")
|
| 27 |
+
|
| 28 |
+
# API Key: prioritize OPENAI_API_KEY, fallback to HF_TOKEN
|
| 29 |
+
API_KEY = os.getenv("OPENAI_API_KEY") or os.getenv("HF_TOKEN")
|
| 30 |
+
|
| 31 |
+
if not API_KEY:
|
| 32 |
+
raise ValueError(
|
| 33 |
+
"API key is required. Set either OPENAI_API_KEY or HF_TOKEN environment variable."
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
BENCHMARK = "redteam_pentest"
|
| 37 |
+
TASK_TOKENS = ["task_1", "task_2", "task_3"]
|
| 38 |
+
|
| 39 |
+
TASKS: List[Dict[str, object]] = [
|
| 40 |
+
{"index": 0, "required_steps": ["scan", "enumerate", "exploit"]},
|
| 41 |
+
{"index": 1, "required_steps": ["scan", "enumerate", "exploit", "escalate"]},
|
| 42 |
+
{"index": 2, "required_steps": ["scan", "enumerate", "exploit", "escalate", "c2", "cleanup"]},
|
| 43 |
+
]
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def _normalize_reward(value: object) -> float:
|
| 47 |
+
try:
|
| 48 |
+
reward = float(value)
|
| 49 |
+
except (TypeError, ValueError):
|
| 50 |
+
return 0.10
|
| 51 |
+
if reward != reward:
|
| 52 |
+
return 0.10
|
| 53 |
+
return max(0.10, min(0.90, reward))
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def _normalize_error(error: Optional[str]) -> str:
|
| 57 |
+
if not error:
|
| 58 |
+
return "null"
|
| 59 |
+
return "_".join(str(error).strip().split()) or "null"
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def log_start(task_id: str, env_name: str, model_name: str) -> None:
|
| 63 |
+
print(f"[START] task={task_id} env={env_name} model={model_name}", flush=True)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def log_step(step_num: int, action: str, reward: float, done: bool, error: Optional[str] = None) -> None:
|
| 67 |
+
err = _normalize_error(error)
|
| 68 |
+
print(
|
| 69 |
+
f"[STEP] step={step_num} action={action} reward={_normalize_reward(reward):.2f} "
|
| 70 |
+
f"done={str(done).lower()} error={err}",
|
| 71 |
+
flush=True,
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def log_end(success: bool, rewards: List[float]) -> None:
|
| 76 |
+
safe_rewards = rewards if rewards else [0.10]
|
| 77 |
+
rewards_str = ",".join(f"{_normalize_reward(r):.2f}" for r in safe_rewards)
|
| 78 |
+
print(f"[END] success={str(success).lower()} steps={len(safe_rewards)} rewards={rewards_str}", flush=True)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
async def run_task(
|
| 82 |
+
client: Optional[OpenAI],
|
| 83 |
+
env: RedTeamPentestEnvironment,
|
| 84 |
+
task_meta: Dict[str, object],
|
| 85 |
+
global_step: int,
|
| 86 |
+
) -> Tuple[List[float], int, bool, Dict[str, object]]:
|
| 87 |
+
task_id = TASK_TOKENS[int(task_meta["index"])]
|
| 88 |
+
episode_id = f"episode-{task_id}"
|
| 89 |
+
log_start(task_id, BENCHMARK, MODEL_NAME)
|
| 90 |
+
|
| 91 |
+
task_rewards: List[float] = []
|
| 92 |
+
task_success = False
|
| 93 |
+
actions_taken: List[str] = []
|
| 94 |
+
states_seen: List[str] = []
|
| 95 |
+
flags_found: List[str] = []
|
| 96 |
+
|
| 97 |
+
try:
|
| 98 |
+
env.task_index = int(task_meta["index"])
|
| 99 |
+
env.reset(task_index=int(task_meta["index"]), episode_id=episode_id)
|
| 100 |
+
completed_steps: List[str] = []
|
| 101 |
+
required_steps = list(task_meta["required_steps"])
|
| 102 |
+
max_steps = len(required_steps) + 2
|
| 103 |
+
|
| 104 |
+
for _ in range(max_steps):
|
| 105 |
+
remaining = [a for a in required_steps if a not in completed_steps]
|
| 106 |
+
if not remaining:
|
| 107 |
+
task_success = True
|
| 108 |
+
break
|
| 109 |
+
|
| 110 |
+
action_str = remaining[0]
|
| 111 |
+
|
| 112 |
+
if client is not None:
|
| 113 |
+
try:
|
| 114 |
+
user_prompt = f"Next pentest phase from {remaining}. Reply with one word only."
|
| 115 |
+
client.chat.completions.create(
|
| 116 |
+
model=MODEL_NAME,
|
| 117 |
+
messages=[
|
| 118 |
+
{
|
| 119 |
+
"role": "system",
|
| 120 |
+
"content": "You are a penetration tester. Reply with one action word only.",
|
| 121 |
+
},
|
| 122 |
+
{"role": "user", "content": user_prompt},
|
| 123 |
+
],
|
| 124 |
+
temperature=0,
|
| 125 |
+
max_tokens=16,
|
| 126 |
+
timeout=8,
|
| 127 |
+
)
|
| 128 |
+
except Exception:
|
| 129 |
+
pass
|
| 130 |
+
obs = env.step(RedTeamAction(action=action_str), episode_id=episode_id)
|
| 131 |
+
|
| 132 |
+
reward = 0.10
|
| 133 |
+
try:
|
| 134 |
+
if getattr(obs, "reward", None) is not None:
|
| 135 |
+
reward = float(obs.reward)
|
| 136 |
+
reward = max(0.10, min(0.90, reward))
|
| 137 |
+
except (TypeError, ValueError):
|
| 138 |
+
reward = 0.10
|
| 139 |
+
|
| 140 |
+
done = bool(getattr(obs, "done", False))
|
| 141 |
+
current_state = str(getattr(obs, "current_state", ""))
|
| 142 |
+
output_text = str(getattr(obs, "output", ""))
|
| 143 |
+
|
| 144 |
+
for flag in re.findall(r"FLAG\{[^\}]+\}", output_text):
|
| 145 |
+
if flag not in flags_found:
|
| 146 |
+
flags_found.append(flag)
|
| 147 |
+
|
| 148 |
+
if current_state not in ("INVALID", "ORDER_VIOLATION", "REPEAT") and action_str not in completed_steps:
|
| 149 |
+
completed_steps.append(action_str)
|
| 150 |
+
actions_taken.append(action_str)
|
| 151 |
+
states_seen.append(current_state)
|
| 152 |
+
|
| 153 |
+
log_step(global_step, action_str, reward, done)
|
| 154 |
+
task_rewards.append(_normalize_reward(reward))
|
| 155 |
+
global_step += 1
|
| 156 |
+
|
| 157 |
+
if done:
|
| 158 |
+
task_success = True
|
| 159 |
+
break
|
| 160 |
+
|
| 161 |
+
except Exception as e:
|
| 162 |
+
print(f"# task error: {e}", flush=True)
|
| 163 |
+
|
| 164 |
+
log_end(task_success, task_rewards if task_rewards else [0.10])
|
| 165 |
+
task_report = {
|
| 166 |
+
"task_id": task_id,
|
| 167 |
+
"episode_id": episode_id,
|
| 168 |
+
"required_steps": required_steps if "required_steps" in locals() else [],
|
| 169 |
+
"actions_taken": actions_taken,
|
| 170 |
+
"states_seen": states_seen,
|
| 171 |
+
"rewards": task_rewards if task_rewards else [0.10],
|
| 172 |
+
"success": task_success,
|
| 173 |
+
"ctf_solved": len(flags_found) > 0,
|
| 174 |
+
"flags_found": flags_found,
|
| 175 |
+
}
|
| 176 |
+
return task_rewards if task_rewards else [0.10], global_step, task_success, task_report
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
async def main() -> None:
|
| 180 |
+
client: Optional[OpenAI]
|
| 181 |
+
try:
|
| 182 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY, timeout=30)
|
| 183 |
+
except Exception as e:
|
| 184 |
+
print(f"# Warning: Failed to initialize OpenAI client: {e}", flush=True)
|
| 185 |
+
client = None
|
| 186 |
+
|
| 187 |
+
env = RedTeamPentestEnvironment()
|
| 188 |
+
global_step = 1
|
| 189 |
+
report_tasks: List[Dict[str, object]] = []
|
| 190 |
+
|
| 191 |
+
for task_meta in TASKS:
|
| 192 |
+
try:
|
| 193 |
+
_, global_step, _, task_report = await run_task(client, env, task_meta, global_step)
|
| 194 |
+
report_tasks.append(task_report)
|
| 195 |
+
except Exception as e:
|
| 196 |
+
task_idx = int(task_meta.get("index", 0))
|
| 197 |
+
fallback_task_id = TASK_TOKENS[task_idx]
|
| 198 |
+
log_start(fallback_task_id, BENCHMARK, MODEL_NAME)
|
| 199 |
+
print(f"# task wrapper error: {e}", flush=True)
|
| 200 |
+
log_end(False, [0.10])
|
| 201 |
+
report_tasks.append(
|
| 202 |
+
{
|
| 203 |
+
"task_id": fallback_task_id,
|
| 204 |
+
"episode_id": f"episode-{fallback_task_id}",
|
| 205 |
+
"required_steps": list(task_meta.get("required_steps", [])),
|
| 206 |
+
"actions_taken": [],
|
| 207 |
+
"states_seen": [],
|
| 208 |
+
"rewards": [0.10],
|
| 209 |
+
"success": False,
|
| 210 |
+
"ctf_solved": False,
|
| 211 |
+
"flags_found": [],
|
| 212 |
+
}
|
| 213 |
+
)
|
| 214 |
+
|
| 215 |
+
summary = {
|
| 216 |
+
"environment": "redteampentestlab",
|
| 217 |
+
"benchmark": BENCHMARK,
|
| 218 |
+
"model": MODEL_NAME,
|
| 219 |
+
"generated_at": datetime.now(timezone.utc).isoformat(),
|
| 220 |
+
"tasks": report_tasks,
|
| 221 |
+
"overall": {
|
| 222 |
+
"tasks_total": len(report_tasks),
|
| 223 |
+
"tasks_success": sum(1 for t in report_tasks if t.get("success") is True),
|
| 224 |
+
"ctf_solved": sum(1 for t in report_tasks if t.get("ctf_solved") is True),
|
| 225 |
+
"total_reward": round(sum(sum(float(r) for r in t.get("rewards", [])) for t in report_tasks), 4),
|
| 226 |
+
},
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
with open("pentest_report.json", "w", encoding="utf-8") as f:
|
| 230 |
+
json.dump(summary, f, indent=2)
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
if __name__ == "__main__":
|
| 234 |
+
asyncio.run(main())
|
init.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .client import RedteampentestlabEnv
|
| 2 |
+
from .models import RedTeamAction, RedTeamObservation
|
| 3 |
+
|
| 4 |
+
__all__ = ["RedteampentestlabEnv", "RedTeamAction", "RedTeamObservation"]
|
models.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Literal
|
| 2 |
+
|
| 3 |
+
from pydantic import Field
|
| 4 |
+
|
| 5 |
+
try:
|
| 6 |
+
from openenv.core.env_server import Action, Observation, State
|
| 7 |
+
except Exception:
|
| 8 |
+
from pydantic import BaseModel
|
| 9 |
+
|
| 10 |
+
class Action(BaseModel):
|
| 11 |
+
pass
|
| 12 |
+
|
| 13 |
+
class Observation(BaseModel):
|
| 14 |
+
reward: float = 0.1
|
| 15 |
+
done: bool = False
|
| 16 |
+
|
| 17 |
+
class State(BaseModel):
|
| 18 |
+
pass
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class RedTeamAction(Action):
|
| 22 |
+
action: Literal["scan", "enumerate", "exploit", "escalate", "c2", "cleanup"]
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class RedTeamObservation(Observation):
|
| 26 |
+
target_ip: str = Field(description="Target host or network currently under assessment.")
|
| 27 |
+
current_state: str = Field(description="Current simulator state label, such as BRIEFING or SUCCESS.")
|
| 28 |
+
output: str = Field(description="Detailed command output and analysis text from the simulation step.")
|
| 29 |
+
difficulty: str = Field(description="Task difficulty level: easy, medium, or hard.")
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class RedTeamState(State):
|
| 33 |
+
episode: int = Field(description="Current episode counter.")
|
| 34 |
+
task: str = Field(description="Current task name.")
|
| 35 |
+
progress: float = Field(description="Normalized completion progress from 0.0 to 1.0.")
|
| 36 |
+
|
| 37 |
+
def __call__(self) -> "RedTeamState":
|
| 38 |
+
return self
|
openenv.yaml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
spec_version: 1
|
| 2 |
+
name: redteampentestlab
|
| 3 |
+
type: space
|
| 4 |
+
runtime: fastapi
|
| 5 |
+
app: server.app:app
|
| 6 |
+
port: 8000
|
pyproject.toml
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=45", "wheel"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "openenv-redteampentestlab"
|
| 7 |
+
version = "0.1.0"
|
| 8 |
+
description = "Automated penetration testing simulation environment for OpenEnv"
|
| 9 |
+
requires-python = ">=3.10"
|
| 10 |
+
dependencies = [
|
| 11 |
+
"openenv-core[core]>=0.2.2",
|
| 12 |
+
"fastapi>=0.100.0",
|
| 13 |
+
"uvicorn>=0.23.0",
|
| 14 |
+
"pydantic>=2.0.0",
|
| 15 |
+
"openai>=1.0.0",
|
| 16 |
+
]
|
| 17 |
+
|
| 18 |
+
[project.optional-dependencies]
|
| 19 |
+
dev = ["pytest>=8.0.0", "pytest-cov>=4.0.0"]
|
| 20 |
+
|
| 21 |
+
[project.scripts]
|
| 22 |
+
server = "server.app:main"
|
| 23 |
+
|
| 24 |
+
[tool.setuptools.packages.find]
|
| 25 |
+
where = ["."]
|
| 26 |
+
include = ["*"]
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openenv-core[core]>=0.2.2
|
| 2 |
+
fastapi>=0.100.0
|
| 3 |
+
uvicorn[standard]>=0.23.0
|
| 4 |
+
pydantic>=2.0.0
|
| 5 |
+
openai>=1.0.0
|
| 6 |
+
httpx>=0.24.0
|
server/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
__all__ = []
|
server/app.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
try:
|
| 2 |
+
from openenv.core.env_server.http_server import create_app
|
| 3 |
+
except Exception as exc:
|
| 4 |
+
raise RuntimeError(f"Failed to import OpenEnv HTTP server integration: {exc}")
|
| 5 |
+
|
| 6 |
+
try:
|
| 7 |
+
from models import RedTeamAction, RedTeamObservation
|
| 8 |
+
except Exception:
|
| 9 |
+
from ..models import RedTeamAction, RedTeamObservation
|
| 10 |
+
|
| 11 |
+
try:
|
| 12 |
+
from server.environment import RedTeamPentestEnvironment
|
| 13 |
+
except Exception:
|
| 14 |
+
from .environment import RedTeamPentestEnvironment
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
app = create_app(
|
| 18 |
+
RedTeamPentestEnvironment,
|
| 19 |
+
RedTeamAction,
|
| 20 |
+
RedTeamObservation,
|
| 21 |
+
env_name="redteampentestlab",
|
| 22 |
+
max_concurrent_envs=4,
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@app.get("/")
|
| 27 |
+
def root():
|
| 28 |
+
return {
|
| 29 |
+
"status": "ok",
|
| 30 |
+
"service": "redteampentestlab",
|
| 31 |
+
"routes": ["/reset", "/step", "/state", "/health"],
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
@app.get("/health")
|
| 36 |
+
def health():
|
| 37 |
+
return {"status": "healthy"}
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def main(host: str = "0.0.0.0", port: int = 8000):
|
| 41 |
+
import uvicorn
|
| 42 |
+
|
| 43 |
+
uvicorn.run(app, host=host, port=port)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
if __name__ == "__main__":
|
| 47 |
+
main()
|
server/environment.py
ADDED
|
@@ -0,0 +1,281 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from threading import Lock
|
| 4 |
+
from typing import Any, Dict, Generic, List, Optional, TypeVar
|
| 5 |
+
|
| 6 |
+
try:
|
| 7 |
+
from openenv.core.env_server import Environment
|
| 8 |
+
except Exception:
|
| 9 |
+
A = TypeVar("A")
|
| 10 |
+
O = TypeVar("O")
|
| 11 |
+
S = TypeVar("S")
|
| 12 |
+
|
| 13 |
+
class Environment(Generic[A, O, S]):
|
| 14 |
+
pass
|
| 15 |
+
|
| 16 |
+
try:
|
| 17 |
+
from models import RedTeamAction, RedTeamObservation, RedTeamState
|
| 18 |
+
except Exception:
|
| 19 |
+
from ..models import RedTeamAction, RedTeamObservation, RedTeamState
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
STEP_REWARDS = {
|
| 23 |
+
"easy": {"base": 0.12, "completion_bonus": 0.06},
|
| 24 |
+
"medium": {"base": 0.10, "completion_bonus": 0.05},
|
| 25 |
+
"hard": {"base": 0.07, "completion_bonus": 0.04},
|
| 26 |
+
}
|
| 27 |
+
CHAIN_BONUS = 0.01
|
| 28 |
+
PENALTY_WRONG_ORDER = -0.05
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def safe_reward(r: float) -> float:
|
| 32 |
+
"""
|
| 33 |
+
Return a reward value STRICTLY inside (0.0, 1.0).
|
| 34 |
+
|
| 35 |
+
This is called on EVERY reward value before it enters an observation.
|
| 36 |
+
No reward ever touches 0.0 or 1.0.
|
| 37 |
+
"""
|
| 38 |
+
try:
|
| 39 |
+
v = float(r)
|
| 40 |
+
except (TypeError, ValueError):
|
| 41 |
+
return 0.10
|
| 42 |
+
if v != v:
|
| 43 |
+
return 0.10
|
| 44 |
+
v = max(0.10, min(0.90, v))
|
| 45 |
+
if v <= 0:
|
| 46 |
+
return 0.10
|
| 47 |
+
if v >= 1:
|
| 48 |
+
return 0.90
|
| 49 |
+
return round(v, 3)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
TASKS: List[Dict[str, Any]] = [
|
| 53 |
+
{
|
| 54 |
+
"name": "Web Application Recon",
|
| 55 |
+
"difficulty": "easy",
|
| 56 |
+
"target_ip": "192.168.10.5",
|
| 57 |
+
"required_steps": ["scan", "enumerate", "exploit"],
|
| 58 |
+
"scenario": "Apache 2.4.29 with CVE-2021-41773 path traversal, exposed /admin panel",
|
| 59 |
+
"hidden_flag": "FLAG{w3b_app_0wn3d_192.168.10.5}",
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"name": "SQLi to RCE",
|
| 63 |
+
"difficulty": "medium",
|
| 64 |
+
"target_ip": "10.10.20.15",
|
| 65 |
+
"required_steps": ["scan", "enumerate", "exploit", "escalate"],
|
| 66 |
+
"scenario": "Node.js app with unsanitized SQL inputs + weak sudo policy (python3 NOPASSWD)",
|
| 67 |
+
"hidden_flag": "FLAG{sql_t0_r00t_10.10.20.15}",
|
| 68 |
+
},
|
| 69 |
+
{
|
| 70 |
+
"name": "APT Multi-Stage Compromise",
|
| 71 |
+
"difficulty": "hard",
|
| 72 |
+
"target_ip": "172.16.50.0/24",
|
| 73 |
+
"required_steps": ["scan", "enumerate", "exploit", "escalate", "c2", "cleanup"],
|
| 74 |
+
"scenario": "Corporate network, Log4Shell on DMZ, lateral movement via pass-the-hash to DC",
|
| 75 |
+
"hidden_flag": "FLAG{apt_full_chain_172.16.50.0}",
|
| 76 |
+
},
|
| 77 |
+
]
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
ACTION_OUTPUTS: Dict[int, Dict[str, str]] = {
|
| 81 |
+
0: {
|
| 82 |
+
"scan": "nmap -sV -p- 192.168.10.5\n22/tcp open ssh OpenSSH 7.6p1\n80/tcp open http Apache httpd 2.4.29\n443/tcp open ssl/http Apache 2.4.29\nFinding: legacy Apache build likely vulnerable to CVE-2021-41773.",
|
| 83 |
+
"enumerate": "gobuster dir -u http://192.168.10.5 -w /usr/share/seclists/Discovery/Web-Content/common.txt\n/admin (Status: 200)\n/cgi-bin/.%2e/%2e%2e/%2e%2e/etc/passwd (Status: 200)\nFinding: path traversal primitive confirmed with readable sensitive files.",
|
| 84 |
+
"exploit": "curl --path-as-is http://192.168.10.5/cgi-bin/.%2e/%2e%2e/%2e%2e/etc/passwd\nroot:x:0:0:root:/root:/bin/bash\nwww-data:x:33:33:www-data:/var/www:/usr/sbin/nologin\nPost-exploitation: web root credentials extracted from /var/www/html/.env.",
|
| 85 |
+
},
|
| 86 |
+
1: {
|
| 87 |
+
"scan": "nmap -sC -sV 10.10.20.15\n22/tcp open ssh OpenSSH 8.2p1\n3000/tcp open http Node.js Express\n3306/tcp open mysql MySQL 5.7\nFinding: exposed app and database service indicate attack surface for injection.",
|
| 88 |
+
"enumerate": "sqlmap -u 'http://10.10.20.15:3000/login?user=admin&pass=test' --batch\nParameter 'user' appears to be injectable\nDBMS: MySQL >= 5.0\nDatabase dump includes hash for app_admin and local service credentials.",
|
| 89 |
+
"exploit": "sqlmap --os-shell simulation\nwhoami\nnodeapp\ncat /etc/sudoers.d/nodeapp\nnodeapp ALL=(ALL) NOPASSWD:/usr/bin/python3\nExploit path confirmed for local privilege escalation.",
|
| 90 |
+
"escalate": "sudo /usr/bin/python3 -c 'import os; os.system(\"/bin/bash -p -c id\")'\nuid=0(root) gid=0(root) groups=0(root)\nRoot shell simulation established on 10.10.20.15.",
|
| 91 |
+
},
|
| 92 |
+
2: {
|
| 93 |
+
"scan": "nmap -sV 172.16.50.0/24\n172.16.50.10 dmz-web log4j app\n172.16.50.20 fileserver smb signing disabled\n172.16.50.30 dc01 ldap/kerberos\nFinding: DMZ host with vulnerable Java service and flat east-west reachability.",
|
| 94 |
+
"enumerate": "ldapsearch and bloodhound-style graphing simulation\nService account svc_backup reuses NTLM hash across hosts\nDomain trust map indicates privileged path to dc01 via fileserver pivot.",
|
| 95 |
+
"exploit": "log4shell payload sent to dmz-web\nRCE callback received on 172.16.50.10\nMeterpreter-like foothold created; harvested lsass material queued for relay.",
|
| 96 |
+
"escalate": "pass-the-hash simulation against fileserver and dc01\nwmiexec.py -hashes aad3b435b51404eeaad3b435b51404ee:5f4dcc3b5aa765d61d8327deb882cf99\nAdministrator context acquired on dc01.",
|
| 97 |
+
"c2": "Cobalt Strike team server beacon check-in\nBeacon ID 4242 active, jitter 35%, sleep 45s\nEncrypted C2 channel established with delegated token operations.",
|
| 98 |
+
"cleanup": "wevtutil cl Security && wevtutil cl System (simulated)\nTemporary payloads shredded, persistence keys removed, operator notes archived.\nBlue-team detection surface reduced for controlled exercise closure.",
|
| 99 |
+
},
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
class RedTeamPentestEnvironment(Environment[RedTeamAction, RedTeamObservation, RedTeamState]):
|
| 104 |
+
SUPPORTS_CONCURRENT_SESSIONS = True
|
| 105 |
+
_shared_lock = Lock()
|
| 106 |
+
_global_episode_counter: int = 0
|
| 107 |
+
_default_task_index: int = 0
|
| 108 |
+
_session_states: Dict[str, Dict[str, Any]] = {}
|
| 109 |
+
_default_session_key = "__default__"
|
| 110 |
+
|
| 111 |
+
def __init__(self) -> None:
|
| 112 |
+
with self._shared_lock:
|
| 113 |
+
self.task_index = int(self.__class__._default_task_index) % len(TASKS)
|
| 114 |
+
self.episode = int(self.__class__._global_episode_counter)
|
| 115 |
+
self.current_task = TASKS[self.task_index]
|
| 116 |
+
self.completed_steps = []
|
| 117 |
+
self.mistakes = 0
|
| 118 |
+
|
| 119 |
+
def _resolve_session_key(self, episode_id: Optional[str], kwargs: Dict[str, Any]) -> str:
|
| 120 |
+
raw_id = episode_id if episode_id is not None else kwargs.get("episode_id")
|
| 121 |
+
if raw_id is None:
|
| 122 |
+
return self.__class__._default_session_key
|
| 123 |
+
normalized = str(raw_id).strip()
|
| 124 |
+
return normalized if normalized else self.__class__._default_session_key
|
| 125 |
+
|
| 126 |
+
def _ensure_session(self, session_key: str) -> Dict[str, Any]:
|
| 127 |
+
session = self.__class__._session_states.get(session_key)
|
| 128 |
+
if session is None:
|
| 129 |
+
session = {
|
| 130 |
+
"task_index": int(self.__class__._default_task_index) % len(TASKS),
|
| 131 |
+
"episode": int(self.__class__._global_episode_counter),
|
| 132 |
+
"completed_steps": [],
|
| 133 |
+
"mistakes": 0,
|
| 134 |
+
}
|
| 135 |
+
self.__class__._session_states[session_key] = session
|
| 136 |
+
return session
|
| 137 |
+
|
| 138 |
+
def _hydrate_from_session(self, session: Dict[str, Any]) -> None:
|
| 139 |
+
self.task_index = int(session["task_index"]) % len(TASKS)
|
| 140 |
+
self.current_task = TASKS[self.task_index]
|
| 141 |
+
self.episode = int(session["episode"])
|
| 142 |
+
self.completed_steps = session["completed_steps"]
|
| 143 |
+
self.mistakes = int(session["mistakes"])
|
| 144 |
+
|
| 145 |
+
@property
|
| 146 |
+
def state(self) -> RedTeamState:
|
| 147 |
+
required = self.current_task["required_steps"]
|
| 148 |
+
raw_progress = len(self.completed_steps) / len(required) if required else 0.1
|
| 149 |
+
progress = max(0.1, min(0.9, raw_progress))
|
| 150 |
+
return RedTeamState(
|
| 151 |
+
episode=self.episode,
|
| 152 |
+
task=self.current_task["name"],
|
| 153 |
+
progress=round(progress, 3),
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
def _make_observation(self, current_state: str, output: str, reward: float, done: bool) -> RedTeamObservation:
|
| 157 |
+
return RedTeamObservation(
|
| 158 |
+
target_ip=self.current_task["target_ip"],
|
| 159 |
+
current_state=current_state,
|
| 160 |
+
output=output,
|
| 161 |
+
difficulty=self.current_task["difficulty"],
|
| 162 |
+
reward=safe_reward(reward),
|
| 163 |
+
done=done,
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
def reset(self, seed: Optional[int] = None, episode_id: Optional[str] = None, **kwargs: Any) -> RedTeamObservation:
|
| 167 |
+
with self._shared_lock:
|
| 168 |
+
session_key = self._resolve_session_key(episode_id, kwargs)
|
| 169 |
+
session = self._ensure_session(session_key)
|
| 170 |
+
|
| 171 |
+
if "task_index" in kwargs:
|
| 172 |
+
session["task_index"] = int(kwargs["task_index"]) % len(TASKS)
|
| 173 |
+
else:
|
| 174 |
+
session["task_index"] = int(session["task_index"]) % len(TASKS)
|
| 175 |
+
|
| 176 |
+
if session_key == self.__class__._default_session_key:
|
| 177 |
+
self.__class__._default_task_index = int(session["task_index"])
|
| 178 |
+
|
| 179 |
+
session["completed_steps"] = []
|
| 180 |
+
session["mistakes"] = 0
|
| 181 |
+
|
| 182 |
+
self.__class__._global_episode_counter += 1
|
| 183 |
+
session["episode"] = self.__class__._global_episode_counter
|
| 184 |
+
|
| 185 |
+
self._hydrate_from_session(session)
|
| 186 |
+
|
| 187 |
+
# Avoid unbounded growth from arbitrary client-provided session ids.
|
| 188 |
+
if len(self.__class__._session_states) > 2048:
|
| 189 |
+
keys = [k for k in self.__class__._session_states if k != self.__class__._default_session_key]
|
| 190 |
+
for key in keys[:512]:
|
| 191 |
+
self.__class__._session_states.pop(key, None)
|
| 192 |
+
|
| 193 |
+
briefing = (
|
| 194 |
+
f"Mission: {self.current_task['name']}\n"
|
| 195 |
+
f"Target: {self.current_task['target_ip']}\n"
|
| 196 |
+
f"Scenario: {self.current_task['scenario']}\n"
|
| 197 |
+
f"Required sequence: {' -> '.join(self.current_task['required_steps'])}\n"
|
| 198 |
+
"Objective: Execute each phase in order, collect evidence, and complete the chain."
|
| 199 |
+
)
|
| 200 |
+
return self._make_observation("BRIEFING", briefing, safe_reward(0.10), False)
|
| 201 |
+
|
| 202 |
+
def _valid_action_output(self, action_name: str, done: bool) -> str:
|
| 203 |
+
task_outputs = ACTION_OUTPUTS.get(self.task_index, {})
|
| 204 |
+
base = task_outputs.get(action_name, f"Executed {action_name} successfully.")
|
| 205 |
+
if done:
|
| 206 |
+
return f"{base}\nObjective complete. Capture: {self.current_task['hidden_flag']}"
|
| 207 |
+
return base
|
| 208 |
+
|
| 209 |
+
def step(self, action: RedTeamAction, **kwargs: Any) -> RedTeamObservation:
|
| 210 |
+
with self._shared_lock:
|
| 211 |
+
session_key = self._resolve_session_key(None, kwargs)
|
| 212 |
+
session = self._ensure_session(session_key)
|
| 213 |
+
self._hydrate_from_session(session)
|
| 214 |
+
|
| 215 |
+
if not getattr(self, "current_task", None):
|
| 216 |
+
return self.reset(**kwargs)
|
| 217 |
+
|
| 218 |
+
action_name = getattr(action, "action", None)
|
| 219 |
+
if action_name is None:
|
| 220 |
+
session["mistakes"] = int(session["mistakes"]) + 1
|
| 221 |
+
self._hydrate_from_session(session)
|
| 222 |
+
return self._make_observation(
|
| 223 |
+
"INVALID",
|
| 224 |
+
"Malformed action payload. Expected one of: scan, enumerate, exploit, escalate, c2, cleanup.",
|
| 225 |
+
safe_reward(0.10),
|
| 226 |
+
False,
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
required_steps = self.current_task["required_steps"]
|
| 230 |
+
|
| 231 |
+
if action_name not in required_steps:
|
| 232 |
+
session["mistakes"] = int(session["mistakes"]) + 1
|
| 233 |
+
self._hydrate_from_session(session)
|
| 234 |
+
return self._make_observation(
|
| 235 |
+
"INVALID",
|
| 236 |
+
f"Action '{action_name}' is not part of this mission plan. Follow: {' -> '.join(required_steps)}.",
|
| 237 |
+
safe_reward(0.10),
|
| 238 |
+
False,
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
if action_name in self.completed_steps:
|
| 242 |
+
return self._make_observation(
|
| 243 |
+
"REPEAT",
|
| 244 |
+
f"Action '{action_name}' was already completed. Continue with the next required phase.",
|
| 245 |
+
safe_reward(0.10),
|
| 246 |
+
False,
|
| 247 |
+
)
|
| 248 |
+
|
| 249 |
+
expected_action = required_steps[len(self.completed_steps)]
|
| 250 |
+
if action_name != expected_action:
|
| 251 |
+
session["mistakes"] = int(session["mistakes"]) + 1
|
| 252 |
+
self._hydrate_from_session(session)
|
| 253 |
+
return self._make_observation(
|
| 254 |
+
"ORDER_VIOLATION",
|
| 255 |
+
f"Out-of-order action. Expected '{expected_action}' but received '{action_name}'.",
|
| 256 |
+
safe_reward(PENALTY_WRONG_ORDER),
|
| 257 |
+
False,
|
| 258 |
+
)
|
| 259 |
+
|
| 260 |
+
session["completed_steps"].append(action_name)
|
| 261 |
+
self._hydrate_from_session(session)
|
| 262 |
+
difficulty = self.current_task["difficulty"]
|
| 263 |
+
base = STEP_REWARDS[difficulty]["base"]
|
| 264 |
+
|
| 265 |
+
# Chain bonus scales with progression when the chain is clean.
|
| 266 |
+
step_position = len(self.completed_steps)
|
| 267 |
+
reward = base + (CHAIN_BONUS * step_position if self.mistakes == 0 else 0)
|
| 268 |
+
|
| 269 |
+
done = len(self.completed_steps) == len(required_steps)
|
| 270 |
+
if done:
|
| 271 |
+
reward += STEP_REWARDS[difficulty]["completion_bonus"]
|
| 272 |
+
|
| 273 |
+
return self._make_observation(
|
| 274 |
+
"SUCCESS" if done else "IN_PROGRESS",
|
| 275 |
+
self._valid_action_output(action_name, done),
|
| 276 |
+
safe_reward(reward),
|
| 277 |
+
done,
|
| 278 |
+
)
|
| 279 |
+
|
| 280 |
+
def close(self) -> None:
|
| 281 |
+
return None
|
server/init.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .app import app
|
| 2 |
+
|
| 3 |
+
__all__ = ["app"]
|
task_validation.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import re
|
| 5 |
+
import sys
|
| 6 |
+
import tokenize
|
| 7 |
+
from dataclasses import dataclass
|
| 8 |
+
from decimal import Decimal, InvalidOperation
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import Iterator, List, Sequence
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
SOURCE_EXTENSIONS = {".py"}
|
| 14 |
+
TEXT_EXTENSIONS = {".json", ".yaml", ".yml", ".txt"}
|
| 15 |
+
SKIP_DIRS = {".git", ".venv", "venv", "__pycache__", ".mypy_cache", ".pytest_cache", ".ruff_cache"}
|
| 16 |
+
DECIMAL_PATTERN = re.compile(
|
| 17 |
+
r"(?<![\w.])[+-]?(?:\d+\.\d*|\.\d+|\d+(?:\.\d*)?[eE][+-]?\d+)(?![\w.])"
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@dataclass(frozen=True)
|
| 22 |
+
class Finding:
|
| 23 |
+
path: Path
|
| 24 |
+
line: int
|
| 25 |
+
token: str
|
| 26 |
+
value: str
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def is_decimal_token(token: str) -> bool:
|
| 30 |
+
return "." in token or "e" in token.lower()
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def parse_decimal(token: str) -> Decimal | None:
|
| 34 |
+
try:
|
| 35 |
+
return Decimal(token)
|
| 36 |
+
except (InvalidOperation, ValueError):
|
| 37 |
+
return None
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def boundary_check(token: str) -> bool:
|
| 41 |
+
value = parse_decimal(token)
|
| 42 |
+
return value is not None and value in {Decimal(0), Decimal(1)}
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def scan_python_file(path: Path) -> List[Finding]:
|
| 46 |
+
findings: List[Finding] = []
|
| 47 |
+
try:
|
| 48 |
+
with tokenize.open(path) as handle:
|
| 49 |
+
tokens = tokenize.generate_tokens(handle.readline)
|
| 50 |
+
for tok_type, tok_str, start, _, _ in tokens:
|
| 51 |
+
if tok_type != tokenize.NUMBER:
|
| 52 |
+
continue
|
| 53 |
+
if not is_decimal_token(tok_str):
|
| 54 |
+
continue
|
| 55 |
+
if boundary_check(tok_str):
|
| 56 |
+
value = parse_decimal(tok_str)
|
| 57 |
+
findings.append(Finding(path=path, line=start[0], token=tok_str, value=str(value)))
|
| 58 |
+
except (OSError, SyntaxError, tokenize.TokenError) as exc:
|
| 59 |
+
findings.append(Finding(path=path, line=1, token="<parse-error>", value=str(exc)))
|
| 60 |
+
return findings
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def scan_text_file(path: Path) -> List[Finding]:
|
| 64 |
+
findings: List[Finding] = []
|
| 65 |
+
try:
|
| 66 |
+
text = path.read_text(encoding="utf-8")
|
| 67 |
+
except OSError as exc:
|
| 68 |
+
return [Finding(path=path, line=1, token="<read-error>", value=str(exc))]
|
| 69 |
+
|
| 70 |
+
for line_number, line in enumerate(text.splitlines(), start=1):
|
| 71 |
+
stripped = line.lstrip()
|
| 72 |
+
if path.suffix in {".yaml", ".yml"} and stripped.startswith("#"):
|
| 73 |
+
continue
|
| 74 |
+
for match in DECIMAL_PATTERN.finditer(line):
|
| 75 |
+
token = match.group(0)
|
| 76 |
+
if boundary_check(token):
|
| 77 |
+
value = parse_decimal(token)
|
| 78 |
+
findings.append(Finding(path=path, line=line_number, token=token, value=str(value)))
|
| 79 |
+
return findings
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def iter_target_files(root: Path) -> Iterator[Path]:
|
| 83 |
+
for path in root.rglob("*"):
|
| 84 |
+
if any(part in SKIP_DIRS for part in path.parts):
|
| 85 |
+
continue
|
| 86 |
+
if not path.is_file():
|
| 87 |
+
continue
|
| 88 |
+
if path.suffix in SOURCE_EXTENSIONS or path.suffix in TEXT_EXTENSIONS:
|
| 89 |
+
yield path
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def collect_findings(root: Path) -> List[Finding]:
|
| 93 |
+
findings: List[Finding] = []
|
| 94 |
+
for path in sorted(iter_target_files(root)):
|
| 95 |
+
if path.suffix in SOURCE_EXTENSIONS:
|
| 96 |
+
findings.extend(scan_python_file(path))
|
| 97 |
+
else:
|
| 98 |
+
findings.extend(scan_text_file(path))
|
| 99 |
+
return findings
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def format_findings(findings: Sequence[Finding], root: Path) -> str:
|
| 103 |
+
lines = []
|
| 104 |
+
for finding in findings:
|
| 105 |
+
lines.append(f"{finding.path.relative_to(root)}:{finding.line}: boundary decimal {finding.token} -> {finding.value}")
|
| 106 |
+
return "\n".join(lines)
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def main(argv: Sequence[str] | None = None) -> int:
|
| 110 |
+
parser = argparse.ArgumentParser(description="Validate that decimal literals do not touch 0 or 1.")
|
| 111 |
+
parser.add_argument("path", nargs="?", default=".", help="Repository path to scan")
|
| 112 |
+
args = parser.parse_args(argv)
|
| 113 |
+
|
| 114 |
+
root = Path(args.path).resolve()
|
| 115 |
+
findings = collect_findings(root)
|
| 116 |
+
|
| 117 |
+
if findings:
|
| 118 |
+
print("Task validation failed: boundary-touching decimals found.", file=sys.stderr)
|
| 119 |
+
print(format_findings(findings, root), file=sys.stderr)
|
| 120 |
+
return 1
|
| 121 |
+
|
| 122 |
+
print("Task validation passed: no decimal literals touch 0 or 1.")
|
| 123 |
+
return 0
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
if __name__ == "__main__":
|
| 127 |
+
raise SystemExit(main())
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|