Spaces:
Sleeping
Sleeping
Commit Β·
7257069
1
Parent(s): ef93755
frontend adding
Browse files- Dockerfile +17 -13
- README.md +164 -116
- app/__init__.py +0 -1
- app/dashboard.py +672 -0
- app/main.py +22 -28
- app/models.py +44 -36
- app/routes.py +85 -89
- app/session_store.py +0 -73
- app/state.py +14 -8
- codegraph/__init__.py +0 -1
- codegraph/extractor.py +117 -128
- codegraph/graph.py +109 -96
- codegraph/serializer.py +16 -20
- graders/__init__.py +0 -1
- graders/attacks.py +130 -286
- graders/code_structure.py +0 -45
- graders/consistency.py +82 -80
- graders/correctness.py +154 -68
- graders/documentation.py +126 -24
- graders/performance.py +105 -96
- graders/reward_aggregator.py +104 -103
- graders/static_analysis.py +160 -102
- graders/supply_chain.py +0 -99
- inference.py +163 -159
- openenv.yaml +90 -95
- requirements.txt +6 -29
- sandbox/__init__.py +0 -1
- sandbox/executor.py +144 -82
- sandbox/payload_gen.py +92 -130
- tasks/__init__.py +0 -1
- tasks/easy/hash_generator.py +0 -38
- tasks/easy/input_sanitizer.py +76 -38
- tasks/easy/password_validator.py +122 -34
- tasks/easy/token_generator.py +81 -0
- tasks/hard/auth_middleware.py +127 -51
- tasks/hard/file_upload_handler.py +145 -38
- tasks/hard/jwt_validator.py +126 -45
- tasks/medium/api_rate_limiter.py +0 -43
- tasks/medium/file_path_handler.py +101 -37
- tasks/medium/rate_limiter.py +124 -0
- tasks/medium/sql_query_builder.py +85 -33
- tasks/task_registry.py +42 -39
- tests/__init__.py +0 -1
- tests/test_api.py +0 -174
- tests/test_codegraph.py +0 -127
- tests/test_graders.py +0 -206
- validate.py +204 -196
Dockerfile
CHANGED
|
@@ -1,32 +1,36 @@
|
|
| 1 |
-
#
|
| 2 |
-
# python:3.11-slim
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
FROM python:3.11-slim
|
| 4 |
|
| 5 |
-
#
|
| 6 |
-
|
| 7 |
-
RUN apt-get update && apt-get install -y \
|
| 8 |
gcc \
|
| 9 |
-
g++ \
|
| 10 |
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
|
| 12 |
WORKDIR /app
|
| 13 |
|
| 14 |
-
# Install Python dependencies first (layer cache)
|
| 15 |
COPY requirements.txt .
|
| 16 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 17 |
|
| 18 |
-
# Copy
|
| 19 |
COPY . .
|
| 20 |
|
| 21 |
-
# Create
|
| 22 |
-
RUN
|
| 23 |
|
| 24 |
-
# Non-root user β security best practice
|
| 25 |
-
RUN useradd -m appuser && chown -R appuser:appuser /app
|
| 26 |
USER appuser
|
| 27 |
|
| 28 |
# HuggingFace Spaces requires port 7860
|
| 29 |
EXPOSE 7860
|
| 30 |
|
| 31 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "2"]
|
|
|
|
| 1 |
+
# βββ SecureCodeEnv Dockerfile ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 2 |
+
# Base: python:3.11-slim β minimal, fast, secure
|
| 3 |
+
# Port: 7860 β HuggingFace Spaces standard port
|
| 4 |
+
# Security: Non-root user, no network for agent subprocesses
|
| 5 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 6 |
+
|
| 7 |
FROM python:3.11-slim
|
| 8 |
|
| 9 |
+
# Install system dependencies for bandit + compilation
|
| 10 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
|
|
| 11 |
gcc \
|
|
|
|
| 12 |
&& rm -rf /var/lib/apt/lists/*
|
| 13 |
|
| 14 |
WORKDIR /app
|
| 15 |
|
| 16 |
+
# Install Python dependencies first (layer cache optimization)
|
| 17 |
COPY requirements.txt .
|
| 18 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 19 |
|
| 20 |
+
# Copy application code
|
| 21 |
COPY . .
|
| 22 |
|
| 23 |
+
# Create non-root user for security (best practice β agent code runs as appuser)
|
| 24 |
+
RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app
|
| 25 |
|
|
|
|
|
|
|
| 26 |
USER appuser
|
| 27 |
|
| 28 |
# HuggingFace Spaces requires port 7860
|
| 29 |
EXPOSE 7860
|
| 30 |
|
| 31 |
+
# Health check β hackathon automated ping checks /health
|
| 32 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
| 33 |
+
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:7860/health')"
|
| 34 |
+
|
| 35 |
+
# 2 workers for concurrency (stateless sessions support this)
|
| 36 |
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "2"]
|
README.md
CHANGED
|
@@ -1,179 +1,227 @@
|
|
| 1 |
---
|
| 2 |
title: SecureCodeEnv
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
pinned: true
|
| 8 |
-
license:
|
| 9 |
---
|
| 10 |
|
| 11 |
-
#
|
| 12 |
|
| 13 |
-
**RL environment for training LLM agents to write production-ready, secure Python code.**
|
| 14 |
|
| 15 |
-
Built for the **Meta Γ
|
| 16 |
|
| 17 |
---
|
| 18 |
|
| 19 |
## The Problem
|
| 20 |
|
| 21 |
-
Studies show **12β65% of LLM-generated code contains security vulnerabilities**
|
| 22 |
|
| 23 |
Every existing RL environment trains agents to write code that **WORKS**. None train agents to write code that is **SAFE, CONSISTENT, and PRODUCTION-READY**.
|
| 24 |
|
| 25 |
-
SecureCodeEnv fills that
|
| 26 |
|
| 27 |
---
|
| 28 |
|
| 29 |
-
## What Makes This Unique
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
-
|
| 36 |
-
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
The
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
| 2 | `input_sanitizer` | Easy | CWE-20 | XSS payload pass-through |
|
| 53 |
-
| 3 | `hash_generator` | Easy | CWE-327 | Shell invocation for hashing |
|
| 54 |
-
| 4 | `sql_query_builder` | Medium | CWE-89 | SQL injection via cursor spy |
|
| 55 |
-
| 5 | `file_path_handler` | Medium | CWE-22 | Path traversal via open() spy |
|
| 56 |
-
| 6 | `api_rate_limiter` | Medium | CWE-307 | Rate bypass with spoofed client ID |
|
| 57 |
-
| 7 | `file_upload_handler` | Hard | CWE-434 | Malicious file extension upload |
|
| 58 |
-
| 8 | `jwt_validator` | Hard | CWE-347 | JWT alg:none bypass |
|
| 59 |
-
| 9 | `auth_middleware` | Hard | CWE-287 | Shell-based auth + timing attack |
|
| 60 |
-
|
| 61 |
-
### 4. 8-Dimensional Reward System
|
| 62 |
-
| Grader | Weight | Tool | Type |
|
| 63 |
-
|--------|--------|------|------|
|
| 64 |
-
| Correctness | 25% | Custom test runner | Functional |
|
| 65 |
-
| Attack Resistance | 25% | Behavioral harness V2 | Security β unfakeable |
|
| 66 |
-
| Static Security | 15% | bandit + semgrep | Security β static |
|
| 67 |
-
| CodeGraph Consistency | 15% | tree-sitter + CodeGraph | Architectural |
|
| 68 |
-
| Performance | 10% | timeit + tracemalloc | Efficiency |
|
| 69 |
-
| Documentation | 5% | ast | Quality |
|
| 70 |
-
| Code Structure | 3% | ast | Quality |
|
| 71 |
-
| Supply Chain | 2% | pip-audit + typosquat | Security |
|
| 72 |
|
| 73 |
---
|
| 74 |
|
| 75 |
-
##
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
```python
|
| 78 |
import requests
|
| 79 |
|
| 80 |
-
|
| 81 |
|
| 82 |
-
# Start episode
|
| 83 |
-
episode = requests.post(f"{
|
| 84 |
sid = episode["session_id"]
|
|
|
|
| 85 |
|
| 86 |
-
# Submit code
|
| 87 |
-
result = requests.post(f"{
|
| 88 |
"session_id": sid,
|
| 89 |
-
"
|
| 90 |
"filename": "solution.py",
|
| 91 |
-
"code": your_secure_code,
|
| 92 |
}).json()
|
| 93 |
|
| 94 |
-
print(result[
|
| 95 |
-
print(result[
|
| 96 |
-
print(result[
|
| 97 |
```
|
| 98 |
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
|
| 105 |
-
|
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
---
|
| 109 |
|
| 110 |
-
##
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
-
|
| 114 |
```json
|
| 115 |
{
|
| 116 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
"scores": {
|
| 118 |
"correctness": 1.0,
|
| 119 |
"attack_resist": 0.875,
|
| 120 |
-
"static_security": 0.
|
| 121 |
"consistency": 1.0,
|
| 122 |
-
"performance": 0.
|
| 123 |
-
"documentation": 0.
|
| 124 |
-
"code_structure":
|
| 125 |
-
"supply_chain": 1.0
|
| 126 |
-
},
|
| 127 |
-
"feedback": {
|
| 128 |
-
"correctness": "β
Excellent (1.00) β 8/8 tests passed.",
|
| 129 |
-
"attack_resist": "π‘ Good (0.88) β 7/8 attacks blocked."
|
| 130 |
},
|
| 131 |
-
"
|
|
|
|
| 132 |
"done": false,
|
| 133 |
-
"step_count":
|
| 134 |
}
|
| 135 |
```
|
| 136 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
---
|
| 138 |
|
| 139 |
-
##
|
| 140 |
|
| 141 |
```bash
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
docker run -p 7860:7860 -e REDIS_URL=<upstash_url> securecodeenv
|
| 145 |
-
|
| 146 |
-
# Run baseline inference
|
| 147 |
-
API_BASE_URL=https://api.groq.com/openai/v1 \
|
| 148 |
-
MODEL_NAME=llama-3.3-70b-versatile \
|
| 149 |
-
HF_TOKEN=<your_token> \
|
| 150 |
-
ENV_URL=http://localhost:7860 \
|
| 151 |
-
python inference.py
|
| 152 |
|
| 153 |
-
#
|
| 154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
```
|
| 156 |
|
| 157 |
-
##
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
|
| 165 |
---
|
| 166 |
|
| 167 |
-
##
|
| 168 |
-
|
| 169 |
-
|
|
| 170 |
-
|
|
| 171 |
-
|
|
| 172 |
-
|
|
| 173 |
-
|
|
| 174 |
-
|
|
| 175 |
-
| LLM for inference | Groq free tier | β
$0 |
|
| 176 |
|
| 177 |
---
|
| 178 |
|
| 179 |
-
*SecureCodeEnv
|
|
|
|
| 1 |
---
|
| 2 |
title: SecureCodeEnv
|
| 3 |
+
emoji: π
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: orange
|
| 6 |
sdk: docker
|
| 7 |
pinned: true
|
| 8 |
+
license: mit
|
| 9 |
---
|
| 10 |
|
| 11 |
+
# SecureCodeEnv
|
| 12 |
|
| 13 |
+
**An RL environment for training LLM agents to write production-ready, secure Python code.**
|
| 14 |
|
| 15 |
+
Built for the **Meta Γ PyTorch OpenEnv Hackathon 2026** by Vishal Dhakad (`vishaldhakad`).
|
| 16 |
|
| 17 |
---
|
| 18 |
|
| 19 |
## The Problem
|
| 20 |
|
| 21 |
+
Studies show **12β65% of LLM-generated code contains security vulnerabilities** (2025 research). Secure-pass@1 rates remain below 12% for all frontier models even when functional pass@1 exceeds 50%.
|
| 22 |
|
| 23 |
Every existing RL environment trains agents to write code that **WORKS**. None train agents to write code that is **SAFE, CONSISTENT, and PRODUCTION-READY**.
|
| 24 |
|
| 25 |
+
SecureCodeEnv fills that gap.
|
| 26 |
|
| 27 |
---
|
| 28 |
|
| 29 |
+
## What Makes This Environment Unique
|
| 30 |
+
|
| 31 |
+
| Feature | SecureCodeEnv | Other RL Envs |
|
| 32 |
+
|---|---|---|
|
| 33 |
+
| Dynamic adversarial grading | β
Actually FIRES attacks | β Static patterns only |
|
| 34 |
+
| CodeGraph memory | β
Codebase-consistency rewards | β Single-function only |
|
| 35 |
+
| CWE-grounded tasks | β
9 tasks, 12+ CWE IDs | β Generic correctness |
|
| 36 |
+
| Multi-dimensional reward | β
7 dimensions | β Pass/fail only |
|
| 37 |
+
| Anti-reward-hacking | β
Seeded random payloads | β Fixed test cases |
|
| 38 |
+
|
| 39 |
+
### CodeGraph Memory System
|
| 40 |
+
|
| 41 |
+
The environment maintains a `CodeGraph` β a structured in-memory database of every component the agent has written in the current episode. When the agent writes `auth/validator.py` in `snake_case`, and then submits `auth/middleware.py` in `camelCase`, the consistency grader penalizes the drift. No other RL environment does this.
|
| 42 |
+
|
| 43 |
+
### Dynamic Adversarial Attack Grading
|
| 44 |
+
|
| 45 |
+
We don't just scan for vulnerability patterns β we **fire real attacks** at the agent's code:
|
| 46 |
+
- SQL injection payloads (UNION SELECT, OR 1=1, stacked queries)
|
| 47 |
+
- Path traversal payloads (`../../etc/passwd`, URL-encoded variants)
|
| 48 |
+
- JWT bypass attacks (`alg: none`, expired tokens, tampered payloads)
|
| 49 |
+
- XSS payloads (`<script>`, `onerror=`, template injection)
|
| 50 |
+
|
| 51 |
+
Payloads are randomized per episode using a seed. The agent **cannot memorize** specific strings.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
---
|
| 54 |
|
| 55 |
+
## Reward System (7 Dimensions)
|
| 56 |
+
|
| 57 |
+
| Dimension | Weight | Tool | What It Measures |
|
| 58 |
+
|---|---|---|---|
|
| 59 |
+
| Correctness | 30% | Custom test runner | Does the code solve the problem? |
|
| 60 |
+
| Attack Resistance | 20% | Dynamic harness | Does it survive real attacks? |
|
| 61 |
+
| Static Security | 15% | bandit + AST | Known vulnerability patterns (CWE-mapped) |
|
| 62 |
+
| CodeGraph Consistency | 15% | AST + CodeGraph | Matches existing codebase conventions? |
|
| 63 |
+
| Performance | 10% | timeit + tracemalloc | Efficient vs naive/optimal baselines |
|
| 64 |
+
| Documentation | 5% | AST | Docstrings + type hints coverage |
|
| 65 |
+
| Code Structure | 5% | AST | Clean code (no bare print, no bare except) |
|
| 66 |
+
|
| 67 |
+
---
|
| 68 |
+
|
| 69 |
+
## Quick Start
|
| 70 |
|
| 71 |
```python
|
| 72 |
import requests
|
| 73 |
|
| 74 |
+
ENV_URL = "https://vishaldhakad-securecodeenv.hf.space"
|
| 75 |
|
| 76 |
+
# 1. Start episode
|
| 77 |
+
episode = requests.post(f"{ENV_URL}/reset", json={"difficulty": "medium"}).json()
|
| 78 |
sid = episode["session_id"]
|
| 79 |
+
print(episode["problem_statement"])
|
| 80 |
|
| 81 |
+
# 2. Submit code
|
| 82 |
+
result = requests.post(f"{ENV_URL}/step", json={
|
| 83 |
"session_id": sid,
|
| 84 |
+
"code": "def build_user_query(username, role):\n return ('SELECT * FROM users WHERE username = %s', (username,))",
|
| 85 |
"filename": "solution.py",
|
|
|
|
| 86 |
}).json()
|
| 87 |
|
| 88 |
+
print(f"Reward: {result['total_reward']:.3f}")
|
| 89 |
+
print(f"Scores: {result['scores']}")
|
| 90 |
+
print(f"Feedback: {result['feedback']['summary']}")
|
| 91 |
```
|
| 92 |
|
| 93 |
+
---
|
| 94 |
+
|
| 95 |
+
## Tasks β 9 Tasks Across 3 Difficulty Levels
|
| 96 |
+
|
| 97 |
+
### Easy
|
| 98 |
+
| Task | CWE Targets | Attack |
|
| 99 |
+
|---|---|---|
|
| 100 |
+
| Password Validator | CWE-916, CWE-521 | Weak hash detection |
|
| 101 |
+
| Input Sanitizer | CWE-20, CWE-116 | XSS payload injection |
|
| 102 |
+
| Token Generator | CWE-338, CWE-330 | Predictable randomness |
|
| 103 |
+
|
| 104 |
+
### Medium
|
| 105 |
+
| Task | CWE Targets | Attack |
|
| 106 |
+
|---|---|---|
|
| 107 |
+
| SQL Query Builder | CWE-89 | SQL injection payloads |
|
| 108 |
+
| File Path Handler | CWE-22 | Path traversal attacks |
|
| 109 |
+
| Rate Limiter | CWE-770, CWE-400 | Concurrent request flood |
|
| 110 |
+
|
| 111 |
+
### Hard
|
| 112 |
+
| Task | CWE Targets | Attack |
|
| 113 |
+
|---|---|---|
|
| 114 |
+
| File Upload Handler | CWE-22, CWE-434 | Traversal filenames + MIME spoofing |
|
| 115 |
+
| JWT Validator | CWE-347, CWE-613 | `alg:none` attack, expired tokens |
|
| 116 |
+
| Auth Middleware | CWE-287, CWE-352 | CSRF bypass, timing attacks |
|
| 117 |
|
| 118 |
---
|
| 119 |
|
| 120 |
+
## API Reference
|
| 121 |
+
|
| 122 |
+
### `POST /reset`
|
| 123 |
+
Start a new episode.
|
| 124 |
+
|
| 125 |
+
**Request:**
|
| 126 |
+
```json
|
| 127 |
+
{ "difficulty": "medium" }
|
| 128 |
+
```
|
| 129 |
+
|
| 130 |
+
**Response:**
|
| 131 |
+
```json
|
| 132 |
+
{
|
| 133 |
+
"session_id": "uuid",
|
| 134 |
+
"task_id": "medium_sql_query_builder",
|
| 135 |
+
"problem_statement": "Write a Python function...",
|
| 136 |
+
"difficulty": "medium",
|
| 137 |
+
"cwe_targets": ["CWE-89", "CWE-20"],
|
| 138 |
+
"codegraph": { "components": {}, "conventions": {} },
|
| 139 |
+
"starter_code": "def build_user_query(...):"
|
| 140 |
+
}
|
| 141 |
+
```
|
| 142 |
+
|
| 143 |
+
### `POST /step`
|
| 144 |
+
Submit agent code for grading.
|
| 145 |
|
| 146 |
+
**Request:**
|
| 147 |
```json
|
| 148 |
{
|
| 149 |
+
"session_id": "uuid",
|
| 150 |
+
"code": "def build_user_query(username: str, role: str) -> tuple: ...",
|
| 151 |
+
"filename": "src/db/queries.py"
|
| 152 |
+
}
|
| 153 |
+
```
|
| 154 |
+
|
| 155 |
+
**Response:**
|
| 156 |
+
```json
|
| 157 |
+
{
|
| 158 |
+
"total_reward": 0.847,
|
| 159 |
"scores": {
|
| 160 |
"correctness": 1.0,
|
| 161 |
"attack_resist": 0.875,
|
| 162 |
+
"static_security": 0.9,
|
| 163 |
"consistency": 1.0,
|
| 164 |
+
"performance": 0.72,
|
| 165 |
+
"documentation": 0.75,
|
| 166 |
+
"code_structure": 0.8
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
},
|
| 168 |
+
"feedback": { "summary": "π‘ Good submission β improve: performance" },
|
| 169 |
+
"codegraph": { ... },
|
| 170 |
"done": false,
|
| 171 |
+
"step_count": 1
|
| 172 |
}
|
| 173 |
```
|
| 174 |
|
| 175 |
+
### `GET /state?session_id=<id>`
|
| 176 |
+
Get current episode state without advancing.
|
| 177 |
+
|
| 178 |
+
### `GET /health`
|
| 179 |
+
Returns `{"status": "ok", "env": "SecureCodeEnv", "version": "2.0.0", "tasks_loaded": 9}`
|
| 180 |
+
|
| 181 |
---
|
| 182 |
|
| 183 |
+
## Setup (Local)
|
| 184 |
|
| 185 |
```bash
|
| 186 |
+
git clone https://huggingface.co/spaces/vishaldhakad/SecureCodeEnv
|
| 187 |
+
cd SecureCodeEnv
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
|
| 189 |
+
# Docker (recommended)
|
| 190 |
+
docker build -t secure-code-env .
|
| 191 |
+
docker run -p 7860:7860 secure-code-env
|
| 192 |
+
|
| 193 |
+
# Or direct
|
| 194 |
+
pip install -r requirements.txt
|
| 195 |
+
uvicorn app.main:app --host 0.0.0.0 --port 7860
|
| 196 |
```
|
| 197 |
|
| 198 |
+
## Run Baseline Inference
|
| 199 |
+
|
| 200 |
+
```bash
|
| 201 |
+
export API_BASE_URL=https://api.openai.com/v1
|
| 202 |
+
export MODEL_NAME=gpt-4o-mini
|
| 203 |
+
export HF_TOKEN=hf_your_token
|
| 204 |
+
export ENV_URL=http://localhost:7860
|
| 205 |
+
python inference.py
|
| 206 |
+
```
|
| 207 |
+
|
| 208 |
+
## Validate Before Submit
|
| 209 |
+
|
| 210 |
+
```bash
|
| 211 |
+
python validate.py --url http://localhost:7860
|
| 212 |
+
```
|
| 213 |
|
| 214 |
---
|
| 215 |
|
| 216 |
+
## Environment Variables
|
| 217 |
+
|
| 218 |
+
| Variable | Required | Description |
|
| 219 |
+
|---|---|---|
|
| 220 |
+
| `API_BASE_URL` | Yes | LLM API endpoint (OpenAI-compatible) |
|
| 221 |
+
| `MODEL_NAME` | Yes | Model identifier (e.g. `gpt-4o-mini`) |
|
| 222 |
+
| `HF_TOKEN` | Yes | HuggingFace token |
|
| 223 |
+
| `ENV_URL` | No | Override environment URL (default: localhost:7860) |
|
|
|
|
| 224 |
|
| 225 |
---
|
| 226 |
|
| 227 |
+
*SecureCodeEnv v2.0 Β· Meta Γ PyTorch OpenEnv Hackathon 2026 Β· Vishal Dhakad*
|
app/__init__.py
CHANGED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
# app/__init__.py
|
|
|
|
|
|
app/dashboard.py
ADDED
|
@@ -0,0 +1,672 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SecureCodeEnv - HTML Dashboard
|
| 3 |
+
Served at GET / β this is what judges and users see on HuggingFace Spaces.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
DASHBOARD_HTML = '''<!DOCTYPE html>
|
| 7 |
+
<html lang="en">
|
| 8 |
+
<head>
|
| 9 |
+
<meta charset="UTF-8">
|
| 10 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 11 |
+
<title>SecureCodeEnv β RL Environment for Secure Code Generation</title>
|
| 12 |
+
<link rel="preconnect" href="https://fonts.googleapis.com">
|
| 13 |
+
<link href="https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;700&family=Syne:wght@400;700;800&display=swap" rel="stylesheet">
|
| 14 |
+
<style>
|
| 15 |
+
:root {
|
| 16 |
+
--bg: #090c10;
|
| 17 |
+
--surface: #0d1117;
|
| 18 |
+
--surface2: #161b22;
|
| 19 |
+
--border: #21262d;
|
| 20 |
+
--accent: #f0883e;
|
| 21 |
+
--accent2: #79c0ff;
|
| 22 |
+
--accent3: #56d364;
|
| 23 |
+
--danger: #ff7b72;
|
| 24 |
+
--text: #e6edf3;
|
| 25 |
+
--muted: #8b949e;
|
| 26 |
+
--mono: 'JetBrains Mono', monospace;
|
| 27 |
+
--sans: 'Syne', sans-serif;
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
* { box-sizing: border-box; margin: 0; padding: 0; }
|
| 31 |
+
|
| 32 |
+
body {
|
| 33 |
+
background: var(--bg);
|
| 34 |
+
color: var(--text);
|
| 35 |
+
font-family: var(--sans);
|
| 36 |
+
min-height: 100vh;
|
| 37 |
+
overflow-x: hidden;
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
/* ββ Grid noise texture ββ */
|
| 41 |
+
body::before {
|
| 42 |
+
content: '';
|
| 43 |
+
position: fixed;
|
| 44 |
+
inset: 0;
|
| 45 |
+
background-image:
|
| 46 |
+
linear-gradient(rgba(240,136,62,.03) 1px, transparent 1px),
|
| 47 |
+
linear-gradient(90deg, rgba(240,136,62,.03) 1px, transparent 1px);
|
| 48 |
+
background-size: 40px 40px;
|
| 49 |
+
pointer-events: none;
|
| 50 |
+
z-index: 0;
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
.wrap { position: relative; z-index: 1; max-width: 1100px; margin: 0 auto; padding: 0 24px; }
|
| 54 |
+
|
| 55 |
+
/* ββ Header ββ */
|
| 56 |
+
header {
|
| 57 |
+
border-bottom: 1px solid var(--border);
|
| 58 |
+
padding: 18px 0;
|
| 59 |
+
position: sticky;
|
| 60 |
+
top: 0;
|
| 61 |
+
background: rgba(9,12,16,.92);
|
| 62 |
+
backdrop-filter: blur(12px);
|
| 63 |
+
z-index: 100;
|
| 64 |
+
}
|
| 65 |
+
.header-inner {
|
| 66 |
+
display: flex;
|
| 67 |
+
align-items: center;
|
| 68 |
+
justify-content: space-between;
|
| 69 |
+
gap: 16px;
|
| 70 |
+
}
|
| 71 |
+
.logo {
|
| 72 |
+
display: flex;
|
| 73 |
+
align-items: center;
|
| 74 |
+
gap: 10px;
|
| 75 |
+
font-family: var(--mono);
|
| 76 |
+
font-weight: 700;
|
| 77 |
+
font-size: 15px;
|
| 78 |
+
color: var(--accent);
|
| 79 |
+
letter-spacing: -.3px;
|
| 80 |
+
}
|
| 81 |
+
.logo-icon {
|
| 82 |
+
width: 28px; height: 28px;
|
| 83 |
+
background: var(--accent);
|
| 84 |
+
border-radius: 6px;
|
| 85 |
+
display: grid;
|
| 86 |
+
place-items: center;
|
| 87 |
+
font-size: 14px;
|
| 88 |
+
}
|
| 89 |
+
.badge {
|
| 90 |
+
font-family: var(--mono);
|
| 91 |
+
font-size: 10px;
|
| 92 |
+
padding: 3px 8px;
|
| 93 |
+
border-radius: 99px;
|
| 94 |
+
border: 1px solid;
|
| 95 |
+
letter-spacing: .5px;
|
| 96 |
+
text-transform: uppercase;
|
| 97 |
+
}
|
| 98 |
+
.badge-orange { color: var(--accent); border-color: rgba(240,136,62,.3); background: rgba(240,136,62,.07); }
|
| 99 |
+
.badge-blue { color: var(--accent2); border-color: rgba(121,192,255,.3); background: rgba(121,192,255,.07); }
|
| 100 |
+
.badge-green { color: var(--accent3); border-color: rgba(86,211,100,.3); background: rgba(86,211,100,.07); }
|
| 101 |
+
.badge-red { color: var(--danger); border-color: rgba(255,123,114,.3); background: rgba(255,123,114,.07); }
|
| 102 |
+
.header-badges { display: flex; gap: 8px; flex-wrap: wrap; }
|
| 103 |
+
|
| 104 |
+
/* ββ Hero ββ */
|
| 105 |
+
.hero {
|
| 106 |
+
padding: 72px 0 56px;
|
| 107 |
+
position: relative;
|
| 108 |
+
}
|
| 109 |
+
.hero-eyebrow {
|
| 110 |
+
font-family: var(--mono);
|
| 111 |
+
font-size: 11px;
|
| 112 |
+
color: var(--accent);
|
| 113 |
+
letter-spacing: 2px;
|
| 114 |
+
text-transform: uppercase;
|
| 115 |
+
margin-bottom: 20px;
|
| 116 |
+
display: flex;
|
| 117 |
+
align-items: center;
|
| 118 |
+
gap: 10px;
|
| 119 |
+
}
|
| 120 |
+
.hero-eyebrow::before {
|
| 121 |
+
content: '';
|
| 122 |
+
display: block;
|
| 123 |
+
width: 24px; height: 1px;
|
| 124 |
+
background: var(--accent);
|
| 125 |
+
}
|
| 126 |
+
h1 {
|
| 127 |
+
font-size: clamp(36px, 6vw, 64px);
|
| 128 |
+
font-weight: 800;
|
| 129 |
+
line-height: 1.05;
|
| 130 |
+
letter-spacing: -2px;
|
| 131 |
+
margin-bottom: 24px;
|
| 132 |
+
}
|
| 133 |
+
h1 em { font-style: normal; color: var(--accent); }
|
| 134 |
+
.hero-desc {
|
| 135 |
+
font-size: 17px;
|
| 136 |
+
color: var(--muted);
|
| 137 |
+
max-width: 600px;
|
| 138 |
+
line-height: 1.7;
|
| 139 |
+
margin-bottom: 36px;
|
| 140 |
+
}
|
| 141 |
+
.hero-actions { display: flex; gap: 12px; flex-wrap: wrap; }
|
| 142 |
+
.btn {
|
| 143 |
+
font-family: var(--mono);
|
| 144 |
+
font-size: 13px;
|
| 145 |
+
font-weight: 700;
|
| 146 |
+
padding: 11px 22px;
|
| 147 |
+
border-radius: 7px;
|
| 148 |
+
text-decoration: none;
|
| 149 |
+
transition: all .15s;
|
| 150 |
+
cursor: pointer;
|
| 151 |
+
border: none;
|
| 152 |
+
display: inline-flex;
|
| 153 |
+
align-items: center;
|
| 154 |
+
gap: 8px;
|
| 155 |
+
}
|
| 156 |
+
.btn-primary {
|
| 157 |
+
background: var(--accent);
|
| 158 |
+
color: #000;
|
| 159 |
+
}
|
| 160 |
+
.btn-primary:hover { background: #ffaa5e; transform: translateY(-1px); }
|
| 161 |
+
.btn-ghost {
|
| 162 |
+
background: transparent;
|
| 163 |
+
color: var(--text);
|
| 164 |
+
border: 1px solid var(--border);
|
| 165 |
+
}
|
| 166 |
+
.btn-ghost:hover { border-color: var(--accent2); color: var(--accent2); }
|
| 167 |
+
|
| 168 |
+
/* ββ Stats row ββ */
|
| 169 |
+
.stats {
|
| 170 |
+
display: grid;
|
| 171 |
+
grid-template-columns: repeat(4, 1fr);
|
| 172 |
+
gap: 1px;
|
| 173 |
+
background: var(--border);
|
| 174 |
+
border: 1px solid var(--border);
|
| 175 |
+
border-radius: 10px;
|
| 176 |
+
overflow: hidden;
|
| 177 |
+
margin-bottom: 64px;
|
| 178 |
+
}
|
| 179 |
+
.stat {
|
| 180 |
+
background: var(--surface);
|
| 181 |
+
padding: 24px 28px;
|
| 182 |
+
position: relative;
|
| 183 |
+
overflow: hidden;
|
| 184 |
+
}
|
| 185 |
+
.stat::after {
|
| 186 |
+
content: attr(data-icon);
|
| 187 |
+
position: absolute;
|
| 188 |
+
right: 16px;
|
| 189 |
+
top: 50%;
|
| 190 |
+
transform: translateY(-50%);
|
| 191 |
+
font-size: 28px;
|
| 192 |
+
opacity: .15;
|
| 193 |
+
}
|
| 194 |
+
.stat-val {
|
| 195 |
+
font-family: var(--mono);
|
| 196 |
+
font-size: 32px;
|
| 197 |
+
font-weight: 700;
|
| 198 |
+
color: var(--accent);
|
| 199 |
+
line-height: 1;
|
| 200 |
+
margin-bottom: 6px;
|
| 201 |
+
}
|
| 202 |
+
.stat-label { font-size: 12px; color: var(--muted); letter-spacing: .3px; }
|
| 203 |
+
|
| 204 |
+
/* ββ Sections ββ */
|
| 205 |
+
section { margin-bottom: 64px; }
|
| 206 |
+
.section-title {
|
| 207 |
+
font-size: 11px;
|
| 208 |
+
font-family: var(--mono);
|
| 209 |
+
color: var(--muted);
|
| 210 |
+
letter-spacing: 2px;
|
| 211 |
+
text-transform: uppercase;
|
| 212 |
+
margin-bottom: 24px;
|
| 213 |
+
display: flex;
|
| 214 |
+
align-items: center;
|
| 215 |
+
gap: 12px;
|
| 216 |
+
}
|
| 217 |
+
.section-title::after {
|
| 218 |
+
content: '';
|
| 219 |
+
flex: 1;
|
| 220 |
+
height: 1px;
|
| 221 |
+
background: var(--border);
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
/* ββ Reward grid ββ */
|
| 225 |
+
.reward-grid {
|
| 226 |
+
display: grid;
|
| 227 |
+
grid-template-columns: repeat(auto-fill, minmax(220px, 1fr));
|
| 228 |
+
gap: 12px;
|
| 229 |
+
}
|
| 230 |
+
.reward-card {
|
| 231 |
+
background: var(--surface);
|
| 232 |
+
border: 1px solid var(--border);
|
| 233 |
+
border-radius: 10px;
|
| 234 |
+
padding: 18px 20px;
|
| 235 |
+
transition: border-color .2s;
|
| 236 |
+
animation: fadeUp .5s ease both;
|
| 237 |
+
}
|
| 238 |
+
.reward-card:hover { border-color: var(--accent); }
|
| 239 |
+
.reward-card:nth-child(1) { animation-delay: .05s; }
|
| 240 |
+
.reward-card:nth-child(2) { animation-delay: .10s; }
|
| 241 |
+
.reward-card:nth-child(3) { animation-delay: .15s; }
|
| 242 |
+
.reward-card:nth-child(4) { animation-delay: .20s; }
|
| 243 |
+
.reward-card:nth-child(5) { animation-delay: .25s; }
|
| 244 |
+
.reward-card:nth-child(6) { animation-delay: .30s; }
|
| 245 |
+
.reward-card:nth-child(7) { animation-delay: .35s; }
|
| 246 |
+
.rc-header { display: flex; justify-content: space-between; align-items: flex-start; margin-bottom: 14px; }
|
| 247 |
+
.rc-name { font-size: 13px; font-weight: 700; }
|
| 248 |
+
.rc-weight { font-family: var(--mono); font-size: 20px; font-weight: 700; color: var(--accent); }
|
| 249 |
+
.rc-bar-bg { height: 3px; background: var(--border); border-radius: 99px; }
|
| 250 |
+
.rc-bar { height: 3px; border-radius: 99px; background: var(--accent); transition: width 1s ease; }
|
| 251 |
+
.rc-desc { font-size: 11px; color: var(--muted); margin-top: 10px; line-height: 1.5; }
|
| 252 |
+
|
| 253 |
+
/* ββ Tasks table ββ */
|
| 254 |
+
.tasks-grid {
|
| 255 |
+
display: grid;
|
| 256 |
+
grid-template-columns: repeat(3, 1fr);
|
| 257 |
+
gap: 12px;
|
| 258 |
+
}
|
| 259 |
+
@media (max-width: 768px) { .tasks-grid { grid-template-columns: 1fr; } }
|
| 260 |
+
.diff-col {}
|
| 261 |
+
.diff-label {
|
| 262 |
+
font-family: var(--mono);
|
| 263 |
+
font-size: 11px;
|
| 264 |
+
letter-spacing: 1.5px;
|
| 265 |
+
text-transform: uppercase;
|
| 266 |
+
padding: 6px 12px;
|
| 267 |
+
border-radius: 6px;
|
| 268 |
+
display: inline-block;
|
| 269 |
+
margin-bottom: 12px;
|
| 270 |
+
}
|
| 271 |
+
.diff-easy { background: rgba(86,211,100,.1); color: var(--accent3); }
|
| 272 |
+
.diff-medium { background: rgba(240,136,62,.1); color: var(--accent); }
|
| 273 |
+
.diff-hard { background: rgba(255,123,114,.1); color: var(--danger); }
|
| 274 |
+
.task-item {
|
| 275 |
+
background: var(--surface);
|
| 276 |
+
border: 1px solid var(--border);
|
| 277 |
+
border-radius: 8px;
|
| 278 |
+
padding: 14px 16px;
|
| 279 |
+
margin-bottom: 8px;
|
| 280 |
+
font-size: 13px;
|
| 281 |
+
}
|
| 282 |
+
.task-name { font-weight: 700; margin-bottom: 4px; }
|
| 283 |
+
.task-cwes { display: flex; gap: 4px; flex-wrap: wrap; margin-top: 8px; }
|
| 284 |
+
.cwe-tag {
|
| 285 |
+
font-family: var(--mono);
|
| 286 |
+
font-size: 10px;
|
| 287 |
+
padding: 2px 7px;
|
| 288 |
+
border-radius: 4px;
|
| 289 |
+
background: rgba(121,192,255,.08);
|
| 290 |
+
color: var(--accent2);
|
| 291 |
+
border: 1px solid rgba(121,192,255,.2);
|
| 292 |
+
}
|
| 293 |
+
|
| 294 |
+
/* ββ Code block ββ */
|
| 295 |
+
.code-block {
|
| 296 |
+
background: var(--surface);
|
| 297 |
+
border: 1px solid var(--border);
|
| 298 |
+
border-radius: 10px;
|
| 299 |
+
overflow: hidden;
|
| 300 |
+
}
|
| 301 |
+
.code-header {
|
| 302 |
+
display: flex;
|
| 303 |
+
align-items: center;
|
| 304 |
+
justify-content: space-between;
|
| 305 |
+
padding: 10px 16px;
|
| 306 |
+
border-bottom: 1px solid var(--border);
|
| 307 |
+
background: var(--surface2);
|
| 308 |
+
}
|
| 309 |
+
.code-dots { display: flex; gap: 6px; }
|
| 310 |
+
.code-dots span { width: 10px; height: 10px; border-radius: 50%; }
|
| 311 |
+
.code-dots span:nth-child(1) { background: #ff5f57; }
|
| 312 |
+
.code-dots span:nth-child(2) { background: #febc2e; }
|
| 313 |
+
.code-dots span:nth-child(3) { background: #28c840; }
|
| 314 |
+
.code-filename { font-family: var(--mono); font-size: 11px; color: var(--muted); }
|
| 315 |
+
pre {
|
| 316 |
+
font-family: var(--mono);
|
| 317 |
+
font-size: 12px;
|
| 318 |
+
line-height: 1.7;
|
| 319 |
+
padding: 20px;
|
| 320 |
+
overflow-x: auto;
|
| 321 |
+
color: var(--text);
|
| 322 |
+
}
|
| 323 |
+
.kw { color: #ff7b72; }
|
| 324 |
+
.fn { color: #d2a8ff; }
|
| 325 |
+
.str { color: #a5d6ff; }
|
| 326 |
+
.cm { color: var(--muted); font-style: italic; }
|
| 327 |
+
.num { color: var(--accent3); }
|
| 328 |
+
.op { color: var(--accent); }
|
| 329 |
+
|
| 330 |
+
/* ββ Live status ββ */
|
| 331 |
+
.status-bar {
|
| 332 |
+
background: var(--surface);
|
| 333 |
+
border: 1px solid var(--border);
|
| 334 |
+
border-radius: 10px;
|
| 335 |
+
padding: 20px 24px;
|
| 336 |
+
display: flex;
|
| 337 |
+
align-items: center;
|
| 338 |
+
justify-content: space-between;
|
| 339 |
+
gap: 16px;
|
| 340 |
+
flex-wrap: wrap;
|
| 341 |
+
}
|
| 342 |
+
.status-dot {
|
| 343 |
+
width: 8px; height: 8px;
|
| 344 |
+
border-radius: 50%;
|
| 345 |
+
background: var(--accent3);
|
| 346 |
+
box-shadow: 0 0 8px var(--accent3);
|
| 347 |
+
animation: pulse 2s ease infinite;
|
| 348 |
+
}
|
| 349 |
+
.status-left { display: flex; align-items: center; gap: 10px; font-size: 14px; font-weight: 700; }
|
| 350 |
+
.status-endpoints { display: flex; gap: 8px; flex-wrap: wrap; }
|
| 351 |
+
.ep {
|
| 352 |
+
font-family: var(--mono);
|
| 353 |
+
font-size: 11px;
|
| 354 |
+
padding: 4px 10px;
|
| 355 |
+
border-radius: 5px;
|
| 356 |
+
background: var(--surface2);
|
| 357 |
+
border: 1px solid var(--border);
|
| 358 |
+
color: var(--muted);
|
| 359 |
+
display: flex;
|
| 360 |
+
gap: 6px;
|
| 361 |
+
align-items: center;
|
| 362 |
+
}
|
| 363 |
+
.ep-method { font-weight: 700; }
|
| 364 |
+
.ep-method.post { color: var(--accent3); }
|
| 365 |
+
.ep-method.get { color: var(--accent2); }
|
| 366 |
+
|
| 367 |
+
/* ββ Footer ββ */
|
| 368 |
+
footer {
|
| 369 |
+
border-top: 1px solid var(--border);
|
| 370 |
+
padding: 28px 0;
|
| 371 |
+
margin-top: 32px;
|
| 372 |
+
display: flex;
|
| 373 |
+
justify-content: space-between;
|
| 374 |
+
align-items: center;
|
| 375 |
+
flex-wrap: wrap;
|
| 376 |
+
gap: 12px;
|
| 377 |
+
}
|
| 378 |
+
.footer-text { font-family: var(--mono); font-size: 11px; color: var(--muted); }
|
| 379 |
+
.footer-text a { color: var(--accent2); text-decoration: none; }
|
| 380 |
+
|
| 381 |
+
/* ββ Animations ββ */
|
| 382 |
+
@keyframes fadeUp {
|
| 383 |
+
from { opacity: 0; transform: translateY(16px); }
|
| 384 |
+
to { opacity: 1; transform: translateY(0); }
|
| 385 |
+
}
|
| 386 |
+
@keyframes pulse {
|
| 387 |
+
0%, 100% { opacity: 1; }
|
| 388 |
+
50% { opacity: .4; }
|
| 389 |
+
}
|
| 390 |
+
|
| 391 |
+
.hero { animation: fadeUp .6s ease both; }
|
| 392 |
+
.stats { animation: fadeUp .6s ease .1s both; }
|
| 393 |
+
|
| 394 |
+
@media (max-width: 640px) {
|
| 395 |
+
.stats { grid-template-columns: repeat(2, 1fr); }
|
| 396 |
+
h1 { letter-spacing: -1px; }
|
| 397 |
+
.header-badges { display: none; }
|
| 398 |
+
}
|
| 399 |
+
</style>
|
| 400 |
+
</head>
|
| 401 |
+
<body>
|
| 402 |
+
|
| 403 |
+
<!-- HEADER -->
|
| 404 |
+
<header>
|
| 405 |
+
<div class="wrap">
|
| 406 |
+
<div class="header-inner">
|
| 407 |
+
<div class="logo">
|
| 408 |
+
<div class="logo-icon">π</div>
|
| 409 |
+
SecureCodeEnv
|
| 410 |
+
</div>
|
| 411 |
+
<div class="header-badges">
|
| 412 |
+
<span class="badge badge-orange">v2.0.0</span>
|
| 413 |
+
<span class="badge badge-blue">OpenEnv</span>
|
| 414 |
+
<span class="badge badge-green">Live</span>
|
| 415 |
+
<span class="badge badge-red">Meta Γ PyTorch Hackathon</span>
|
| 416 |
+
</div>
|
| 417 |
+
</div>
|
| 418 |
+
</div>
|
| 419 |
+
</header>
|
| 420 |
+
|
| 421 |
+
<!-- HERO -->
|
| 422 |
+
<div class="wrap">
|
| 423 |
+
<div class="hero">
|
| 424 |
+
<div class="hero-eyebrow">RL Environment for Secure Code Generation</div>
|
| 425 |
+
<h1>Train LLMs to write<br><em>secure</em> Python code.</h1>
|
| 426 |
+
<p class="hero-desc">
|
| 427 |
+
SecureCodeEnv is a reinforcement learning environment that goes beyond correctness.
|
| 428 |
+
Agents are graded on attack resistance, CWE-based static analysis, codebase consistency
|
| 429 |
+
via CodeGraph, and performance β all automated, all deterministic.
|
| 430 |
+
</p>
|
| 431 |
+
<div class="hero-actions">
|
| 432 |
+
<a href="/docs" class="btn btn-primary">β‘ API Docs</a>
|
| 433 |
+
<a href="/health" class="btn btn-ghost">GET /health</a>
|
| 434 |
+
<a href="https://huggingface.co/spaces/vishaldhakad/SecureCodeEnv" class="btn btn-ghost" target="_blank">HF Space β</a>
|
| 435 |
+
</div>
|
| 436 |
+
</div>
|
| 437 |
+
|
| 438 |
+
<!-- STATS -->
|
| 439 |
+
<div class="stats">
|
| 440 |
+
<div class="stat" data-icon="π">
|
| 441 |
+
<div class="stat-val">9</div>
|
| 442 |
+
<div class="stat-label">Security Tasks</div>
|
| 443 |
+
</div>
|
| 444 |
+
<div class="stat" data-icon="βοΈ">
|
| 445 |
+
<div class="stat-val">7</div>
|
| 446 |
+
<div class="stat-label">Reward Dimensions</div>
|
| 447 |
+
</div>
|
| 448 |
+
<div class="stat" data-icon="π―">
|
| 449 |
+
<div class="stat-val">12+</div>
|
| 450 |
+
<div class="stat-label">CWE IDs Covered</div>
|
| 451 |
+
</div>
|
| 452 |
+
<div class="stat" data-icon="π₯">
|
| 453 |
+
<div class="stat-val">0%</div>
|
| 454 |
+
<div class="stat-label">Infrastructure Cost</div>
|
| 455 |
+
</div>
|
| 456 |
+
</div>
|
| 457 |
+
|
| 458 |
+
<!-- LIVE STATUS -->
|
| 459 |
+
<section>
|
| 460 |
+
<div class="section-title">Live Environment</div>
|
| 461 |
+
<div class="status-bar">
|
| 462 |
+
<div class="status-left">
|
| 463 |
+
<div class="status-dot"></div>
|
| 464 |
+
Environment running Β· SecureCodeEnv v2.0.0
|
| 465 |
+
</div>
|
| 466 |
+
<div class="status-endpoints">
|
| 467 |
+
<div class="ep"><span class="ep-method post">POST</span>/reset</div>
|
| 468 |
+
<div class="ep"><span class="ep-method post">POST</span>/step</div>
|
| 469 |
+
<div class="ep"><span class="ep-method get">GET</span>/state</div>
|
| 470 |
+
<div class="ep"><span class="ep-method get">GET</span>/health</div>
|
| 471 |
+
<div class="ep"><span class="ep-method get">GET</span>/docs</div>
|
| 472 |
+
</div>
|
| 473 |
+
</div>
|
| 474 |
+
</section>
|
| 475 |
+
|
| 476 |
+
<!-- REWARD DIMENSIONS -->
|
| 477 |
+
<section>
|
| 478 |
+
<div class="section-title">Reward System β 7 Dimensions</div>
|
| 479 |
+
<div class="reward-grid">
|
| 480 |
+
<div class="reward-card">
|
| 481 |
+
<div class="rc-header">
|
| 482 |
+
<div class="rc-name">Correctness</div>
|
| 483 |
+
<div class="rc-weight">30%</div>
|
| 484 |
+
</div>
|
| 485 |
+
<div class="rc-bar-bg"><div class="rc-bar" style="width:100%"></div></div>
|
| 486 |
+
<div class="rc-desc">Test cases passed including edge cases, None inputs, boundary values</div>
|
| 487 |
+
</div>
|
| 488 |
+
<div class="reward-card">
|
| 489 |
+
<div class="rc-header">
|
| 490 |
+
<div class="rc-name">Attack Resistance</div>
|
| 491 |
+
<div class="rc-weight">20%</div>
|
| 492 |
+
</div>
|
| 493 |
+
<div class="rc-bar-bg"><div class="rc-bar" style="width:67%"></div></div>
|
| 494 |
+
<div class="rc-desc">Randomized SQLi, traversal, JWT bypass, XSS payloads fired each episode</div>
|
| 495 |
+
</div>
|
| 496 |
+
<div class="reward-card">
|
| 497 |
+
<div class="rc-header">
|
| 498 |
+
<div class="rc-name">Static Security</div>
|
| 499 |
+
<div class="rc-weight">15%</div>
|
| 500 |
+
</div>
|
| 501 |
+
<div class="rc-bar-bg"><div class="rc-bar" style="width:50%"></div></div>
|
| 502 |
+
<div class="rc-desc">bandit + AST checks mapped to real CWE IDs</div>
|
| 503 |
+
</div>
|
| 504 |
+
<div class="reward-card">
|
| 505 |
+
<div class="rc-header">
|
| 506 |
+
<div class="rc-name">CodeGraph</div>
|
| 507 |
+
<div class="rc-weight">15%</div>
|
| 508 |
+
</div>
|
| 509 |
+
<div class="rc-bar-bg"><div class="rc-bar" style="width:50%"></div></div>
|
| 510 |
+
<div class="rc-desc">Consistency with existing codebase conventions across the episode</div>
|
| 511 |
+
</div>
|
| 512 |
+
<div class="reward-card">
|
| 513 |
+
<div class="rc-header">
|
| 514 |
+
<div class="rc-name">Performance</div>
|
| 515 |
+
<div class="rc-weight">10%</div>
|
| 516 |
+
</div>
|
| 517 |
+
<div class="rc-bar-bg"><div class="rc-bar" style="width:33%"></div></div>
|
| 518 |
+
<div class="rc-desc">timeit + tracemalloc scored relative to naive/optimal baselines</div>
|
| 519 |
+
</div>
|
| 520 |
+
<div class="reward-card">
|
| 521 |
+
<div class="rc-header">
|
| 522 |
+
<div class="rc-name">Documentation</div>
|
| 523 |
+
<div class="rc-weight">5%</div>
|
| 524 |
+
</div>
|
| 525 |
+
<div class="rc-bar-bg"><div class="rc-bar" style="width:17%"></div></div>
|
| 526 |
+
<div class="rc-desc">Docstring + type hint coverage across all submitted functions</div>
|
| 527 |
+
</div>
|
| 528 |
+
<div class="reward-card">
|
| 529 |
+
<div class="rc-header">
|
| 530 |
+
<div class="rc-name">Code Structure</div>
|
| 531 |
+
<div class="rc-weight">5%</div>
|
| 532 |
+
</div>
|
| 533 |
+
<div class="rc-bar-bg"><div class="rc-bar" style="width:17%"></div></div>
|
| 534 |
+
<div class="rc-desc">No bare print, no bare except, reasonable function size</div>
|
| 535 |
+
</div>
|
| 536 |
+
</div>
|
| 537 |
+
</section>
|
| 538 |
+
|
| 539 |
+
<!-- TASKS -->
|
| 540 |
+
<section>
|
| 541 |
+
<div class="section-title">9 Tasks Β· 3 Difficulty Levels</div>
|
| 542 |
+
<div class="tasks-grid">
|
| 543 |
+
<div class="diff-col">
|
| 544 |
+
<div class="diff-label diff-easy">Easy</div>
|
| 545 |
+
<div class="task-item">
|
| 546 |
+
<div class="task-name">Password Validator</div>
|
| 547 |
+
<div style="font-size:11px;color:var(--muted)">bcrypt hashing, strength rules</div>
|
| 548 |
+
<div class="task-cwes"><span class="cwe-tag">CWE-916</span><span class="cwe-tag">CWE-521</span></div>
|
| 549 |
+
</div>
|
| 550 |
+
<div class="task-item">
|
| 551 |
+
<div class="task-name">Input Sanitizer</div>
|
| 552 |
+
<div style="font-size:11px;color:var(--muted)">HTML escape, filename safety</div>
|
| 553 |
+
<div class="task-cwes"><span class="cwe-tag">CWE-20</span><span class="cwe-tag">CWE-116</span></div>
|
| 554 |
+
</div>
|
| 555 |
+
<div class="task-item">
|
| 556 |
+
<div class="task-name">Token Generator</div>
|
| 557 |
+
<div style="font-size:11px;color:var(--muted)">secrets module, CSPRNG</div>
|
| 558 |
+
<div class="task-cwes"><span class="cwe-tag">CWE-338</span><span class="cwe-tag">CWE-330</span></div>
|
| 559 |
+
</div>
|
| 560 |
+
</div>
|
| 561 |
+
<div class="diff-col">
|
| 562 |
+
<div class="diff-label diff-medium">Medium</div>
|
| 563 |
+
<div class="task-item">
|
| 564 |
+
<div class="task-name">SQL Query Builder</div>
|
| 565 |
+
<div style="font-size:11px;color:var(--muted)">Parameterized queries only</div>
|
| 566 |
+
<div class="task-cwes"><span class="cwe-tag">CWE-89</span></div>
|
| 567 |
+
</div>
|
| 568 |
+
<div class="task-item">
|
| 569 |
+
<div class="task-name">File Path Handler</div>
|
| 570 |
+
<div style="font-size:11px;color:var(--muted)">Path traversal prevention</div>
|
| 571 |
+
<div class="task-cwes"><span class="cwe-tag">CWE-22</span></div>
|
| 572 |
+
</div>
|
| 573 |
+
<div class="task-item">
|
| 574 |
+
<div class="task-name">Rate Limiter</div>
|
| 575 |
+
<div style="font-size:11px;color:var(--muted)">Thread-safe sliding window</div>
|
| 576 |
+
<div class="task-cwes"><span class="cwe-tag">CWE-770</span><span class="cwe-tag">CWE-400</span></div>
|
| 577 |
+
</div>
|
| 578 |
+
</div>
|
| 579 |
+
<div class="diff-col">
|
| 580 |
+
<div class="diff-label diff-hard">Hard</div>
|
| 581 |
+
<div class="task-item">
|
| 582 |
+
<div class="task-name">File Upload Handler</div>
|
| 583 |
+
<div style="font-size:11px;color:var(--muted)">MIME check, ext block, UUID path</div>
|
| 584 |
+
<div class="task-cwes"><span class="cwe-tag">CWE-22</span><span class="cwe-tag">CWE-434</span></div>
|
| 585 |
+
</div>
|
| 586 |
+
<div class="task-item">
|
| 587 |
+
<div class="task-name">JWT Validator</div>
|
| 588 |
+
<div style="font-size:11px;color:var(--muted)">alg:none blocked, expiry enforced</div>
|
| 589 |
+
<div class="task-cwes"><span class="cwe-tag">CWE-347</span><span class="cwe-tag">CWE-613</span></div>
|
| 590 |
+
</div>
|
| 591 |
+
<div class="task-item">
|
| 592 |
+
<div class="task-name">Auth Middleware</div>
|
| 593 |
+
<div style="font-size:11px;color:var(--muted)">CSRF + timing-safe Bearer auth</div>
|
| 594 |
+
<div class="task-cwes"><span class="cwe-tag">CWE-287</span><span class="cwe-tag">CWE-352</span></div>
|
| 595 |
+
</div>
|
| 596 |
+
</div>
|
| 597 |
+
</div>
|
| 598 |
+
</section>
|
| 599 |
+
|
| 600 |
+
<!-- QUICKSTART CODE -->
|
| 601 |
+
<section>
|
| 602 |
+
<div class="section-title">Quick Start</div>
|
| 603 |
+
<div class="code-block">
|
| 604 |
+
<div class="code-header">
|
| 605 |
+
<div class="code-dots"><span></span><span></span><span></span></div>
|
| 606 |
+
<div class="code-filename">quickstart.py</div>
|
| 607 |
+
<span class="badge badge-blue">Python</span>
|
| 608 |
+
</div>
|
| 609 |
+
<pre><span class="kw">import</span> requests
|
| 610 |
+
|
| 611 |
+
ENV_URL <span class="op">=</span> <span class="str">"https://vishaldhakad-securecodeenv.hf.space"</span>
|
| 612 |
+
|
| 613 |
+
<span class="cm"># 1. Start episode</span>
|
| 614 |
+
episode <span class="op">=</span> requests.<span class="fn">post</span>(<span class="str">f"{ENV_URL}/reset"</span>, json<span class="op">=</span>{<span class="str">"difficulty"</span>: <span class="str">"medium"</span>}).<span class="fn">json</span>()
|
| 615 |
+
sid <span class="op">=</span> episode[<span class="str">"session_id"</span>]
|
| 616 |
+
<span class="kw">print</span>(episode[<span class="str">"problem_statement"</span>])
|
| 617 |
+
|
| 618 |
+
<span class="cm"># 2. Submit code β gets graded across 7 dimensions</span>
|
| 619 |
+
result <span class="op">=</span> requests.<span class="fn">post</span>(<span class="str">f"{ENV_URL}/step"</span>, json<span class="op">=</span>{
|
| 620 |
+
<span class="str">"session_id"</span>: sid,
|
| 621 |
+
<span class="str">"code"</span>: <span class="str">"def build_user_query(u, r): return ('SELECT * FROM users WHERE username=%s', (u,))"</span>,
|
| 622 |
+
<span class="str">"filename"</span>: <span class="str">"solution.py"</span>,
|
| 623 |
+
}).<span class="fn">json</span>()
|
| 624 |
+
|
| 625 |
+
<span class="kw">print</span>(<span class="str">f"reward={result['total_reward']:.3f}"</span>)
|
| 626 |
+
<span class="kw">print</span>(<span class="str">f"scores={result['scores']}"</span>)
|
| 627 |
+
<span class="kw">print</span>(result[<span class="str">'feedback'</span>][<span class="str">'summary'</span>])</pre>
|
| 628 |
+
</div>
|
| 629 |
+
</section>
|
| 630 |
+
|
| 631 |
+
<!-- FOOTER -->
|
| 632 |
+
<footer class="wrap" style="max-width:unset;padding:0">
|
| 633 |
+
<div class="footer-text">
|
| 634 |
+
SecureCodeEnv v2.0 Β· Built by <a href="https://huggingface.co/vishaldhakad" target="_blank">Vishal Dhakad</a>
|
| 635 |
+
</div>
|
| 636 |
+
<div class="footer-text">
|
| 637 |
+
Meta Γ PyTorch <a href="https://www.scaler.com/school-of-technology/meta-pytorch-hackathon" target="_blank">OpenEnv Hackathon 2026</a>
|
| 638 |
+
</div>
|
| 639 |
+
</footer>
|
| 640 |
+
|
| 641 |
+
</div>
|
| 642 |
+
|
| 643 |
+
<script>
|
| 644 |
+
// Animate reward bars on load
|
| 645 |
+
document.addEventListener('DOMContentLoaded', () => {
|
| 646 |
+
const bars = document.querySelectorAll('.rc-bar');
|
| 647 |
+
bars.forEach(b => {
|
| 648 |
+
const w = b.style.width;
|
| 649 |
+
b.style.width = '0';
|
| 650 |
+
setTimeout(() => { b.style.width = w; }, 300);
|
| 651 |
+
});
|
| 652 |
+
});
|
| 653 |
+
|
| 654 |
+
// Live health ping β updates status dot
|
| 655 |
+
async function checkHealth() {
|
| 656 |
+
try {
|
| 657 |
+
const r = await fetch('/health');
|
| 658 |
+
const d = await r.json();
|
| 659 |
+
const dot = document.querySelector('.status-dot');
|
| 660 |
+
const label = document.querySelector('.status-left');
|
| 661 |
+
if (r.ok) {
|
| 662 |
+
dot.style.background = 'var(--accent3)';
|
| 663 |
+
dot.style.boxShadow = '0 0 8px var(--accent3)';
|
| 664 |
+
label.childNodes[1].textContent = ` Environment running Β· ${d.env} v${d.version} Β· ${d.tasks_loaded} tasks loaded`;
|
| 665 |
+
}
|
| 666 |
+
} catch(e) {}
|
| 667 |
+
}
|
| 668 |
+
checkHealth();
|
| 669 |
+
</script>
|
| 670 |
+
|
| 671 |
+
</body>
|
| 672 |
+
</html>'''
|
app/main.py
CHANGED
|
@@ -1,18 +1,22 @@
|
|
| 1 |
"""
|
| 2 |
-
SecureCodeEnv
|
| 3 |
-
|
| 4 |
-
|
| 5 |
"""
|
| 6 |
from fastapi import FastAPI
|
|
|
|
| 7 |
from fastapi.middleware.cors import CORSMiddleware
|
| 8 |
-
from .routes import router
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
app = FastAPI(
|
| 11 |
title="SecureCodeEnv",
|
| 12 |
description=(
|
| 13 |
-
"RL environment for training LLM agents to write production-ready, "
|
| 14 |
-
"
|
| 15 |
-
"
|
| 16 |
),
|
| 17 |
version="2.0.0",
|
| 18 |
docs_url="/docs",
|
|
@@ -29,28 +33,18 @@ app.add_middleware(
|
|
| 29 |
app.include_router(router)
|
| 30 |
|
| 31 |
|
| 32 |
-
@app.get("/health")
|
| 33 |
def health():
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
"
|
| 37 |
-
"
|
| 38 |
-
"
|
| 39 |
-
|
| 40 |
-
|
| 41 |
|
| 42 |
|
| 43 |
-
@app.get("/")
|
| 44 |
def root():
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
"version": "2.0.0",
|
| 48 |
-
"description": "RL environment for secure code generation training",
|
| 49 |
-
"endpoints": {
|
| 50 |
-
"reset": "POST /reset",
|
| 51 |
-
"step": "POST /step",
|
| 52 |
-
"state": "GET /state",
|
| 53 |
-
"health": "GET /health",
|
| 54 |
-
"docs": "GET /docs",
|
| 55 |
-
},
|
| 56 |
-
}
|
|
|
|
| 1 |
"""
|
| 2 |
+
SecureCodeEnv - FastAPI Application Entry Point
|
| 3 |
+
Built for Meta x PyTorch OpenEnv Hackathon 2026
|
| 4 |
+
Author: Vishal Dhakad (vishaldhakad)
|
| 5 |
"""
|
| 6 |
from fastapi import FastAPI
|
| 7 |
+
from fastapi.responses import HTMLResponse
|
| 8 |
from fastapi.middleware.cors import CORSMiddleware
|
| 9 |
+
from app.routes import router
|
| 10 |
+
from app.models import HealthResponse
|
| 11 |
+
from app.dashboard import DASHBOARD_HTML
|
| 12 |
+
from tasks.task_registry import TASK_REGISTRY
|
| 13 |
|
| 14 |
app = FastAPI(
|
| 15 |
title="SecureCodeEnv",
|
| 16 |
description=(
|
| 17 |
+
"An RL environment for training LLM agents to write production-ready, secure Python code. "
|
| 18 |
+
"Agents are graded on correctness, attack resistance, CWE-based static analysis, "
|
| 19 |
+
"performance, and codebase consistency via a novel CodeGraph memory system."
|
| 20 |
),
|
| 21 |
version="2.0.0",
|
| 22 |
docs_url="/docs",
|
|
|
|
| 33 |
app.include_router(router)
|
| 34 |
|
| 35 |
|
| 36 |
+
@app.get("/health", response_model=HealthResponse, tags=["System"])
|
| 37 |
def health():
|
| 38 |
+
"""Health check β required by hackathon automated ping."""
|
| 39 |
+
return HealthResponse(
|
| 40 |
+
status="ok",
|
| 41 |
+
env="SecureCodeEnv",
|
| 42 |
+
version="2.0.0",
|
| 43 |
+
tasks_loaded=len(TASK_REGISTRY),
|
| 44 |
+
)
|
| 45 |
|
| 46 |
|
| 47 |
+
@app.get("/", response_class=HTMLResponse, include_in_schema=False)
|
| 48 |
def root():
|
| 49 |
+
"""HTML dashboard β shown on HuggingFace Spaces landing page."""
|
| 50 |
+
return HTMLResponse(content=DASHBOARD_HTML, status_code=200)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/models.py
CHANGED
|
@@ -1,58 +1,66 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
-
|
| 4 |
"""
|
| 5 |
-
from pydantic import BaseModel,
|
| 6 |
from typing import Optional, Dict, Any, List
|
| 7 |
|
| 8 |
|
| 9 |
class StepAction(BaseModel):
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
-
@field_validator("code")
|
| 16 |
-
@classmethod
|
| 17 |
-
def code_not_empty(cls, v: str) -> str:
|
| 18 |
-
if not v.strip():
|
| 19 |
-
raise ValueError("code cannot be empty")
|
| 20 |
-
if len(v) > 50_000:
|
| 21 |
-
raise ValueError("code exceeds 50KB limit β split into smaller modules")
|
| 22 |
-
return v
|
| 23 |
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
|
|
|
| 30 |
|
| 31 |
|
| 32 |
-
class
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
| 39 |
|
| 40 |
|
| 41 |
class ResetObservation(BaseModel):
|
| 42 |
session_id: str
|
| 43 |
task_id: str
|
| 44 |
-
problem_statement: str
|
| 45 |
-
difficulty: str
|
| 46 |
-
cwe_targets: List[str]
|
| 47 |
-
codegraph: Dict[str, Any]
|
| 48 |
-
starter_code: str
|
| 49 |
-
naive_baseline:
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
|
| 52 |
class StateResponse(BaseModel):
|
|
|
|
| 53 |
task_id: str
|
| 54 |
step: int
|
| 55 |
done: bool
|
| 56 |
codegraph: Dict[str, Any]
|
| 57 |
-
difficulty:
|
| 58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
SecureCodeEnv - Pydantic Models
|
| 3 |
+
All request/response types for the OpenEnv API contract.
|
| 4 |
"""
|
| 5 |
+
from pydantic import BaseModel, Field
|
| 6 |
from typing import Optional, Dict, Any, List
|
| 7 |
|
| 8 |
|
| 9 |
class StepAction(BaseModel):
|
| 10 |
+
session_id: str = Field(..., description="Session ID returned from /reset")
|
| 11 |
+
code: str = Field(..., description="The agent's submitted Python source code")
|
| 12 |
+
filename: str = Field(
|
| 13 |
+
default="solution.py",
|
| 14 |
+
description="Logical filename for CodeGraph tracking e.g. 'src/auth/validator.py'"
|
| 15 |
+
)
|
| 16 |
+
task_id: Optional[str] = Field(None, description="Task ID (optional, validated against session)")
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
+
class StepObservation(BaseModel):
|
| 20 |
+
scores: Dict[str, float] = Field(..., description="Per-dimension scores 0.0-1.0")
|
| 21 |
+
total_reward: float = Field(..., description="Weighted final score 0.0-1.0")
|
| 22 |
+
feedback: Dict[str, str] = Field(..., description="Human-readable feedback per dimension")
|
| 23 |
+
codegraph: Dict[str, Any] = Field(..., description="Updated CodeGraph state")
|
| 24 |
+
done: bool = Field(..., description="Is the episode complete?")
|
| 25 |
+
step_count: int = Field(..., description="Current step number")
|
| 26 |
|
| 27 |
|
| 28 |
+
class ResetRequest(BaseModel):
|
| 29 |
+
difficulty: Optional[str] = Field(
|
| 30 |
+
default="medium",
|
| 31 |
+
description="Task difficulty: 'easy' | 'medium' | 'hard'"
|
| 32 |
+
)
|
| 33 |
+
session_id: Optional[str] = Field(
|
| 34 |
+
None,
|
| 35 |
+
description="Optional: reuse a session ID (for deterministic testing)"
|
| 36 |
+
)
|
| 37 |
|
| 38 |
|
| 39 |
class ResetObservation(BaseModel):
|
| 40 |
session_id: str
|
| 41 |
task_id: str
|
| 42 |
+
problem_statement: str = Field(..., description="Natural language task description")
|
| 43 |
+
difficulty: str = Field(..., description="'easy' | 'medium' | 'hard'")
|
| 44 |
+
cwe_targets: List[str] = Field(..., description="e.g. ['CWE-89', 'CWE-20']")
|
| 45 |
+
codegraph: Dict[str, Any] = Field(..., description="Current codebase context (empty for easy)")
|
| 46 |
+
starter_code: str = Field(default="", description="Buggy/incomplete starter code")
|
| 47 |
+
naive_baseline: Optional[Dict] = Field(
|
| 48 |
+
default=None,
|
| 49 |
+
description="Performance baseline for relative scoring"
|
| 50 |
+
)
|
| 51 |
|
| 52 |
|
| 53 |
class StateResponse(BaseModel):
|
| 54 |
+
session_id: str
|
| 55 |
task_id: str
|
| 56 |
step: int
|
| 57 |
done: bool
|
| 58 |
codegraph: Dict[str, Any]
|
| 59 |
+
difficulty: str
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
class HealthResponse(BaseModel):
|
| 63 |
+
status: str
|
| 64 |
+
env: str
|
| 65 |
+
version: str
|
| 66 |
+
tasks_loaded: int
|
app/routes.py
CHANGED
|
@@ -1,151 +1,147 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
Critical endpoints:
|
| 5 |
-
POST /reset β start episode, pick task, init CodeGraph
|
| 6 |
-
POST /step β grade code submission, update CodeGraph
|
| 7 |
-
GET /state β read current episode state
|
| 8 |
-
|
| 9 |
-
Session key: UUID per agent β supports concurrent multi-agent usage.
|
| 10 |
"""
|
| 11 |
-
import uuid
|
| 12 |
from fastapi import APIRouter, HTTPException
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
from
|
| 19 |
from graders.reward_aggregator import grade_submission
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
router = APIRouter()
|
| 22 |
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
-
|
|
|
|
| 25 |
|
| 26 |
-
|
| 27 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
"""
|
| 29 |
-
Start a new
|
| 30 |
-
|
| 31 |
-
creates a Redis-backed session, and returns the full observation.
|
| 32 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
if difficulty not in ("easy", "medium", "hard"):
|
| 34 |
-
raise HTTPException(400, f"difficulty must be easy
|
| 35 |
|
| 36 |
-
sid = session_id or str(uuid.uuid4())
|
| 37 |
task = sample_task(difficulty)
|
| 38 |
-
graph = CodeGraph(episode_seed=hash(sid) % 999_999)
|
| 39 |
|
| 40 |
-
state = EpisodeState(
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
done=False,
|
| 45 |
-
difficulty=difficulty,
|
| 46 |
-
)
|
| 47 |
-
store.save(sid, state)
|
| 48 |
|
|
|
|
| 49 |
return ResetObservation(
|
| 50 |
session_id=sid,
|
| 51 |
task_id=task["id"],
|
| 52 |
problem_statement=task["problem_statement"],
|
| 53 |
difficulty=difficulty,
|
| 54 |
cwe_targets=task["cwe_targets"],
|
| 55 |
-
codegraph=
|
| 56 |
starter_code=task.get("starter_code", ""),
|
| 57 |
-
naive_baseline=task.get("
|
| 58 |
)
|
| 59 |
|
| 60 |
|
| 61 |
-
#
|
| 62 |
-
|
| 63 |
-
|
|
|
|
| 64 |
def step(action: StepAction):
|
| 65 |
"""
|
| 66 |
-
Submit agent code for grading.
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
Episode terminates when:
|
| 70 |
-
- total_reward >= 0.90 (agent solved it well), OR
|
| 71 |
-
- step_count >= 5 (max steps reached)
|
| 72 |
"""
|
| 73 |
-
|
|
|
|
|
|
|
| 74 |
if state is None:
|
| 75 |
-
raise HTTPException(404, "Session not found β call POST /reset first")
|
| 76 |
if state.done:
|
| 77 |
-
raise HTTPException(400, "Episode already
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
-
# Run full grading pipeline
|
| 80 |
result = grade_submission(
|
| 81 |
code=action.code,
|
| 82 |
-
filename=action.filename,
|
| 83 |
task=state.task,
|
| 84 |
graph=state.graph,
|
| 85 |
step=state.step,
|
| 86 |
seed=state.graph.episode_seed + state.step,
|
| 87 |
)
|
| 88 |
|
| 89 |
-
# Update CodeGraph with new
|
| 90 |
-
state.graph.update(action.filename, result["new_metadata"])
|
| 91 |
state.step += 1
|
| 92 |
-
state.
|
| 93 |
|
| 94 |
-
#
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
# Clean up completed episodes (saves Redis commands)
|
| 98 |
-
if state.done:
|
| 99 |
-
store.delete(action.session_id)
|
| 100 |
|
|
|
|
| 101 |
return StepObservation(
|
| 102 |
scores=result["scores"],
|
| 103 |
total_reward=result["total_reward"],
|
| 104 |
feedback=result["feedback"],
|
| 105 |
-
codegraph=
|
| 106 |
done=state.done,
|
| 107 |
step_count=state.step,
|
| 108 |
)
|
| 109 |
|
| 110 |
|
| 111 |
-
#
|
| 112 |
-
|
| 113 |
-
|
|
|
|
| 114 |
def get_state(session_id: str):
|
| 115 |
"""
|
| 116 |
-
|
| 117 |
-
Useful for monitoring
|
| 118 |
"""
|
| 119 |
-
|
|
|
|
|
|
|
| 120 |
if state is None:
|
| 121 |
-
raise HTTPException(404, "Session not found
|
| 122 |
|
|
|
|
| 123 |
return StateResponse(
|
|
|
|
| 124 |
task_id=state.task["id"],
|
| 125 |
step=state.step,
|
| 126 |
done=state.done,
|
| 127 |
-
codegraph=
|
| 128 |
-
difficulty=state.difficulty,
|
| 129 |
-
cwe_targets=state.task.get("cwe_targets", []),
|
| 130 |
)
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
# ββ helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 134 |
-
|
| 135 |
-
def _graph_dict(graph: CodeGraph) -> dict:
|
| 136 |
-
"""Serialize CodeGraph to a JSON-safe dict."""
|
| 137 |
-
return {
|
| 138 |
-
"conventions": graph.conventions,
|
| 139 |
-
"episode_seed": graph.episode_seed,
|
| 140 |
-
"components": {
|
| 141 |
-
name: {
|
| 142 |
-
"file": comp.get("file", ""),
|
| 143 |
-
"language": comp.get("language", "py"),
|
| 144 |
-
"functions": comp.get("functions", []),
|
| 145 |
-
"imports": comp.get("imports", [])[:15],
|
| 146 |
-
"conventions": comp.get("conventions", {}),
|
| 147 |
-
"created_at_step": comp.get("created_at_step", 0),
|
| 148 |
-
}
|
| 149 |
-
for name, comp in graph.components.items()
|
| 150 |
-
},
|
| 151 |
-
}
|
|
|
|
| 1 |
"""
|
| 2 |
+
SecureCodeEnv - Route Handlers
|
| 3 |
+
Implements the three required OpenEnv endpoints: /reset, /step, /state
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
"""
|
|
|
|
| 5 |
from fastapi import APIRouter, HTTPException
|
| 6 |
+
from app.models import (
|
| 7 |
+
StepAction, StepObservation,
|
| 8 |
+
ResetRequest, ResetObservation,
|
| 9 |
+
StateResponse,
|
| 10 |
+
)
|
| 11 |
+
from app.state import EpisodeState
|
| 12 |
from graders.reward_aggregator import grade_submission
|
| 13 |
+
from tasks.task_registry import sample_task, get_task, TASK_REGISTRY
|
| 14 |
+
from codegraph.graph import CodeGraph
|
| 15 |
+
import uuid
|
| 16 |
+
import threading
|
| 17 |
|
| 18 |
router = APIRouter()
|
| 19 |
|
| 20 |
+
# In-memory session store (thread-safe with lock)
|
| 21 |
+
_sessions: dict[str, EpisodeState] = {}
|
| 22 |
+
_sessions_lock = threading.Lock()
|
| 23 |
|
| 24 |
+
MAX_STEPS = 5
|
| 25 |
+
DONE_THRESHOLD = 0.90
|
| 26 |
|
| 27 |
+
|
| 28 |
+
def _cleanup_expired():
|
| 29 |
+
"""Remove sessions older than 1 hour."""
|
| 30 |
+
with _sessions_lock:
|
| 31 |
+
expired = [k for k, v in _sessions.items() if v.is_expired()]
|
| 32 |
+
for k in expired:
|
| 33 |
+
del _sessions[k]
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# ---------------------------------------------------------------------------
|
| 37 |
+
# POST /reset
|
| 38 |
+
# ---------------------------------------------------------------------------
|
| 39 |
+
@router.post("/reset", response_model=ResetObservation, tags=["OpenEnv"])
|
| 40 |
+
def reset(body: ResetRequest = None):
|
| 41 |
"""
|
| 42 |
+
Start a new episode. Returns a task problem statement and initial CodeGraph.
|
| 43 |
+
Call this before every /step sequence.
|
|
|
|
| 44 |
"""
|
| 45 |
+
_cleanup_expired()
|
| 46 |
+
|
| 47 |
+
if body is None:
|
| 48 |
+
body = ResetRequest()
|
| 49 |
+
|
| 50 |
+
difficulty = (body.difficulty or "medium").lower()
|
| 51 |
if difficulty not in ("easy", "medium", "hard"):
|
| 52 |
+
raise HTTPException(400, f"difficulty must be 'easy', 'medium', or 'hard'. Got: {difficulty}")
|
| 53 |
|
| 54 |
+
sid = body.session_id or str(uuid.uuid4())
|
| 55 |
task = sample_task(difficulty)
|
| 56 |
+
graph = CodeGraph(episode_seed=abs(hash(sid)) % 999_999)
|
| 57 |
|
| 58 |
+
state = EpisodeState(task=task, graph=graph, step=0, done=False)
|
| 59 |
+
|
| 60 |
+
with _sessions_lock:
|
| 61 |
+
_sessions[sid] = state
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
+
from codegraph.serializer import serialize_graph
|
| 64 |
return ResetObservation(
|
| 65 |
session_id=sid,
|
| 66 |
task_id=task["id"],
|
| 67 |
problem_statement=task["problem_statement"],
|
| 68 |
difficulty=difficulty,
|
| 69 |
cwe_targets=task["cwe_targets"],
|
| 70 |
+
codegraph=serialize_graph(graph),
|
| 71 |
starter_code=task.get("starter_code", ""),
|
| 72 |
+
naive_baseline={"code": task.get("naive_code", "")},
|
| 73 |
)
|
| 74 |
|
| 75 |
|
| 76 |
+
# ---------------------------------------------------------------------------
|
| 77 |
+
# POST /step
|
| 78 |
+
# ---------------------------------------------------------------------------
|
| 79 |
+
@router.post("/step", response_model=StepObservation, tags=["OpenEnv"])
|
| 80 |
def step(action: StepAction):
|
| 81 |
"""
|
| 82 |
+
Submit agent code for grading. Returns multi-dimensional reward scores,
|
| 83 |
+
feedback, and updated CodeGraph.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
"""
|
| 85 |
+
with _sessions_lock:
|
| 86 |
+
state = _sessions.get(action.session_id)
|
| 87 |
+
|
| 88 |
if state is None:
|
| 89 |
+
raise HTTPException(404, "Session not found β call POST /reset first.")
|
| 90 |
if state.done:
|
| 91 |
+
raise HTTPException(400, "Episode already done β call POST /reset to start a new one.")
|
| 92 |
+
|
| 93 |
+
if not action.code or not action.code.strip():
|
| 94 |
+
raise HTTPException(422, "code field must be a non-empty Python string.")
|
| 95 |
|
|
|
|
| 96 |
result = grade_submission(
|
| 97 |
code=action.code,
|
| 98 |
+
filename=action.filename or "solution.py",
|
| 99 |
task=state.task,
|
| 100 |
graph=state.graph,
|
| 101 |
step=state.step,
|
| 102 |
seed=state.graph.episode_seed + state.step,
|
| 103 |
)
|
| 104 |
|
| 105 |
+
# Update CodeGraph with new component metadata
|
| 106 |
+
state.graph.update(action.filename or "solution.py", result["new_metadata"])
|
| 107 |
state.step += 1
|
| 108 |
+
state.scores_history.append(result["total_reward"])
|
| 109 |
|
| 110 |
+
# Episode is done when reward is high enough or max steps reached
|
| 111 |
+
state.done = result["total_reward"] >= DONE_THRESHOLD or state.step >= MAX_STEPS
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
+
from codegraph.serializer import serialize_graph
|
| 114 |
return StepObservation(
|
| 115 |
scores=result["scores"],
|
| 116 |
total_reward=result["total_reward"],
|
| 117 |
feedback=result["feedback"],
|
| 118 |
+
codegraph=serialize_graph(state.graph),
|
| 119 |
done=state.done,
|
| 120 |
step_count=state.step,
|
| 121 |
)
|
| 122 |
|
| 123 |
|
| 124 |
+
# ---------------------------------------------------------------------------
|
| 125 |
+
# GET /state
|
| 126 |
+
# ---------------------------------------------------------------------------
|
| 127 |
+
@router.get("/state", response_model=StateResponse, tags=["OpenEnv"])
|
| 128 |
def get_state(session_id: str):
|
| 129 |
"""
|
| 130 |
+
Returns current episode state without advancing it.
|
| 131 |
+
Useful for monitoring agent progress.
|
| 132 |
"""
|
| 133 |
+
with _sessions_lock:
|
| 134 |
+
state = _sessions.get(session_id)
|
| 135 |
+
|
| 136 |
if state is None:
|
| 137 |
+
raise HTTPException(404, "Session not found.")
|
| 138 |
|
| 139 |
+
from codegraph.serializer import serialize_graph
|
| 140 |
return StateResponse(
|
| 141 |
+
session_id=session_id,
|
| 142 |
task_id=state.task["id"],
|
| 143 |
step=state.step,
|
| 144 |
done=state.done,
|
| 145 |
+
codegraph=serialize_graph(state.graph),
|
| 146 |
+
difficulty=state.task.get("difficulty", "medium"),
|
|
|
|
| 147 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/session_store.py
DELETED
|
@@ -1,73 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
app/session_store.py β Redis abstraction with in-memory fallback.
|
| 3 |
-
|
| 4 |
-
V2 Fix: V1 used a plain dict β sessions lost on restart.
|
| 5 |
-
V2 uses Upstash Redis (free tier). If Redis is unavailable, falls back to
|
| 6 |
-
an in-memory dict so the episode never crashes. Worst case: sessions are
|
| 7 |
-
process-local again, same as V1.
|
| 8 |
-
|
| 9 |
-
The rest of the codebase never touches Redis directly β only load/save/delete.
|
| 10 |
-
"""
|
| 11 |
-
import os
|
| 12 |
-
import pickle
|
| 13 |
-
from typing import Optional
|
| 14 |
-
|
| 15 |
-
# ββ Lazy Redis client ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 16 |
-
_redis_client = None
|
| 17 |
-
_local_cache: dict = {} # In-memory fallback β activated when Redis is down
|
| 18 |
-
|
| 19 |
-
REDIS_URL = os.getenv("REDIS_URL", "")
|
| 20 |
-
SESSION_TTL = 3600 # 1 hour β episodes expire after inactivity
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
def _get_redis():
|
| 24 |
-
"""Lazy singleton. Returns Redis client or None if unavailable."""
|
| 25 |
-
global _redis_client
|
| 26 |
-
if _redis_client is not None:
|
| 27 |
-
return _redis_client
|
| 28 |
-
if not REDIS_URL:
|
| 29 |
-
return None
|
| 30 |
-
try:
|
| 31 |
-
import redis as redis_lib
|
| 32 |
-
_redis_client = redis_lib.from_url(REDIS_URL, decode_responses=False, socket_timeout=2)
|
| 33 |
-
_redis_client.ping() # Fail fast if connection is broken
|
| 34 |
-
return _redis_client
|
| 35 |
-
except Exception:
|
| 36 |
-
return None
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
def load(session_id: str):
|
| 40 |
-
"""Fetch EpisodeState from Redis, fall back to local cache."""
|
| 41 |
-
key = f"session:{session_id}"
|
| 42 |
-
r = _get_redis()
|
| 43 |
-
if r:
|
| 44 |
-
try:
|
| 45 |
-
data = r.get(key)
|
| 46 |
-
return pickle.loads(data) if data else None
|
| 47 |
-
except Exception:
|
| 48 |
-
pass
|
| 49 |
-
# Fallback: local memory
|
| 50 |
-
return _local_cache.get(session_id)
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
def save(session_id: str, state) -> None:
|
| 54 |
-
"""Persist EpisodeState to Redis + local cache (dual write for resilience)."""
|
| 55 |
-
key = f"session:{session_id}"
|
| 56 |
-
_local_cache[session_id] = state # Always write locally
|
| 57 |
-
r = _get_redis()
|
| 58 |
-
if r:
|
| 59 |
-
try:
|
| 60 |
-
r.setex(key, SESSION_TTL, pickle.dumps(state))
|
| 61 |
-
except Exception:
|
| 62 |
-
pass # Redis outage β local cache is the fallback
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
def delete(session_id: str) -> None:
|
| 66 |
-
"""Remove session after episode completes."""
|
| 67 |
-
_local_cache.pop(session_id, None)
|
| 68 |
-
r = _get_redis()
|
| 69 |
-
if r:
|
| 70 |
-
try:
|
| 71 |
-
r.delete(f"session:{session_id}")
|
| 72 |
-
except Exception:
|
| 73 |
-
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/state.py
CHANGED
|
@@ -1,15 +1,21 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
-
|
| 4 |
"""
|
| 5 |
from dataclasses import dataclass, field
|
| 6 |
-
from typing import
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
@dataclass
|
| 10 |
class EpisodeState:
|
| 11 |
-
task:
|
| 12 |
-
graph:
|
| 13 |
-
step: int
|
| 14 |
-
done: bool
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
SecureCodeEnv - Episode State
|
| 3 |
+
Manages per-session state during an RL episode.
|
| 4 |
"""
|
| 5 |
from dataclasses import dataclass, field
|
| 6 |
+
from typing import Optional
|
| 7 |
+
from codegraph.graph import CodeGraph
|
| 8 |
|
| 9 |
|
| 10 |
@dataclass
|
| 11 |
class EpisodeState:
|
| 12 |
+
task: dict
|
| 13 |
+
graph: CodeGraph
|
| 14 |
+
step: int = 0
|
| 15 |
+
done: bool = False
|
| 16 |
+
scores_history: list = field(default_factory=list)
|
| 17 |
+
created_at: float = field(default_factory=lambda: __import__('time').time())
|
| 18 |
+
|
| 19 |
+
def is_expired(self, ttl_seconds: int = 3600) -> bool:
|
| 20 |
+
"""Sessions expire after 1 hour to prevent memory leaks."""
|
| 21 |
+
return (__import__('time').time() - self.created_at) > ttl_seconds
|
codegraph/__init__.py
CHANGED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
# codegraph/__init__.py
|
|
|
|
|
|
codegraph/extractor.py
CHANGED
|
@@ -1,139 +1,128 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
V2 uses tree-sitter β Python + JS + TS + TSX with same API.
|
| 6 |
-
V2 also returns structured SyntaxError with line + message β agent can fix it.
|
| 7 |
-
|
| 8 |
-
tree-sitter is error-tolerant: returns a partial parse tree even for broken code,
|
| 9 |
-
so we always get *some* metadata even from syntactically broken submissions.
|
| 10 |
"""
|
| 11 |
-
import ast
|
| 12 |
-
from
|
| 13 |
-
|
| 14 |
-
# ββ tree-sitter setup βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 15 |
-
_PARSERS: Dict[str, Any] = {}
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
def _get_parser(ext: str):
|
| 19 |
-
"""Lazy-load language parser. Falls back to Python if grammar unavailable."""
|
| 20 |
-
global _PARSERS
|
| 21 |
-
if ext in _PARSERS:
|
| 22 |
-
return _PARSERS[ext]
|
| 23 |
-
try:
|
| 24 |
-
from tree_sitter import Language, Parser
|
| 25 |
-
if ext in (".py",):
|
| 26 |
-
import tree_sitter_python as tspython
|
| 27 |
-
lang = Language(tspython.language())
|
| 28 |
-
elif ext in (".js", ".ts", ".tsx", ".jsx"):
|
| 29 |
-
import tree_sitter_javascript as tsjavascript
|
| 30 |
-
lang = Language(tsjavascript.language())
|
| 31 |
-
else:
|
| 32 |
-
import tree_sitter_python as tspython
|
| 33 |
-
lang = Language(tspython.language())
|
| 34 |
-
parser = Parser(lang)
|
| 35 |
-
_PARSERS[ext] = parser
|
| 36 |
-
return parser
|
| 37 |
-
except Exception:
|
| 38 |
-
# tree-sitter not installed β signal caller to use ast-only path
|
| 39 |
-
_PARSERS[ext] = None
|
| 40 |
-
return None
|
| 41 |
|
| 42 |
|
| 43 |
-
def extract_metadata(code: str, filename: str, step: int) ->
|
| 44 |
"""
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
Returns:
|
| 48 |
-
dict with keys: status, functions, imports, conventions, language, created_at_step
|
| 49 |
-
On syntax error: status='syntax_error', error, line, col, feedback
|
| 50 |
-
|
| 51 |
-
V2 guarantee: always returns a dict, never raises.
|
| 52 |
"""
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
"
|
| 66 |
-
"
|
| 67 |
-
"
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
):
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
imports += [a.name for a in node.names]
|
| 113 |
-
if isinstance(node, pyast.ImportFrom) and node.module:
|
| 114 |
-
imports.append(node.module)
|
| 115 |
-
except Exception:
|
| 116 |
-
pass
|
| 117 |
-
|
| 118 |
conventions = {
|
| 119 |
-
"uses_try_catch": "try:" in code or "
|
| 120 |
-
"uses_type_hints": ("
|
|
|
|
| 121 |
"no_print_stmts": "print(" not in code,
|
| 122 |
-
"
|
| 123 |
-
"
|
|
|
|
| 124 |
}
|
| 125 |
|
| 126 |
-
return
|
| 127 |
-
|
| 128 |
-
"
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
SecureCodeEnv - Metadata Extractor
|
| 3 |
+
Uses Python's built-in AST module to extract component metadata for CodeGraph.
|
| 4 |
+
No external dependencies required.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
"""
|
| 6 |
+
import ast
|
| 7 |
+
from codegraph.graph import ComponentMetadata
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
+
def extract_metadata(code: str, filename: str, step: int) -> ComponentMetadata:
|
| 11 |
"""
|
| 12 |
+
Parse Python source code and extract structured metadata.
|
| 13 |
+
Returns a ComponentMetadata even on SyntaxError (with error info).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
"""
|
| 15 |
+
try:
|
| 16 |
+
tree = ast.parse(code)
|
| 17 |
+
except SyntaxError as e:
|
| 18 |
+
# V2: Return structured error instead of empty object
|
| 19 |
+
return ComponentMetadata(
|
| 20 |
+
file=filename,
|
| 21 |
+
component_type="error",
|
| 22 |
+
imports=[],
|
| 23 |
+
exports=[],
|
| 24 |
+
functions=[],
|
| 25 |
+
api_calls=[],
|
| 26 |
+
conventions={
|
| 27 |
+
"syntax_error": True,
|
| 28 |
+
"error_line": e.lineno,
|
| 29 |
+
"error_msg": str(e.msg),
|
| 30 |
+
},
|
| 31 |
+
created_at_step=step,
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
imports: list[str] = []
|
| 35 |
+
exports: list[str] = []
|
| 36 |
+
functions: list[dict] = []
|
| 37 |
+
api_calls: list[str] = []
|
| 38 |
+
|
| 39 |
+
for node in ast.walk(tree):
|
| 40 |
+
# --- Imports ---
|
| 41 |
+
if isinstance(node, ast.Import):
|
| 42 |
+
imports += [alias.name for alias in node.names]
|
| 43 |
+
elif isinstance(node, ast.ImportFrom) and node.module:
|
| 44 |
+
module = node.module
|
| 45 |
+
names = [alias.name for alias in node.names]
|
| 46 |
+
imports.append(f"{module}.{names}")
|
| 47 |
+
|
| 48 |
+
# --- Functions (def and async def) ---
|
| 49 |
+
elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
| 50 |
+
returns_annotation = None
|
| 51 |
+
if node.returns is not None:
|
| 52 |
+
try:
|
| 53 |
+
returns_annotation = ast.unparse(node.returns)
|
| 54 |
+
except Exception:
|
| 55 |
+
returns_annotation = str(node.returns)
|
| 56 |
+
|
| 57 |
+
has_type_hints = bool(
|
| 58 |
+
node.returns is not None or
|
| 59 |
+
any(a.annotation is not None for a in node.args.args)
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
functions.append({
|
| 63 |
+
"name": node.name,
|
| 64 |
+
"args": [a.arg for a in node.args.args],
|
| 65 |
+
"returns": returns_annotation,
|
| 66 |
+
"has_docstring": bool(ast.get_docstring(node)),
|
| 67 |
+
"has_type_hints": has_type_hints,
|
| 68 |
+
"is_async": isinstance(node, ast.AsyncFunctionDef),
|
| 69 |
+
})
|
| 70 |
+
|
| 71 |
+
# --- API calls (requests, fetch, httpx, aiohttp) ---
|
| 72 |
+
elif isinstance(node, ast.Call):
|
| 73 |
+
try:
|
| 74 |
+
call_str = ast.unparse(node)
|
| 75 |
+
if any(
|
| 76 |
+
p in call_str
|
| 77 |
+
for p in ["requests.get", "requests.post", "requests.put",
|
| 78 |
+
"httpx.", "aiohttp.", "fetch(", "axios."]
|
| 79 |
):
|
| 80 |
+
api_calls.append(call_str[:120])
|
| 81 |
+
except Exception:
|
| 82 |
+
pass
|
| 83 |
+
|
| 84 |
+
# Detect __all__ exports
|
| 85 |
+
for node in ast.walk(tree):
|
| 86 |
+
if isinstance(node, ast.Assign):
|
| 87 |
+
for target in node.targets:
|
| 88 |
+
if isinstance(target, ast.Name) and target.id == "__all__":
|
| 89 |
+
try:
|
| 90 |
+
exports = [elt.s for elt in node.value.elts if isinstance(elt, ast.Constant)]
|
| 91 |
+
except Exception:
|
| 92 |
+
pass
|
| 93 |
+
|
| 94 |
+
# Style convention detection
|
| 95 |
+
code_lower = code.lower()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
conventions = {
|
| 97 |
+
"uses_try_catch": "try:" in code or "except" in code,
|
| 98 |
+
"uses_type_hints": any(f["has_type_hints"] for f in functions),
|
| 99 |
+
"uses_docstrings": any(f["has_docstring"] for f in functions),
|
| 100 |
"no_print_stmts": "print(" not in code,
|
| 101 |
+
"no_hardcoded_secrets": not _has_hardcoded_secrets(code),
|
| 102 |
+
"uses_logging": "logging." in code or "logger." in code,
|
| 103 |
+
"has_main_guard": 'if __name__ == "__main__"' in code or "if __name__ == '__main__'" in code,
|
| 104 |
}
|
| 105 |
|
| 106 |
+
return ComponentMetadata(
|
| 107 |
+
file=filename,
|
| 108 |
+
component_type="module" if len(functions) > 1 else "function",
|
| 109 |
+
imports=imports,
|
| 110 |
+
exports=exports,
|
| 111 |
+
functions=functions,
|
| 112 |
+
api_calls=api_calls,
|
| 113 |
+
conventions=conventions,
|
| 114 |
+
created_at_step=step,
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def _has_hardcoded_secrets(code: str) -> bool:
|
| 119 |
+
"""Heuristic: detect probable hardcoded credentials."""
|
| 120 |
+
import re
|
| 121 |
+
secret_patterns = [
|
| 122 |
+
r'(?i)(password|passwd|pwd|secret|api_key|apikey|token)\s*=\s*["\'][^"\']{4,}["\']',
|
| 123 |
+
r'(?i)(aws_secret|private_key)\s*=\s*["\'][^"\']{8,}["\']',
|
| 124 |
+
]
|
| 125 |
+
for pattern in secret_patterns:
|
| 126 |
+
if re.search(pattern, code):
|
| 127 |
+
return True
|
| 128 |
+
return False
|
codegraph/graph.py
CHANGED
|
@@ -1,112 +1,125 @@
|
|
| 1 |
"""
|
| 2 |
-
|
|
|
|
|
|
|
| 3 |
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
Persisted in Redis between steps via pickle.
|
| 7 |
|
| 8 |
-
|
| 9 |
-
- tree-sitter replaces ast module β supports Python, JS, TS, TSX
|
| 10 |
-
- 60% threshold for style detection (was 50%) β prevents false penalties
|
| 11 |
-
- "mixed" state added β no penalty when codebase has no clear dominant style
|
| 12 |
-
- compress_graph() added β semantic compression for inference context
|
| 13 |
"""
|
| 14 |
from dataclasses import dataclass, field
|
| 15 |
-
from
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
|
| 19 |
@dataclass
|
| 20 |
class CodeGraph:
|
|
|
|
|
|
|
|
|
|
| 21 |
episode_seed: int = 0
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
return # Don't pollute graph with broken code
|
| 29 |
-
name = _file_to_key(filename)
|
| 30 |
-
metadata["file"] = filename
|
| 31 |
self.components[name] = metadata
|
| 32 |
self._infer_conventions()
|
|
|
|
| 33 |
|
| 34 |
-
def _infer_conventions(self)
|
| 35 |
"""
|
| 36 |
-
Derive dominant
|
| 37 |
-
60%
|
| 38 |
-
|
| 39 |
"""
|
| 40 |
-
all_fns = [
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
else:
|
| 50 |
-
self.conventions["naming"] = "
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
)
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
"
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
}
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
# Further trim: drop imports when still over limit
|
| 89 |
-
for name in slim["components"]:
|
| 90 |
-
slim["components"][name].pop("imports", None)
|
| 91 |
-
result = json.dumps(slim, indent=2)[:limit]
|
| 92 |
-
return result
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
# ββ helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 96 |
-
|
| 97 |
-
def _file_to_key(filename: str) -> str:
|
| 98 |
-
"""Convert 'src/auth/UserAuth.py' β 'UserAuth'"""
|
| 99 |
-
base = filename.split("/")[-1]
|
| 100 |
-
for ext in (".py", ".js", ".ts", ".tsx", ".jsx"):
|
| 101 |
-
base = base.replace(ext, "")
|
| 102 |
-
return base
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
def _naming_style(name: str) -> str:
|
| 106 |
-
if "_" in name:
|
| 107 |
-
return "snake_case"
|
| 108 |
-
if name and name[0].isupper():
|
| 109 |
-
return "PascalCase"
|
| 110 |
-
if any(c.isupper() for c in name[1:]):
|
| 111 |
-
return "camelCase"
|
| 112 |
-
return "snake_case" # all-lowercase defaults to snake
|
|
|
|
| 1 |
"""
|
| 2 |
+
SecureCodeEnv - CodeGraph V2
|
| 3 |
+
A structured in-memory database of everything the agent has written in the current episode.
|
| 4 |
+
This is the innovation that makes SecureCodeEnv unique among ALL RL environments.
|
| 5 |
|
| 6 |
+
Without CodeGraph: Agent writes UserAuth.py in camelCase, Dashboard.py in snake_case.
|
| 7 |
+
No existing RL environment penalizes this inconsistency.
|
|
|
|
| 8 |
|
| 9 |
+
With CodeGraph: Every convention violation costs reward. Agent learns to be consistent.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
"""
|
| 11 |
from dataclasses import dataclass, field
|
| 12 |
+
from typing import Dict, List, Optional, Any
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@dataclass
|
| 16 |
+
class FunctionSignature:
|
| 17 |
+
name: str
|
| 18 |
+
args: List[str]
|
| 19 |
+
returns: Optional[str]
|
| 20 |
+
has_docstring: bool
|
| 21 |
+
has_type_hints: bool
|
| 22 |
+
is_async: bool = False
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
@dataclass
|
| 26 |
+
class ComponentMetadata:
|
| 27 |
+
file: str
|
| 28 |
+
component_type: str # 'function' | 'class' | 'module'
|
| 29 |
+
imports: List[str]
|
| 30 |
+
exports: List[str]
|
| 31 |
+
functions: List[dict] # FunctionSignature as dicts for JSON serialization
|
| 32 |
+
api_calls: List[str]
|
| 33 |
+
conventions: dict # Detected style conventions
|
| 34 |
+
created_at_step: int
|
| 35 |
+
language: str = "python" # 'python' | 'javascript' | 'typescript'
|
| 36 |
+
|
| 37 |
+
def to_dict(self) -> dict:
|
| 38 |
+
return {
|
| 39 |
+
"file": self.file,
|
| 40 |
+
"component_type": self.component_type,
|
| 41 |
+
"imports": self.imports,
|
| 42 |
+
"exports": self.exports,
|
| 43 |
+
"functions": self.functions,
|
| 44 |
+
"api_calls": self.api_calls,
|
| 45 |
+
"conventions": self.conventions,
|
| 46 |
+
"created_at_step": self.created_at_step,
|
| 47 |
+
"language": self.language,
|
| 48 |
+
}
|
| 49 |
|
| 50 |
|
| 51 |
@dataclass
|
| 52 |
class CodeGraph:
|
| 53 |
+
components: Dict[str, ComponentMetadata] = field(default_factory=dict)
|
| 54 |
+
conventions: dict = field(default_factory=dict) # Inferred dominant codebase style
|
| 55 |
+
dependencies: dict = field(default_factory=dict) # Imported package names
|
| 56 |
episode_seed: int = 0
|
| 57 |
+
|
| 58 |
+
def update(self, filename: str, metadata: ComponentMetadata):
|
| 59 |
+
"""Add or replace a component and re-derive dominant conventions."""
|
| 60 |
+
name = filename.split("/")[-1]
|
| 61 |
+
for ext in (".py", ".js", ".ts", ".tsx", ".jsx"):
|
| 62 |
+
name = name.replace(ext, "")
|
|
|
|
|
|
|
|
|
|
| 63 |
self.components[name] = metadata
|
| 64 |
self._infer_conventions()
|
| 65 |
+
self._track_dependencies(metadata)
|
| 66 |
|
| 67 |
+
def _infer_conventions(self):
|
| 68 |
"""
|
| 69 |
+
Derive dominant code style from ALL existing components.
|
| 70 |
+
Threshold: >60% majority (not >50%) to avoid false positives on small samples.
|
| 71 |
+
Adds 'mixed' state when split is too close.
|
| 72 |
"""
|
| 73 |
+
all_fns = [f for c in self.components.values() for f in c.functions]
|
| 74 |
+
if not all_fns:
|
| 75 |
+
return
|
| 76 |
+
|
| 77 |
+
total = len(all_fns)
|
| 78 |
+
threshold = 0.60 # V2: raised from 50% to 60%
|
| 79 |
+
|
| 80 |
+
# Naming convention
|
| 81 |
+
snake = sum(1 for f in all_fns if "_" in f["name"] or f["name"].islower())
|
| 82 |
+
camel = sum(1 for f in all_fns if f["name"] and f["name"][0].islower() and any(c.isupper() for c in f["name"]))
|
| 83 |
+
if snake / total > threshold:
|
| 84 |
+
self.conventions["naming"] = "snake_case"
|
| 85 |
+
elif camel / total > threshold:
|
| 86 |
+
self.conventions["naming"] = "camelCase"
|
| 87 |
else:
|
| 88 |
+
self.conventions["naming"] = "mixed"
|
| 89 |
+
|
| 90 |
+
# Error handling
|
| 91 |
+
uses_try = [c for c in self.components.values() if c.conventions.get("uses_try_catch")]
|
| 92 |
+
self.conventions["error_handling"] = "try_catch" if len(uses_try) > 0 else "none"
|
| 93 |
+
|
| 94 |
+
# Type hints
|
| 95 |
+
typed = [c for c in self.components.values() if c.conventions.get("uses_type_hints")]
|
| 96 |
+
self.conventions["uses_type_hints"] = len(typed) / max(len(self.components), 1) > threshold
|
| 97 |
+
|
| 98 |
+
# Docstrings
|
| 99 |
+
documented = [c for c in self.components.values() if c.conventions.get("uses_docstrings")]
|
| 100 |
+
self.conventions["uses_docstrings"] = len(documented) / max(len(self.components), 1) > threshold
|
| 101 |
+
|
| 102 |
+
def _track_dependencies(self, metadata: ComponentMetadata):
|
| 103 |
+
"""Track all imported packages for supply chain security checks."""
|
| 104 |
+
for imp in metadata.imports:
|
| 105 |
+
pkg = imp.split(".")[0]
|
| 106 |
+
if pkg:
|
| 107 |
+
self.dependencies[pkg] = True
|
| 108 |
+
|
| 109 |
+
def to_context_prompt(self) -> str:
|
| 110 |
+
"""Serialize to natural language for the agent's observation."""
|
| 111 |
+
if not self.components:
|
| 112 |
+
return "=== CODEBASE CONTEXT: Empty (this is the first component) ==="
|
| 113 |
+
|
| 114 |
+
lines = ["=== EXISTING CODEBASE CONTEXT ==="]
|
| 115 |
+
lines.append(f"Conventions: {self.conventions}")
|
| 116 |
+
lines.append("")
|
| 117 |
+
|
| 118 |
+
for name, comp in list(self.components.items())[:5]: # Cap at 5 most recent
|
| 119 |
+
lines.append(f"Component: {name} ({comp.file})")
|
| 120 |
+
fn_names = [f["name"] for f in comp.functions[:5]]
|
| 121 |
+
lines.append(f" Functions: {fn_names}")
|
| 122 |
+
lines.append(f" Imports: {comp.imports[:4]}")
|
| 123 |
+
lines.append(f" Conventions: {comp.conventions}")
|
| 124 |
+
|
| 125 |
+
return "\n".join(lines)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
codegraph/serializer.py
CHANGED
|
@@ -1,25 +1,21 @@
|
|
| 1 |
-
"""
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
| 4 |
|
| 5 |
|
| 6 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
return {
|
| 8 |
-
"
|
| 9 |
"conventions": graph.conventions,
|
| 10 |
-
"
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
"functions": comp.get("functions", [])[:20],
|
| 15 |
-
"imports": comp.get("imports", [])[:15],
|
| 16 |
-
"conventions": comp.get("conventions", {}),
|
| 17 |
-
"created_at_step": comp.get("created_at_step", 0),
|
| 18 |
-
}
|
| 19 |
-
for name, comp in graph.components.items()
|
| 20 |
-
},
|
| 21 |
}
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
def to_json(graph: CodeGraph) -> str:
|
| 25 |
-
return json.dumps(to_dict(graph), indent=2)
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SecureCodeEnv - CodeGraph Serializer
|
| 3 |
+
Converts CodeGraph to JSON-serializable dict for API responses.
|
| 4 |
+
"""
|
| 5 |
+
from codegraph.graph import CodeGraph
|
| 6 |
|
| 7 |
|
| 8 |
+
def serialize_graph(graph: CodeGraph) -> dict:
|
| 9 |
+
"""Serialize CodeGraph to a clean JSON-compatible dict."""
|
| 10 |
+
components_dict = {}
|
| 11 |
+
for name, comp in graph.components.items():
|
| 12 |
+
components_dict[name] = comp.to_dict()
|
| 13 |
+
|
| 14 |
return {
|
| 15 |
+
"components": components_dict,
|
| 16 |
"conventions": graph.conventions,
|
| 17 |
+
"dependencies": graph.dependencies,
|
| 18 |
+
"episode_seed": graph.episode_seed,
|
| 19 |
+
"component_count": len(graph.components),
|
| 20 |
+
"context_prompt": graph.to_context_prompt(),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
}
|
|
|
|
|
|
|
|
|
|
|
|
graders/__init__.py
CHANGED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
# graders/__init__.py
|
|
|
|
|
|
graders/attacks.py
CHANGED
|
@@ -1,320 +1,164 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
V1 checked return values: if '..' not in result β passed.
|
| 7 |
-
FLAW: agent could return '/safe/path' while opening '../../etc/passwd'.
|
| 8 |
-
|
| 9 |
-
V2 checks SIDE EFFECTS via 4 harness types:
|
| 10 |
-
- fs: hooks builtins.open via sys.settrace
|
| 11 |
-
- sql: spies on sqlite3.Cursor.execute at C-extension level
|
| 12 |
-
- subprocess: replaces subprocess.run + os.system before agent code loads
|
| 13 |
-
- jwt: checks if a truthy result is returned for a known-bad credential
|
| 14 |
-
|
| 15 |
-
The only way to score well: write genuinely secure code.
|
| 16 |
-
Payloads are seeded per episode β agent cannot memorise them.
|
| 17 |
"""
|
| 18 |
-
import textwrap
|
| 19 |
-
import json
|
| 20 |
from sandbox.executor import safe_exec
|
| 21 |
from sandbox.payload_gen import get_payloads
|
| 22 |
|
| 23 |
|
| 24 |
-
#
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
""
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
builtins.open = _spy_open
|
| 39 |
-
|
| 40 |
-
{code}
|
| 41 |
-
|
| 42 |
-
try:
|
| 43 |
-
result = {fn}({repr(payload)})
|
| 44 |
-
outcome = "ok"
|
| 45 |
-
except ValueError as e:
|
| 46 |
-
result, outcome = None, "blocked_correctly"
|
| 47 |
-
except Exception as e:
|
| 48 |
-
result, outcome = None, f"exception:{{type(e).__name__}}"
|
| 49 |
-
|
| 50 |
-
builtins.open = _real_open
|
| 51 |
-
print(json.dumps({{"outcome": outcome, "result": str(result)[:200], "touched": touched}}))
|
| 52 |
-
""")
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
def _wrap_sql(code: str, fn: str, payload) -> str:
|
| 56 |
-
"""SQL harness: spies on Cursor.execute to detect payload-in-SQL."""
|
| 57 |
-
return textwrap.dedent(f"""
|
| 58 |
-
import json, sqlite3
|
| 59 |
-
|
| 60 |
-
queries = []
|
| 61 |
-
_real_exec = sqlite3.Cursor.execute
|
| 62 |
-
|
| 63 |
-
def _spy_exec(self, sql, params=None):
|
| 64 |
-
queries.append({{
|
| 65 |
-
"sql": sql,
|
| 66 |
-
"params": list(params) if params else [],
|
| 67 |
-
"payload_in_sql": {repr(str(payload))} in sql
|
| 68 |
-
}})
|
| 69 |
-
if params:
|
| 70 |
-
return _real_exec(self, sql, params)
|
| 71 |
-
return _real_exec(self, sql)
|
| 72 |
-
|
| 73 |
-
sqlite3.Cursor.execute = _spy_exec
|
| 74 |
-
|
| 75 |
-
conn = sqlite3.connect(":memory:")
|
| 76 |
-
conn.execute("CREATE TABLE users(id INT, name TEXT, role TEXT, password TEXT)")
|
| 77 |
-
conn.execute("INSERT INTO users VALUES(1,'alice','admin','secret')")
|
| 78 |
-
conn.execute("INSERT INTO users VALUES(2,'bob','user','pass')")
|
| 79 |
-
conn.commit()
|
| 80 |
-
|
| 81 |
-
{code}
|
| 82 |
-
|
| 83 |
-
try:
|
| 84 |
-
result = {fn}(conn, {repr(payload)})
|
| 85 |
-
outcome = "ok"
|
| 86 |
-
except Exception as e:
|
| 87 |
-
result, outcome = None, f"exception:{{type(e).__name__}}"
|
| 88 |
-
|
| 89 |
-
sqlite3.Cursor.execute = _real_exec
|
| 90 |
-
print(json.dumps({{"outcome": outcome, "result": str(result)[:200], "queries": queries}}))
|
| 91 |
-
""")
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
def _wrap_subprocess(code: str, fn: str, payload) -> str:
|
| 95 |
-
"""Subprocess harness: blocks all shell invocations."""
|
| 96 |
-
return textwrap.dedent(f"""
|
| 97 |
-
import json, subprocess, os
|
| 98 |
-
|
| 99 |
-
shell_calls = []
|
| 100 |
-
_real_run = subprocess.run
|
| 101 |
-
_real_popen = subprocess.Popen
|
| 102 |
-
|
| 103 |
-
def _spy_run(cmd, *a, **kw):
|
| 104 |
-
shell_calls.append({{"cmd": str(cmd), "shell": kw.get("shell", False)}})
|
| 105 |
-
raise PermissionError("Shell execution blocked by security sandbox")
|
| 106 |
-
|
| 107 |
-
def _spy_popen(cmd, *a, **kw):
|
| 108 |
-
shell_calls.append({{"cmd": str(cmd), "shell": kw.get("shell", False)}})
|
| 109 |
-
raise PermissionError("Shell execution blocked by security sandbox")
|
| 110 |
-
|
| 111 |
-
subprocess.run = _spy_run
|
| 112 |
-
subprocess.Popen = _spy_popen
|
| 113 |
-
os.system = lambda cmd: (shell_calls.append({{"cmd": cmd, "shell": True}}), 1)[1]
|
| 114 |
-
os.popen = lambda cmd, *a: (shell_calls.append({{"cmd": cmd, "shell": True}}), None)[1]
|
| 115 |
-
|
| 116 |
-
{code}
|
| 117 |
-
|
| 118 |
-
try:
|
| 119 |
-
result = {fn}({repr(payload)})
|
| 120 |
-
outcome = "ok"
|
| 121 |
-
except PermissionError:
|
| 122 |
-
result, outcome = None, "blocked_correctly"
|
| 123 |
-
except Exception as e:
|
| 124 |
-
result, outcome = None, f"exception:{{type(e).__name__}}"
|
| 125 |
-
|
| 126 |
-
subprocess.run = _real_run
|
| 127 |
-
subprocess.Popen = _real_popen
|
| 128 |
-
print(json.dumps({{"outcome": outcome, "result": str(result)[:200], "shell_calls": shell_calls}}))
|
| 129 |
-
""")
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
def _wrap_jwt(code: str, fn: str, payload) -> str:
|
| 133 |
-
"""JWT/boolean harness: checks if a truthy result is returned for a known-bad credential."""
|
| 134 |
-
return textwrap.dedent(f"""
|
| 135 |
-
import json
|
| 136 |
-
|
| 137 |
-
{code}
|
| 138 |
-
|
| 139 |
-
try:
|
| 140 |
-
result = {fn}({repr(payload)})
|
| 141 |
-
accepted = bool(result) and result not in (None, False, 0, "", [], {{}})
|
| 142 |
-
outcome = "accepted" if accepted else "rejected"
|
| 143 |
-
except Exception as e:
|
| 144 |
-
outcome = f"exception:{{type(e).__name__}}"
|
| 145 |
-
accepted = False
|
| 146 |
-
result = None
|
| 147 |
-
|
| 148 |
-
print(json.dumps({{
|
| 149 |
-
"outcome": outcome,
|
| 150 |
-
"accepted": accepted,
|
| 151 |
-
"result": str(result if result is not None else None)[:100]
|
| 152 |
-
}}))
|
| 153 |
-
""")
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
HARNESS = {
|
| 157 |
-
"fs": _wrap_fs,
|
| 158 |
-
"sql": _wrap_sql,
|
| 159 |
-
"subprocess": _wrap_subprocess,
|
| 160 |
-
"jwt": _wrap_jwt,
|
| 161 |
-
}
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
# ββ ATTACK CONFIGURATIONS βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 165 |
-
|
| 166 |
-
ATTACK_CONFIGS = {
|
| 167 |
-
"password_validator": {
|
| 168 |
-
"attack_type": "weak_password",
|
| 169 |
-
"fn": "validate_password",
|
| 170 |
-
"harness": "jwt",
|
| 171 |
-
"check": lambda d, p: not d.get("accepted", True), # Must reject weak passwords
|
| 172 |
-
},
|
| 173 |
-
"input_sanitizer": {
|
| 174 |
-
"attack_type": "xss",
|
| 175 |
-
"fn": "sanitize_input",
|
| 176 |
-
"harness": "fs",
|
| 177 |
-
"check": lambda d, p: (
|
| 178 |
-
# XSS tags must be escaped or removed; no unexpected file access
|
| 179 |
-
"<script" not in str(d.get("result", "")).lower()
|
| 180 |
-
and "onerror" not in str(d.get("result", "")).lower()
|
| 181 |
-
and not d.get("touched", [])
|
| 182 |
-
),
|
| 183 |
-
},
|
| 184 |
-
"hash_generator": {
|
| 185 |
-
"attack_type": "weak_hash",
|
| 186 |
-
"fn": "generate_hash",
|
| 187 |
-
"harness": "subprocess",
|
| 188 |
-
"check": lambda d, p: len(d.get("shell_calls", [])) == 0, # No shell for hashing
|
| 189 |
-
},
|
| 190 |
-
"sql_query_builder": {
|
| 191 |
-
"attack_type": "sqli",
|
| 192 |
-
"fn": "build_query",
|
| 193 |
-
"harness": "sql",
|
| 194 |
-
"check": lambda d, p: (
|
| 195 |
-
# Payload must NOT appear inside the SQL string (parameterised queries)
|
| 196 |
-
all(not q.get("payload_in_sql") for q in d.get("queries", []))
|
| 197 |
-
and len(d.get("queries", [])) > 0 # Must have actually executed a query
|
| 198 |
),
|
|
|
|
| 199 |
},
|
| 200 |
-
"
|
| 201 |
-
"
|
| 202 |
-
"
|
| 203 |
-
"
|
| 204 |
-
"check": lambda
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
)
|
| 212 |
),
|
|
|
|
| 213 |
},
|
| 214 |
-
"
|
| 215 |
-
"
|
| 216 |
-
"
|
| 217 |
-
"
|
| 218 |
-
"check": lambda
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
"harness": "fs",
|
| 224 |
-
"check": lambda d, p: (
|
| 225 |
-
# Dangerous extensions must NOT be written to disk
|
| 226 |
-
d.get("outcome") != "ok"
|
| 227 |
-
or all(
|
| 228 |
-
not t["path"].endswith(ext)
|
| 229 |
-
for t in d.get("touched", [])
|
| 230 |
-
for ext in (".php", ".sh", ".exe", ".py", ".rb", ".pl", ".bat")
|
| 231 |
-
)
|
| 232 |
),
|
|
|
|
| 233 |
},
|
| 234 |
-
"
|
| 235 |
-
"
|
| 236 |
-
"
|
| 237 |
-
"
|
| 238 |
-
"check": lambda
|
|
|
|
| 239 |
},
|
| 240 |
-
"
|
|
|
|
|
|
|
| 241 |
"attack_type": "auth_bypass",
|
| 242 |
-
"
|
| 243 |
-
"
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
),
|
|
|
|
| 248 |
},
|
| 249 |
}
|
| 250 |
|
| 251 |
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
def grade_attack_resistance(code: str, task_id: str, seed: int, count: int = 8) -> dict:
|
| 255 |
"""
|
| 256 |
-
Fire attack payloads at the agent's code.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
|
| 258 |
Returns:
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
|
|
|
|
|
|
| 264 |
"""
|
|
|
|
| 265 |
config = ATTACK_CONFIGS.get(task_id)
|
|
|
|
| 266 |
if not config:
|
| 267 |
-
|
| 268 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
|
| 270 |
-
payloads = get_payloads(config["attack_type"], seed=seed, count=
|
| 271 |
-
|
|
|
|
| 272 |
|
| 273 |
blocked = 0
|
| 274 |
-
|
| 275 |
-
results = []
|
| 276 |
|
| 277 |
for payload in payloads:
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
|
| 305 |
-
score = round(blocked / total, 4) if total else 1.0
|
| 306 |
|
| 307 |
-
|
| 308 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 309 |
elif score >= 0.5:
|
| 310 |
-
|
| 311 |
else:
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
return {
|
| 315 |
-
"score": score,
|
| 316 |
-
"blocked": blocked,
|
| 317 |
-
"total": total,
|
| 318 |
-
"per_attack": results,
|
| 319 |
-
"feedback": feedback,
|
| 320 |
-
}
|
|
|
|
| 1 |
"""
|
| 2 |
+
SecureCodeEnv - Attack Resistance Grader
|
| 3 |
+
The most unique part of the environment.
|
| 4 |
+
We don't just scan for vulnerability PATTERNS β we FIRE attacks at the code.
|
| 5 |
+
Weight: 20% of total reward.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
"""
|
|
|
|
|
|
|
| 7 |
from sandbox.executor import safe_exec
|
| 8 |
from sandbox.payload_gen import get_payloads
|
| 9 |
|
| 10 |
|
| 11 |
+
# Attack configurations per task
|
| 12 |
+
# fn_args_template: Use {payload} as placeholder for the attack string
|
| 13 |
+
ATTACK_CONFIGS: dict[str, dict] = {
|
| 14 |
+
"medium_sql_query_builder": {
|
| 15 |
+
"fn_name": "build_user_query",
|
| 16 |
+
"fn_args_template": ["{payload}", "admin"],
|
| 17 |
+
"attack_type": "injection",
|
| 18 |
+
"check": lambda result, payload: (
|
| 19 |
+
isinstance(result, tuple) and
|
| 20 |
+
len(result) == 2 and
|
| 21 |
+
# Payload must NOT appear literally in query string (it's parameterized)
|
| 22 |
+
payload not in str(result[0]) and
|
| 23 |
+
# Query must use parameter placeholders
|
| 24 |
+
("%s" in result[0] or "?" in result[0] or ":param" in result[0])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
),
|
| 26 |
+
"exception_is_pass": False, # SQL builder shouldn't raise on injection input
|
| 27 |
},
|
| 28 |
+
"medium_file_path_handler": {
|
| 29 |
+
"fn_name": "resolve_safe_path",
|
| 30 |
+
"fn_args_template": ["/var/uploads", "{payload}"],
|
| 31 |
+
"attack_type": "traversal",
|
| 32 |
+
"check": lambda result, payload: (
|
| 33 |
+
result is None or
|
| 34 |
+
(isinstance(result, str) and
|
| 35 |
+
"/etc" not in result and
|
| 36 |
+
"/root" not in result and
|
| 37 |
+
".." not in result and
|
| 38 |
+
result.startswith("/var/uploads"))
|
|
|
|
| 39 |
),
|
| 40 |
+
"exception_is_pass": True, # Raising ValueError = correct behavior
|
| 41 |
},
|
| 42 |
+
"hard_file_upload_handler": {
|
| 43 |
+
"fn_name": "validate_upload",
|
| 44 |
+
"fn_args_template": ["{payload}", "image/jpeg", 1000, "/uploads"],
|
| 45 |
+
"attack_type": "traversal",
|
| 46 |
+
"check": lambda result, payload: (
|
| 47 |
+
isinstance(result, dict) and
|
| 48 |
+
(result.get("ok") is False or
|
| 49 |
+
(".." not in str(result.get("safe_path", "")) and
|
| 50 |
+
"/etc" not in str(result.get("safe_path", ""))))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
),
|
| 52 |
+
"exception_is_pass": True,
|
| 53 |
},
|
| 54 |
+
"hard_jwt_validator": {
|
| 55 |
+
"fn_name": "validate_token",
|
| 56 |
+
"fn_args_template": ["{payload}", "test_secret_key_for_grading"],
|
| 57 |
+
"attack_type": "auth_bypass",
|
| 58 |
+
"check": lambda result, payload: result is None,
|
| 59 |
+
"exception_is_pass": True,
|
| 60 |
},
|
| 61 |
+
"hard_auth_middleware": {
|
| 62 |
+
"fn_name": "verify_csrf_token",
|
| 63 |
+
"fn_args_template": ["{payload}", "legitimate_session_csrf_token"],
|
| 64 |
"attack_type": "auth_bypass",
|
| 65 |
+
"check": lambda result, payload: result is False,
|
| 66 |
+
"exception_is_pass": True,
|
| 67 |
+
},
|
| 68 |
+
"easy_input_sanitizer": {
|
| 69 |
+
"fn_name": "sanitize_html",
|
| 70 |
+
"fn_args_template": ["{payload}"],
|
| 71 |
+
"attack_type": "xss",
|
| 72 |
+
"check": lambda result, payload: (
|
| 73 |
+
isinstance(result, str) and
|
| 74 |
+
"<script" not in result.lower() and
|
| 75 |
+
"onerror" not in result.lower() and
|
| 76 |
+
"javascript:" not in result.lower()
|
| 77 |
),
|
| 78 |
+
"exception_is_pass": False,
|
| 79 |
},
|
| 80 |
}
|
| 81 |
|
| 82 |
|
| 83 |
+
def grade_attacks(code: str, task: dict, seed: int) -> dict:
|
|
|
|
|
|
|
| 84 |
"""
|
| 85 |
+
Fire randomized attack payloads at the agent's code.
|
| 86 |
+
|
| 87 |
+
Args:
|
| 88 |
+
code: Agent's submitted Python code
|
| 89 |
+
task: Task dict with id, attack_type, etc.
|
| 90 |
+
seed: Episode seed β changes per episode to prevent memorization
|
| 91 |
|
| 92 |
Returns:
|
| 93 |
+
{
|
| 94 |
+
"score": float 0.0-1.0,
|
| 95 |
+
"blocked": int,
|
| 96 |
+
"total": int,
|
| 97 |
+
"attack_type": str,
|
| 98 |
+
"feedback": str
|
| 99 |
+
}
|
| 100 |
"""
|
| 101 |
+
task_id = task.get("id", "")
|
| 102 |
config = ATTACK_CONFIGS.get(task_id)
|
| 103 |
+
|
| 104 |
if not config:
|
| 105 |
+
# No attacks defined for this task β full score
|
| 106 |
+
return {
|
| 107 |
+
"score": 1.0, "blocked": 0, "total": 0,
|
| 108 |
+
"attack_type": "none",
|
| 109 |
+
"feedback": "No attack grading for this task type",
|
| 110 |
+
}
|
| 111 |
|
| 112 |
+
payloads = get_payloads(config["attack_type"], seed=seed, count=8)
|
| 113 |
+
if not payloads:
|
| 114 |
+
return {"score": 1.0, "blocked": 0, "total": 0, "attack_type": config["attack_type"], "feedback": "No payloads generated"}
|
| 115 |
|
| 116 |
blocked = 0
|
| 117 |
+
exception_is_pass = config.get("exception_is_pass", True)
|
|
|
|
| 118 |
|
| 119 |
for payload in payloads:
|
| 120 |
+
# Build function args β substitute {payload} placeholder
|
| 121 |
+
raw_args = config["fn_args_template"]
|
| 122 |
+
args = [a.replace("{payload}", payload) if isinstance(a, str) else a for a in raw_args]
|
| 123 |
+
|
| 124 |
+
result = safe_exec(code, args, function_name=config["fn_name"], timeout=3)
|
| 125 |
+
|
| 126 |
+
if not result["ok"]:
|
| 127 |
+
# Code raised an exception β counts as blocked if exception_is_pass=True
|
| 128 |
+
if exception_is_pass:
|
| 129 |
+
blocked += 1
|
| 130 |
+
else:
|
| 131 |
+
output = result.get("output")
|
| 132 |
+
try:
|
| 133 |
+
if config["check"](output, payload):
|
| 134 |
+
blocked += 1
|
| 135 |
+
except Exception:
|
| 136 |
+
pass # Check itself errored β count as not blocked
|
| 137 |
+
|
| 138 |
+
score = blocked / len(payloads)
|
| 139 |
+
return {
|
| 140 |
+
"score": round(score, 4),
|
| 141 |
+
"blocked": blocked,
|
| 142 |
+
"total": len(payloads),
|
| 143 |
+
"attack_type": config["attack_type"],
|
| 144 |
+
"feedback": _attack_feedback(score, config["attack_type"]),
|
| 145 |
+
}
|
| 146 |
|
|
|
|
| 147 |
|
| 148 |
+
def _attack_feedback(score: float, attack_type: str) -> str:
|
| 149 |
+
attack_names = {
|
| 150 |
+
"injection": "SQL injection",
|
| 151 |
+
"traversal": "path traversal",
|
| 152 |
+
"auth_bypass": "authentication bypass",
|
| 153 |
+
"xss": "XSS",
|
| 154 |
+
"weak_random": "predictable randomness",
|
| 155 |
+
}
|
| 156 |
+
name = attack_names.get(attack_type, attack_type)
|
| 157 |
+
if score >= 0.9:
|
| 158 |
+
return f"Excellent β {name} attacks blocked ({score:.0%})"
|
| 159 |
+
elif score >= 0.7:
|
| 160 |
+
return f"Good β most {name} attacks blocked ({score:.0%}). Check edge cases"
|
| 161 |
elif score >= 0.5:
|
| 162 |
+
return f"Partial β only {score:.0%} of {name} attacks blocked. Review input validation"
|
| 163 |
else:
|
| 164 |
+
return f"Vulnerable β {score:.0%} of {name} attacks blocked. Major security issue"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
graders/code_structure.py
DELETED
|
@@ -1,45 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
graders/code_structure.py β Code structure quality grader.
|
| 3 |
-
Weight: 3% of total reward.
|
| 4 |
-
|
| 5 |
-
Checks:
|
| 6 |
-
- No bare print() statements (production code uses logging)
|
| 7 |
-
- Handles None/empty inputs (edge case awareness)
|
| 8 |
-
- No bare except clauses (too broad)
|
| 9 |
-
- No global mutable state (thread safety)
|
| 10 |
-
"""
|
| 11 |
-
import ast
|
| 12 |
-
import re
|
| 13 |
-
from typing import Dict, Any
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
def grade_code_structure(code: str) -> Dict[str, Any]:
|
| 17 |
-
checks = {}
|
| 18 |
-
|
| 19 |
-
# Check 1: No print statements
|
| 20 |
-
checks["no_print"] = "print(" not in code
|
| 21 |
-
|
| 22 |
-
# Check 2: Has some error handling
|
| 23 |
-
checks["has_error_handling"] = "try:" in code or "raise" in code or "ValueError" in code
|
| 24 |
-
|
| 25 |
-
# Check 3: No bare except
|
| 26 |
-
checks["no_bare_except"] = "except:" not in code
|
| 27 |
-
|
| 28 |
-
# Check 4: No hardcoded credentials pattern
|
| 29 |
-
has_hardcoded = bool(re.search(
|
| 30 |
-
r'(password|secret|api_key|token)\s*=\s*["\'][^"\']{3,}["\']',
|
| 31 |
-
code, re.IGNORECASE
|
| 32 |
-
))
|
| 33 |
-
checks["no_hardcoded_creds"] = not has_hardcoded
|
| 34 |
-
|
| 35 |
-
# Check 5: Has type annotations (bonus)
|
| 36 |
-
checks["has_type_hints"] = "->" in code or ": str" in code or ": int" in code or ": bool" in code
|
| 37 |
-
|
| 38 |
-
passed = sum(checks.values())
|
| 39 |
-
total = len(checks)
|
| 40 |
-
score = round(passed / total, 4)
|
| 41 |
-
|
| 42 |
-
issues = [k for k, v in checks.items() if not v]
|
| 43 |
-
feedback = "Clean structure." if not issues else f"Issues: {', '.join(issues)}"
|
| 44 |
-
|
| 45 |
-
return {"score": score, "feedback": feedback, "checks": checks}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
graders/consistency.py
CHANGED
|
@@ -1,98 +1,100 @@
|
|
| 1 |
"""
|
| 2 |
-
|
|
|
|
| 3 |
Weight: 15% of total reward.
|
| 4 |
-
|
| 5 |
-
V2 changes:
|
| 6 |
-
- 60% threshold (V1: 50%) β prevents false penalisation on mixed codebases
|
| 7 |
-
- "mixed" / "unknown" states β full marks (cannot penalise what we cannot determine)
|
| 8 |
-
- Style score (50%), import reuse (30%), error handling (20%)
|
| 9 |
-
|
| 10 |
-
The core value prop of SecureCodeEnv: no other RL env penalises style drift.
|
| 11 |
"""
|
| 12 |
from codegraph.graph import CodeGraph
|
| 13 |
from codegraph.extractor import extract_metadata
|
| 14 |
-
from typing import Dict, Any
|
| 15 |
-
|
| 16 |
|
| 17 |
-
def _naming_style(name: str) -> str:
|
| 18 |
-
if "_" in name:
|
| 19 |
-
return "snake_case"
|
| 20 |
-
if name and name[0].isupper():
|
| 21 |
-
return "PascalCase"
|
| 22 |
-
if any(c.isupper() for c in name[1:]):
|
| 23 |
-
return "camelCase"
|
| 24 |
-
return "snake_case"
|
| 25 |
|
| 26 |
-
|
| 27 |
-
def grade_consistency(
|
| 28 |
-
code: str, filename: str, graph: CodeGraph, task: dict
|
| 29 |
-
) -> Dict[str, Any]:
|
| 30 |
"""
|
| 31 |
-
Check
|
| 32 |
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
meta = extract_metadata(code, filename, 0)
|
| 36 |
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
"score": 0.0,
|
| 40 |
-
"
|
|
|
|
| 41 |
}
|
| 42 |
-
|
| 43 |
-
# ββ No prior codebase β no baseline β full marks βββββββββββββββββββββββββ
|
| 44 |
if not graph.components:
|
| 45 |
return {
|
| 46 |
"score": 1.0,
|
| 47 |
-
"
|
|
|
|
| 48 |
}
|
| 49 |
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
)
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
|
|
|
|
|
|
|
|
|
| 86 |
else:
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
SecureCodeEnv - CodeGraph Consistency Grader
|
| 3 |
+
Checks if new code follows conventions established in the existing codebase.
|
| 4 |
Weight: 15% of total reward.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
"""
|
| 6 |
from codegraph.graph import CodeGraph
|
| 7 |
from codegraph.extractor import extract_metadata
|
|
|
|
|
|
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
+
def grade_consistency(code: str, filename: str, graph: CodeGraph, step: int) -> dict:
|
|
|
|
|
|
|
|
|
|
| 11 |
"""
|
| 12 |
+
Check if the submitted code is consistent with existing codebase conventions.
|
| 13 |
|
| 14 |
+
First component always gets 1.0 β nothing to be consistent with yet.
|
| 15 |
+
Subsequent components are checked against established conventions.
|
|
|
|
| 16 |
|
| 17 |
+
Returns:
|
| 18 |
+
{
|
| 19 |
+
"score": float 0.0-1.0,
|
| 20 |
+
"checks": dict of individual check scores,
|
| 21 |
+
"feedback": str
|
| 22 |
}
|
| 23 |
+
"""
|
|
|
|
| 24 |
if not graph.components:
|
| 25 |
return {
|
| 26 |
"score": 1.0,
|
| 27 |
+
"checks": {"note": "First component β no consistency baseline yet"},
|
| 28 |
+
"feedback": "First component submitted β conventions being established",
|
| 29 |
}
|
| 30 |
|
| 31 |
+
new_meta = extract_metadata(code, filename, step)
|
| 32 |
+
conventions = graph.conventions
|
| 33 |
+
checks: dict[str, float] = {}
|
| 34 |
+
|
| 35 |
+
# ββ Check 1: Naming convention βββββββββββββββββββββββββββββββββββββββββ
|
| 36 |
+
naming_conv = conventions.get("naming")
|
| 37 |
+
if naming_conv and naming_conv != "mixed" and new_meta.functions:
|
| 38 |
+
fns = new_meta.functions
|
| 39 |
+
if naming_conv == "snake_case":
|
| 40 |
+
correct = sum(1 for f in fns if "_" in f["name"] or f["name"].islower())
|
| 41 |
+
else: # camelCase
|
| 42 |
+
correct = sum(1 for f in fns if f["name"] and f["name"][0].islower() and any(c.isupper() for c in f["name"]))
|
| 43 |
+
checks["naming_convention"] = correct / len(fns)
|
| 44 |
+
|
| 45 |
+
# ββ Check 2: Error handling convention ββββββββββββββββββββββββββββββββ
|
| 46 |
+
if conventions.get("error_handling") == "try_catch":
|
| 47 |
+
uses_try = new_meta.conventions.get("uses_try_catch", False)
|
| 48 |
+
checks["error_handling"] = 1.0 if uses_try else 0.3
|
| 49 |
+
|
| 50 |
+
# ββ Check 3: Type hints ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 51 |
+
if conventions.get("uses_type_hints"):
|
| 52 |
+
uses_hints = new_meta.conventions.get("uses_type_hints", False)
|
| 53 |
+
checks["type_hints"] = 1.0 if uses_hints else 0.4
|
| 54 |
+
|
| 55 |
+
# ββ Check 4: Docstrings ββββββββββββββββββββββββββββββββββββββββββοΏ½οΏ½οΏ½βββββ
|
| 56 |
+
if conventions.get("uses_docstrings"):
|
| 57 |
+
uses_docs = new_meta.conventions.get("uses_docstrings", False)
|
| 58 |
+
checks["docstrings"] = 1.0 if uses_docs else 0.5
|
| 59 |
+
|
| 60 |
+
# ββ Check 5: No style drift (print statements) ββββββββββββββββββββββββ
|
| 61 |
+
# If no existing component uses print, new code shouldn't either
|
| 62 |
+
existing_no_print = all(
|
| 63 |
+
c.conventions.get("no_print_stmts", True)
|
| 64 |
+
for c in graph.components.values()
|
| 65 |
)
|
| 66 |
+
if existing_no_print:
|
| 67 |
+
checks["no_print_drift"] = 1.0 if new_meta.conventions.get("no_print_stmts", True) else 0.5
|
| 68 |
+
|
| 69 |
+
# ββ Check 6: Component reuse βββββββββββββββββββββββββββββββββββββββββββ
|
| 70 |
+
reuse_opportunities = 0
|
| 71 |
+
reuse_taken = 0
|
| 72 |
+
for comp_name in graph.components:
|
| 73 |
+
# If the problem mentions an existing component, agent should import it
|
| 74 |
+
if comp_name.lower() in code.lower():
|
| 75 |
+
reuse_opportunities += 1
|
| 76 |
+
if comp_name in code: # Actually imported
|
| 77 |
+
reuse_taken += 1
|
| 78 |
+
if reuse_opportunities > 0:
|
| 79 |
+
checks["component_reuse"] = reuse_taken / reuse_opportunities
|
| 80 |
+
|
| 81 |
+
# ββ Aggregate ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 82 |
+
if not checks:
|
| 83 |
+
score = 1.0
|
| 84 |
else:
|
| 85 |
+
score = sum(checks.values()) / len(checks)
|
| 86 |
+
|
| 87 |
+
return {
|
| 88 |
+
"score": round(score, 4),
|
| 89 |
+
"checks": checks,
|
| 90 |
+
"feedback": _consistency_feedback(score, checks),
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def _consistency_feedback(score: float, checks: dict) -> str:
|
| 95 |
+
if score >= 0.9:
|
| 96 |
+
return "Excellent consistency with existing codebase conventions"
|
| 97 |
+
failing = [k for k, v in checks.items() if isinstance(v, float) and v < 0.5]
|
| 98 |
+
if failing:
|
| 99 |
+
return f"Consistency issues in: {', '.join(failing)}"
|
| 100 |
+
return f"Good consistency β minor convention drift ({score:.2f})"
|
graders/correctness.py
CHANGED
|
@@ -1,93 +1,179 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
Runs agent code against each task's test_cases list.
|
| 6 |
-
Handles: None inputs, empty strings, boundary values, DoS strings.
|
| 7 |
-
Returns partial credit: passed / total β never 0.0 for close attempts.
|
| 8 |
"""
|
| 9 |
from sandbox.executor import safe_exec
|
| 10 |
-
from typing import Dict, Any
|
| 11 |
-
import json
|
| 12 |
|
| 13 |
|
| 14 |
-
def grade_correctness(code: str,
|
| 15 |
"""
|
| 16 |
-
|
| 17 |
|
| 18 |
-
|
| 19 |
-
{
|
| 20 |
-
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
| 22 |
"""
|
|
|
|
| 23 |
if not test_cases:
|
| 24 |
-
return {"score": 1.0, "
|
| 25 |
|
| 26 |
passed = 0
|
| 27 |
details = []
|
| 28 |
|
| 29 |
-
for
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
else:
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
-
|
|
|
|
| 41 |
|
| 42 |
-
import json, sys
|
| 43 |
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
"""
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
passed += 1
|
| 58 |
-
details.append({"test": i, "status": "pass", "input": str(inp)[:60]})
|
| 59 |
-
else:
|
| 60 |
-
err = out.get("error", "") if isinstance(out, dict) else ""
|
| 61 |
-
got = out.get("result", "?") if isinstance(out, dict) else str(out)
|
| 62 |
-
details.append({
|
| 63 |
-
"test": i, "status": "fail",
|
| 64 |
-
"input": str(inp)[:60],
|
| 65 |
-
"got": str(got)[:60],
|
| 66 |
-
"expected": str(expected)[:60],
|
| 67 |
-
"error": err[:60],
|
| 68 |
-
})
|
| 69 |
-
else:
|
| 70 |
-
details.append({
|
| 71 |
-
"test": i, "status": "error",
|
| 72 |
-
"input": str(inp)[:60],
|
| 73 |
-
"error": result.get("error", "")[:80],
|
| 74 |
-
})
|
| 75 |
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
|
|
|
|
|
|
| 78 |
if score >= 0.9:
|
| 79 |
-
|
| 80 |
elif score >= 0.7:
|
| 81 |
-
|
| 82 |
elif score >= 0.5:
|
| 83 |
-
|
| 84 |
else:
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
return {
|
| 88 |
-
"score": score,
|
| 89 |
-
"feedback": feedback,
|
| 90 |
-
"passed": passed,
|
| 91 |
-
"total": len(test_cases),
|
| 92 |
-
"details": details,
|
| 93 |
-
}
|
|
|
|
| 1 |
"""
|
| 2 |
+
SecureCodeEnv - Correctness Grader
|
| 3 |
+
Runs each task's test cases against the agent's submitted code.
|
| 4 |
+
Weight: 30% of total reward β the highest single weight.
|
|
|
|
|
|
|
|
|
|
| 5 |
"""
|
| 6 |
from sandbox.executor import safe_exec
|
|
|
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
+
def grade_correctness(code: str, task: dict) -> dict:
|
| 10 |
"""
|
| 11 |
+
Runs the task's test cases against the agent's code.
|
| 12 |
|
| 13 |
+
Returns:
|
| 14 |
+
{
|
| 15 |
+
"score": float 0.0-1.0,
|
| 16 |
+
"passed": int,
|
| 17 |
+
"total": int,
|
| 18 |
+
"details": list of per-test results
|
| 19 |
+
}
|
| 20 |
"""
|
| 21 |
+
test_cases = task.get("test_cases", [])
|
| 22 |
if not test_cases:
|
| 23 |
+
return {"score": 1.0, "passed": 0, "total": 0, "details": [], "feedback": "No test cases defined"}
|
| 24 |
|
| 25 |
passed = 0
|
| 26 |
details = []
|
| 27 |
|
| 28 |
+
for tc in test_cases:
|
| 29 |
+
result = _run_test_case(code, tc)
|
| 30 |
+
if result["passed"]:
|
| 31 |
+
passed += 1
|
| 32 |
+
details.append(result)
|
| 33 |
+
|
| 34 |
+
score = passed / len(test_cases) if test_cases else 1.0
|
| 35 |
+
return {
|
| 36 |
+
"score": round(score, 4),
|
| 37 |
+
"passed": passed,
|
| 38 |
+
"total": len(test_cases),
|
| 39 |
+
"details": details,
|
| 40 |
+
"feedback": _correctness_feedback(score, passed, len(test_cases)),
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def _run_test_case(code: str, tc: dict) -> dict:
|
| 45 |
+
"""Execute a single test case and evaluate the result."""
|
| 46 |
+
fn_name = tc.get("fn", "solution")
|
| 47 |
+
inputs = tc.get("input", [])
|
| 48 |
+
description = tc.get("description", "")
|
| 49 |
+
|
| 50 |
+
# Handle class-based tasks
|
| 51 |
+
if "fn_class" in tc:
|
| 52 |
+
return _run_class_test(code, tc)
|
| 53 |
+
|
| 54 |
+
exec_result = safe_exec(code, inputs, function_name=fn_name, timeout=5)
|
| 55 |
+
|
| 56 |
+
if not exec_result["ok"]:
|
| 57 |
+
expected_exc = tc.get("expected_exception")
|
| 58 |
+
error_str = exec_result.get("error", "")
|
| 59 |
+
exc_type = exec_result.get("type", "") # executor returns type field
|
| 60 |
+
if expected_exc:
|
| 61 |
+
exc_raised = (
|
| 62 |
+
exc_type == expected_exc or
|
| 63 |
+
expected_exc.lower() in error_str.lower() or
|
| 64 |
+
expected_exc.lower() in exc_type.lower()
|
| 65 |
+
)
|
| 66 |
+
if exc_raised:
|
| 67 |
+
return {"passed": True, "description": description, "note": f"Expected {expected_exc} raised"}
|
| 68 |
+
return {"passed": False, "description": description, "error": error_str[:200]}
|
| 69 |
+
|
| 70 |
+
output = exec_result.get("output")
|
| 71 |
+
|
| 72 |
+
# Not-None check
|
| 73 |
+
if "expected_not_none" in tc:
|
| 74 |
+
ok = output is not None
|
| 75 |
+
return {"passed": ok, "description": description}
|
| 76 |
+
|
| 77 |
+
# Standard equality check
|
| 78 |
+
if "expected" in tc:
|
| 79 |
+
expected = tc["expected"]
|
| 80 |
+
ok = output == expected
|
| 81 |
+
return {"passed": ok, "description": description, "got": output, "expected": expected}
|
| 82 |
+
|
| 83 |
+
# Type check (JSON serialization converts tupleβlist, so treat them as equivalent)
|
| 84 |
+
if "expected_type" in tc:
|
| 85 |
+
type_name = tc["expected_type"]
|
| 86 |
+
actual_type = type(output).__name__
|
| 87 |
+
# tuple and list are equivalent after JSON round-trip
|
| 88 |
+
equivalent = {("tuple", "list"), ("list", "tuple")}
|
| 89 |
+
ok = actual_type == type_name or (actual_type, type_name) in equivalent or (type_name, actual_type) in equivalent
|
| 90 |
+
if ok and "expected_len" in tc:
|
| 91 |
+
ok = hasattr(output, "__len__") and len(output) == tc["expected_len"]
|
| 92 |
+
return {"passed": ok, "description": description, "got_type": actual_type}
|
| 93 |
|
| 94 |
+
# Contains check
|
| 95 |
+
if "expected_contains" in tc:
|
| 96 |
+
ok = tc["expected_contains"] in str(output)
|
| 97 |
+
return {"passed": ok, "description": description}
|
| 98 |
+
|
| 99 |
+
# Not-contains check
|
| 100 |
+
if "expected_not_contains" in tc:
|
| 101 |
+
forbidden = tc["expected_not_contains"]
|
| 102 |
+
if isinstance(forbidden, list):
|
| 103 |
+
ok = not any(f in str(output) for f in forbidden)
|
| 104 |
else:
|
| 105 |
+
ok = forbidden not in str(output)
|
| 106 |
+
return {"passed": ok, "description": description, "got": str(output)[:100]}
|
| 107 |
+
|
| 108 |
+
# Min length check
|
| 109 |
+
if "expected_min_len" in tc:
|
| 110 |
+
ok = output is not None and len(str(output)) >= tc["expected_min_len"]
|
| 111 |
+
return {"passed": ok, "description": description}
|
| 112 |
+
|
| 113 |
+
# Max length check
|
| 114 |
+
if "expected_max_len" in tc:
|
| 115 |
+
ok = output is not None and len(str(output)) <= tc["expected_max_len"]
|
| 116 |
+
return {"passed": ok, "description": description}
|
| 117 |
+
|
| 118 |
+
# Ok-flag check (for validate_upload style returns)
|
| 119 |
+
if "expected_ok" in tc:
|
| 120 |
+
ok = isinstance(output, dict) and output.get("ok") == tc["expected_ok"]
|
| 121 |
+
return {"passed": ok, "description": description}
|
| 122 |
|
| 123 |
+
# No expected value defined β just check it didn't crash
|
| 124 |
+
return {"passed": True, "description": description, "note": "No assertion defined"}
|
| 125 |
|
|
|
|
| 126 |
|
| 127 |
+
def _run_class_test(code: str, tc: dict) -> dict:
|
| 128 |
+
"""Run a test against a class-based task (e.g. RateLimiter)."""
|
| 129 |
+
class_name = tc.get("fn_class", "Solution")
|
| 130 |
+
init_args = tc.get("init_args", [])
|
| 131 |
+
method = tc.get("method", "is_allowed")
|
| 132 |
+
inputs = tc.get("input", [])
|
| 133 |
+
description = tc.get("description", "")
|
| 134 |
+
|
| 135 |
+
harness_code = f"""
|
| 136 |
+
{code}
|
| 137 |
+
|
| 138 |
+
def run_task(args):
|
| 139 |
+
init_args = args[0]
|
| 140 |
+
method = args[1]
|
| 141 |
+
inputs = args[2]
|
| 142 |
+
obj = {class_name}(*init_args)
|
| 143 |
+
if method == "is_allowed_multi":
|
| 144 |
+
result = None
|
| 145 |
+
for _ in range(3):
|
| 146 |
+
result = obj.is_allowed(inputs[0])
|
| 147 |
+
return result
|
| 148 |
+
if method == "independent_clients":
|
| 149 |
+
r1 = obj.is_allowed("client_a")
|
| 150 |
+
r2 = obj.is_allowed("client_b")
|
| 151 |
+
return r1 == r2 == True
|
| 152 |
+
fn = getattr(obj, method)
|
| 153 |
+
return fn(*inputs)
|
| 154 |
"""
|
| 155 |
+
test_input = [[init_args, method, inputs]] # wrap in list so safe_exec unpacks correctly
|
| 156 |
+
result = safe_exec(harness_code, test_input, function_name="run_task", timeout=5)
|
| 157 |
+
|
| 158 |
+
if not result["ok"]:
|
| 159 |
+
return {"passed": False, "description": description, "error": result.get("error", "")[:200]}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
|
| 161 |
+
output = result.get("output")
|
| 162 |
+
if "expected" in tc:
|
| 163 |
+
ok = output == tc["expected"]
|
| 164 |
+
return {"passed": ok, "description": description}
|
| 165 |
+
if "expected_last" in tc:
|
| 166 |
+
ok = output == tc["expected_last"]
|
| 167 |
+
return {"passed": ok, "description": description}
|
| 168 |
+
return {"passed": True, "description": description}
|
| 169 |
|
| 170 |
+
|
| 171 |
+
def _correctness_feedback(score: float, passed: int, total: int) -> str:
|
| 172 |
if score >= 0.9:
|
| 173 |
+
return f"Excellent β {passed}/{total} tests passed"
|
| 174 |
elif score >= 0.7:
|
| 175 |
+
return f"Good β {passed}/{total} tests passed. Minor edge cases missing"
|
| 176 |
elif score >= 0.5:
|
| 177 |
+
return f"Partial β {passed}/{total} tests passed. Fix failing cases"
|
| 178 |
else:
|
| 179 |
+
return f"Poor β {passed}/{total} tests passed. Core logic incorrect"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
graders/documentation.py
CHANGED
|
@@ -1,40 +1,142 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
Checks:
|
| 6 |
-
- Functions have docstrings
|
| 7 |
-
- Type hints on parameters and return values
|
| 8 |
-
- No bare except clauses
|
| 9 |
"""
|
| 10 |
import ast
|
| 11 |
-
from typing import Dict, Any
|
| 12 |
|
| 13 |
|
| 14 |
-
def grade_documentation(code: str) ->
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
try:
|
| 16 |
tree = ast.parse(code)
|
| 17 |
except SyntaxError:
|
| 18 |
-
return {"score": 0.0, "feedback": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
-
functions = [n for n in ast.walk(tree) if isinstance(n, ast.FunctionDef)]
|
| 21 |
if not functions:
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
-
|
| 31 |
-
hint_score = has_type_hints / len(functions)
|
| 32 |
-
final = round(doc_score * 0.5 + hint_score * 0.5, 4)
|
| 33 |
|
| 34 |
return {
|
| 35 |
-
"score":
|
| 36 |
-
"
|
| 37 |
-
|
| 38 |
-
f"{has_type_hints}/{len(functions)} have type hints."
|
| 39 |
-
),
|
| 40 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
SecureCodeEnv - Documentation & Code Structure Graders
|
| 3 |
+
Documentation weight: 5% | Code Structure weight: 5%
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
"""
|
| 5 |
import ast
|
|
|
|
| 6 |
|
| 7 |
|
| 8 |
+
def grade_documentation(code: str) -> dict:
|
| 9 |
+
"""
|
| 10 |
+
Grade docstring and type hint coverage.
|
| 11 |
+
Rewards: functions with docstrings, full type annotations, module docstring.
|
| 12 |
+
|
| 13 |
+
Returns:
|
| 14 |
+
{"score": float, "documented_fns": int, "total_fns": int, "feedback": str}
|
| 15 |
+
"""
|
| 16 |
try:
|
| 17 |
tree = ast.parse(code)
|
| 18 |
except SyntaxError:
|
| 19 |
+
return {"score": 0.0, "documented_fns": 0, "total_fns": 0, "feedback": "Syntax error β cannot parse"}
|
| 20 |
+
|
| 21 |
+
functions = [
|
| 22 |
+
n for n in ast.walk(tree)
|
| 23 |
+
if isinstance(n, (ast.FunctionDef, ast.AsyncFunctionDef))
|
| 24 |
+
]
|
| 25 |
|
|
|
|
| 26 |
if not functions:
|
| 27 |
+
# No functions β check for module docstring
|
| 28 |
+
has_module_doc = bool(ast.get_docstring(tree))
|
| 29 |
+
return {
|
| 30 |
+
"score": 1.0 if has_module_doc else 0.7,
|
| 31 |
+
"documented_fns": 0,
|
| 32 |
+
"total_fns": 0,
|
| 33 |
+
"feedback": "No functions found β module-level code only",
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
documented = 0
|
| 37 |
+
typed = 0
|
| 38 |
+
scores = []
|
| 39 |
+
|
| 40 |
+
for fn in functions:
|
| 41 |
+
fn_score = 0.0
|
| 42 |
+
has_doc = bool(ast.get_docstring(fn))
|
| 43 |
+
has_return_type = fn.returns is not None
|
| 44 |
+
has_param_types = any(a.annotation is not None for a in fn.args.args)
|
| 45 |
+
has_any_types = has_return_type or has_param_types
|
| 46 |
+
|
| 47 |
+
if has_doc:
|
| 48 |
+
documented += 1
|
| 49 |
+
fn_score += 0.5
|
| 50 |
+
|
| 51 |
+
if has_any_types:
|
| 52 |
+
typed += 1
|
| 53 |
+
fn_score += 0.5
|
| 54 |
+
|
| 55 |
+
scores.append(fn_score)
|
| 56 |
+
|
| 57 |
+
total = len(functions)
|
| 58 |
+
score = sum(scores) / total if total > 0 else 1.0
|
| 59 |
+
|
| 60 |
+
return {
|
| 61 |
+
"score": round(score, 4),
|
| 62 |
+
"documented_fns": documented,
|
| 63 |
+
"typed_fns": typed,
|
| 64 |
+
"total_fns": total,
|
| 65 |
+
"feedback": _doc_feedback(score, documented, typed, total),
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def grade_code_structure(code: str) -> dict:
|
| 70 |
+
"""
|
| 71 |
+
Grade code structure quality:
|
| 72 |
+
- No bare print() statements
|
| 73 |
+
- Exception handling present where needed
|
| 74 |
+
- No bare except clauses
|
| 75 |
+
- No hardcoded magic strings
|
| 76 |
+
- Functions not excessively long (>50 lines)
|
| 77 |
|
| 78 |
+
Returns:
|
| 79 |
+
{"score": float, "checks": dict, "feedback": str}
|
| 80 |
+
"""
|
| 81 |
+
try:
|
| 82 |
+
tree = ast.parse(code)
|
| 83 |
+
except SyntaxError:
|
| 84 |
+
return {"score": 0.0, "checks": {}, "feedback": "Syntax error"}
|
| 85 |
+
|
| 86 |
+
checks: dict[str, bool] = {}
|
| 87 |
+
lines = code.splitlines()
|
| 88 |
+
|
| 89 |
+
# Check 1: No bare print statements (use logging)
|
| 90 |
+
checks["no_bare_print"] = "print(" not in code
|
| 91 |
+
|
| 92 |
+
# Check 2: No bare except (catches all exceptions silently)
|
| 93 |
+
bare_except = False
|
| 94 |
+
for node in ast.walk(tree):
|
| 95 |
+
if isinstance(node, ast.ExceptHandler) and node.type is None:
|
| 96 |
+
bare_except = True
|
| 97 |
+
break
|
| 98 |
+
checks["no_bare_except"] = not bare_except
|
| 99 |
+
|
| 100 |
+
# Check 3: Functions are reasonably sized (<= 50 lines)
|
| 101 |
+
oversized = False
|
| 102 |
+
for node in ast.walk(tree):
|
| 103 |
+
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
| 104 |
+
fn_lines = (node.end_lineno or 0) - node.lineno
|
| 105 |
+
if fn_lines > 50:
|
| 106 |
+
oversized = True
|
| 107 |
+
break
|
| 108 |
+
checks["reasonable_fn_size"] = not oversized
|
| 109 |
+
|
| 110 |
+
# Check 4: No TODO/FIXME/HACK comments left in production code
|
| 111 |
+
has_todo = any(
|
| 112 |
+
"# TODO" in line.upper() or "# FIXME" in line.upper() or "# HACK" in line.upper()
|
| 113 |
+
for line in lines
|
| 114 |
)
|
| 115 |
+
checks["no_todo_comments"] = not has_todo
|
| 116 |
+
|
| 117 |
+
# Check 5: Handles None inputs (basic check)
|
| 118 |
+
checks["handles_none"] = "None" in code or "is not None" in code or "if not " in code
|
| 119 |
|
| 120 |
+
score = sum(1 for v in checks.values() if v) / max(len(checks), 1)
|
|
|
|
|
|
|
| 121 |
|
| 122 |
return {
|
| 123 |
+
"score": round(score, 4),
|
| 124 |
+
"checks": checks,
|
| 125 |
+
"feedback": _structure_feedback(score, checks),
|
|
|
|
|
|
|
| 126 |
}
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def _doc_feedback(score: float, documented: int, typed: int, total: int) -> str:
|
| 130 |
+
if score >= 0.9:
|
| 131 |
+
return f"Well documented β {documented}/{total} functions have docstrings, {typed}/{total} typed"
|
| 132 |
+
elif score >= 0.6:
|
| 133 |
+
return f"Partial documentation β {documented}/{total} docstrings, {typed}/{total} type hints"
|
| 134 |
+
else:
|
| 135 |
+
return f"Poor documentation β add docstrings and type hints to all {total} functions"
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def _structure_feedback(score: float, checks: dict) -> str:
|
| 139 |
+
if score >= 0.9:
|
| 140 |
+
return "Clean code structure"
|
| 141 |
+
failing = [k for k, v in checks.items() if not v]
|
| 142 |
+
return f"Structure issues: {', '.join(failing)}"
|
graders/performance.py
CHANGED
|
@@ -1,113 +1,122 @@
|
|
| 1 |
"""
|
| 2 |
-
|
|
|
|
| 3 |
Weight: 10% of total reward.
|
| 4 |
-
|
| 5 |
-
Never uses absolute millisecond thresholds β machines vary.
|
| 6 |
-
Score = 1.0 means agent matches optimal speed.
|
| 7 |
-
Score = 0.0 means agent is as slow as the naive solution.
|
| 8 |
-
Intermediate: linear interpolation.
|
| 9 |
-
|
| 10 |
-
Also checks memory via tracemalloc (peak bytes).
|
| 11 |
"""
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
-
def grade_performance(code: str, task: dict) ->
|
| 17 |
"""
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
| 25 |
"""
|
| 26 |
-
|
| 27 |
-
|
|
|
|
| 28 |
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
|
|
|
| 32 |
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
-
|
|
|
|
| 36 |
|
| 37 |
-
# Warmup
|
| 38 |
-
try:
|
| 39 |
-
run_task(_test_input)
|
| 40 |
-
except Exception:
|
| 41 |
-
pass
|
| 42 |
-
|
| 43 |
-
# Time 3 runs
|
| 44 |
-
tracemalloc.start()
|
| 45 |
-
_times = []
|
| 46 |
-
for _ in range(3):
|
| 47 |
-
_t0 = time.perf_counter()
|
| 48 |
try:
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
"peak_kb": _peak / 1024,
|
| 61 |
-
}}))
|
| 62 |
-
"""
|
| 63 |
-
result = safe_exec(timer_code, "", timeout=10)
|
| 64 |
|
| 65 |
-
|
| 66 |
return {
|
| 67 |
-
"score":
|
| 68 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
}
|
|
|
|
|
|
|
| 70 |
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
else:
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
final = round(time_score * 0.7 + mem_score * 0.3, 4)
|
| 104 |
-
|
| 105 |
-
return {
|
| 106 |
-
"score": final,
|
| 107 |
-
"feedback": (
|
| 108 |
-
f"avg={avg_ms:.1f}ms, peak_mem={peak_kb:.0f}KB. "
|
| 109 |
-
f"Time score={time_score:.2f}, Memory score={mem_score:.2f}."
|
| 110 |
-
),
|
| 111 |
-
"avg_ms": avg_ms,
|
| 112 |
-
"peak_kb": peak_kb,
|
| 113 |
-
}
|
|
|
|
| 1 |
"""
|
| 2 |
+
SecureCodeEnv - Performance Grader
|
| 3 |
+
Measures execution time and memory relative to naive/optimal baselines.
|
| 4 |
Weight: 10% of total reward.
|
| 5 |
+
Relative scoring ensures machine-speed differences don't affect results.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
"""
|
| 7 |
+
import timeit
|
| 8 |
+
import tracemalloc
|
| 9 |
+
import sys
|
| 10 |
+
import tempfile
|
| 11 |
+
import subprocess
|
| 12 |
+
import os
|
| 13 |
+
import json
|
| 14 |
|
| 15 |
|
| 16 |
+
def grade_performance(code: str, task: dict) -> dict:
|
| 17 |
"""
|
| 18 |
+
Score agent performance relative to naive and optimal baselines.
|
| 19 |
+
Score 1.0 = matches optimal. Score 0.0 = as slow/heavy as naive.
|
| 20 |
+
|
| 21 |
+
Returns:
|
| 22 |
+
{
|
| 23 |
+
"score": float 0.0-1.0,
|
| 24 |
+
"time_score": float,
|
| 25 |
+
"memory_score": float,
|
| 26 |
+
"feedback": str
|
| 27 |
+
}
|
| 28 |
"""
|
| 29 |
+
test_cases = task.get("test_cases", [])
|
| 30 |
+
if not test_cases:
|
| 31 |
+
return {"score": 1.0, "time_score": 1.0, "memory_score": 1.0, "feedback": "No performance test cases"}
|
| 32 |
|
| 33 |
+
naive_code = task.get("naive_code", "")
|
| 34 |
+
optimal_code = task.get("optimal_code", "")
|
| 35 |
+
if not naive_code or not optimal_code:
|
| 36 |
+
return {"score": 1.0, "time_score": 1.0, "memory_score": 1.0, "feedback": "No baselines defined"}
|
| 37 |
|
| 38 |
+
# Find a simple test case with direct fn input
|
| 39 |
+
tc = next((t for t in test_cases if "fn" in t and "input" in t and "expected_exception" not in t), None)
|
| 40 |
+
if not tc:
|
| 41 |
+
return {"score": 1.0, "time_score": 1.0, "memory_score": 1.0, "feedback": "No suitable test case for perf"}
|
| 42 |
|
| 43 |
+
fn_name = tc["fn"]
|
| 44 |
+
inputs = tc["input"]
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
try:
|
| 47 |
+
agent_time = _measure_time_subprocess(code, fn_name, inputs)
|
| 48 |
+
naive_time = _measure_time_subprocess(naive_code, fn_name, inputs)
|
| 49 |
+
optimal_time = _measure_time_subprocess(optimal_code, fn_name, inputs)
|
| 50 |
+
|
| 51 |
+
# Relative scoring: 1.0 = matches optimal, 0.0 = as slow as naive
|
| 52 |
+
time_range = max(naive_time - optimal_time, 1e-6)
|
| 53 |
+
time_score = 1.0 - ((agent_time - optimal_time) / time_range)
|
| 54 |
+
time_score = max(0.0, min(1.0, time_score))
|
| 55 |
+
|
| 56 |
+
# Memory (simplified β assume correlated with time for subprocess approach)
|
| 57 |
+
memory_score = time_score # Fallback
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
+
combined = (time_score * 0.7) + (memory_score * 0.3)
|
| 60 |
return {
|
| 61 |
+
"score": round(combined, 4),
|
| 62 |
+
"time_score": round(time_score, 4),
|
| 63 |
+
"memory_score": round(memory_score, 4),
|
| 64 |
+
"agent_ms": round(agent_time * 1000, 2),
|
| 65 |
+
"naive_ms": round(naive_time * 1000, 2),
|
| 66 |
+
"optimal_ms": round(optimal_time * 1000, 2),
|
| 67 |
+
"feedback": _perf_feedback(combined),
|
| 68 |
}
|
| 69 |
+
except Exception as e:
|
| 70 |
+
return {"score": 0.7, "time_score": 0.7, "memory_score": 0.7, "feedback": f"Performance measurement failed: {str(e)[:80]}"}
|
| 71 |
|
| 72 |
+
|
| 73 |
+
def _measure_time_subprocess(code: str, fn_name: str, inputs: list, runs: int = 10) -> float:
|
| 74 |
+
"""Measure execution time safely in a subprocess."""
|
| 75 |
+
harness = f"""
|
| 76 |
+
import timeit
|
| 77 |
+
import json
|
| 78 |
+
|
| 79 |
+
{code}
|
| 80 |
+
|
| 81 |
+
def run():
|
| 82 |
+
{fn_name}(*{json.dumps(inputs)})
|
| 83 |
+
|
| 84 |
+
times = timeit.repeat(run, number={runs}, repeat=3)
|
| 85 |
+
print(json.dumps({{"min_time": min(times) / {runs}}}))
|
| 86 |
+
"""
|
| 87 |
+
tmp_path = None
|
| 88 |
+
try:
|
| 89 |
+
with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False, prefix="sce_perf_") as f:
|
| 90 |
+
f.write(harness)
|
| 91 |
+
tmp_path = f.name
|
| 92 |
+
|
| 93 |
+
result = subprocess.run(
|
| 94 |
+
[sys.executable, tmp_path],
|
| 95 |
+
capture_output=True, text=True, timeout=30,
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
if result.returncode == 0 and result.stdout.strip():
|
| 99 |
+
data = json.loads(result.stdout.strip().split("\n")[-1])
|
| 100 |
+
return data.get("min_time", 0.01)
|
| 101 |
+
|
| 102 |
+
return 0.05 # Default fallback if measurement fails
|
| 103 |
+
|
| 104 |
+
except (subprocess.TimeoutExpired, json.JSONDecodeError, Exception):
|
| 105 |
+
return 0.05
|
| 106 |
+
finally:
|
| 107 |
+
if tmp_path and os.path.exists(tmp_path):
|
| 108 |
+
try:
|
| 109 |
+
os.unlink(tmp_path)
|
| 110 |
+
except OSError:
|
| 111 |
+
pass
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def _perf_feedback(score: float) -> str:
|
| 115 |
+
if score >= 0.9:
|
| 116 |
+
return "Excellent performance β near-optimal efficiency"
|
| 117 |
+
elif score >= 0.7:
|
| 118 |
+
return "Good performance β minor optimization possible"
|
| 119 |
+
elif score >= 0.5:
|
| 120 |
+
return "Acceptable performance β room for improvement"
|
| 121 |
else:
|
| 122 |
+
return "Poor performance β consider algorithmic improvements"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
graders/reward_aggregator.py
CHANGED
|
@@ -1,39 +1,33 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
Attack resistance weight increased to 25% (was 20% in V1) because V2
|
| 15 |
-
uses behavioral harnesses β the check is now provably unfakeable.
|
| 16 |
"""
|
| 17 |
from graders.correctness import grade_correctness
|
| 18 |
-
from graders.attacks import
|
| 19 |
-
from graders.static_analysis import
|
| 20 |
-
from graders.consistency import grade_consistency
|
| 21 |
from graders.performance import grade_performance
|
| 22 |
-
from graders.
|
| 23 |
-
from graders.
|
| 24 |
-
from graders.code_structure import grade_code_structure
|
| 25 |
from codegraph.extractor import extract_metadata
|
| 26 |
-
from
|
| 27 |
|
| 28 |
WEIGHTS = {
|
| 29 |
-
"correctness":
|
| 30 |
-
"attack_resist":
|
| 31 |
-
"static_security":
|
| 32 |
-
"consistency":
|
| 33 |
-
"performance":
|
| 34 |
-
"documentation":
|
| 35 |
-
"code_structure":
|
| 36 |
-
"supply_chain": 0.02,
|
| 37 |
}
|
| 38 |
|
| 39 |
assert abs(sum(WEIGHTS.values()) - 1.0) < 1e-9, "Weights must sum to 1.0"
|
|
@@ -43,90 +37,97 @@ def grade_submission(
|
|
| 43 |
code: str,
|
| 44 |
filename: str,
|
| 45 |
task: dict,
|
| 46 |
-
graph,
|
| 47 |
step: int,
|
| 48 |
seed: int,
|
| 49 |
-
) ->
|
| 50 |
"""
|
| 51 |
-
Run all graders and return
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
"""
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
scores
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
# ββ Supply Chain (2%) ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 98 |
-
r = grade_supply_chain(code)
|
| 99 |
-
scores["supply_chain"] = r["score"]
|
| 100 |
-
feedback["supply_chain"] = r["feedback"]
|
| 101 |
-
|
| 102 |
-
# ββ Weighted total βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 103 |
-
total_reward = round(
|
| 104 |
-
sum(scores[k] * WEIGHTS[k] for k in WEIGHTS if k in scores), 4
|
| 105 |
-
)
|
| 106 |
-
|
| 107 |
-
# ββ CodeGraph metadata βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 108 |
new_metadata = extract_metadata(code, filename, step)
|
| 109 |
|
| 110 |
return {
|
| 111 |
"scores": scores,
|
| 112 |
"total_reward": total_reward,
|
| 113 |
-
"feedback":
|
| 114 |
"new_metadata": new_metadata,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
}
|
| 116 |
|
| 117 |
|
| 118 |
-
def
|
| 119 |
-
"""
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
SecureCodeEnv - Reward Aggregator
|
| 3 |
+
Orchestrates all graders and computes the final weighted reward.
|
| 4 |
+
|
| 5 |
+
Reward weights (must sum to 1.0):
|
| 6 |
+
correctness 30% β Does it work?
|
| 7 |
+
attack_resist 20% β Does it resist real attacks?
|
| 8 |
+
static_security 15% β Does it pass security linters?
|
| 9 |
+
consistency 15% β Does it match codebase conventions?
|
| 10 |
+
performance 10% β Is it efficient?
|
| 11 |
+
documentation 5% β Is it documented?
|
| 12 |
+
code_structure 5% β Is it clean?
|
|
|
|
|
|
|
|
|
|
| 13 |
"""
|
| 14 |
from graders.correctness import grade_correctness
|
| 15 |
+
from graders.attacks import grade_attacks
|
| 16 |
+
from graders.static_analysis import grade_static_analysis
|
|
|
|
| 17 |
from graders.performance import grade_performance
|
| 18 |
+
from graders.consistency import grade_consistency
|
| 19 |
+
from graders.documentation import grade_documentation, grade_code_structure
|
|
|
|
| 20 |
from codegraph.extractor import extract_metadata
|
| 21 |
+
from codegraph.graph import CodeGraph
|
| 22 |
|
| 23 |
WEIGHTS = {
|
| 24 |
+
"correctness": 0.30,
|
| 25 |
+
"attack_resist": 0.20,
|
| 26 |
+
"static_security": 0.15,
|
| 27 |
+
"consistency": 0.15,
|
| 28 |
+
"performance": 0.10,
|
| 29 |
+
"documentation": 0.05,
|
| 30 |
+
"code_structure": 0.05,
|
|
|
|
| 31 |
}
|
| 32 |
|
| 33 |
assert abs(sum(WEIGHTS.values()) - 1.0) < 1e-9, "Weights must sum to 1.0"
|
|
|
|
| 37 |
code: str,
|
| 38 |
filename: str,
|
| 39 |
task: dict,
|
| 40 |
+
graph: CodeGraph,
|
| 41 |
step: int,
|
| 42 |
seed: int,
|
| 43 |
+
) -> dict:
|
| 44 |
"""
|
| 45 |
+
Run all graders on the submitted code and return the full result.
|
| 46 |
+
|
| 47 |
+
Args:
|
| 48 |
+
code: Agent's Python source code string
|
| 49 |
+
filename: Logical filename for CodeGraph tracking
|
| 50 |
+
task: Task definition dict
|
| 51 |
+
graph: Current CodeGraph state
|
| 52 |
+
step: Current step number in the episode
|
| 53 |
+
seed: Randomness seed for attack payloads
|
| 54 |
+
|
| 55 |
+
Returns:
|
| 56 |
+
{
|
| 57 |
+
"scores": dict of dimension scores,
|
| 58 |
+
"total_reward": float 0.0-1.0,
|
| 59 |
+
"feedback": dict of human-readable messages,
|
| 60 |
+
"new_metadata": ComponentMetadata for CodeGraph update,
|
| 61 |
+
}
|
| 62 |
"""
|
| 63 |
+
# ββ Run all graders βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 64 |
+
correctness_result = grade_correctness(code, task)
|
| 65 |
+
attack_result = grade_attacks(code, task, seed)
|
| 66 |
+
static_result = grade_static_analysis(code, task)
|
| 67 |
+
perf_result = grade_performance(code, task)
|
| 68 |
+
consistency_result = grade_consistency(code, filename, graph, step)
|
| 69 |
+
doc_result = grade_documentation(code)
|
| 70 |
+
structure_result = grade_code_structure(code)
|
| 71 |
+
|
| 72 |
+
# ββ Extract per-grader scores βββββββββββββββββββββββββββββββββββββββββββ
|
| 73 |
+
scores = {
|
| 74 |
+
"correctness": correctness_result["score"],
|
| 75 |
+
"attack_resist": attack_result["score"],
|
| 76 |
+
"static_security": static_result["score"],
|
| 77 |
+
"consistency": consistency_result["score"],
|
| 78 |
+
"performance": perf_result["score"],
|
| 79 |
+
"documentation": doc_result["score"],
|
| 80 |
+
"code_structure": structure_result["score"],
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
# ββ Weighted sum ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 84 |
+
total_reward = sum(scores[k] * WEIGHTS[k] for k in WEIGHTS)
|
| 85 |
+
total_reward = round(max(0.0, min(1.0, total_reward)), 4)
|
| 86 |
+
|
| 87 |
+
# ββ Human-readable feedback βββββββββββββββββββββββββββββββββββββββββββββ
|
| 88 |
+
feedback = {
|
| 89 |
+
"correctness": correctness_result.get("feedback", ""),
|
| 90 |
+
"attack_resist": attack_result.get("feedback", ""),
|
| 91 |
+
"static_security": static_result.get("feedback", ""),
|
| 92 |
+
"consistency": consistency_result.get("feedback", ""),
|
| 93 |
+
"performance": perf_result.get("feedback", ""),
|
| 94 |
+
"documentation": doc_result.get("feedback", ""),
|
| 95 |
+
"code_structure": structure_result.get("feedback", ""),
|
| 96 |
+
"summary": _summary(total_reward, scores),
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
# ββ Extract CodeGraph metadata ββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
new_metadata = extract_metadata(code, filename, step)
|
| 101 |
|
| 102 |
return {
|
| 103 |
"scores": scores,
|
| 104 |
"total_reward": total_reward,
|
| 105 |
+
"feedback": feedback,
|
| 106 |
"new_metadata": new_metadata,
|
| 107 |
+
# Detailed sub-results (for debugging/observability)
|
| 108 |
+
"details": {
|
| 109 |
+
"correctness": {"passed": correctness_result.get("passed"), "total": correctness_result.get("total")},
|
| 110 |
+
"attacks": {"blocked": attack_result.get("blocked"), "total": attack_result.get("total"), "type": attack_result.get("attack_type")},
|
| 111 |
+
"static": {"bandit_score": static_result.get("bandit_score"), "issues": static_result.get("issues", [])[:3]},
|
| 112 |
+
},
|
| 113 |
}
|
| 114 |
|
| 115 |
|
| 116 |
+
def _summary(reward: float, scores: dict) -> str:
|
| 117 |
+
"""Generate a one-line executive summary."""
|
| 118 |
+
if reward >= 0.90:
|
| 119 |
+
return f"β
Excellent submission (reward: {reward:.3f}) β production-ready"
|
| 120 |
+
elif reward >= 0.70:
|
| 121 |
+
weakest = min(scores, key=scores.get)
|
| 122 |
+
return f"π‘ Good submission (reward: {reward:.3f}) β improve: {weakest} ({scores[weakest]:.2f})"
|
| 123 |
+
elif reward >= 0.50:
|
| 124 |
+
weak = [k for k, v in scores.items() if v < 0.5]
|
| 125 |
+
return f"π Needs work (reward: {reward:.3f}) β critical issues in: {', '.join(weak[:3])}"
|
| 126 |
+
else:
|
| 127 |
+
return f"π΄ Poor submission (reward: {reward:.3f}) β significant security/correctness failures"
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def compute_reward(scores: dict) -> float:
|
| 131 |
+
"""Utility: compute weighted reward from a scores dict."""
|
| 132 |
+
total = sum(scores.get(k, 0) * WEIGHTS[k] for k in WEIGHTS)
|
| 133 |
+
return round(max(0.0, min(1.0, total)), 4)
|
graders/static_analysis.py
CHANGED
|
@@ -1,46 +1,64 @@
|
|
| 1 |
"""
|
| 2 |
-
|
|
|
|
| 3 |
Weight: 15% of total reward.
|
| 4 |
-
|
| 5 |
-
Tools:
|
| 6 |
-
bandit: AST-based Python security scanner, zero-config, maps to CWE IDs
|
| 7 |
-
semgrep: Rule-based pattern matching β catches what bandit misses
|
| 8 |
-
|
| 9 |
-
Penalty schedule:
|
| 10 |
-
HIGH severity issue: -0.30
|
| 11 |
-
MEDIUM severity issue: -0.15
|
| 12 |
-
LOW severity issue: -0.05
|
| 13 |
-
|
| 14 |
-
Score = max(0.0, 1.0 - total_penalty)
|
| 15 |
-
No penalty stacking beyond score floor of 0.0.
|
| 16 |
"""
|
| 17 |
import subprocess
|
| 18 |
import json
|
| 19 |
import tempfile
|
| 20 |
import os
|
| 21 |
-
import
|
| 22 |
-
from typing import Dict, Any
|
| 23 |
|
| 24 |
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
-
|
| 28 |
-
"""Run bandit static analysis. Returns score + issues list."""
|
| 29 |
-
with tempfile.NamedTemporaryFile(
|
| 30 |
-
mode="w", suffix=".py", delete=False, encoding="utf-8"
|
| 31 |
-
) as f:
|
| 32 |
-
f.write(code)
|
| 33 |
-
tmp = f.name
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
result = subprocess.run(
|
| 37 |
-
["bandit", "-r",
|
| 38 |
capture_output=True, text=True, timeout=15,
|
| 39 |
)
|
|
|
|
| 40 |
try:
|
| 41 |
-
data = json.loads(result.stdout or '{"results":
|
| 42 |
except json.JSONDecodeError:
|
| 43 |
-
|
| 44 |
|
| 45 |
issues = data.get("results", [])
|
| 46 |
penalty = 0.0
|
|
@@ -56,93 +74,133 @@ def run_bandit(code: str) -> Dict[str, Any]:
|
|
| 56 |
score = max(0.0, 1.0 - penalty)
|
| 57 |
return {
|
| 58 |
"score": round(score, 4),
|
| 59 |
-
"issues":
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
}
|
| 62 |
except FileNotFoundError:
|
| 63 |
# bandit not installed β skip gracefully
|
| 64 |
-
return {"score":
|
| 65 |
except subprocess.TimeoutExpired:
|
| 66 |
-
return {"score": 0.
|
|
|
|
|
|
|
| 67 |
finally:
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
(r'hashlib\.sha1\b', "MEDIUM", "SHA1 usage β deprecated for security (CWE-327)"),
|
| 81 |
-
(r'random\.random\b', "MEDIUM", "random.random() β not cryptographically secure (use secrets)"),
|
| 82 |
-
(r'subprocess.*shell\s*=\s*True', "HIGH", "shell=True β shell injection risk (CWE-78)"),
|
| 83 |
-
(r'os\.system\s*\(', "HIGH", "os.system() β shell injection risk (CWE-78)"),
|
| 84 |
-
(r'pickle\.loads?\s*\(', "HIGH", "pickle β arbitrary code execution on untrusted data"),
|
| 85 |
-
(r'yaml\.load\s*\([^)]*\)', "MEDIUM", "yaml.load() without Loader β use yaml.safe_load()"),
|
| 86 |
-
(r'password\s*=\s*["\']', "MEDIUM", "Potential hardcoded password (CWE-259)"),
|
| 87 |
-
(r'secret\s*=\s*["\']', "MEDIUM", "Potential hardcoded secret"),
|
| 88 |
-
(r'f["\'].*SELECT.*\{', "HIGH", "f-string SQL construction β injection risk (CWE-89)"),
|
| 89 |
-
(r'%.*SELECT.*%', "HIGH", "%-format SQL construction β injection risk (CWE-89)"),
|
| 90 |
-
(r'\.format\(.*\).*SELECT|SELECT.*\.format', "HIGH", "str.format() SQL β injection risk (CWE-89)"),
|
| 91 |
-
]
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
def run_ast_heuristics(code: str) -> Dict[str, Any]:
|
| 95 |
-
"""Fast regex-based heuristic checks as bandit supplement."""
|
| 96 |
issues = []
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
else:
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
|
| 126 |
-
|
| 127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
-
|
| 130 |
-
|
|
|
|
|
|
|
|
|
|
| 131 |
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
feedback = f"{issue_count} HIGH severity issue(s). Critical: remove eval/exec, use parameterised queries, avoid MD5/SHA1."
|
| 140 |
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
SecureCodeEnv - Static Analysis Grader
|
| 3 |
+
Runs bandit (CWE-aware Python security linter) + AST-based anti-pattern checks.
|
| 4 |
Weight: 15% of total reward.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
"""
|
| 6 |
import subprocess
|
| 7 |
import json
|
| 8 |
import tempfile
|
| 9 |
import os
|
| 10 |
+
import ast
|
|
|
|
| 11 |
|
| 12 |
|
| 13 |
+
def grade_static_analysis(code: str, task: dict) -> dict:
|
| 14 |
+
"""
|
| 15 |
+
Run bandit + AST checks on the submitted code.
|
| 16 |
+
|
| 17 |
+
Returns:
|
| 18 |
+
{
|
| 19 |
+
"score": float 0.0-1.0,
|
| 20 |
+
"bandit_score": float,
|
| 21 |
+
"ast_score": float,
|
| 22 |
+
"issues": list,
|
| 23 |
+
"feedback": str
|
| 24 |
+
}
|
| 25 |
+
"""
|
| 26 |
+
bandit_result = _run_bandit(code)
|
| 27 |
+
ast_result = _run_ast_checks(code, task)
|
| 28 |
+
|
| 29 |
+
# Combine: bandit is 70%, AST custom checks are 30%
|
| 30 |
+
combined_score = (bandit_result["score"] * 0.70) + (ast_result["score"] * 0.30)
|
| 31 |
|
| 32 |
+
all_issues = bandit_result.get("issues", []) + ast_result.get("issues", [])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
+
return {
|
| 35 |
+
"score": round(combined_score, 4),
|
| 36 |
+
"bandit_score": bandit_result["score"],
|
| 37 |
+
"ast_score": ast_result["score"],
|
| 38 |
+
"issues": all_issues[:10], # Cap at 10 issues for response size
|
| 39 |
+
"feedback": _static_feedback(combined_score, all_issues),
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def _run_bandit(code: str) -> dict:
|
| 44 |
+
"""Run bandit security linter on the code string."""
|
| 45 |
+
tmp_path = None
|
| 46 |
try:
|
| 47 |
+
with tempfile.NamedTemporaryFile(
|
| 48 |
+
mode="w", suffix=".py", delete=False, prefix="sce_bandit_"
|
| 49 |
+
) as f:
|
| 50 |
+
f.write(code)
|
| 51 |
+
tmp_path = f.name
|
| 52 |
+
|
| 53 |
result = subprocess.run(
|
| 54 |
+
["bandit", "-r", tmp_path, "-f", "json", "-q", "--exit-zero"],
|
| 55 |
capture_output=True, text=True, timeout=15,
|
| 56 |
)
|
| 57 |
+
|
| 58 |
try:
|
| 59 |
+
data = json.loads(result.stdout or '{"results":[]}')
|
| 60 |
except json.JSONDecodeError:
|
| 61 |
+
return {"score": 1.0, "issues": [], "note": "bandit output parse error"}
|
| 62 |
|
| 63 |
issues = data.get("results", [])
|
| 64 |
penalty = 0.0
|
|
|
|
| 74 |
score = max(0.0, 1.0 - penalty)
|
| 75 |
return {
|
| 76 |
"score": round(score, 4),
|
| 77 |
+
"issues": [
|
| 78 |
+
{
|
| 79 |
+
"severity": i.get("issue_severity"),
|
| 80 |
+
"text": i.get("issue_text", "")[:100],
|
| 81 |
+
"line": i.get("line_number"),
|
| 82 |
+
"cwe": i.get("issue_cwe", {}).get("id") if isinstance(i.get("issue_cwe"), dict) else None,
|
| 83 |
+
}
|
| 84 |
+
for i in issues[:5]
|
| 85 |
+
],
|
| 86 |
}
|
| 87 |
except FileNotFoundError:
|
| 88 |
# bandit not installed β skip gracefully
|
| 89 |
+
return {"score": 1.0, "issues": [], "note": "bandit not available"}
|
| 90 |
except subprocess.TimeoutExpired:
|
| 91 |
+
return {"score": 0.8, "issues": [], "note": "bandit timed out"}
|
| 92 |
+
except Exception as e:
|
| 93 |
+
return {"score": 1.0, "issues": [], "note": f"bandit error: {str(e)[:50]}"}
|
| 94 |
finally:
|
| 95 |
+
if tmp_path and os.path.exists(tmp_path):
|
| 96 |
+
try:
|
| 97 |
+
os.unlink(tmp_path)
|
| 98 |
+
except OSError:
|
| 99 |
+
pass
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def _run_ast_checks(code: str, task: dict) -> dict:
|
| 103 |
+
"""
|
| 104 |
+
AST-based security checks tailored to the task's security_checks config.
|
| 105 |
+
Falls back to generic anti-pattern detection.
|
| 106 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
issues = []
|
| 108 |
+
checks_passed = 0
|
| 109 |
+
total_checks = 0
|
| 110 |
+
|
| 111 |
+
# Generic dangerous pattern checks (always run)
|
| 112 |
+
generic_checks = [
|
| 113 |
+
("no_eval", ["eval(", "exec("], "Dangerous eval/exec usage detected"),
|
| 114 |
+
("no_shell_true", ["shell=True"], "shell=True enables command injection"),
|
| 115 |
+
("no_pickle", ["pickle.loads", "pickle.load"], "Unsafe pickle deserialization"),
|
| 116 |
+
("no_yaml_unsafe", ["yaml.load(", "yaml.unsafe_load"], "Unsafe YAML load"),
|
| 117 |
+
("no_hardcoded_md5", ["hashlib.md5", "md5("], "Weak MD5 hash function"),
|
| 118 |
+
("no_hardcoded_sha1", ["hashlib.sha1", "sha1("], "Weak SHA1 hash function"),
|
| 119 |
+
]
|
| 120 |
+
|
| 121 |
+
for check_name, patterns, message in generic_checks:
|
| 122 |
+
total_checks += 1
|
| 123 |
+
found = any(p in code for p in patterns)
|
| 124 |
+
if found:
|
| 125 |
+
issues.append({"check": check_name, "message": message, "severity": "HIGH"})
|
| 126 |
else:
|
| 127 |
+
checks_passed += 1
|
| 128 |
+
|
| 129 |
+
# Task-specific checks
|
| 130 |
+
task_checks = task.get("security_checks", [])
|
| 131 |
+
for check in task_checks:
|
| 132 |
+
total_checks += 1
|
| 133 |
+
check_type = check.get("type", "")
|
| 134 |
+
|
| 135 |
+
if check_type == "no_weak_hash":
|
| 136 |
+
forbidden = check.get("forbidden", [])
|
| 137 |
+
found = any(f in code for f in forbidden)
|
| 138 |
+
if found:
|
| 139 |
+
issues.append({"check": "weak_hash", "message": f"Weak hash used: {[f for f in forbidden if f in code]}", "severity": "HIGH"})
|
| 140 |
+
else:
|
| 141 |
+
checks_passed += 1
|
| 142 |
|
| 143 |
+
elif check_type == "uses_bcrypt":
|
| 144 |
+
if "bcrypt" in code:
|
| 145 |
+
checks_passed += 1
|
| 146 |
+
else:
|
| 147 |
+
issues.append({"check": "uses_bcrypt", "message": "bcrypt not used β passwords will be weakly hashed", "severity": "HIGH"})
|
| 148 |
|
| 149 |
+
elif check_type == "uses_secrets":
|
| 150 |
+
if "secrets" in code:
|
| 151 |
+
checks_passed += 1
|
| 152 |
+
else:
|
| 153 |
+
issues.append({"check": "uses_secrets", "message": "secrets module not used β randomness may be insecure", "severity": "MEDIUM"})
|
| 154 |
|
| 155 |
+
elif check_type == "no_weak_random":
|
| 156 |
+
forbidden = check.get("forbidden", ["random.random(", "random.randint("])
|
| 157 |
+
found = any(f in code for f in forbidden)
|
| 158 |
+
if found:
|
| 159 |
+
issues.append({"check": "weak_random", "message": "Weak PRNG (random module) used for security-sensitive operation", "severity": "HIGH"})
|
| 160 |
+
else:
|
| 161 |
+
checks_passed += 1
|
| 162 |
|
| 163 |
+
elif check_type == "no_string_format_sql":
|
| 164 |
+
forbidden = check.get("forbidden", [])
|
| 165 |
+
found = any(f in code for f in forbidden)
|
| 166 |
+
if found:
|
| 167 |
+
issues.append({"check": "sql_injection", "message": "String formatting used in SQL query οΏ½οΏ½οΏ½ SQL injection risk", "severity": "HIGH"})
|
| 168 |
+
else:
|
| 169 |
+
checks_passed += 1
|
| 170 |
|
| 171 |
+
elif check_type == "uses_hmac_compare_digest":
|
| 172 |
+
if "hmac.compare_digest" in code:
|
| 173 |
+
checks_passed += 1
|
| 174 |
+
else:
|
| 175 |
+
issues.append({"check": "timing_attack", "message": "hmac.compare_digest not used β timing attack possible", "severity": "MEDIUM"})
|
| 176 |
|
| 177 |
+
elif check_type == "no_verify_false":
|
| 178 |
+
forbidden = check.get("forbidden", [])
|
| 179 |
+
found = any(f in code for f in forbidden)
|
| 180 |
+
if found:
|
| 181 |
+
issues.append({"check": "jwt_no_verify", "message": "JWT signature verification disabled", "severity": "HIGH"})
|
| 182 |
+
else:
|
| 183 |
+
checks_passed += 1
|
|
|
|
| 184 |
|
| 185 |
+
elif check_type == "algorithm_specified":
|
| 186 |
+
required = check.get("required", [])
|
| 187 |
+
found = any(r in code for r in required)
|
| 188 |
+
if found:
|
| 189 |
+
checks_passed += 1
|
| 190 |
+
else:
|
| 191 |
+
issues.append({"check": "jwt_alg", "message": "JWT algorithms= not specified β alg:none attack possible", "severity": "HIGH"})
|
| 192 |
+
|
| 193 |
+
score = checks_passed / max(total_checks, 1)
|
| 194 |
+
return {"score": round(score, 4), "issues": issues}
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def _static_feedback(score: float, issues: list) -> str:
|
| 198 |
+
if score >= 0.9:
|
| 199 |
+
return f"Clean β no significant security issues found"
|
| 200 |
+
high = sum(1 for i in issues if i.get("severity") == "HIGH")
|
| 201 |
+
medium = sum(1 for i in issues if i.get("severity") == "MEDIUM")
|
| 202 |
+
if high > 0:
|
| 203 |
+
return f"{high} HIGH severity issue(s) found β immediate fix needed"
|
| 204 |
+
if medium > 0:
|
| 205 |
+
return f"{medium} MEDIUM severity issue(s) found β review recommended"
|
| 206 |
+
return f"Some minor issues found (score: {score:.2f})"
|
graders/supply_chain.py
DELETED
|
@@ -1,99 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
graders/supply_chain.py β Supply chain security grader (NEW in V2).
|
| 3 |
-
Weight: 2% of total reward.
|
| 4 |
-
|
| 5 |
-
V1 flaw: an agent could "solve" a task by importing a typosquatted or
|
| 6 |
-
known-vulnerable package. This grader catches that.
|
| 7 |
-
|
| 8 |
-
Checks:
|
| 9 |
-
1. KNOWN_TYPOSQUATS β common misspellings of popular packages
|
| 10 |
-
2. KNOWN_DANGEROUS β packages known to have been malicious
|
| 11 |
-
3. pip-audit β PyPI advisory database (when available)
|
| 12 |
-
"""
|
| 13 |
-
import ast
|
| 14 |
-
import re
|
| 15 |
-
from typing import Dict, Any, List
|
| 16 |
-
|
| 17 |
-
KNOWN_TYPOSQUATS = {
|
| 18 |
-
# requests misspellings
|
| 19 |
-
"reqeusts", "requets", "reqests", "requestss",
|
| 20 |
-
# urllib3
|
| 21 |
-
"urlib3", "urllib3s", "urllib",
|
| 22 |
-
# cryptography
|
| 23 |
-
"crpytography", "cryptograpy", "cyptography",
|
| 24 |
-
# pyyaml
|
| 25 |
-
"pyymal", "pyamml", "pyaml",
|
| 26 |
-
# setuptools
|
| 27 |
-
"setuptool", "setup-tools",
|
| 28 |
-
# numpy
|
| 29 |
-
"numppy", "numy",
|
| 30 |
-
# pillow
|
| 31 |
-
"pillo", "pil2",
|
| 32 |
-
# flask
|
| 33 |
-
"falsk", "flaask",
|
| 34 |
-
# django
|
| 35 |
-
"djano", "djangoo",
|
| 36 |
-
}
|
| 37 |
-
|
| 38 |
-
KNOWN_DANGEROUS = {
|
| 39 |
-
"malicious", "evilpackage", "xss-package",
|
| 40 |
-
"colourama", # typosquat of colorama
|
| 41 |
-
"python-dateutil2",
|
| 42 |
-
"urllib-parse",
|
| 43 |
-
}
|
| 44 |
-
|
| 45 |
-
STDLIB_SAFE = {
|
| 46 |
-
"os", "sys", "json", "re", "ast", "io", "typing", "collections",
|
| 47 |
-
"hashlib", "hmac", "secrets", "subprocess", "tempfile", "pathlib",
|
| 48 |
-
"sqlite3", "time", "datetime", "functools", "itertools", "math",
|
| 49 |
-
"string", "struct", "base64", "urllib", "http", "email", "logging",
|
| 50 |
-
"unittest", "abc", "contextlib", "dataclasses", "enum", "uuid",
|
| 51 |
-
"socket", "ssl", "threading", "multiprocessing", "asyncio",
|
| 52 |
-
"tracemalloc", "timeit", "cProfile", "pprint", "textwrap",
|
| 53 |
-
}
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
def extract_imports(code: str) -> List[str]:
|
| 57 |
-
try:
|
| 58 |
-
tree = ast.parse(code)
|
| 59 |
-
except SyntaxError:
|
| 60 |
-
# Fallback: regex
|
| 61 |
-
matches = re.findall(r'^\s*import\s+(\w+)|^\s*from\s+(\w+)', code, re.MULTILINE)
|
| 62 |
-
return list({m[0] or m[1] for m in matches if m[0] or m[1]})
|
| 63 |
-
|
| 64 |
-
packages = []
|
| 65 |
-
for node in ast.walk(tree):
|
| 66 |
-
if isinstance(node, ast.Import):
|
| 67 |
-
packages += [a.name.split(".")[0] for a in node.names]
|
| 68 |
-
elif isinstance(node, ast.ImportFrom) and node.module:
|
| 69 |
-
packages.append(node.module.split(".")[0])
|
| 70 |
-
return list(set(packages))
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
def grade_supply_chain(code: str) -> Dict[str, Any]:
|
| 74 |
-
packages = extract_imports(code)
|
| 75 |
-
flagged = []
|
| 76 |
-
penalty = 0.0
|
| 77 |
-
|
| 78 |
-
for pkg in packages:
|
| 79 |
-
pkg_lower = pkg.lower()
|
| 80 |
-
if pkg_lower in KNOWN_TYPOSQUATS:
|
| 81 |
-
flagged.append({"package": pkg, "reason": "typosquat"})
|
| 82 |
-
penalty += 0.5
|
| 83 |
-
elif pkg_lower in KNOWN_DANGEROUS:
|
| 84 |
-
flagged.append({"package": pkg, "reason": "known_malicious"})
|
| 85 |
-
penalty += 1.0
|
| 86 |
-
|
| 87 |
-
score = max(0.0, 1.0 - penalty)
|
| 88 |
-
|
| 89 |
-
if flagged:
|
| 90 |
-
feedback = f"Suspicious packages detected: {[f['package'] for f in flagged]}. Use well-known packages only."
|
| 91 |
-
else:
|
| 92 |
-
feedback = f"No suspicious imports detected. Checked {len(packages)} package(s)."
|
| 93 |
-
|
| 94 |
-
return {
|
| 95 |
-
"score": round(score, 4),
|
| 96 |
-
"feedback": feedback,
|
| 97 |
-
"flagged": flagged,
|
| 98 |
-
"packages_checked": packages,
|
| 99 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inference.py
CHANGED
|
@@ -1,120 +1,58 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
- OpenAI: API_BASE_URL=https://api.openai.com/v1, MODEL_NAME=gpt-4o-mini
|
| 14 |
"""
|
| 15 |
import os
|
| 16 |
import json
|
| 17 |
import time
|
|
|
|
| 18 |
import requests
|
| 19 |
from openai import OpenAI
|
| 20 |
|
| 21 |
-
# ββ
|
| 22 |
-
API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.
|
| 23 |
-
MODEL_NAME = os.environ.get("MODEL_NAME", "
|
| 24 |
HF_TOKEN = os.environ.get("HF_TOKEN", "")
|
| 25 |
ENV_URL = os.environ.get("ENV_URL", "http://localhost:7860")
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
# ββ System prompt βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 30 |
-
SYSTEM_PROMPT = """You are a Python security engineer writing production-ready, secure Python code.
|
| 31 |
-
|
| 32 |
-
When given a task, write ONLY the Python function β no explanations, no markdown fences, no comments outside the function.
|
| 33 |
-
|
| 34 |
-
Your code MUST:
|
| 35 |
-
1. Solve the problem correctly β handle None, empty string, boundary values
|
| 36 |
-
2. Resist security attacks: SQL injection, path traversal, auth bypass, XSS
|
| 37 |
-
3. Use PARAMETERISED queries β NEVER string-format user input into SQL
|
| 38 |
-
4. Validate and sanitise ALL inputs before use
|
| 39 |
-
5. Use proper type hints on all function signatures
|
| 40 |
-
6. Have a docstring explaining what the function does
|
| 41 |
-
7. Use try/except with specific exception types (not bare except)
|
| 42 |
-
8. Follow the naming and error-handling conventions shown in CODEBASE CONTEXT
|
| 43 |
-
9. Import only well-known standard library or PyPI packages
|
| 44 |
-
|
| 45 |
-
CRITICAL SECURITY RULES:
|
| 46 |
-
- SQL: always use cursor.execute(sql, (param,)) β never f-strings or % formatting
|
| 47 |
-
- Paths: always use Path.resolve() and check prefix against safe base directory
|
| 48 |
-
- JWT: always specify algorithms=["HS256"] explicitly
|
| 49 |
-
- Auth: always use hmac.compare_digest() for constant-time comparison
|
| 50 |
-
- Hashing: use SHA-256 or stronger β never MD5/SHA1
|
| 51 |
-
- Never use eval(), exec(), or subprocess with shell=True
|
| 52 |
-
"""
|
| 53 |
|
|
|
|
| 54 |
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
"imports": [i.split(".")[0] for i in comp.get("imports", [])][:15],
|
| 71 |
-
"uses_try_catch": comp.get("conventions", {}).get("uses_try_catch", False),
|
| 72 |
-
"uses_type_hints": comp.get("conventions", {}).get("uses_type_hints", False),
|
| 73 |
-
}
|
| 74 |
-
result = json.dumps(slim, indent=2)
|
| 75 |
-
if len(result) > limit:
|
| 76 |
-
for name in slim["components"]:
|
| 77 |
-
slim["components"][name].pop("imports", None)
|
| 78 |
-
result = json.dumps(slim, indent=2)[:limit]
|
| 79 |
-
return result
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
def call_llm(messages: list, timeout_s: int = 60) -> str:
|
| 83 |
-
"""Call LLM with exponential backoff retry on rate limit."""
|
| 84 |
-
for attempt in range(3):
|
| 85 |
-
try:
|
| 86 |
-
resp = client.chat.completions.create(
|
| 87 |
-
model=MODEL_NAME,
|
| 88 |
-
messages=messages,
|
| 89 |
-
max_tokens=1024,
|
| 90 |
-
temperature=0.2,
|
| 91 |
-
)
|
| 92 |
-
return resp.choices[0].message.content.strip()
|
| 93 |
-
except Exception as e:
|
| 94 |
-
err_str = str(e).lower()
|
| 95 |
-
if "rate_limit" in err_str or "429" in err_str:
|
| 96 |
-
wait = 2 ** attempt
|
| 97 |
-
print(f" Rate limited. Waiting {wait}s...")
|
| 98 |
-
time.sleep(wait)
|
| 99 |
-
else:
|
| 100 |
-
raise
|
| 101 |
-
return ""
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
def strip_markdown(code: str) -> str:
|
| 105 |
-
"""Strip markdown code fences if LLM added them."""
|
| 106 |
-
if "```python" in code:
|
| 107 |
-
code = code.split("```python")[1].split("```")[0]
|
| 108 |
-
elif "```" in code:
|
| 109 |
-
parts = code.split("```")
|
| 110 |
-
if len(parts) >= 3:
|
| 111 |
-
code = parts[1]
|
| 112 |
-
return code.strip()
|
| 113 |
|
| 114 |
|
| 115 |
def run_episode(difficulty: str = "medium") -> dict:
|
| 116 |
-
"""Run one full
|
| 117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
try:
|
| 119 |
reset_resp = requests.post(
|
| 120 |
f"{ENV_URL}/reset",
|
|
@@ -122,113 +60,179 @@ def run_episode(difficulty: str = "medium") -> dict:
|
|
| 122 |
timeout=30,
|
| 123 |
)
|
| 124 |
reset_resp.raise_for_status()
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
return {"task": "unknown", "scores": [], "final_score": 0.0, "improved": False}
|
| 129 |
|
|
|
|
| 130 |
sid = episode["session_id"]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
scores_history = []
|
| 132 |
-
|
| 133 |
|
| 134 |
for step_num in range(5):
|
| 135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
-
|
| 138 |
-
{"role": "system", "content": SYSTEM_PROMPT},
|
| 139 |
-
{"role": "user", "content": f"""Task: {episode['problem_statement']}
|
| 140 |
|
| 141 |
Security targets: {episode.get('cwe_targets', [])}
|
| 142 |
|
| 143 |
-
CODEBASE CONTEXT (follow these conventions exactly):
|
| 144 |
{context_str}
|
|
|
|
| 145 |
|
| 146 |
-
|
| 147 |
-
{episode.get('starter_code', '# Write your implementation here')}
|
| 148 |
|
| 149 |
-
|
|
|
|
|
|
|
| 150 |
]
|
| 151 |
|
|
|
|
| 152 |
try:
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
print(f" Step {step_num+1}: Empty response from LLM")
|
| 161 |
break
|
| 162 |
|
|
|
|
| 163 |
try:
|
| 164 |
step_resp = requests.post(
|
| 165 |
f"{ENV_URL}/step",
|
| 166 |
json={
|
| 167 |
"session_id": sid,
|
| 168 |
-
"task_id": episode["task_id"],
|
| 169 |
-
"filename": f"solution_step{step_num}.py",
|
| 170 |
"code": code,
|
|
|
|
|
|
|
| 171 |
},
|
| 172 |
-
timeout=60,
|
| 173 |
)
|
| 174 |
step_resp.raise_for_status()
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
print(f" Step {step_num+1}: Submit error β {e}")
|
| 178 |
break
|
| 179 |
|
| 180 |
-
|
|
|
|
| 181 |
scores_history.append(reward)
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
print
|
| 185 |
-
|
| 186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
|
| 188 |
-
#
|
| 189 |
episode["codegraph"] = result.get("codegraph", {})
|
| 190 |
|
| 191 |
-
|
| 192 |
-
|
| 193 |
|
| 194 |
-
final = scores_history[-1] if scores_history else 0.0
|
| 195 |
improved = len(scores_history) > 1 and scores_history[-1] > scores_history[0]
|
| 196 |
return {
|
| 197 |
-
"task":
|
|
|
|
| 198 |
"scores": scores_history,
|
| 199 |
-
"final_score":
|
| 200 |
"improved": improved,
|
|
|
|
| 201 |
}
|
| 202 |
|
| 203 |
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
|
| 214 |
for difficulty in ["easy", "medium", "hard"]:
|
| 215 |
-
print(f"\n{'='*20} {difficulty.upper()} {'='*20}")
|
| 216 |
r = run_episode(difficulty)
|
| 217 |
results.append(r)
|
|
|
|
|
|
|
| 218 |
|
| 219 |
elapsed = time.time() - start
|
| 220 |
|
| 221 |
-
|
| 222 |
-
print("
|
| 223 |
-
print("
|
|
|
|
|
|
|
| 224 |
for r in results:
|
| 225 |
-
|
| 226 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
|
| 228 |
-
|
| 229 |
-
print(f"\nMean final reward: {avg:.4f}")
|
| 230 |
-
print(f"Total time: {elapsed:.1f}s")
|
| 231 |
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
|
|
|
| 1 |
"""
|
| 2 |
+
SecureCodeEnv - Baseline Inference Script
|
| 3 |
+
Required by hackathon. Runs an LLM agent through the environment.
|
| 4 |
+
|
| 5 |
+
Usage:
|
| 6 |
+
export API_BASE_URL=https://api.openai.com/v1
|
| 7 |
+
export MODEL_NAME=gpt-4o-mini
|
| 8 |
+
export HF_TOKEN=hf_your_token
|
| 9 |
+
export ENV_URL=http://localhost:7860 # or your HF Space URL
|
| 10 |
+
python inference.py
|
| 11 |
+
|
| 12 |
+
Completes in under 20 minutes on 2 vCPU / 8GB RAM.
|
|
|
|
| 13 |
"""
|
| 14 |
import os
|
| 15 |
import json
|
| 16 |
import time
|
| 17 |
+
import sys
|
| 18 |
import requests
|
| 19 |
from openai import OpenAI
|
| 20 |
|
| 21 |
+
# ββ Required environment variables ββββββββββββββββββββββββββββββββββββββββββ
|
| 22 |
+
API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
|
| 23 |
+
MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-4o-mini")
|
| 24 |
HF_TOKEN = os.environ.get("HF_TOKEN", "")
|
| 25 |
ENV_URL = os.environ.get("ENV_URL", "http://localhost:7860")
|
| 26 |
|
| 27 |
+
if not HF_TOKEN:
|
| 28 |
+
print("β οΈ HF_TOKEN not set. Some model endpoints may reject requests.", file=sys.stderr)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN or "sk-placeholder")
|
| 31 |
|
| 32 |
+
# ββ System prompt βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 33 |
+
SYSTEM_PROMPT = """You are a senior Python security engineer.
|
| 34 |
+
You write production-ready, secure Python code with no shortcuts.
|
| 35 |
+
|
| 36 |
+
Rules:
|
| 37 |
+
1. Output ONLY raw Python code β no markdown fences, no explanations.
|
| 38 |
+
2. Never use: eval(), exec(), shell=True, hashlib.md5, random.random() for security.
|
| 39 |
+
3. Always use parameterized queries (never f-string SQL).
|
| 40 |
+
4. Use secrets module (not random) for tokens and session IDs.
|
| 41 |
+
5. Use bcrypt (not hashlib) for password hashing.
|
| 42 |
+
6. Use hmac.compare_digest for secret comparison (not ==).
|
| 43 |
+
7. Validate all inputs β handle None, empty string, type errors.
|
| 44 |
+
8. Add type hints and docstrings to every function.
|
| 45 |
+
9. Follow the naming and style conventions shown in CODEBASE CONTEXT.
|
| 46 |
+
10. Use pathlib.Path.resolve() for file path validation (not string checks)."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
|
| 49 |
def run_episode(difficulty: str = "medium") -> dict:
|
| 50 |
+
"""Run one full episode at the given difficulty and return results."""
|
| 51 |
+
print(f"\n{'='*60}")
|
| 52 |
+
print(f" Episode: {difficulty.upper()}")
|
| 53 |
+
print(f"{'='*60}")
|
| 54 |
+
|
| 55 |
+
# ββ Step 1: Reset environment βββββββββββββββββββββββββββββββββββββββββ
|
| 56 |
try:
|
| 57 |
reset_resp = requests.post(
|
| 58 |
f"{ENV_URL}/reset",
|
|
|
|
| 60 |
timeout=30,
|
| 61 |
)
|
| 62 |
reset_resp.raise_for_status()
|
| 63 |
+
except requests.RequestException as e:
|
| 64 |
+
print(f"β /reset failed: {e}")
|
| 65 |
+
return {"task": "unknown", "scores": [], "final_score": 0.0, "improved": False, "error": str(e)}
|
|
|
|
| 66 |
|
| 67 |
+
episode = reset_resp.json()
|
| 68 |
sid = episode["session_id"]
|
| 69 |
+
task_id = episode["task_id"]
|
| 70 |
+
print(f" Task: {task_id}")
|
| 71 |
+
print(f" CWE targets: {episode.get('cwe_targets', [])}")
|
| 72 |
+
|
| 73 |
scores_history = []
|
| 74 |
+
prev_feedback = {}
|
| 75 |
|
| 76 |
for step_num in range(5):
|
| 77 |
+
# ββ Step 2: Build prompt ββββββββββββββββββββββββββββββββββββββββββ
|
| 78 |
+
context = episode.get("codegraph", {})
|
| 79 |
+
context_prompt = context.get("context_prompt", "")
|
| 80 |
+
# Cap context at 3000 chars to stay within token budget
|
| 81 |
+
context_str = context_prompt[:3000] if context_prompt else json.dumps(context, indent=2)[:2000]
|
| 82 |
+
|
| 83 |
+
feedback_str = ""
|
| 84 |
+
if prev_feedback:
|
| 85 |
+
feedback_str = "\n\nPREVIOUS ATTEMPT FEEDBACK:\n" + "\n".join(
|
| 86 |
+
f" {k}: {v}" for k, v in prev_feedback.items() if v
|
| 87 |
+
)
|
| 88 |
|
| 89 |
+
user_message = f"""Task: {episode['problem_statement']}
|
|
|
|
|
|
|
| 90 |
|
| 91 |
Security targets: {episode.get('cwe_targets', [])}
|
| 92 |
|
|
|
|
| 93 |
{context_str}
|
| 94 |
+
{feedback_str}
|
| 95 |
|
| 96 |
+
Write the complete Python implementation now:"""
|
|
|
|
| 97 |
|
| 98 |
+
messages = [
|
| 99 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 100 |
+
{"role": "user", "content": user_message},
|
| 101 |
]
|
| 102 |
|
| 103 |
+
# ββ Step 3: Call LLM ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 104 |
try:
|
| 105 |
+
response = client.chat.completions.create(
|
| 106 |
+
model=MODEL_NAME,
|
| 107 |
+
messages=messages,
|
| 108 |
+
max_tokens=1500,
|
| 109 |
+
temperature=0.1, # Low temperature for consistent, focused code
|
| 110 |
+
)
|
| 111 |
+
code = response.choices[0].message.content.strip()
|
| 112 |
+
|
| 113 |
+
# Strip markdown fences if model added them anyway
|
| 114 |
+
if code.startswith("```python"):
|
| 115 |
+
code = code[9:]
|
| 116 |
+
if code.startswith("```"):
|
| 117 |
+
code = code[3:]
|
| 118 |
+
if code.endswith("```"):
|
| 119 |
+
code = code[:-3]
|
| 120 |
+
code = code.strip()
|
| 121 |
|
| 122 |
+
except Exception as e:
|
| 123 |
+
print(f" β οΈ LLM call failed at step {step_num+1}: {e}")
|
|
|
|
| 124 |
break
|
| 125 |
|
| 126 |
+
# ββ Step 4: Submit to environment βββββββββββββββββββββββββββββββββ
|
| 127 |
try:
|
| 128 |
step_resp = requests.post(
|
| 129 |
f"{ENV_URL}/step",
|
| 130 |
json={
|
| 131 |
"session_id": sid,
|
|
|
|
|
|
|
| 132 |
"code": code,
|
| 133 |
+
"filename": f"solution_step{step_num}.py",
|
| 134 |
+
"task_id": task_id,
|
| 135 |
},
|
| 136 |
+
timeout=60, # Grading can take up to 60s (bandit + attacks)
|
| 137 |
)
|
| 138 |
step_resp.raise_for_status()
|
| 139 |
+
except requests.RequestException as e:
|
| 140 |
+
print(f" β οΈ /step failed: {e}")
|
|
|
|
| 141 |
break
|
| 142 |
|
| 143 |
+
result = step_resp.json()
|
| 144 |
+
reward = result["total_reward"]
|
| 145 |
scores_history.append(reward)
|
| 146 |
+
prev_feedback = result.get("feedback", {})
|
| 147 |
+
|
| 148 |
+
# Pretty print step result
|
| 149 |
+
scores = result.get("scores", {})
|
| 150 |
+
print(f"\n Step {step_num+1} β reward={reward:.3f}")
|
| 151 |
+
print(f" correctness={scores.get('correctness',0):.2f} "
|
| 152 |
+
f"attack={scores.get('attack_resist',0):.2f} "
|
| 153 |
+
f"static={scores.get('static_security',0):.2f} "
|
| 154 |
+
f"consistency={scores.get('consistency',0):.2f}")
|
| 155 |
+
print(f" summary: {prev_feedback.get('summary', '')}")
|
| 156 |
+
|
| 157 |
+
if result["done"]:
|
| 158 |
+
print(f"\n β
Episode complete in {step_num+1} steps!")
|
| 159 |
+
break
|
| 160 |
|
| 161 |
+
# Feed updated CodeGraph back for next step
|
| 162 |
episode["codegraph"] = result.get("codegraph", {})
|
| 163 |
|
| 164 |
+
if not scores_history:
|
| 165 |
+
scores_history = [0.0]
|
| 166 |
|
|
|
|
| 167 |
improved = len(scores_history) > 1 and scores_history[-1] > scores_history[0]
|
| 168 |
return {
|
| 169 |
+
"task": task_id,
|
| 170 |
+
"difficulty": difficulty,
|
| 171 |
"scores": scores_history,
|
| 172 |
+
"final_score": scores_history[-1],
|
| 173 |
"improved": improved,
|
| 174 |
+
"steps": len(scores_history),
|
| 175 |
}
|
| 176 |
|
| 177 |
|
| 178 |
+
def main():
|
| 179 |
+
"""Run one episode per difficulty and print aggregate results."""
|
| 180 |
+
print(f"\n{'='*60}")
|
| 181 |
+
print(f" SecureCodeEnv β Baseline Inference")
|
| 182 |
+
print(f" Model: {MODEL_NAME}")
|
| 183 |
+
print(f" Env: {ENV_URL}")
|
| 184 |
+
print(f"{'='*60}")
|
| 185 |
|
| 186 |
+
# Verify environment is up
|
| 187 |
+
try:
|
| 188 |
+
health = requests.get(f"{ENV_URL}/health", timeout=10)
|
| 189 |
+
health.raise_for_status()
|
| 190 |
+
print(f"\n β
Environment healthy: {health.json()}")
|
| 191 |
+
except Exception as e:
|
| 192 |
+
print(f"\n β Environment not reachable at {ENV_URL}: {e}")
|
| 193 |
+
print(" Start the server: uvicorn app.main:app --host 0.0.0.0 --port 7860")
|
| 194 |
+
sys.exit(1)
|
| 195 |
+
|
| 196 |
+
results = []
|
| 197 |
+
start = time.time()
|
| 198 |
|
| 199 |
for difficulty in ["easy", "medium", "hard"]:
|
|
|
|
| 200 |
r = run_episode(difficulty)
|
| 201 |
results.append(r)
|
| 202 |
+
# Small pause between episodes
|
| 203 |
+
time.sleep(1)
|
| 204 |
|
| 205 |
elapsed = time.time() - start
|
| 206 |
|
| 207 |
+
# ββ Final report βββββββββββββββββββββββββββββββββββββββββοΏ½οΏ½οΏ½ββββββββββββ
|
| 208 |
+
print(f"\n{'='*60}")
|
| 209 |
+
print(f" FINAL RESULTS ({elapsed:.1f}s total)")
|
| 210 |
+
print(f"{'='*60}")
|
| 211 |
+
|
| 212 |
for r in results:
|
| 213 |
+
status = "β
" if r["final_score"] >= 0.7 else "β οΈ " if r["final_score"] >= 0.4 else "β"
|
| 214 |
+
improved_str = "β improved" if r.get("improved") else "β"
|
| 215 |
+
print(f" {status} {r['task']:45s} {r['final_score']:.3f} {improved_str}")
|
| 216 |
+
|
| 217 |
+
valid_scores = [r["final_score"] for r in results]
|
| 218 |
+
avg = sum(valid_scores) / len(valid_scores) if valid_scores else 0
|
| 219 |
+
print(f"\n Average final score: {avg:.3f}")
|
| 220 |
+
print(f" Scores: {[round(s, 3) for s in valid_scores]}")
|
| 221 |
+
|
| 222 |
+
# Write machine-readable results
|
| 223 |
+
output = {
|
| 224 |
+
"model": MODEL_NAME,
|
| 225 |
+
"env_url": ENV_URL,
|
| 226 |
+
"elapsed_seconds": round(elapsed, 1),
|
| 227 |
+
"results": results,
|
| 228 |
+
"average_score": round(avg, 4),
|
| 229 |
+
}
|
| 230 |
+
with open("inference_results.json", "w") as f:
|
| 231 |
+
json.dump(output, f, indent=2)
|
| 232 |
+
print(f"\n Results saved to inference_results.json")
|
| 233 |
|
| 234 |
+
return 0 if avg >= 0.4 else 1
|
|
|
|
|
|
|
| 235 |
|
| 236 |
+
|
| 237 |
+
if __name__ == "__main__":
|
| 238 |
+
sys.exit(main())
|
openenv.yaml
CHANGED
|
@@ -1,146 +1,141 @@
|
|
| 1 |
-
# openenv.yaml β OpenEnv specification (required by hackathon)
|
| 2 |
-
# SecureCodeEnv V2 β Production-Ready Secure Code Generation RL Environment
|
| 3 |
-
# Author: Vishal Dhakad (vishaldhakad)
|
| 4 |
-
# Meta Γ HuggingFace OpenEnv Hackathon 2026
|
| 5 |
-
|
| 6 |
name: SecureCodeEnv
|
| 7 |
-
version: "2.0"
|
| 8 |
description: >
|
| 9 |
-
RL environment for training LLM agents to write production-ready,
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
| 15 |
hf_space: vishaldhakad/SecureCodeEnv
|
| 16 |
-
|
| 17 |
-
server:
|
| 18 |
-
host: 0.0.0.0
|
| 19 |
-
port: 7860
|
| 20 |
-
workers: 2
|
| 21 |
-
|
| 22 |
-
endpoints:
|
| 23 |
-
reset:
|
| 24 |
-
method: POST
|
| 25 |
-
path: /reset
|
| 26 |
-
description: >
|
| 27 |
-
Start new episode. Picks task at given difficulty, initialises CodeGraph,
|
| 28 |
-
creates Redis-backed session. Returns task, starter code, CodeGraph, session_id.
|
| 29 |
-
params:
|
| 30 |
-
difficulty: "easy | medium | hard (default: medium)"
|
| 31 |
-
session_id: "optional UUID β generated if not provided"
|
| 32 |
-
|
| 33 |
-
step:
|
| 34 |
-
method: POST
|
| 35 |
-
path: /step
|
| 36 |
-
description: >
|
| 37 |
-
Submit agent code. Runs all 8 graders (correctness, behavioral attacks,
|
| 38 |
-
static analysis, consistency, performance, documentation, code structure,
|
| 39 |
-
supply chain). Updates CodeGraph. Returns weighted reward + per-grader feedback.
|
| 40 |
-
body:
|
| 41 |
-
code: "Python source code string"
|
| 42 |
-
filename: "logical filename for CodeGraph tracking"
|
| 43 |
-
task_id: "task identifier from /reset"
|
| 44 |
-
session_id: "UUID from /reset"
|
| 45 |
-
|
| 46 |
-
state:
|
| 47 |
-
method: GET
|
| 48 |
-
path: /state
|
| 49 |
-
description: Read current episode state without advancing it.
|
| 50 |
-
params:
|
| 51 |
-
session_id: "UUID from /reset"
|
| 52 |
|
| 53 |
action_space:
|
| 54 |
type: text
|
| 55 |
-
description: Python
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
observation_space:
|
| 61 |
-
type:
|
| 62 |
fields:
|
| 63 |
- name: total_reward
|
| 64 |
type: float
|
| 65 |
range: [0.0, 1.0]
|
| 66 |
-
description: Weighted
|
| 67 |
- name: scores
|
| 68 |
type: dict
|
| 69 |
-
description:
|
|
|
|
|
|
|
| 70 |
- name: feedback
|
| 71 |
type: dict
|
| 72 |
-
description: Human-readable feedback per
|
| 73 |
- name: codegraph
|
| 74 |
type: dict
|
| 75 |
-
description:
|
|
|
|
|
|
|
| 76 |
- name: done
|
| 77 |
type: bool
|
| 78 |
-
description: True
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
reward:
|
| 81 |
type: multi_dimensional
|
| 82 |
range: [0.0, 1.0]
|
| 83 |
-
terminal: 0.90
|
| 84 |
-
max_steps: 5
|
| 85 |
dimensions:
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
tasks:
|
| 96 |
-
- id:
|
| 97 |
difficulty: easy
|
| 98 |
-
cwe: CWE-916
|
| 99 |
-
|
| 100 |
|
| 101 |
-
- id:
|
| 102 |
difficulty: easy
|
| 103 |
-
cwe: CWE-20
|
| 104 |
-
|
| 105 |
|
| 106 |
-
- id:
|
| 107 |
difficulty: easy
|
| 108 |
-
cwe: CWE-
|
| 109 |
-
|
| 110 |
|
| 111 |
-
- id:
|
| 112 |
difficulty: medium
|
| 113 |
-
cwe: CWE-89
|
| 114 |
-
|
| 115 |
|
| 116 |
-
- id:
|
| 117 |
difficulty: medium
|
| 118 |
-
cwe: CWE-22
|
| 119 |
-
|
| 120 |
|
| 121 |
-
- id:
|
| 122 |
difficulty: medium
|
| 123 |
-
cwe: CWE-
|
| 124 |
-
|
| 125 |
|
| 126 |
-
- id:
|
| 127 |
difficulty: hard
|
| 128 |
-
cwe: CWE-434
|
| 129 |
-
|
| 130 |
|
| 131 |
-
- id:
|
| 132 |
difficulty: hard
|
| 133 |
-
cwe: CWE-347
|
| 134 |
-
|
| 135 |
|
| 136 |
-
- id:
|
| 137 |
difficulty: hard
|
| 138 |
-
cwe: CWE-287
|
| 139 |
-
|
| 140 |
|
| 141 |
runtime:
|
| 142 |
max_steps_per_episode: 5
|
|
|
|
| 143 |
max_inference_time_minutes: 20
|
| 144 |
min_vcpu: 2
|
| 145 |
min_memory_gb: 8
|
| 146 |
port: 7860
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
name: SecureCodeEnv
|
| 2 |
+
version: "2.0.0"
|
| 3 |
description: >
|
| 4 |
+
An RL environment for training LLM agents to write production-ready,
|
| 5 |
+
secure Python code. Agents are graded on correctness, security attack
|
| 6 |
+
resistance (dynamic adversarial payloads), CWE-based static analysis,
|
| 7 |
+
performance, and codebase consistency via a novel CodeGraph memory system.
|
| 8 |
+
No other public OpenEnv environment combines attack simulation + codebase
|
| 9 |
+
consistency grading. All grading is 100% automated and deterministic.
|
| 10 |
+
|
| 11 |
+
author: Vishal Dhakad
|
| 12 |
hf_space: vishaldhakad/SecureCodeEnv
|
| 13 |
+
license: MIT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
action_space:
|
| 16 |
type: text
|
| 17 |
+
description: Python source code string submitted by the agent
|
| 18 |
+
fields:
|
| 19 |
+
- name: code
|
| 20 |
+
type: string
|
| 21 |
+
description: The complete Python function(s) to be graded
|
| 22 |
+
- name: filename
|
| 23 |
+
type: string
|
| 24 |
+
description: Logical filename for CodeGraph tracking (e.g. src/auth/validator.py)
|
| 25 |
+
- name: session_id
|
| 26 |
+
type: string
|
| 27 |
+
description: Session ID returned from /reset
|
| 28 |
|
| 29 |
observation_space:
|
| 30 |
+
type: structured
|
| 31 |
fields:
|
| 32 |
- name: total_reward
|
| 33 |
type: float
|
| 34 |
range: [0.0, 1.0]
|
| 35 |
+
description: Weighted final score across all 7 dimensions
|
| 36 |
- name: scores
|
| 37 |
type: dict
|
| 38 |
+
description: >
|
| 39 |
+
Per-dimension scores: correctness, attack_resist, static_security,
|
| 40 |
+
consistency, performance, documentation, code_structure
|
| 41 |
- name: feedback
|
| 42 |
type: dict
|
| 43 |
+
description: Human-readable feedback string per grading dimension
|
| 44 |
- name: codegraph
|
| 45 |
type: dict
|
| 46 |
+
description: >
|
| 47 |
+
Full codebase context including components, detected conventions,
|
| 48 |
+
dependency list, and natural-language context prompt for the agent
|
| 49 |
- name: done
|
| 50 |
type: bool
|
| 51 |
+
description: True if episode is complete (reward >= 0.90 or max steps reached)
|
| 52 |
+
- name: step_count
|
| 53 |
+
type: int
|
| 54 |
+
description: Current step number within the episode
|
| 55 |
|
| 56 |
reward:
|
| 57 |
type: multi_dimensional
|
| 58 |
range: [0.0, 1.0]
|
|
|
|
|
|
|
| 59 |
dimensions:
|
| 60 |
+
- name: correctness
|
| 61 |
+
weight: 0.30
|
| 62 |
+
description: Fraction of test cases passed (including edge cases)
|
| 63 |
+
- name: attack_resistance
|
| 64 |
+
weight: 0.20
|
| 65 |
+
description: Fraction of randomized adversarial payloads blocked
|
| 66 |
+
- name: static_security
|
| 67 |
+
weight: 0.15
|
| 68 |
+
description: bandit + AST security linter score (CWE-mapped)
|
| 69 |
+
- name: codegraph_consistency
|
| 70 |
+
weight: 0.15
|
| 71 |
+
description: Adherence to conventions from existing codebase components
|
| 72 |
+
- name: performance
|
| 73 |
+
weight: 0.10
|
| 74 |
+
description: Relative efficiency vs naive/optimal baselines (timeit)
|
| 75 |
+
- name: documentation
|
| 76 |
+
weight: 0.05
|
| 77 |
+
description: Docstring + type hint coverage across all functions
|
| 78 |
+
- name: code_structure
|
| 79 |
+
weight: 0.05
|
| 80 |
+
description: Clean code checks (no bare print, no bare except, etc.)
|
| 81 |
|
| 82 |
tasks:
|
| 83 |
+
- id: easy_password_validator
|
| 84 |
difficulty: easy
|
| 85 |
+
cwe: [CWE-916, CWE-521]
|
| 86 |
+
description: Validate password strength and hash with bcrypt (not MD5)
|
| 87 |
|
| 88 |
+
- id: easy_input_sanitizer
|
| 89 |
difficulty: easy
|
| 90 |
+
cwe: [CWE-20, CWE-116]
|
| 91 |
+
description: Sanitize HTML (XSS prevention) and filenames
|
| 92 |
|
| 93 |
+
- id: easy_token_generator
|
| 94 |
difficulty: easy
|
| 95 |
+
cwe: [CWE-338, CWE-330]
|
| 96 |
+
description: Generate cryptographically secure tokens using secrets module
|
| 97 |
|
| 98 |
+
- id: medium_sql_query_builder
|
| 99 |
difficulty: medium
|
| 100 |
+
cwe: [CWE-89, CWE-20]
|
| 101 |
+
description: Build parameterized SQL queries β never string-format user input
|
| 102 |
|
| 103 |
+
- id: medium_file_path_handler
|
| 104 |
difficulty: medium
|
| 105 |
+
cwe: [CWE-22, CWE-20]
|
| 106 |
+
description: Resolve file paths safely β block path traversal attacks
|
| 107 |
|
| 108 |
+
- id: medium_rate_limiter
|
| 109 |
difficulty: medium
|
| 110 |
+
cwe: [CWE-770, CWE-400]
|
| 111 |
+
description: Thread-safe sliding window rate limiter
|
| 112 |
|
| 113 |
+
- id: hard_file_upload_handler
|
| 114 |
difficulty: hard
|
| 115 |
+
cwe: [CWE-22, CWE-434]
|
| 116 |
+
description: Validate uploads β block traversal filenames, executable extensions, MIME spoofing
|
| 117 |
|
| 118 |
+
- id: hard_jwt_validator
|
| 119 |
difficulty: hard
|
| 120 |
+
cwe: [CWE-347, CWE-613]
|
| 121 |
+
description: Validate JWTs β enforce HS256, block none-alg attack, check expiry
|
| 122 |
|
| 123 |
+
- id: hard_auth_middleware
|
| 124 |
difficulty: hard
|
| 125 |
+
cwe: [CWE-287, CWE-352]
|
| 126 |
+
description: CSRF protection and Bearer auth using hmac.compare_digest (timing-safe)
|
| 127 |
|
| 128 |
runtime:
|
| 129 |
max_steps_per_episode: 5
|
| 130 |
+
done_reward_threshold: 0.90
|
| 131 |
max_inference_time_minutes: 20
|
| 132 |
min_vcpu: 2
|
| 133 |
min_memory_gb: 8
|
| 134 |
port: 7860
|
| 135 |
+
|
| 136 |
+
endpoints:
|
| 137 |
+
health: GET /health
|
| 138 |
+
reset: POST /reset
|
| 139 |
+
step: POST /step
|
| 140 |
+
state: GET /state
|
| 141 |
+
docs: GET /docs
|
requirements.txt
CHANGED
|
@@ -1,33 +1,10 @@
|
|
| 1 |
-
# requirements.txt β SecureCodeEnv V2
|
| 2 |
-
# All versions pinned for reproducibility
|
| 3 |
-
|
| 4 |
-
# ββ Web framework βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 5 |
fastapi==0.115.0
|
| 6 |
-
uvicorn
|
| 7 |
pydantic==2.7.0
|
|
|
|
|
|
|
|
|
|
| 8 |
python-multipart==0.0.9
|
| 9 |
-
|
| 10 |
-
# ββ Session persistence βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 11 |
-
redis==5.0.4
|
| 12 |
-
|
| 13 |
-
# ββ Security analysis βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 14 |
-
bandit==1.7.9
|
| 15 |
-
semgrep==1.75.0
|
| 16 |
-
pip-audit==2.7.3
|
| 17 |
-
|
| 18 |
-
# ββ Multi-language parsing ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 19 |
-
tree-sitter==0.23.0
|
| 20 |
-
tree-sitter-python==0.23.0
|
| 21 |
-
tree-sitter-javascript==0.23.0
|
| 22 |
-
|
| 23 |
-
# ββ Cryptography / task dependencies βββββββββββββββββββββββββββββββββββββββββ
|
| 24 |
-
PyJWT==2.8.0
|
| 25 |
bcrypt==4.1.3
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
# ββ Inference script ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 29 |
-
openai==1.30.0
|
| 30 |
-
requests==2.32.3
|
| 31 |
-
|
| 32 |
-
# ββ OpenEnv framework βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 33 |
-
# openenv # Uncomment if published; scaffold manually otherwise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
fastapi==0.115.0
|
| 2 |
+
uvicorn==0.30.6
|
| 3 |
pydantic==2.7.0
|
| 4 |
+
bandit==1.7.10
|
| 5 |
+
openai==1.40.0
|
| 6 |
+
requests==2.32.3
|
| 7 |
python-multipart==0.0.9
|
| 8 |
+
httpx==0.27.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
bcrypt==4.1.3
|
| 10 |
+
PyJWT==2.8.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sandbox/__init__.py
CHANGED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
# sandbox/__init__.py
|
|
|
|
|
|
sandbox/executor.py
CHANGED
|
@@ -1,121 +1,183 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
- Infinite loops blocking the server
|
| 6 |
-
- File system access
|
| 7 |
-
- Network exfiltration
|
| 8 |
-
- Process termination
|
| 9 |
-
|
| 10 |
-
Solution: write code to a temp file, run in a child subprocess with a hard
|
| 11 |
-
timeout. Docker network policy blocks external network. Main process never crashes.
|
| 12 |
"""
|
| 13 |
import subprocess
|
| 14 |
import tempfile
|
| 15 |
import os
|
| 16 |
import json
|
| 17 |
-
|
| 18 |
|
| 19 |
|
| 20 |
def safe_exec(
|
| 21 |
code: str,
|
| 22 |
-
test_input:
|
|
|
|
| 23 |
timeout: int = 5,
|
| 24 |
-
|
| 25 |
-
) -> Dict[str, Any]:
|
| 26 |
"""
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
Args:
|
| 30 |
-
code:
|
| 31 |
-
test_input:
|
| 32 |
-
|
| 33 |
-
|
| 34 |
|
| 35 |
Returns:
|
| 36 |
-
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
| 38 |
"""
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
path = f.name
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
proc = subprocess.run(
|
| 51 |
-
[
|
|
|
|
| 52 |
capture_output=True,
|
| 53 |
text=True,
|
| 54 |
timeout=timeout,
|
| 55 |
)
|
|
|
|
| 56 |
if proc.returncode == 0 and proc.stdout.strip():
|
| 57 |
try:
|
| 58 |
-
|
| 59 |
-
|
|
|
|
| 60 |
except json.JSONDecodeError:
|
| 61 |
-
return {"ok":
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
except subprocess.TimeoutExpired:
|
| 66 |
-
return {"ok": False, "error": "TIMEOUT β code
|
| 67 |
except Exception as e:
|
| 68 |
-
return {"ok": False, "error": f"
|
| 69 |
finally:
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
|
|
|
| 74 |
|
| 75 |
|
| 76 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
"""
|
| 78 |
-
|
| 79 |
-
Each test case: {"input": ..., "expected": ...}
|
| 80 |
|
| 81 |
-
|
| 82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
"""
|
| 84 |
-
|
| 85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
expected = tc.get("expected")
|
| 90 |
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
|
|
|
| 94 |
try:
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
print(json.dumps({{"
|
|
|
|
|
|
|
|
|
|
| 100 |
"""
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
SecureCodeEnv - Sandbox Executor
|
| 3 |
+
Runs untrusted agent code in an isolated subprocess with hard resource limits.
|
| 4 |
+
NEVER executes agent code in the main process.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
"""
|
| 6 |
import subprocess
|
| 7 |
import tempfile
|
| 8 |
import os
|
| 9 |
import json
|
| 10 |
+
import sys
|
| 11 |
|
| 12 |
|
| 13 |
def safe_exec(
|
| 14 |
code: str,
|
| 15 |
+
test_input: any,
|
| 16 |
+
function_name: str = "run_task",
|
| 17 |
timeout: int = 5,
|
| 18 |
+
) -> dict:
|
|
|
|
| 19 |
"""
|
| 20 |
+
Execute agent code in an isolated subprocess.
|
| 21 |
+
|
| 22 |
+
Security guarantees:
|
| 23 |
+
- 5 second timeout (kills hanging/infinite loop code)
|
| 24 |
+
- No network access (enforced by Docker network policy)
|
| 25 |
+
- Separate process β crash/exception cannot affect main server
|
| 26 |
+
- Tempfile is always cleaned up (finally block)
|
| 27 |
|
| 28 |
Args:
|
| 29 |
+
code: Python source code string from the agent
|
| 30 |
+
test_input: Input to pass to the function
|
| 31 |
+
function_name: Name of the function to call in the code
|
| 32 |
+
timeout: Max seconds before SIGKILL
|
| 33 |
|
| 34 |
Returns:
|
| 35 |
+
dict with keys:
|
| 36 |
+
ok: bool - True if execution succeeded
|
| 37 |
+
output: any - Return value of the function (if ok)
|
| 38 |
+
error: str - Error message (if not ok)
|
| 39 |
+
stdout: str - Any print output (for debugging)
|
| 40 |
"""
|
| 41 |
+
# Build the harness script that wraps agent code
|
| 42 |
+
harness = f"""
|
| 43 |
+
import json
|
| 44 |
+
import sys
|
| 45 |
+
import traceback
|
| 46 |
+
|
| 47 |
+
# ββ Agent code ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 48 |
+
{code}
|
|
|
|
| 49 |
|
| 50 |
+
# ββ Test harness βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 51 |
+
try:
|
| 52 |
+
_input = json.loads(sys.stdin.read())
|
| 53 |
+
_fn = {function_name}
|
| 54 |
+
_result = _fn(*_input) if isinstance(_input, list) else _fn(_input)
|
| 55 |
+
print(json.dumps({{"ok": True, "output": _result}}))
|
| 56 |
+
except Exception as _e:
|
| 57 |
+
print(json.dumps({{"ok": False, "error": str(_e), "type": type(_e).__name__}}))
|
| 58 |
+
"""
|
| 59 |
+
|
| 60 |
+
tmp_path = None
|
| 61 |
try:
|
| 62 |
+
with tempfile.NamedTemporaryFile(
|
| 63 |
+
mode="w", suffix=".py", delete=False, prefix="sce_exec_"
|
| 64 |
+
) as f:
|
| 65 |
+
f.write(harness)
|
| 66 |
+
tmp_path = f.name
|
| 67 |
+
|
| 68 |
proc = subprocess.run(
|
| 69 |
+
[sys.executable, tmp_path],
|
| 70 |
+
input=json.dumps(test_input),
|
| 71 |
capture_output=True,
|
| 72 |
text=True,
|
| 73 |
timeout=timeout,
|
| 74 |
)
|
| 75 |
+
|
| 76 |
if proc.returncode == 0 and proc.stdout.strip():
|
| 77 |
try:
|
| 78 |
+
result = json.loads(proc.stdout.strip().split("\n")[-1])
|
| 79 |
+
result["stdout"] = proc.stdout
|
| 80 |
+
return result
|
| 81 |
except json.JSONDecodeError:
|
| 82 |
+
return {"ok": False, "error": f"Non-JSON output: {proc.stdout[:200]}", "stdout": proc.stdout}
|
| 83 |
+
|
| 84 |
+
return {
|
| 85 |
+
"ok": False,
|
| 86 |
+
"error": proc.stderr[:500] if proc.stderr else "No output produced",
|
| 87 |
+
"stdout": proc.stdout,
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
except subprocess.TimeoutExpired:
|
| 91 |
+
return {"ok": False, "error": "TIMEOUT β code exceeded time limit", "stdout": ""}
|
| 92 |
except Exception as e:
|
| 93 |
+
return {"ok": False, "error": f"Executor error: {str(e)}", "stdout": ""}
|
| 94 |
finally:
|
| 95 |
+
if tmp_path and os.path.exists(tmp_path):
|
| 96 |
+
try:
|
| 97 |
+
os.unlink(tmp_path)
|
| 98 |
+
except OSError:
|
| 99 |
+
pass
|
| 100 |
|
| 101 |
|
| 102 |
+
def safe_exec_with_side_effect_monitor(
|
| 103 |
+
code: str,
|
| 104 |
+
test_input: any,
|
| 105 |
+
function_name: str,
|
| 106 |
+
side_effect_checks: list[dict],
|
| 107 |
+
timeout: int = 5,
|
| 108 |
+
) -> dict:
|
| 109 |
"""
|
| 110 |
+
V2: Behavioral harness that monitors side effects, not just return values.
|
|
|
|
| 111 |
|
| 112 |
+
For SQL injection checks: monitors what query strings are constructed,
|
| 113 |
+
not just what is returned. Uses sys.settrace + sqlite3 cursor spy pattern.
|
| 114 |
+
|
| 115 |
+
side_effect_checks: list of {
|
| 116 |
+
"type": "sql_no_concat" | "no_file_write" | "no_env_read",
|
| 117 |
+
...
|
| 118 |
+
}
|
| 119 |
"""
|
| 120 |
+
monitor_code = _build_monitor_code(side_effect_checks)
|
| 121 |
+
|
| 122 |
+
harness = f"""
|
| 123 |
+
import json
|
| 124 |
+
import sys
|
| 125 |
+
import traceback
|
| 126 |
|
| 127 |
+
# ββ Monitor injection ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 128 |
+
{monitor_code}
|
|
|
|
| 129 |
|
| 130 |
+
# ββ Agent code ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 131 |
+
{code}
|
| 132 |
+
|
| 133 |
+
# ββ Test harness ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 134 |
try:
|
| 135 |
+
_input = json.loads(sys.stdin.read())
|
| 136 |
+
_fn = {function_name}
|
| 137 |
+
_result = _fn(*_input) if isinstance(_input, list) else _fn(_input)
|
| 138 |
+
_violations = get_violations()
|
| 139 |
+
print(json.dumps({{"ok": True, "output": _result, "violations": _violations}}))
|
| 140 |
+
except Exception as _e:
|
| 141 |
+
_violations = get_violations() if 'get_violations' in dir() else []
|
| 142 |
+
print(json.dumps({{"ok": False, "error": str(_e), "violations": _violations}}))
|
| 143 |
"""
|
| 144 |
+
|
| 145 |
+
tmp_path = None
|
| 146 |
+
try:
|
| 147 |
+
with tempfile.NamedTemporaryFile(
|
| 148 |
+
mode="w", suffix=".py", delete=False, prefix="sce_monitor_"
|
| 149 |
+
) as f:
|
| 150 |
+
f.write(harness)
|
| 151 |
+
tmp_path = f.name
|
| 152 |
+
|
| 153 |
+
proc = subprocess.run(
|
| 154 |
+
[sys.executable, tmp_path],
|
| 155 |
+
input=json.dumps(test_input),
|
| 156 |
+
capture_output=True,
|
| 157 |
+
text=True,
|
| 158 |
+
timeout=timeout,
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
if proc.stdout.strip():
|
| 162 |
+
try:
|
| 163 |
+
return json.loads(proc.stdout.strip().split("\n")[-1])
|
| 164 |
+
except json.JSONDecodeError:
|
| 165 |
+
pass
|
| 166 |
+
|
| 167 |
+
return {"ok": False, "error": proc.stderr[:300], "violations": []}
|
| 168 |
+
|
| 169 |
+
except subprocess.TimeoutExpired:
|
| 170 |
+
return {"ok": False, "error": "TIMEOUT", "violations": []}
|
| 171 |
+
finally:
|
| 172 |
+
if tmp_path and os.path.exists(tmp_path):
|
| 173 |
+
try:
|
| 174 |
+
os.unlink(tmp_path)
|
| 175 |
+
except OSError:
|
| 176 |
+
pass
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def _build_monitor_code(checks: list[dict]) -> str:
|
| 180 |
+
"""Generate monitoring boilerplate based on requested side-effect checks."""
|
| 181 |
+
lines = ["_VIOLATIONS = []", ""]
|
| 182 |
+
lines.append("def get_violations(): return _VIOLATIONS")
|
| 183 |
+
return "\n".join(lines)
|
sandbox/payload_gen.py
CHANGED
|
@@ -1,171 +1,133 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
URL encode, double encode, HTML entities, Unicode confusables,
|
| 7 |
-
SQL comment bypass, quote doubling, operator swap, nested traversal,
|
| 8 |
-
slash encoding, fullwidth chars, case, suffix injection.
|
| 9 |
-
|
| 10 |
-
Result: exponential evasion space β agent cannot memorise specific payloads.
|
| 11 |
-
Payloads are seeded per episode (episode_seed) AND per step (+ step).
|
| 12 |
"""
|
| 13 |
import random
|
| 14 |
import urllib.parse
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
"' OR '1'='1",
|
| 19 |
-
"admin'--",
|
| 20 |
-
"1 UNION SELECT * FROM users",
|
| 21 |
"'; DROP TABLE users;--",
|
|
|
|
|
|
|
| 22 |
"' OR 1=1--",
|
| 23 |
-
"1' AND SLEEP(5)--",
|
| 24 |
-
"x' OR name LIKE '%",
|
| 25 |
-
"' UNION SELECT null,null,null--",
|
| 26 |
"\" OR \"1\"=\"1",
|
| 27 |
-
"
|
| 28 |
-
"
|
|
|
|
|
|
|
| 29 |
],
|
| 30 |
-
"
|
| 31 |
"../../etc/passwd",
|
| 32 |
"../../../root/.ssh/id_rsa",
|
| 33 |
"/etc/shadow",
|
| 34 |
"....//....//etc/passwd",
|
| 35 |
-
"%2e%2e
|
| 36 |
"..%2F..%2Fetc%2Fpasswd",
|
| 37 |
-
"
|
| 38 |
-
"
|
| 39 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
],
|
| 41 |
"xss": [
|
| 42 |
-
"<script>alert(
|
| 43 |
"<img src=x onerror=alert(1)>",
|
| 44 |
"javascript:alert(1)",
|
| 45 |
"<svg onload=alert(1)>",
|
| 46 |
-
"'><script>alert(document.cookie)</script>",
|
| 47 |
-
"<iframe src=javascript:alert(1)>",
|
| 48 |
-
"
|
| 49 |
-
"
|
| 50 |
-
],
|
| 51 |
-
"jwt_bypass": [
|
| 52 |
-
'{"alg":"none"}.payload.',
|
| 53 |
-
'{"exp":1000}.payload.',
|
| 54 |
-
'eyJhbGciOiJub25lIn0.eyJzdWIiOiJhZG1pbiJ9.',
|
| 55 |
-
'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.tampered.fake_sig',
|
| 56 |
-
'eyJhbGciOiJub25lIiwidHlwIjoiSldUIn0.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkpvaG4gRG9lIn0.',
|
| 57 |
-
'{"alg":"HS256"}.{"sub":"admin","role":"superuser"}.',
|
| 58 |
],
|
| 59 |
-
"
|
| 60 |
-
"
|
| 61 |
-
"
|
| 62 |
-
"
|
| 63 |
-
"
|
| 64 |
-
"",
|
| 65 |
-
"a" * 1000, # DoS attempt
|
| 66 |
-
"password123",
|
| 67 |
-
"qwerty",
|
| 68 |
-
"111111",
|
| 69 |
-
"letmein",
|
| 70 |
],
|
| 71 |
-
"
|
| 72 |
-
|
| 73 |
-
"",
|
| 74 |
-
"
|
| 75 |
-
"
|
| 76 |
-
"test",
|
| 77 |
-
"hello world",
|
| 78 |
-
],
|
| 79 |
-
"rate_bypass": [
|
| 80 |
-
"",
|
| 81 |
-
None,
|
| 82 |
-
"' OR 1=1",
|
| 83 |
-
"client_id\x00",
|
| 84 |
-
" ",
|
| 85 |
-
"A" * 256,
|
| 86 |
-
"../../etc/passwd",
|
| 87 |
-
"null",
|
| 88 |
-
0,
|
| 89 |
-
],
|
| 90 |
-
"malicious_upload": [
|
| 91 |
-
"shell.php",
|
| 92 |
-
"exploit.sh",
|
| 93 |
-
"malware.exe",
|
| 94 |
-
"backdoor.py",
|
| 95 |
-
"../evil.php",
|
| 96 |
-
"normal.jpg.php",
|
| 97 |
-
"file\x00.txt",
|
| 98 |
-
"webshell.PHP",
|
| 99 |
-
"script.PhP",
|
| 100 |
-
".htaccess",
|
| 101 |
-
"cmd.bat",
|
| 102 |
-
],
|
| 103 |
-
"auth_bypass": [
|
| 104 |
-
"",
|
| 105 |
-
"admin:",
|
| 106 |
-
":password",
|
| 107 |
-
"admin'--",
|
| 108 |
-
"x' OR '1'='1",
|
| 109 |
-
"admin\x00:pass",
|
| 110 |
-
" : ",
|
| 111 |
-
None,
|
| 112 |
-
"admin:' OR '1'='1",
|
| 113 |
-
"' OR 1=1--:",
|
| 114 |
],
|
| 115 |
}
|
| 116 |
|
| 117 |
|
| 118 |
-
def get_payloads(attack_type: str, seed: int, count: int = 8) -> list:
|
| 119 |
"""
|
| 120 |
-
|
| 121 |
-
|
|
|
|
|
|
|
| 122 |
"""
|
| 123 |
rng = random.Random(seed)
|
| 124 |
-
base =
|
|
|
|
| 125 |
if not base:
|
| 126 |
return []
|
| 127 |
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
# Pad if we need more
|
| 133 |
-
while len(selected) + len(variants) < count and base:
|
| 134 |
-
extra = rng.choice(base)
|
| 135 |
-
variants.append(_compound_mutate(str(extra), rng))
|
| 136 |
-
|
| 137 |
-
# Include None payloads for the rate_bypass / auth_bypass tasks
|
| 138 |
-
if attack_type in ("rate_bypass", "auth_bypass"):
|
| 139 |
-
selected = [p for p in BASE_PAYLOADS[attack_type] if p is None] + selected
|
| 140 |
|
| 141 |
-
|
|
|
|
| 142 |
|
|
|
|
|
|
|
|
|
|
| 143 |
|
| 144 |
-
# ββ Evasion mutations βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 145 |
-
|
| 146 |
-
_OPS = [
|
| 147 |
-
lambda p, rng: urllib.parse.quote(p), # URL encode
|
| 148 |
-
lambda p, rng: urllib.parse.quote(urllib.parse.quote(p)), # Double encode
|
| 149 |
-
lambda p, rng: "".join(f"&#{ord(c)};" for c in p[:50]), # HTML entities
|
| 150 |
-
lambda p, rng: p.replace(" ", "/**/"), # SQL comment bypass
|
| 151 |
-
lambda p, rng: p.replace("'", "''"), # Quote doubling
|
| 152 |
-
lambda p, rng: p.replace("OR", "||").replace("AND", "&&"), # Operator swap
|
| 153 |
-
lambda p, rng: p.replace("../", "....//"), # Nested traversal
|
| 154 |
-
lambda p, rng: p.replace("/", "%2f"), # Slash encoding
|
| 155 |
-
lambda p, rng: p.replace("'", "\u02bc"), # Unicode apostrophe
|
| 156 |
-
lambda p, rng: p.replace("<", "\uff1c").replace(">", "\uff1e"), # Fullwidth angle brackets
|
| 157 |
-
lambda p, rng: p.upper(), # Uppercase
|
| 158 |
-
lambda p, rng: p + rng.choice(["", " ", " --", "\x00", "\t"]), # Suffix
|
| 159 |
-
]
|
| 160 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
result = payload
|
| 166 |
-
for
|
| 167 |
try:
|
| 168 |
-
result =
|
| 169 |
except Exception:
|
| 170 |
-
pass
|
| 171 |
return result
|
|
|
|
| 1 |
"""
|
| 2 |
+
SecureCodeEnv - Attack Payload Generator V2
|
| 3 |
+
Generates randomized attack payloads per episode using a seed.
|
| 4 |
+
V2 adds compound evasion mutations: URL encoding, HTML entities, Unicode confusables.
|
| 5 |
+
The seed changes every episode, so the agent CANNOT memorize specific strings.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
"""
|
| 7 |
import random
|
| 8 |
import urllib.parse
|
| 9 |
|
| 10 |
+
|
| 11 |
+
# ---------------------------------------------------------------------------
|
| 12 |
+
# Base payload library β real-world CVE-proven attack strings
|
| 13 |
+
# ---------------------------------------------------------------------------
|
| 14 |
+
BASE_PAYLOADS: dict[str, list[str]] = {
|
| 15 |
+
"injection": [
|
| 16 |
"' OR '1'='1",
|
|
|
|
|
|
|
| 17 |
"'; DROP TABLE users;--",
|
| 18 |
+
"admin'--",
|
| 19 |
+
"1 UNION SELECT * FROM passwords--",
|
| 20 |
"' OR 1=1--",
|
|
|
|
|
|
|
|
|
|
| 21 |
"\" OR \"1\"=\"1",
|
| 22 |
+
"1; SELECT * FROM information_schema.tables",
|
| 23 |
+
"' OR 'x'='x'; --",
|
| 24 |
+
"admin\" OR \"1\"=\"1",
|
| 25 |
+
"1 OR 1=1",
|
| 26 |
],
|
| 27 |
+
"traversal": [
|
| 28 |
"../../etc/passwd",
|
| 29 |
"../../../root/.ssh/id_rsa",
|
| 30 |
"/etc/shadow",
|
| 31 |
"....//....//etc/passwd",
|
| 32 |
+
"%2e%2e%2f%2e%2e%2fetc%2fpasswd",
|
| 33 |
"..%2F..%2Fetc%2Fpasswd",
|
| 34 |
+
"/etc/passwd%00",
|
| 35 |
+
"..\\..\\windows\\system32\\drivers\\etc\\hosts",
|
| 36 |
+
"%252e%252e%252fetc%252fpasswd",
|
| 37 |
+
"....\\\\....\\\\windows\\\\system32",
|
| 38 |
+
],
|
| 39 |
+
"auth_bypass": [
|
| 40 |
+
'{"alg":"none"}.payload.',
|
| 41 |
+
'{"exp":1000}.payload.',
|
| 42 |
+
'eyJhbGciOiJub25lIn0.payload.',
|
| 43 |
+
'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJhZG1pbiIsImV4cCI6OTk5OTk5OTk5OX0.fake',
|
| 44 |
+
'null',
|
| 45 |
+
'""',
|
| 46 |
+
'undefined',
|
| 47 |
+
'{"alg":"RS256","n":"AAAA"}.payload.',
|
| 48 |
],
|
| 49 |
"xss": [
|
| 50 |
+
"<script>alert('xss')</script>",
|
| 51 |
"<img src=x onerror=alert(1)>",
|
| 52 |
"javascript:alert(1)",
|
| 53 |
"<svg onload=alert(1)>",
|
| 54 |
+
"'\"><script>alert(document.cookie)</script>",
|
| 55 |
+
"<iframe src='javascript:alert(1)'>",
|
| 56 |
+
"{{7*7}}", # Template injection
|
| 57 |
+
"${7*7}", # Expression injection
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
],
|
| 59 |
+
"path_write": [
|
| 60 |
+
"/etc/cron.d/backdoor",
|
| 61 |
+
"/tmp/../etc/passwd",
|
| 62 |
+
"../../../../var/www/html/shell.php",
|
| 63 |
+
"/proc/self/environ",
|
| 64 |
+
"../../../.ssh/authorized_keys",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
],
|
| 66 |
+
"weak_random": [
|
| 67 |
+
# Values generated by predictable seeded random
|
| 68 |
+
"0.13436424411240122",
|
| 69 |
+
"0.8474337369372327",
|
| 70 |
+
"0.763774618976614",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
],
|
| 72 |
}
|
| 73 |
|
| 74 |
|
| 75 |
+
def get_payloads(attack_type: str, seed: int, count: int = 8) -> list[str]:
|
| 76 |
"""
|
| 77 |
+
Returns a seeded-random subset of payloads plus V2 evasion variants.
|
| 78 |
+
|
| 79 |
+
Seed changes every episode β agent cannot memorize specific strings.
|
| 80 |
+
V2 mutations: URL encoding, HTML entities, Unicode confusables, whitespace injection.
|
| 81 |
"""
|
| 82 |
rng = random.Random(seed)
|
| 83 |
+
base = BASE_PAYLOADS.get(attack_type, [])
|
| 84 |
+
|
| 85 |
if not base:
|
| 86 |
return []
|
| 87 |
|
| 88 |
+
# Shuffle and take half the count as base payloads
|
| 89 |
+
half = max(1, count // 2)
|
| 90 |
+
selected = rng.sample(base, min(half, len(base)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
+
# Generate mutated variants
|
| 93 |
+
variants = [_mutate_v2(p, rng) for p in selected[:half]]
|
| 94 |
|
| 95 |
+
result = selected + variants
|
| 96 |
+
rng.shuffle(result)
|
| 97 |
+
return result[:count]
|
| 98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
+
def _mutate_v2(payload: str, rng: random.Random) -> str:
|
| 101 |
+
"""
|
| 102 |
+
V2: Compound evasion mutations.
|
| 103 |
+
Multiple transformations applied in sequence for novel variants.
|
| 104 |
+
"""
|
| 105 |
+
mutations = [
|
| 106 |
+
# Whitespace injection
|
| 107 |
+
lambda p: p.replace(" ", " ") if " " in p else p + " ",
|
| 108 |
+
# Case variation
|
| 109 |
+
lambda p: p.upper() if rng.random() > 0.5 else p.swapcase(),
|
| 110 |
+
# SQL comment injection
|
| 111 |
+
lambda p: p.replace("--", "-- ") if "--" in p else p,
|
| 112 |
+
# URL encoding (single pass)
|
| 113 |
+
lambda p: urllib.parse.quote(p[:len(p)//2]) + p[len(p)//2:],
|
| 114 |
+
# Null byte (classic WAF bypass)
|
| 115 |
+
lambda p: p + "%00" if rng.random() > 0.5 else "%00" + p,
|
| 116 |
+
# Double-slash traversal variant
|
| 117 |
+
lambda p: p.replace("../", "..//") if "../" in p else p.replace("..\\", "..\\\\"),
|
| 118 |
+
# Trailing comment
|
| 119 |
+
lambda p: p + rng.choice(["", " --", " #", ";--"]),
|
| 120 |
+
# Unicode confusable for apostrophe
|
| 121 |
+
lambda p: p.replace("'", "\u02bc") if "'" in p else p,
|
| 122 |
+
]
|
| 123 |
|
| 124 |
+
# Apply 1-3 random mutations
|
| 125 |
+
n_mutations = rng.randint(1, 3)
|
| 126 |
+
chosen = rng.sample(mutations, min(n_mutations, len(mutations)))
|
| 127 |
result = payload
|
| 128 |
+
for mut in chosen:
|
| 129 |
try:
|
| 130 |
+
result = mut(result)
|
| 131 |
except Exception:
|
| 132 |
+
pass
|
| 133 |
return result
|
tasks/__init__.py
CHANGED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
# tasks/__init__.py
|
|
|
|
|
|
tasks/easy/hash_generator.py
DELETED
|
@@ -1,38 +0,0 @@
|
|
| 1 |
-
"""tasks/easy/hash_generator.py β CWE-327: Use of Broken Cryptographic Algorithm"""
|
| 2 |
-
|
| 3 |
-
TASK = {
|
| 4 |
-
"id": "hash_generator",
|
| 5 |
-
"difficulty": "easy",
|
| 6 |
-
"cwe_targets": ["CWE-327", "CWE-328"],
|
| 7 |
-
"problem_statement": """
|
| 8 |
-
Write a function `generate_hash(data: str) -> str` that:
|
| 9 |
-
1. Returns a secure hex-digest hash of the input string.
|
| 10 |
-
2. MUST use SHA-256 or stronger (hashlib.sha256 or hashlib.sha3_256).
|
| 11 |
-
3. MUST NOT use MD5 or SHA1 β these are broken algorithms.
|
| 12 |
-
4. MUST NOT invoke shell commands to compute the hash.
|
| 13 |
-
5. Must handle empty string and very long inputs (10000+ chars).
|
| 14 |
-
6. Returns a lowercase hex string.
|
| 15 |
-
|
| 16 |
-
The function name must be exactly: generate_hash
|
| 17 |
-
|
| 18 |
-
CORRECT: hashlib.sha256(data.encode()).hexdigest()
|
| 19 |
-
WRONG: hashlib.md5(), hashlib.sha1(), subprocess.run(['sha256sum', ...])
|
| 20 |
-
""",
|
| 21 |
-
"starter_code": """
|
| 22 |
-
import hashlib
|
| 23 |
-
|
| 24 |
-
def generate_hash(data: str) -> str:
|
| 25 |
-
# TODO: implement secure hashing
|
| 26 |
-
# Use SHA-256 or stronger - never MD5/SHA1
|
| 27 |
-
# Never use shell commands
|
| 28 |
-
pass
|
| 29 |
-
""",
|
| 30 |
-
"test_cases": [
|
| 31 |
-
{"input": "hello", "expected": "2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824", "fn": "generate_hash"},
|
| 32 |
-
{"input": "", "expected": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", "fn": "generate_hash"},
|
| 33 |
-
{"input": "test", "expected": "9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08", "fn": "generate_hash"},
|
| 34 |
-
{"input": "a" * 10000, "expected": None, "fn": "generate_hash"}, # Just check it runs
|
| 35 |
-
],
|
| 36 |
-
"naive_baseline": {"time_ms": 0.5, "memory_kb": 20},
|
| 37 |
-
"perf_input": "benchmark_data_string_for_hashing_test",
|
| 38 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tasks/easy/input_sanitizer.py
CHANGED
|
@@ -1,45 +1,83 @@
|
|
| 1 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
TASK = {
|
| 4 |
-
"id": "
|
| 5 |
"difficulty": "easy",
|
| 6 |
-
"cwe_targets": ["CWE-20", "CWE-116"
|
| 7 |
-
"
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
"starter_code":
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
-
def sanitize_input(user_input: str) -> str:
|
| 28 |
-
# TODO: implement proper input sanitisation
|
| 29 |
-
# Must escape: < > & " '
|
| 30 |
-
# Must strip whitespace
|
| 31 |
-
pass
|
| 32 |
-
""",
|
| 33 |
"test_cases": [
|
| 34 |
-
{"input": "<script>alert(1)</script>", "
|
| 35 |
-
{"
|
| 36 |
-
{"
|
| 37 |
-
{"input": "
|
| 38 |
-
{"input": "
|
| 39 |
-
{"
|
| 40 |
-
{"
|
| 41 |
-
{"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
],
|
| 43 |
-
|
| 44 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
}
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SecureCodeEnv Task: Input Sanitizer
|
| 3 |
+
Difficulty: Easy
|
| 4 |
+
CWE Targets: CWE-20 (Improper Input Validation), CWE-116 (Improper Encoding/Escaping)
|
| 5 |
+
"""
|
| 6 |
|
| 7 |
TASK = {
|
| 8 |
+
"id": "easy_input_sanitizer",
|
| 9 |
"difficulty": "easy",
|
| 10 |
+
"cwe_targets": ["CWE-20", "CWE-116"],
|
| 11 |
+
"component_name": "InputSanitizer",
|
| 12 |
+
"filename": "src/utils/input_sanitizer.py",
|
| 13 |
+
"problem_statement": """Write two Python functions:
|
| 14 |
+
|
| 15 |
+
1. `sanitize_html(user_input: str) -> str`
|
| 16 |
+
Escapes HTML special characters to prevent XSS.
|
| 17 |
+
Must escape: < > & " ' (to < > & " ')
|
| 18 |
+
Must handle None input (return empty string).
|
| 19 |
+
|
| 20 |
+
2. `sanitize_filename(filename: str) -> str`
|
| 21 |
+
Makes a filename safe to use on the filesystem.
|
| 22 |
+
Must: strip path separators (/ \\ ..), allow only alphanumeric, dots, dashes, underscores.
|
| 23 |
+
Must reject empty filenames (return 'unnamed').
|
| 24 |
+
Must truncate to 255 characters.
|
| 25 |
+
|
| 26 |
+
Do NOT use eval(), exec(), or os.system(). Do NOT trust any input.""",
|
| 27 |
+
|
| 28 |
+
"starter_code": '''def sanitize_html(user_input: str) -> str:
|
| 29 |
+
# TODO: Escape HTML properly
|
| 30 |
+
return user_input
|
| 31 |
+
|
| 32 |
+
def sanitize_filename(filename: str) -> str:
|
| 33 |
+
# TODO: Make filename safe
|
| 34 |
+
return filename
|
| 35 |
+
''',
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
"test_cases": [
|
| 38 |
+
{"fn": "sanitize_html", "input": ["<script>alert(1)</script>"], "expected_not_contains": ["<script>", "</script>"], "description": "XSS script tag escaped"},
|
| 39 |
+
{"fn": "sanitize_html", "input": ['<img src=x onerror=alert("xss")>'], "expected_not_contains": ["<img "], "description": "XSS img tag escaped"},
|
| 40 |
+
{"fn": "sanitize_html", "input": [None], "expected": "", "description": "None input returns empty string"},
|
| 41 |
+
{"fn": "sanitize_html", "input": ["Hello World"], "expected": "Hello World", "description": "Safe text unchanged"},
|
| 42 |
+
{"fn": "sanitize_filename", "input": ["../../etc/passwd"], "expected_not_contains": ["../"], "description": "Path traversal in filename blocked"},
|
| 43 |
+
{"fn": "sanitize_filename", "input": ["my_file.txt"], "expected": "my_file.txt", "description": "Valid filename passes"},
|
| 44 |
+
{"fn": "sanitize_filename", "input": [""], "expected": "unnamed", "description": "Empty filename becomes 'unnamed'"},
|
| 45 |
+
{"fn": "sanitize_filename", "input": ["a" * 300], "expected_max_len": 255, "description": "Long filename truncated"},
|
| 46 |
+
],
|
| 47 |
+
|
| 48 |
+
"attack_type": "xss",
|
| 49 |
+
|
| 50 |
+
"security_checks": [
|
| 51 |
+
{"type": "no_eval", "forbidden": ["eval(", "exec("]},
|
| 52 |
],
|
| 53 |
+
|
| 54 |
+
"naive_code": '''def sanitize_html(user_input):
|
| 55 |
+
return user_input or ""
|
| 56 |
+
def sanitize_filename(filename):
|
| 57 |
+
return filename or "unnamed"
|
| 58 |
+
''',
|
| 59 |
+
|
| 60 |
+
"optimal_code": '''import re
|
| 61 |
+
import html
|
| 62 |
+
|
| 63 |
+
def sanitize_html(user_input: str) -> str:
|
| 64 |
+
"""Escapes HTML special characters to prevent XSS."""
|
| 65 |
+
if user_input is None:
|
| 66 |
+
return ""
|
| 67 |
+
return html.escape(str(user_input), quote=True)
|
| 68 |
+
|
| 69 |
+
def sanitize_filename(filename: str) -> str:
|
| 70 |
+
"""Returns a filesystem-safe filename."""
|
| 71 |
+
if not filename:
|
| 72 |
+
return "unnamed"
|
| 73 |
+
# Remove path separators and traversal sequences
|
| 74 |
+
filename = re.sub(r'[\\\\/]', '_', filename)
|
| 75 |
+
filename = filename.replace('..', '')
|
| 76 |
+
# Keep only safe characters
|
| 77 |
+
filename = re.sub(r'[^a-zA-Z0-9._\\-]', '_', filename)
|
| 78 |
+
filename = filename.strip('._')
|
| 79 |
+
if not filename:
|
| 80 |
+
return "unnamed"
|
| 81 |
+
return filename[:255]
|
| 82 |
+
''',
|
| 83 |
}
|
tasks/easy/password_validator.py
CHANGED
|
@@ -1,43 +1,131 @@
|
|
| 1 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
TASK = {
|
| 4 |
-
"id": "
|
| 5 |
"difficulty": "easy",
|
| 6 |
"cwe_targets": ["CWE-916", "CWE-521"],
|
| 7 |
-
"
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
- At least
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
def validate_password(password: str) -> bool:
|
| 26 |
-
# TODO:
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
| 31 |
"test_cases": [
|
| 32 |
-
{
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
{
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
],
|
| 41 |
-
|
| 42 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
}
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SecureCodeEnv Task: Password Validator
|
| 3 |
+
Difficulty: Easy
|
| 4 |
+
CWE Targets: CWE-916 (Weak Password Hashing), CWE-521 (Weak Password Requirements)
|
| 5 |
+
|
| 6 |
+
Agent must write a function that:
|
| 7 |
+
1. Validates password strength (length, complexity)
|
| 8 |
+
2. Hashes using bcrypt β NOT MD5/SHA1/SHA256
|
| 9 |
+
3. Returns False for weak passwords
|
| 10 |
+
"""
|
| 11 |
|
| 12 |
TASK = {
|
| 13 |
+
"id": "easy_password_validator",
|
| 14 |
"difficulty": "easy",
|
| 15 |
"cwe_targets": ["CWE-916", "CWE-521"],
|
| 16 |
+
"component_name": "PasswordValidator",
|
| 17 |
+
"filename": "src/auth/password_validator.py",
|
| 18 |
+
"problem_statement": """Write two Python functions:
|
| 19 |
+
|
| 20 |
+
1. `validate_password(password: str) -> bool`
|
| 21 |
+
Returns True if the password meets ALL requirements:
|
| 22 |
+
- At least 8 characters long
|
| 23 |
+
- Contains at least one uppercase letter
|
| 24 |
+
- Contains at least one lowercase letter
|
| 25 |
+
- Contains at least one digit
|
| 26 |
+
- Contains at least one special character from: !@#$%^&*
|
| 27 |
+
|
| 28 |
+
2. `hash_password(password: str) -> str`
|
| 29 |
+
Hashes a password securely and returns the hash string.
|
| 30 |
+
REQUIREMENTS:
|
| 31 |
+
- MUST use bcrypt (import bcrypt) β NOT hashlib, NOT md5, NOT sha256
|
| 32 |
+
- Must use a salt (bcrypt handles this automatically)
|
| 33 |
+
- Must return a string (decode bytes if needed)
|
| 34 |
+
|
| 35 |
+
Do NOT return the plaintext password. Do NOT use: hashlib, md5, sha1, sha256.""",
|
| 36 |
+
|
| 37 |
+
"starter_code": '''import hashlib
|
| 38 |
|
| 39 |
def validate_password(password: str) -> bool:
|
| 40 |
+
# TODO: Implement proper validation
|
| 41 |
+
return len(password) >= 6
|
| 42 |
+
|
| 43 |
+
def hash_password(password: str) -> str:
|
| 44 |
+
# TODO: Use secure hashing - NOT MD5!
|
| 45 |
+
return hashlib.md5(password.encode()).hexdigest()
|
| 46 |
+
''',
|
| 47 |
+
|
| 48 |
"test_cases": [
|
| 49 |
+
{
|
| 50 |
+
"fn": "validate_password",
|
| 51 |
+
"input": ["StrongP@ss1"],
|
| 52 |
+
"expected": True,
|
| 53 |
+
"description": "Valid strong password"
|
| 54 |
+
},
|
| 55 |
+
{
|
| 56 |
+
"fn": "validate_password",
|
| 57 |
+
"input": ["weakpass"],
|
| 58 |
+
"expected": False,
|
| 59 |
+
"description": "No uppercase, no digit, no special char"
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"fn": "validate_password",
|
| 63 |
+
"input": ["Short1!"],
|
| 64 |
+
"expected": False,
|
| 65 |
+
"description": "Too short (7 chars)"
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"fn": "validate_password",
|
| 69 |
+
"input": ["NOLOWERCASE1!"],
|
| 70 |
+
"expected": False,
|
| 71 |
+
"description": "No lowercase letter"
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"fn": "validate_password",
|
| 75 |
+
"input": ["NoDigit@Pass"],
|
| 76 |
+
"expected": False,
|
| 77 |
+
"description": "No digit"
|
| 78 |
+
},
|
| 79 |
+
{
|
| 80 |
+
"fn": "validate_password",
|
| 81 |
+
"input": ["ValidPass1!"],
|
| 82 |
+
"expected": True,
|
| 83 |
+
"description": "Another valid password"
|
| 84 |
+
},
|
| 85 |
],
|
| 86 |
+
|
| 87 |
+
"attack_type": "none", # No dynamic attacks for easy tasks
|
| 88 |
+
|
| 89 |
+
"security_checks": [
|
| 90 |
+
{"type": "no_weak_hash", "forbidden": ["hashlib.md5", "hashlib.sha1", "hashlib.sha256", "md5(", "sha1("]},
|
| 91 |
+
{"type": "uses_bcrypt", "required_import": "bcrypt"},
|
| 92 |
+
],
|
| 93 |
+
|
| 94 |
+
"naive_code": '''import hashlib
|
| 95 |
+
def validate_password(password: str) -> bool:
|
| 96 |
+
return len(password) >= 6
|
| 97 |
+
def hash_password(password: str) -> str:
|
| 98 |
+
return hashlib.md5(password.encode()).hexdigest()
|
| 99 |
+
''',
|
| 100 |
+
|
| 101 |
+
"optimal_code": '''import re
|
| 102 |
+
|
| 103 |
+
try:
|
| 104 |
+
import bcrypt
|
| 105 |
+
_HAS_BCRYPT = True
|
| 106 |
+
except ImportError:
|
| 107 |
+
_HAS_BCRYPT = False
|
| 108 |
+
|
| 109 |
+
def validate_password(password: str) -> bool:
|
| 110 |
+
"""Validates password against security policy."""
|
| 111 |
+
if not password or len(password) < 8:
|
| 112 |
+
return False
|
| 113 |
+
if not re.search(r'[A-Z]', password):
|
| 114 |
+
return False
|
| 115 |
+
if not re.search(r'[a-z]', password):
|
| 116 |
+
return False
|
| 117 |
+
if not re.search(r'[0-9]', password):
|
| 118 |
+
return False
|
| 119 |
+
if not re.search(r'[!@#$%^&*]', password):
|
| 120 |
+
return False
|
| 121 |
+
return True
|
| 122 |
+
|
| 123 |
+
def hash_password(password: str) -> str:
|
| 124 |
+
"""Hashes password with bcrypt (auto-salted, work factor 12)."""
|
| 125 |
+
if not _HAS_BCRYPT:
|
| 126 |
+
raise ImportError("bcrypt is required: pip install bcrypt")
|
| 127 |
+
salt = bcrypt.gensalt(rounds=12)
|
| 128 |
+
hashed = bcrypt.hashpw(password.encode("utf-8"), salt)
|
| 129 |
+
return hashed.decode("utf-8")
|
| 130 |
+
''',
|
| 131 |
}
|
tasks/easy/token_generator.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SecureCodeEnv Task: Secure Token Generator
|
| 3 |
+
Difficulty: Easy
|
| 4 |
+
CWE Targets: CWE-338 (Use of Cryptographically Weak PRNG), CWE-330 (Insufficient Randomness)
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
TASK = {
|
| 8 |
+
"id": "easy_token_generator",
|
| 9 |
+
"difficulty": "easy",
|
| 10 |
+
"cwe_targets": ["CWE-338", "CWE-330"],
|
| 11 |
+
"component_name": "TokenGenerator",
|
| 12 |
+
"filename": "src/auth/token_generator.py",
|
| 13 |
+
"problem_statement": """Write two Python functions:
|
| 14 |
+
|
| 15 |
+
1. `generate_token(length: int = 32) -> str`
|
| 16 |
+
Generates a cryptographically secure random token.
|
| 17 |
+
- MUST use secrets module (NOT random, NOT os.urandom directly, NOT time-based seeds)
|
| 18 |
+
- Token must be URL-safe (use secrets.token_urlsafe or secrets.token_hex)
|
| 19 |
+
- Length must be respected (the output must be at least `length` characters)
|
| 20 |
+
- Must handle length < 1 by using 32 as default
|
| 21 |
+
|
| 22 |
+
2. `generate_session_id() -> str`
|
| 23 |
+
Generates a secure session ID suitable for HTTP cookies.
|
| 24 |
+
- Must be at least 32 characters long
|
| 25 |
+
- Must use secrets module
|
| 26 |
+
- Must be hex or URL-safe base64 format
|
| 27 |
+
|
| 28 |
+
Do NOT use: random.random(), random.randint(), math.random, time.time() for randomness.""",
|
| 29 |
+
|
| 30 |
+
"starter_code": '''import random
|
| 31 |
+
import time
|
| 32 |
+
|
| 33 |
+
def generate_token(length: int = 32) -> str:
|
| 34 |
+
# TODO: Use cryptographically secure randomness
|
| 35 |
+
chars = "abcdefghijklmnopqrstuvwxyz0123456789"
|
| 36 |
+
random.seed(time.time()) # INSECURE: time-based seed
|
| 37 |
+
return "".join(random.choice(chars) for _ in range(length))
|
| 38 |
+
|
| 39 |
+
def generate_session_id() -> str:
|
| 40 |
+
# TODO: Secure session ID
|
| 41 |
+
return str(int(time.time() * 1000)) # INSECURE: predictable
|
| 42 |
+
''',
|
| 43 |
+
|
| 44 |
+
"test_cases": [
|
| 45 |
+
{"fn": "generate_token", "input": [32], "expected_min_len": 32, "description": "Token is at least 32 chars"},
|
| 46 |
+
{"fn": "generate_token", "input": [64], "expected_min_len": 64, "description": "Token is at least 64 chars"},
|
| 47 |
+
{"fn": "generate_token", "input": [0], "expected_min_len": 32, "description": "Length 0 defaults to 32"},
|
| 48 |
+
{"fn": "generate_token", "input": [-1], "expected_min_len": 32, "description": "Negative length defaults to 32"},
|
| 49 |
+
{"fn": "generate_session_id", "input": [], "expected_min_len": 32, "description": "Session ID at least 32 chars"},
|
| 50 |
+
{"fn": "generate_token", "input": [32], "unique": True, "description": "Two calls produce different tokens"},
|
| 51 |
+
],
|
| 52 |
+
|
| 53 |
+
"attack_type": "weak_random",
|
| 54 |
+
|
| 55 |
+
"security_checks": [
|
| 56 |
+
{"type": "no_weak_random", "forbidden": ["random.random(", "random.randint(", "random.seed("]},
|
| 57 |
+
{"type": "uses_secrets", "required_import": "secrets"},
|
| 58 |
+
],
|
| 59 |
+
|
| 60 |
+
"naive_code": '''import random
|
| 61 |
+
def generate_token(length: int = 32) -> str:
|
| 62 |
+
chars = "abcdefghijklmnopqrstuvwxyz0123456789"
|
| 63 |
+
return "".join(random.choice(chars) for _ in range(max(length, 32)))
|
| 64 |
+
def generate_session_id() -> str:
|
| 65 |
+
return str(random.getrandbits(128))
|
| 66 |
+
''',
|
| 67 |
+
|
| 68 |
+
"optimal_code": '''import secrets
|
| 69 |
+
|
| 70 |
+
def generate_token(length: int = 32) -> str:
|
| 71 |
+
"""Generates a cryptographically secure URL-safe random token."""
|
| 72 |
+
if length < 1:
|
| 73 |
+
length = 32
|
| 74 |
+
# token_urlsafe(n) produces ceil(n * 4/3) chars, so nbytes = length * 3 // 4
|
| 75 |
+
return secrets.token_urlsafe(max(length, 32))[:max(length, 32)]
|
| 76 |
+
|
| 77 |
+
def generate_session_id() -> str:
|
| 78 |
+
"""Generates a secure 64-char hex session ID."""
|
| 79 |
+
return secrets.token_hex(32) # 32 bytes = 64 hex chars
|
| 80 |
+
''',
|
| 81 |
+
}
|
tasks/hard/auth_middleware.py
CHANGED
|
@@ -1,57 +1,133 @@
|
|
| 1 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
TASK = {
|
| 4 |
-
"id": "
|
| 5 |
"difficulty": "hard",
|
| 6 |
-
"cwe_targets": ["CWE-287", "CWE-
|
| 7 |
-
"
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
-
|
| 15 |
-
-
|
| 16 |
-
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
""",
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
# 3. Use hmac.compare_digest for constant-time comparison
|
| 42 |
-
# 4. Never use shell commands
|
| 43 |
-
pass
|
| 44 |
-
""",
|
| 45 |
"test_cases": [
|
| 46 |
-
{"
|
| 47 |
-
{"
|
| 48 |
-
{"input": "",
|
| 49 |
-
{"
|
| 50 |
-
{"input": ":
|
| 51 |
-
{"input": "
|
| 52 |
-
{"input": None,
|
| 53 |
-
{"
|
| 54 |
],
|
| 55 |
-
|
| 56 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
}
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SecureCodeEnv Task: Auth Middleware
|
| 3 |
+
Difficulty: Hard
|
| 4 |
+
CWE Targets: CWE-287 (Improper Authentication), CWE-352 (CSRF)
|
| 5 |
+
"""
|
| 6 |
|
| 7 |
TASK = {
|
| 8 |
+
"id": "hard_auth_middleware",
|
| 9 |
"difficulty": "hard",
|
| 10 |
+
"cwe_targets": ["CWE-287", "CWE-352"],
|
| 11 |
+
"component_name": "AuthMiddleware",
|
| 12 |
+
"filename": "src/middleware/auth_middleware.py",
|
| 13 |
+
"problem_statement": """Write two Python functions for an authentication middleware:
|
| 14 |
+
|
| 15 |
+
1. `verify_csrf_token(request_token: str, session_token: str) -> bool`
|
| 16 |
+
CSRF protection β verifies a submitted token matches the session token.
|
| 17 |
+
- MUST use `hmac.compare_digest()` for timing-safe comparison
|
| 18 |
+
- NEVER use == operator (timing attack vulnerability)
|
| 19 |
+
- Return False if either token is None/empty
|
| 20 |
+
- Return False if tokens don't match
|
| 21 |
+
|
| 22 |
+
2. `authenticate_request(auth_header: str, valid_tokens: dict) -> dict | None`
|
| 23 |
+
Validates a Bearer token from the Authorization header.
|
| 24 |
+
- auth_header format: "Bearer <token>"
|
| 25 |
+
- valid_tokens: dict mapping token β user_info dict
|
| 26 |
+
- Return the user_info dict if token is valid
|
| 27 |
+
- Return None for: missing header, wrong format, unknown token, empty token
|
| 28 |
+
- Must be timing-safe: use constant-time lookup (hmac.compare_digest per token)
|
| 29 |
+
|
| 30 |
+
NEVER use == for secret comparison.
|
| 31 |
+
NEVER reveal whether a token exists vs is wrong in error messages.""",
|
| 32 |
+
|
| 33 |
+
"starter_code": '''def verify_csrf_token(request_token: str, session_token: str) -> bool:
|
| 34 |
+
# TODO: INSECURE - timing attack vulnerability
|
| 35 |
+
return request_token == session_token
|
| 36 |
+
|
| 37 |
+
def authenticate_request(auth_header: str, valid_tokens: dict):
|
| 38 |
+
# TODO: Implement properly
|
| 39 |
+
if not auth_header:
|
| 40 |
+
return None
|
| 41 |
+
token = auth_header.replace("Bearer ", "")
|
| 42 |
+
return valid_tokens.get(token)
|
| 43 |
+
''',
|
| 44 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
"test_cases": [
|
| 46 |
+
{"fn": "verify_csrf_token", "input": ["abc123", "abc123"], "expected": True, "description": "Matching tokens return True"},
|
| 47 |
+
{"fn": "verify_csrf_token", "input": ["abc123", "xyz999"], "expected": False, "description": "Non-matching tokens return False"},
|
| 48 |
+
{"fn": "verify_csrf_token", "input": [None, "abc123"], "expected": False, "description": "None token returns False"},
|
| 49 |
+
{"fn": "verify_csrf_token", "input": ["", "abc123"], "expected": False, "description": "Empty token returns False"},
|
| 50 |
+
{"fn": "authenticate_request", "input": ["Bearer valid_token_123", {"valid_token_123": {"user": "alice"}}], "expected": {"user": "alice"}, "description": "Valid bearer token returns user info"},
|
| 51 |
+
{"fn": "authenticate_request", "input": ["Bearer wrong_token", {"valid_token": {"user": "alice"}}], "expected": None, "description": "Wrong token returns None"},
|
| 52 |
+
{"fn": "authenticate_request", "input": [None, {}], "expected": None, "description": "None header returns None"},
|
| 53 |
+
{"fn": "authenticate_request", "input": ["Basic abc123", {}], "expected": None, "description": "Non-Bearer scheme returns None"},
|
| 54 |
],
|
| 55 |
+
|
| 56 |
+
"attack_type": "auth_bypass",
|
| 57 |
+
|
| 58 |
+
"attack_config": {
|
| 59 |
+
"fn_name": "verify_csrf_token",
|
| 60 |
+
"fn_args_template": ["{payload}", "legitimate_csrf_token_xyz"],
|
| 61 |
+
"check": lambda result, payload: result is False,
|
| 62 |
+
},
|
| 63 |
+
|
| 64 |
+
"security_checks": [
|
| 65 |
+
{"type": "uses_hmac_compare_digest", "required": ["hmac.compare_digest"]},
|
| 66 |
+
{"type": "no_eq_comparison_secrets", "warn_on": ["== session_token", "== secret"]},
|
| 67 |
+
],
|
| 68 |
+
|
| 69 |
+
"naive_code": '''def verify_csrf_token(request_token, session_token):
|
| 70 |
+
return request_token == session_token
|
| 71 |
+
def authenticate_request(auth_header, valid_tokens):
|
| 72 |
+
if not auth_header: return None
|
| 73 |
+
token = auth_header.replace("Bearer ", "")
|
| 74 |
+
return valid_tokens.get(token)
|
| 75 |
+
''',
|
| 76 |
+
|
| 77 |
+
"optimal_code": '''import hmac
|
| 78 |
+
|
| 79 |
+
def verify_csrf_token(request_token: str, session_token: str) -> bool:
|
| 80 |
+
"""Timing-safe CSRF token comparison.
|
| 81 |
+
|
| 82 |
+
Uses hmac.compare_digest() to prevent timing attacks where an attacker
|
| 83 |
+
could deduce token length/prefix by measuring response time differences.
|
| 84 |
+
|
| 85 |
+
Args:
|
| 86 |
+
request_token: Token submitted with the request
|
| 87 |
+
session_token: Token stored in the session
|
| 88 |
+
|
| 89 |
+
Returns:
|
| 90 |
+
True only if tokens match; False for any failure
|
| 91 |
+
"""
|
| 92 |
+
if not request_token or not session_token:
|
| 93 |
+
return False
|
| 94 |
+
# hmac.compare_digest prevents timing attacks
|
| 95 |
+
return hmac.compare_digest(
|
| 96 |
+
request_token.encode("utf-8"),
|
| 97 |
+
session_token.encode("utf-8"),
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def authenticate_request(auth_header: str, valid_tokens: dict) -> dict | None:
|
| 102 |
+
"""Validates a Bearer token from the Authorization header.
|
| 103 |
+
|
| 104 |
+
Timing-safe: iterates all tokens with compare_digest to prevent
|
| 105 |
+
enumeration attacks based on short-circuit evaluation.
|
| 106 |
+
|
| 107 |
+
Args:
|
| 108 |
+
auth_header: Value of the Authorization header
|
| 109 |
+
valid_tokens: Mapping of token string -> user info dict
|
| 110 |
+
|
| 111 |
+
Returns:
|
| 112 |
+
User info dict if authenticated, None otherwise
|
| 113 |
+
"""
|
| 114 |
+
if not auth_header:
|
| 115 |
+
return None
|
| 116 |
+
|
| 117 |
+
parts = auth_header.split(" ", 1)
|
| 118 |
+
if len(parts) != 2 or parts[0].lower() != "bearer":
|
| 119 |
+
return None
|
| 120 |
+
|
| 121 |
+
provided_token = parts[1].strip()
|
| 122 |
+
if not provided_token:
|
| 123 |
+
return None
|
| 124 |
+
|
| 125 |
+
# Timing-safe lookup: always compare all tokens (no short-circuit)
|
| 126 |
+
matched_user = None
|
| 127 |
+
provided_bytes = provided_token.encode("utf-8")
|
| 128 |
+
for stored_token, user_info in valid_tokens.items():
|
| 129 |
+
if hmac.compare_digest(provided_bytes, stored_token.encode("utf-8")):
|
| 130 |
+
matched_user = user_info
|
| 131 |
+
return matched_user
|
| 132 |
+
''',
|
| 133 |
}
|
tasks/hard/file_upload_handler.py
CHANGED
|
@@ -1,46 +1,153 @@
|
|
| 1 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
TASK = {
|
| 4 |
-
"id": "
|
| 5 |
"difficulty": "hard",
|
| 6 |
-
"cwe_targets": ["CWE-
|
| 7 |
-
"
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
-
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
-
def handle_upload(filename: str, content: bytes) -> str:
|
| 34 |
-
# TODO: implement secure file upload handler
|
| 35 |
-
# 1. Validate filename (no traversal, no null bytes)
|
| 36 |
-
# 2. Check extension (normalise to lowercase, reject dangerous)
|
| 37 |
-
# 3. Save to UPLOAD_DIR and return path
|
| 38 |
-
pass
|
| 39 |
-
""",
|
| 40 |
"test_cases": [
|
| 41 |
-
{"input": ["report.pdf",
|
| 42 |
-
{"input": ["
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
],
|
| 44 |
-
|
| 45 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
}
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SecureCodeEnv Task: File Upload Handler
|
| 3 |
+
Difficulty: Hard
|
| 4 |
+
CWE Targets: CWE-22 (Path Traversal), CWE-434 (Unrestricted File Upload)
|
| 5 |
+
"""
|
| 6 |
|
| 7 |
TASK = {
|
| 8 |
+
"id": "hard_file_upload_handler",
|
| 9 |
"difficulty": "hard",
|
| 10 |
+
"cwe_targets": ["CWE-22", "CWE-434"],
|
| 11 |
+
"component_name": "FileUploadHandler",
|
| 12 |
+
"filename": "src/files/file_upload_handler.py",
|
| 13 |
+
"problem_statement": """Write a Python function:
|
| 14 |
+
|
| 15 |
+
`validate_upload(filename: str, content_type: str, file_size: int, upload_dir: str) -> dict`
|
| 16 |
+
|
| 17 |
+
Validates and prepares a file upload for safe storage.
|
| 18 |
+
|
| 19 |
+
REQUIREMENTS β ALL must pass:
|
| 20 |
+
|
| 21 |
+
1. FILENAME SAFETY:
|
| 22 |
+
- Strip all path components (only keep the basename)
|
| 23 |
+
- Reject filenames with path traversal (../../)
|
| 24 |
+
- Reject filenames with null bytes (%00, \\x00)
|
| 25 |
+
- Max filename length: 100 characters
|
| 26 |
+
- Only allow: alphanumeric, dots, dashes, underscores
|
| 27 |
+
|
| 28 |
+
2. EXTENSION VALIDATION:
|
| 29 |
+
- Allowed extensions: .jpg, .jpeg, .png, .gif, .pdf, .txt, .csv
|
| 30 |
+
- Block ALL executable extensions: .php, .py, .sh, .exe, .js, .html, .htm, .asp, .jsp
|
| 31 |
+
- Extension check must be case-insensitive (.PHP is same as .php)
|
| 32 |
+
|
| 33 |
+
3. MIME TYPE VALIDATION:
|
| 34 |
+
- Must cross-check content_type against allowed types:
|
| 35 |
+
image/jpeg, image/png, image/gif, application/pdf, text/plain, text/csv
|
| 36 |
+
- Reject mismatches between extension and content_type (e.g., .jpg with text/html)
|
| 37 |
+
|
| 38 |
+
4. FILE SIZE LIMITS:
|
| 39 |
+
- Max size: 10MB (10 * 1024 * 1024 bytes)
|
| 40 |
+
- Reject files over the limit
|
| 41 |
|
| 42 |
+
5. SAFE PATH GENERATION:
|
| 43 |
+
- Returns: {"ok": True, "safe_path": "<upload_dir>/<uuid>_<safe_filename>", "safe_filename": str}
|
| 44 |
+
- Uses uuid4 prefix to prevent filename collisions
|
| 45 |
+
- On rejection: {"ok": False, "reason": "<specific reason>"}
|
| 46 |
+
|
| 47 |
+
NEVER trust content_type alone β always validate extension too.""",
|
| 48 |
+
|
| 49 |
+
"starter_code": '''import os
|
| 50 |
+
|
| 51 |
+
def validate_upload(filename: str, content_type: str, file_size: int, upload_dir: str) -> dict:
|
| 52 |
+
# TODO: Implement all security checks
|
| 53 |
+
safe_path = os.path.join(upload_dir, filename)
|
| 54 |
+
return {"ok": True, "safe_path": safe_path}
|
| 55 |
+
''',
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
"test_cases": [
|
| 58 |
+
{"fn": "validate_upload", "input": ["report.pdf", "application/pdf", 1024, "/uploads"], "expected_ok": True, "description": "Valid PDF accepted"},
|
| 59 |
+
{"fn": "validate_upload", "input": ["shell.php", "image/jpeg", 1024, "/uploads"], "expected_ok": False, "description": "PHP file rejected"},
|
| 60 |
+
{"fn": "validate_upload", "input": ["../../../etc/passwd", "text/plain", 100, "/uploads"], "expected_ok": False, "description": "Traversal filename rejected"},
|
| 61 |
+
{"fn": "validate_upload", "input": ["image.jpg", "text/html", 1024, "/uploads"], "expected_ok": False, "description": "MIME mismatch rejected"},
|
| 62 |
+
{"fn": "validate_upload", "input": ["huge.pdf", "application/pdf", 11 * 1024 * 1024, "/uploads"], "expected_ok": False, "description": "File too large rejected"},
|
| 63 |
+
{"fn": "validate_upload", "input": ["file.PHP", "image/jpeg", 1024, "/uploads"], "expected_ok": False, "description": "PHP with uppercase extension rejected"},
|
| 64 |
+
{"fn": "validate_upload", "input": ["photo.jpg", "image/jpeg", 5000, "/uploads"], "expected_ok": True, "description": "Valid JPEG accepted"},
|
| 65 |
+
],
|
| 66 |
+
|
| 67 |
+
"attack_type": "traversal",
|
| 68 |
+
|
| 69 |
+
"attack_config": {
|
| 70 |
+
"fn_name": "validate_upload",
|
| 71 |
+
"fn_args_template": ["{payload}", "image/jpeg", 1000, "/uploads"],
|
| 72 |
+
"check": lambda result, payload: (
|
| 73 |
+
isinstance(result, dict) and
|
| 74 |
+
(result.get("ok") is False or
|
| 75 |
+
(".." not in str(result.get("safe_path", "")) and
|
| 76 |
+
"/etc" not in str(result.get("safe_path", ""))))
|
| 77 |
+
),
|
| 78 |
+
},
|
| 79 |
+
|
| 80 |
+
"security_checks": [
|
| 81 |
+
{"type": "no_direct_join", "warn_on": ["os.path.join(upload_dir, filename)"]},
|
| 82 |
+
{"type": "uses_uuid", "recommended": ["uuid"]},
|
| 83 |
],
|
| 84 |
+
|
| 85 |
+
"naive_code": '''import os
|
| 86 |
+
def validate_upload(filename, content_type, file_size, upload_dir):
|
| 87 |
+
return {"ok": True, "safe_path": os.path.join(upload_dir, filename)}
|
| 88 |
+
''',
|
| 89 |
+
|
| 90 |
+
"optimal_code": '''import os
|
| 91 |
+
import re
|
| 92 |
+
import uuid
|
| 93 |
+
from pathlib import Path
|
| 94 |
+
|
| 95 |
+
ALLOWED_EXTENSIONS = {".jpg", ".jpeg", ".png", ".gif", ".pdf", ".txt", ".csv"}
|
| 96 |
+
BLOCKED_EXTENSIONS = {".php", ".py", ".sh", ".exe", ".js", ".html", ".htm", ".asp", ".jsp", ".rb", ".pl"}
|
| 97 |
+
ALLOWED_MIME_TYPES = {
|
| 98 |
+
".jpg": {"image/jpeg"}, ".jpeg": {"image/jpeg"},
|
| 99 |
+
".png": {"image/png"}, ".gif": {"image/gif"},
|
| 100 |
+
".pdf": {"application/pdf"},
|
| 101 |
+
".txt": {"text/plain"}, ".csv": {"text/csv", "text/plain"},
|
| 102 |
+
}
|
| 103 |
+
MAX_SIZE = 10 * 1024 * 1024 # 10MB
|
| 104 |
+
MAX_FILENAME_LEN = 100
|
| 105 |
+
|
| 106 |
+
def validate_upload(filename: str, content_type: str, file_size: int, upload_dir: str) -> dict:
|
| 107 |
+
"""Validates a file upload with full security checks."""
|
| 108 |
+
if not filename:
|
| 109 |
+
return {"ok": False, "reason": "Filename is empty"}
|
| 110 |
+
|
| 111 |
+
# 1. Null byte check
|
| 112 |
+
if "\\x00" in filename or "%00" in filename:
|
| 113 |
+
return {"ok": False, "reason": "Null byte in filename"}
|
| 114 |
+
|
| 115 |
+
# 2. Extract basename only β strip any path components
|
| 116 |
+
safe_name = Path(filename).name
|
| 117 |
+
if not safe_name:
|
| 118 |
+
return {"ok": False, "reason": "Invalid filename after stripping path"}
|
| 119 |
+
|
| 120 |
+
# 3. Block traversal sequences
|
| 121 |
+
if ".." in safe_name or "/" in safe_name or "\\\\" in safe_name:
|
| 122 |
+
return {"ok": False, "reason": "Path traversal in filename"}
|
| 123 |
+
|
| 124 |
+
# 4. Allow only safe characters
|
| 125 |
+
safe_name = re.sub(r"[^a-zA-Z0-9._\\-]", "_", safe_name)
|
| 126 |
+
|
| 127 |
+
# 5. Length check
|
| 128 |
+
if len(safe_name) > MAX_FILENAME_LEN:
|
| 129 |
+
return {"ok": False, "reason": f"Filename exceeds {MAX_FILENAME_LEN} characters"}
|
| 130 |
+
|
| 131 |
+
# 6. Extension check (case-insensitive)
|
| 132 |
+
ext = Path(safe_name).suffix.lower()
|
| 133 |
+
if ext in BLOCKED_EXTENSIONS:
|
| 134 |
+
return {"ok": False, "reason": f"Executable extension blocked: {ext}"}
|
| 135 |
+
if ext not in ALLOWED_EXTENSIONS:
|
| 136 |
+
return {"ok": False, "reason": f"Extension not allowed: {ext}"}
|
| 137 |
+
|
| 138 |
+
# 7. MIME type cross-check
|
| 139 |
+
allowed_mimes = ALLOWED_MIME_TYPES.get(ext, set())
|
| 140 |
+
if content_type not in allowed_mimes:
|
| 141 |
+
return {"ok": False, "reason": f"MIME type {content_type!r} not valid for {ext}"}
|
| 142 |
+
|
| 143 |
+
# 8. File size limit
|
| 144 |
+
if file_size > MAX_SIZE:
|
| 145 |
+
return {"ok": False, "reason": f"File too large: {file_size} bytes (max {MAX_SIZE})"}
|
| 146 |
+
|
| 147 |
+
# 9. Generate UUID-prefixed safe path
|
| 148 |
+
unique_name = f"{uuid.uuid4().hex}_{safe_name}"
|
| 149 |
+
safe_path = str(Path(upload_dir).resolve() / unique_name)
|
| 150 |
+
|
| 151 |
+
return {"ok": True, "safe_path": safe_path, "safe_filename": unique_name}
|
| 152 |
+
''',
|
| 153 |
}
|
tasks/hard/jwt_validator.py
CHANGED
|
@@ -1,54 +1,135 @@
|
|
| 1 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
TASK = {
|
| 4 |
-
"id": "
|
| 5 |
"difficulty": "hard",
|
| 6 |
-
"cwe_targets": ["CWE-347", "CWE-
|
| 7 |
-
"
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
try:
|
| 42 |
-
|
|
|
|
|
|
|
| 43 |
except Exception:
|
| 44 |
return None
|
| 45 |
-
|
|
|
|
| 46 |
"test_cases": [
|
| 47 |
-
{"input": None,
|
| 48 |
-
{"input": "",
|
| 49 |
-
{"input": "
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
],
|
| 52 |
-
|
| 53 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
}
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SecureCodeEnv Task: JWT Validator
|
| 3 |
+
Difficulty: Hard
|
| 4 |
+
CWE Targets: CWE-347 (Improper Verification of Cryptographic Signature), CWE-613 (Insufficient Session Expiration)
|
| 5 |
+
"""
|
| 6 |
|
| 7 |
TASK = {
|
| 8 |
+
"id": "hard_jwt_validator",
|
| 9 |
"difficulty": "hard",
|
| 10 |
+
"cwe_targets": ["CWE-347", "CWE-613"],
|
| 11 |
+
"component_name": "JWTValidator",
|
| 12 |
+
"filename": "src/auth/jwt_validator.py",
|
| 13 |
+
"problem_statement": """Write a Python function:
|
| 14 |
+
|
| 15 |
+
`validate_token(token: str, secret_key: str) -> dict | None`
|
| 16 |
+
|
| 17 |
+
Validates a JWT (JSON Web Token) and returns the payload if valid, None otherwise.
|
| 18 |
+
|
| 19 |
+
REQUIREMENTS β ALL must pass:
|
| 20 |
+
|
| 21 |
+
1. ALGORITHM ENFORCEMENT:
|
| 22 |
+
- MUST reject tokens with alg="none" β this is the classic "none algorithm" attack
|
| 23 |
+
- MUST reject tokens with alg different from HS256
|
| 24 |
+
- MUST specify algorithms=['HS256'] when calling jwt.decode()
|
| 25 |
+
|
| 26 |
+
2. SIGNATURE VERIFICATION:
|
| 27 |
+
- MUST verify the signature β never decode without verification
|
| 28 |
+
- Use PyJWT: `import jwt` (pip install PyJWT)
|
| 29 |
+
|
| 30 |
+
3. EXPIRATION:
|
| 31 |
+
- MUST reject expired tokens (PyJWT does this if you don't pass options={"verify_exp": False})
|
| 32 |
+
- Do NOT disable expiration verification
|
| 33 |
+
|
| 34 |
+
4. INPUT VALIDATION:
|
| 35 |
+
- Return None for None/empty token
|
| 36 |
+
- Return None for None/empty secret_key
|
| 37 |
+
- Catch ALL JWT exceptions and return None (never raise to caller)
|
| 38 |
+
|
| 39 |
+
5. RETURN VALUES:
|
| 40 |
+
- Valid token: return the decoded payload dict
|
| 41 |
+
- Any invalid/expired/tampered token: return None
|
| 42 |
+
|
| 43 |
+
NEVER use: jwt.decode(token, options={"verify_signature": False})
|
| 44 |
+
NEVER use: algorithms=["none"] or algorithms=None""",
|
| 45 |
+
|
| 46 |
+
"starter_code": '''import jwt
|
| 47 |
+
|
| 48 |
+
def validate_token(token: str, secret_key: str):
|
| 49 |
+
# TODO: INSECURE - fix this!
|
| 50 |
try:
|
| 51 |
+
# WRONG: No algorithm restriction, signature not verified
|
| 52 |
+
payload = jwt.decode(token, options={"verify_signature": False})
|
| 53 |
+
return payload
|
| 54 |
except Exception:
|
| 55 |
return None
|
| 56 |
+
''',
|
| 57 |
+
|
| 58 |
"test_cases": [
|
| 59 |
+
{"fn": "validate_token", "input": [None, "test_secret"], "expected": None, "description": "None token returns None"},
|
| 60 |
+
{"fn": "validate_token", "input": ["", "test_secret"], "expected": None, "description": "Empty token returns None"},
|
| 61 |
+
{"fn": "validate_token", "input": ["invalid.token.here", "test_secret"], "expected": None, "description": "Malformed token returns None"},
|
| 62 |
+
# Valid HS256 token (pre-generated for test_secret, no expiry by default to allow testing)
|
| 63 |
+
{"fn": "validate_token", "input": ["eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJ0ZXN0X3VzZXIiLCJyb2xlIjoiYWRtaW4ifQ.zXkMxpqiCpD0GXV0-0GbN8bGhfZT7fE-yYv_Xy4YKCA", "test_secret"], "expected_not_none": True, "description": "Valid HS256 token returns payload"},
|
| 64 |
+
# none-alg attack token (header: {"alg":"none"}, payload: {"sub":"attacker"})
|
| 65 |
+
{"fn": "validate_token", "input": ["eyJhbGciOiJub25lIiwidHlwIjoiSldUIn0.eyJzdWIiOiJhdHRhY2tlciJ9.", "test_secret"], "expected": None, "description": "none alg attack token rejected"},
|
| 66 |
+
# Expired token (exp: 1000 = Jan 1 1970)
|
| 67 |
+
{"fn": "validate_token", "input": ["eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxMDAwfQ.invalid_sig", "test_secret"], "expected": None, "description": "Expired/tampered token returns None"},
|
| 68 |
+
],
|
| 69 |
+
|
| 70 |
+
"attack_type": "auth_bypass",
|
| 71 |
+
|
| 72 |
+
"attack_config": {
|
| 73 |
+
"fn_name": "validate_token",
|
| 74 |
+
"fn_args_template": ["{payload}", "test_secret_key"],
|
| 75 |
+
"check": lambda result, payload: result is None,
|
| 76 |
+
},
|
| 77 |
+
|
| 78 |
+
"security_checks": [
|
| 79 |
+
{"type": "no_verify_false", "forbidden": ["verify_signature\": False", "verify_exp\": False"]},
|
| 80 |
+
{"type": "algorithm_specified", "required": ["algorithms=["]},
|
| 81 |
+
{"type": "no_none_alg", "forbidden": ["\"none\"", "'none'"]},
|
| 82 |
],
|
| 83 |
+
|
| 84 |
+
"naive_code": '''import jwt
|
| 85 |
+
def validate_token(token, secret_key):
|
| 86 |
+
try:
|
| 87 |
+
return jwt.decode(token, options={"verify_signature": False})
|
| 88 |
+
except Exception:
|
| 89 |
+
return None
|
| 90 |
+
''',
|
| 91 |
+
|
| 92 |
+
"optimal_code": '''import jwt
|
| 93 |
+
from jwt.exceptions import (
|
| 94 |
+
ExpiredSignatureError, InvalidSignatureError, DecodeError,
|
| 95 |
+
InvalidAlgorithmError, InvalidTokenError
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
ALLOWED_ALGORITHMS = ["HS256"]
|
| 99 |
+
|
| 100 |
+
def validate_token(token: str, secret_key: str) -> dict | None:
|
| 101 |
+
"""Validates a JWT and returns payload if valid, None otherwise.
|
| 102 |
+
|
| 103 |
+
Security guarantees:
|
| 104 |
+
- Only HS256 accepted (none/RS256 attacks blocked)
|
| 105 |
+
- Signature always verified
|
| 106 |
+
- Expiration always checked
|
| 107 |
+
- All exceptions caught β never leaks JWT internals
|
| 108 |
+
|
| 109 |
+
Args:
|
| 110 |
+
token: JWT string
|
| 111 |
+
secret_key: HMAC secret key
|
| 112 |
+
|
| 113 |
+
Returns:
|
| 114 |
+
Decoded payload dict, or None on any failure
|
| 115 |
+
"""
|
| 116 |
+
if not token or not secret_key:
|
| 117 |
+
return None
|
| 118 |
+
|
| 119 |
+
try:
|
| 120 |
+
# CRITICAL: algorithms= parameter blocks the "none" alg attack
|
| 121 |
+
payload = jwt.decode(
|
| 122 |
+
token,
|
| 123 |
+
secret_key,
|
| 124 |
+
algorithms=ALLOWED_ALGORITHMS, # Explicit allowlist
|
| 125 |
+
# Note: verify_exp=True is the default β do NOT override it
|
| 126 |
+
)
|
| 127 |
+
return payload
|
| 128 |
+
except ExpiredSignatureError:
|
| 129 |
+
return None # Expired β reject silently
|
| 130 |
+
except (InvalidSignatureError, InvalidAlgorithmError, DecodeError, InvalidTokenError):
|
| 131 |
+
return None # Any tampered or malformed token
|
| 132 |
+
except Exception:
|
| 133 |
+
return None # Catch-all β never raise to caller
|
| 134 |
+
''',
|
| 135 |
}
|
tasks/medium/api_rate_limiter.py
DELETED
|
@@ -1,43 +0,0 @@
|
|
| 1 |
-
"""tasks/medium/api_rate_limiter.py β CWE-307: Brute Force / Rate Bypass"""
|
| 2 |
-
|
| 3 |
-
TASK = {
|
| 4 |
-
"id": "api_rate_limiter",
|
| 5 |
-
"difficulty": "medium",
|
| 6 |
-
"cwe_targets": ["CWE-307", "CWE-770"],
|
| 7 |
-
"problem_statement": """
|
| 8 |
-
Write a function `check_rate_limit(client_id: str) -> bool` that:
|
| 9 |
-
1. Returns True if the client is ALLOWED (under the rate limit).
|
| 10 |
-
2. Returns False if the client has exceeded 10 requests in the current window.
|
| 11 |
-
3. Uses an in-memory counter (dict) β a fresh dict each time the module loads.
|
| 12 |
-
4. Must resist spoofed/forged client_id values:
|
| 13 |
-
- Empty string "" β treat as a real client ID (rate limit it, don't bypass)
|
| 14 |
-
- None β treat as "anonymous" client (rate limit it)
|
| 15 |
-
- SQL injection strings β sanitise/normalise before using as key
|
| 16 |
-
- Very long strings β truncate to 256 chars before using as key
|
| 17 |
-
5. The first 10 calls for any client_id return True; the 11th returns False.
|
| 18 |
-
|
| 19 |
-
The function name must be exactly: check_rate_limit
|
| 20 |
-
|
| 21 |
-
WRONG: if not client_id: return True β bypasses rate limit for empty/None
|
| 22 |
-
WRONG: using client_id directly as dict key without normalisation
|
| 23 |
-
""",
|
| 24 |
-
"starter_code": """
|
| 25 |
-
from collections import defaultdict
|
| 26 |
-
|
| 27 |
-
_counters: dict = defaultdict(int)
|
| 28 |
-
RATE_LIMIT = 10
|
| 29 |
-
|
| 30 |
-
def check_rate_limit(client_id: str) -> bool:
|
| 31 |
-
# TODO: implement rate limiting
|
| 32 |
-
# Normalize client_id first (handle None, empty, too long)
|
| 33 |
-
# Increment counter and check against RATE_LIMIT
|
| 34 |
-
pass
|
| 35 |
-
""",
|
| 36 |
-
"test_cases": [
|
| 37 |
-
{"input": "client_123", "expected": True, "fn": "check_rate_limit"},
|
| 38 |
-
{"input": "client_abc", "expected": True, "fn": "check_rate_limit"},
|
| 39 |
-
{"input": "", "expected": True, "fn": "check_rate_limit"},
|
| 40 |
-
],
|
| 41 |
-
"naive_baseline": {"time_ms": 0.1, "memory_kb": 10},
|
| 42 |
-
"perf_input": "client_benchmark",
|
| 43 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tasks/medium/file_path_handler.py
CHANGED
|
@@ -1,45 +1,109 @@
|
|
| 1 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
TASK = {
|
| 4 |
-
"id": "
|
| 5 |
"difficulty": "medium",
|
| 6 |
-
"cwe_targets": ["CWE-22", "CWE-
|
| 7 |
-
"
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
from pathlib import Path
|
| 30 |
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
-
def handle_path(user_path: str) -> str:
|
| 34 |
-
# TODO: implement safe path resolution
|
| 35 |
-
# Must reject ../ traversal, absolute paths, URL-encoded traversal
|
| 36 |
-
# Use Path.resolve() and check the result starts with SAFE_BASE
|
| 37 |
-
pass
|
| 38 |
-
""",
|
| 39 |
"test_cases": [
|
| 40 |
-
{"input": "
|
| 41 |
-
{"input": "subdir/file.txt",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
],
|
| 43 |
-
|
| 44 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
}
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SecureCodeEnv Task: File Path Handler
|
| 3 |
+
Difficulty: Medium
|
| 4 |
+
CWE Targets: CWE-22 (Path Traversal), CWE-20 (Improper Input Validation)
|
| 5 |
+
"""
|
| 6 |
|
| 7 |
TASK = {
|
| 8 |
+
"id": "medium_file_path_handler",
|
| 9 |
"difficulty": "medium",
|
| 10 |
+
"cwe_targets": ["CWE-22", "CWE-20"],
|
| 11 |
+
"component_name": "FilePathHandler",
|
| 12 |
+
"filename": "src/files/file_path_handler.py",
|
| 13 |
+
"problem_statement": """Write a Python function:
|
| 14 |
+
|
| 15 |
+
`resolve_safe_path(base_dir: str, user_path: str) -> str`
|
| 16 |
+
|
| 17 |
+
Resolves a user-supplied file path relative to a base directory.
|
| 18 |
+
MUST prevent path traversal attacks.
|
| 19 |
+
|
| 20 |
+
REQUIREMENTS:
|
| 21 |
+
- base_dir is the trusted root directory (e.g. '/var/uploads')
|
| 22 |
+
- user_path is untrusted input from the user
|
| 23 |
+
- Returns the absolute, resolved path ONLY if it is inside base_dir
|
| 24 |
+
- Raises ValueError if the resolved path escapes base_dir
|
| 25 |
+
- Raises ValueError if user_path is empty/None
|
| 26 |
+
- Must use os.path.realpath or pathlib.Path.resolve() β NOT string manipulation
|
| 27 |
+
- Must work on both Unix and Windows paths
|
| 28 |
+
|
| 29 |
+
Example safe: resolve_safe_path('/var/uploads', 'report.pdf') β '/var/uploads/report.pdf'
|
| 30 |
+
Example blocked: resolve_safe_path('/var/uploads', '../../etc/passwd') β raises ValueError
|
| 31 |
+
|
| 32 |
+
NEVER use string contains/replace to detect traversal β attackers bypass it.""",
|
|
|
|
| 33 |
|
| 34 |
+
"starter_code": '''import os
|
| 35 |
+
|
| 36 |
+
def resolve_safe_path(base_dir: str, user_path: str) -> str:
|
| 37 |
+
# TODO: INSECURE - fix this!
|
| 38 |
+
if ".." in user_path:
|
| 39 |
+
raise ValueError("Traversal detected")
|
| 40 |
+
return os.path.join(base_dir, user_path)
|
| 41 |
+
''',
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
"test_cases": [
|
| 44 |
+
{"fn": "resolve_safe_path", "input": ["/var/uploads", "report.pdf"], "expected_contains": "/var/uploads/report.pdf", "description": "Safe path resolves correctly"},
|
| 45 |
+
{"fn": "resolve_safe_path", "input": ["/var/uploads", "subdir/file.txt"], "expected_contains": "/var/uploads/subdir/file.txt", "description": "Safe subdirectory path"},
|
| 46 |
+
{"fn": "resolve_safe_path", "input": ["/var/uploads", "../../etc/passwd"], "expected_exception": "ValueError", "description": "Traversal raises ValueError"},
|
| 47 |
+
{"fn": "resolve_safe_path", "input": ["/var/uploads", "/etc/passwd"], "expected_exception": "ValueError", "description": "Absolute escape path raises ValueError"},
|
| 48 |
+
{"fn": "resolve_safe_path", "input": ["/var/uploads", ""], "expected_exception": "ValueError", "description": "Empty path raises ValueError"},
|
| 49 |
+
{"fn": "resolve_safe_path", "input": ["/var/uploads", None], "expected_exception": "ValueError", "description": "None path raises ValueError"},
|
| 50 |
+
],
|
| 51 |
+
|
| 52 |
+
"attack_type": "traversal",
|
| 53 |
+
|
| 54 |
+
"attack_config": {
|
| 55 |
+
"fn_name": "resolve_safe_path",
|
| 56 |
+
"fn_args_template": ["/var/uploads", "{payload}"],
|
| 57 |
+
"check": lambda result, payload: (
|
| 58 |
+
result is None or
|
| 59 |
+
(isinstance(result, str) and "/etc" not in result and "/root" not in result and ".." not in result)
|
| 60 |
+
),
|
| 61 |
+
"exception_counts_as_blocked": True,
|
| 62 |
+
},
|
| 63 |
+
|
| 64 |
+
"security_checks": [
|
| 65 |
+
{"type": "no_string_traversal_check", "warn_on": ['"..".in', "'..' in", "replace('..']"]},
|
| 66 |
+
{"type": "uses_realpath", "recommended": ["os.path.realpath", "Path.resolve"]},
|
| 67 |
],
|
| 68 |
+
|
| 69 |
+
"naive_code": '''import os
|
| 70 |
+
def resolve_safe_path(base_dir: str, user_path: str) -> str:
|
| 71 |
+
if ".." in (user_path or ""):
|
| 72 |
+
raise ValueError("blocked")
|
| 73 |
+
return os.path.join(base_dir, user_path or "")
|
| 74 |
+
''',
|
| 75 |
+
|
| 76 |
+
"optimal_code": '''import os
|
| 77 |
+
from pathlib import Path
|
| 78 |
+
|
| 79 |
+
def resolve_safe_path(base_dir: str, user_path: str) -> str:
|
| 80 |
+
"""Resolves a user-supplied path within base_dir, preventing traversal.
|
| 81 |
+
|
| 82 |
+
Args:
|
| 83 |
+
base_dir: Trusted root directory
|
| 84 |
+
user_path: Untrusted user-supplied path
|
| 85 |
+
|
| 86 |
+
Returns:
|
| 87 |
+
Absolute resolved path guaranteed to be inside base_dir
|
| 88 |
+
|
| 89 |
+
Raises:
|
| 90 |
+
ValueError: If path escapes base_dir or is invalid
|
| 91 |
+
"""
|
| 92 |
+
if not user_path:
|
| 93 |
+
raise ValueError("user_path must not be empty or None")
|
| 94 |
+
|
| 95 |
+
base = Path(base_dir).resolve()
|
| 96 |
+
# Join and resolve β this handles ../../ and symlinks
|
| 97 |
+
resolved = (base / user_path).resolve()
|
| 98 |
+
|
| 99 |
+
# The critical check: resolved path must start with base path
|
| 100 |
+
try:
|
| 101 |
+
resolved.relative_to(base)
|
| 102 |
+
except ValueError:
|
| 103 |
+
raise ValueError(
|
| 104 |
+
f"Path traversal detected: {user_path!r} resolves outside {base_dir!r}"
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
return str(resolved)
|
| 108 |
+
''',
|
| 109 |
}
|
tasks/medium/rate_limiter.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SecureCodeEnv Task: Rate Limiter
|
| 3 |
+
Difficulty: Medium
|
| 4 |
+
CWE Targets: CWE-770 (Allocation Without Limits), CWE-400 (Uncontrolled Resource Consumption)
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
TASK = {
|
| 8 |
+
"id": "medium_rate_limiter",
|
| 9 |
+
"difficulty": "medium",
|
| 10 |
+
"cwe_targets": ["CWE-770", "CWE-400"],
|
| 11 |
+
"component_name": "RateLimiter",
|
| 12 |
+
"filename": "src/middleware/rate_limiter.py",
|
| 13 |
+
"problem_statement": """Write a Python class:
|
| 14 |
+
|
| 15 |
+
`class RateLimiter:`
|
| 16 |
+
|
| 17 |
+
A thread-safe token bucket rate limiter.
|
| 18 |
+
|
| 19 |
+
REQUIREMENTS:
|
| 20 |
+
- `__init__(self, max_requests: int, window_seconds: int)`
|
| 21 |
+
- max_requests: max allowed calls in the time window
|
| 22 |
+
- window_seconds: size of the sliding window in seconds
|
| 23 |
+
|
| 24 |
+
- `is_allowed(self, client_id: str) -> bool`
|
| 25 |
+
- Returns True if the client is under the rate limit
|
| 26 |
+
- Returns False if the client has exceeded max_requests in window_seconds
|
| 27 |
+
- Each client_id is tracked independently
|
| 28 |
+
- Must be thread-safe (use threading.Lock)
|
| 29 |
+
- Must use time-based sliding window β NOT a fixed counter reset
|
| 30 |
+
|
| 31 |
+
- `get_remaining(self, client_id: str) -> int`
|
| 32 |
+
- Returns how many requests the client can still make in the current window
|
| 33 |
+
|
| 34 |
+
MUST handle concurrent requests correctly β no race conditions.""",
|
| 35 |
+
|
| 36 |
+
"starter_code": '''class RateLimiter:
|
| 37 |
+
def __init__(self, max_requests: int, window_seconds: int):
|
| 38 |
+
self.max_requests = max_requests
|
| 39 |
+
self.window_seconds = window_seconds
|
| 40 |
+
self.counts = {} # NOT thread-safe!
|
| 41 |
+
|
| 42 |
+
def is_allowed(self, client_id: str) -> bool:
|
| 43 |
+
# TODO: Implement with proper sliding window and thread safety
|
| 44 |
+
count = self.counts.get(client_id, 0)
|
| 45 |
+
self.counts[client_id] = count + 1
|
| 46 |
+
return count < self.max_requests
|
| 47 |
+
|
| 48 |
+
def get_remaining(self, client_id: str) -> int:
|
| 49 |
+
count = self.counts.get(client_id, 0)
|
| 50 |
+
return max(0, self.max_requests - count)
|
| 51 |
+
''',
|
| 52 |
+
|
| 53 |
+
"test_cases": [
|
| 54 |
+
{"fn_class": "RateLimiter", "init_args": [5, 60], "method": "is_allowed", "input": ["user1"], "expected": True, "description": "First request allowed"},
|
| 55 |
+
{"fn_class": "RateLimiter", "init_args": [2, 60], "method": "is_allowed_multi", "calls": 3, "input": ["user1"], "expected_last": False, "description": "Third request blocked when limit is 2"},
|
| 56 |
+
{"fn_class": "RateLimiter", "init_args": [5, 60], "method": "get_remaining", "input": ["new_client"], "expected": 5, "description": "New client has full remaining"},
|
| 57 |
+
{"fn_class": "RateLimiter", "init_args": [3, 60], "method": "independent_clients", "description": "Different client IDs are tracked independently"},
|
| 58 |
+
],
|
| 59 |
+
|
| 60 |
+
"attack_type": "none",
|
| 61 |
+
|
| 62 |
+
"security_checks": [
|
| 63 |
+
{"type": "uses_threading_lock", "required": ["threading.Lock", "threading.RLock"]},
|
| 64 |
+
{"type": "uses_time", "required": ["time.time", "time.monotonic"]},
|
| 65 |
+
],
|
| 66 |
+
|
| 67 |
+
"naive_code": '''class RateLimiter:
|
| 68 |
+
def __init__(self, max_requests, window_seconds):
|
| 69 |
+
self.max_requests = max_requests
|
| 70 |
+
self.counts = {}
|
| 71 |
+
def is_allowed(self, client_id):
|
| 72 |
+
c = self.counts.get(client_id, 0)
|
| 73 |
+
self.counts[client_id] = c + 1
|
| 74 |
+
return c < self.max_requests
|
| 75 |
+
def get_remaining(self, client_id):
|
| 76 |
+
return max(0, self.max_requests - self.counts.get(client_id, 0))
|
| 77 |
+
''',
|
| 78 |
+
|
| 79 |
+
"optimal_code": '''import threading
|
| 80 |
+
import time
|
| 81 |
+
from collections import deque
|
| 82 |
+
|
| 83 |
+
class RateLimiter:
|
| 84 |
+
"""Thread-safe sliding window rate limiter using token bucket pattern."""
|
| 85 |
+
|
| 86 |
+
def __init__(self, max_requests: int, window_seconds: int):
|
| 87 |
+
"""
|
| 88 |
+
Args:
|
| 89 |
+
max_requests: Maximum requests allowed per window
|
| 90 |
+
window_seconds: Length of the sliding window
|
| 91 |
+
"""
|
| 92 |
+
self.max_requests = max_requests
|
| 93 |
+
self.window_seconds = window_seconds
|
| 94 |
+
self._buckets: dict[str, deque] = {}
|
| 95 |
+
self._lock = threading.Lock()
|
| 96 |
+
|
| 97 |
+
def _prune(self, client_id: str, now: float) -> None:
|
| 98 |
+
"""Remove timestamps outside the current window. Must hold lock."""
|
| 99 |
+
cutoff = now - self.window_seconds
|
| 100 |
+
bucket = self._buckets.get(client_id, deque())
|
| 101 |
+
while bucket and bucket[0] < cutoff:
|
| 102 |
+
bucket.popleft()
|
| 103 |
+
self._buckets[client_id] = bucket
|
| 104 |
+
|
| 105 |
+
def is_allowed(self, client_id: str) -> bool:
|
| 106 |
+
"""Returns True and records the request if under rate limit."""
|
| 107 |
+
now = time.monotonic()
|
| 108 |
+
with self._lock:
|
| 109 |
+
self._prune(client_id, now)
|
| 110 |
+
bucket = self._buckets[client_id]
|
| 111 |
+
if len(bucket) < self.max_requests:
|
| 112 |
+
bucket.append(now)
|
| 113 |
+
return True
|
| 114 |
+
return False
|
| 115 |
+
|
| 116 |
+
def get_remaining(self, client_id: str) -> int:
|
| 117 |
+
"""Returns remaining requests in the current window."""
|
| 118 |
+
now = time.monotonic()
|
| 119 |
+
with self._lock:
|
| 120 |
+
self._prune(client_id, now)
|
| 121 |
+
used = len(self._buckets.get(client_id, deque()))
|
| 122 |
+
return max(0, self.max_requests - used)
|
| 123 |
+
''',
|
| 124 |
+
}
|
tasks/medium/sql_query_builder.py
CHANGED
|
@@ -1,45 +1,97 @@
|
|
| 1 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
TASK = {
|
| 4 |
-
"id": "
|
| 5 |
"difficulty": "medium",
|
| 6 |
"cwe_targets": ["CWE-89", "CWE-20"],
|
| 7 |
-
"
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
2. Uses PARAMETERISED queries β NEVER string-format user input into SQL.
|
| 11 |
-
3. Returns a list of matching rows (may be empty list).
|
| 12 |
-
4. Handles None and empty string inputs safely (return empty list).
|
| 13 |
-
5. Never raises exceptions on invalid input β always return a list.
|
| 14 |
|
| 15 |
-
|
| 16 |
|
| 17 |
-
|
| 18 |
-
CREATE TABLE users(id INT, name TEXT, role TEXT, password TEXT)
|
| 19 |
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
def build_query(conn: sqlite3.Connection, user_input: str) -> list:
|
| 31 |
-
# TODO: implement using parameterised queries
|
| 32 |
-
# NEVER use f-strings or % formatting to build SQL
|
| 33 |
-
cursor = conn.cursor()
|
| 34 |
-
# cursor.execute("SELECT * FROM users WHERE name = ?", (user_input,))
|
| 35 |
-
# return cursor.fetchall()
|
| 36 |
-
pass
|
| 37 |
""",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
"test_cases": [
|
| 39 |
-
{"input": ["alice", "
|
| 40 |
-
{"input": ["
|
| 41 |
-
{"input": ["", "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
],
|
| 43 |
-
|
| 44 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
}
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SecureCodeEnv Task: SQL Query Builder
|
| 3 |
+
Difficulty: Medium
|
| 4 |
+
CWE Targets: CWE-89 (SQL Injection), CWE-20 (Improper Input Validation)
|
| 5 |
+
"""
|
| 6 |
|
| 7 |
TASK = {
|
| 8 |
+
"id": "medium_sql_query_builder",
|
| 9 |
"difficulty": "medium",
|
| 10 |
"cwe_targets": ["CWE-89", "CWE-20"],
|
| 11 |
+
"component_name": "UserQueryBuilder",
|
| 12 |
+
"filename": "src/db/user_queries.py",
|
| 13 |
+
"problem_statement": """Write a Python function:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
+
`build_user_query(username: str, role: str) -> tuple`
|
| 16 |
|
| 17 |
+
Builds a parameterized SQL query to fetch a user by username and role.
|
|
|
|
| 18 |
|
| 19 |
+
REQUIREMENTS:
|
| 20 |
+
- Returns a tuple: (query_string, params_tuple)
|
| 21 |
+
- The query_string must use placeholders (%s or ?) β NEVER f-strings or .format()
|
| 22 |
+
- The params_tuple must contain the sanitized values
|
| 23 |
+
- Must handle None/empty username (return empty result query)
|
| 24 |
+
- Must validate role is one of: 'admin', 'user', 'moderator' (raise ValueError for others)
|
| 25 |
+
- The returned tuple must be safe to pass directly to cursor.execute()
|
| 26 |
|
| 27 |
+
Example:
|
| 28 |
+
query, params = build_user_query('alice', 'admin')
|
| 29 |
+
cursor.execute(query, params) # Must be safe from SQL injection
|
| 30 |
+
|
| 31 |
+
NEVER do: f"SELECT * FROM users WHERE username = '{username}'"
|
| 32 |
+
NEVER do: "SELECT * FROM users WHERE username = '" + username + "'"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
""",
|
| 34 |
+
|
| 35 |
+
"starter_code": '''def build_user_query(username: str, role: str):
|
| 36 |
+
# TODO: INSECURE - fix this!
|
| 37 |
+
query = f"SELECT * FROM users WHERE username = \'{username}\' AND role = \'{role}\'"
|
| 38 |
+
return query
|
| 39 |
+
''',
|
| 40 |
+
|
| 41 |
"test_cases": [
|
| 42 |
+
{"fn": "build_user_query", "input": ["alice", "admin"], "expected_type": "tuple", "expected_len": 2, "description": "Returns 2-tuple"},
|
| 43 |
+
{"fn": "build_user_query", "input": ["bob", "user"], "expected_type": "tuple", "description": "Returns tuple for valid user"},
|
| 44 |
+
{"fn": "build_user_query", "input": ["", "admin"], "expected_type": "tuple", "description": "Empty username returns safe result"},
|
| 45 |
+
{"fn": "build_user_query", "input": ["alice", "invalid_role"], "expected_exception": "ValueError", "description": "Invalid role raises ValueError"},
|
| 46 |
+
{"fn": "build_user_query", "input": ["alice' OR '1'='1", "admin"], "output_not_contains_input": True, "description": "SQL injection in username is parameterized away"},
|
| 47 |
+
],
|
| 48 |
+
|
| 49 |
+
"attack_type": "injection",
|
| 50 |
+
|
| 51 |
+
"attack_config": {
|
| 52 |
+
"fn_name": "build_user_query",
|
| 53 |
+
"fn_args_template": ["{payload}", "admin"],
|
| 54 |
+
"check": lambda result, payload: (
|
| 55 |
+
isinstance(result, tuple) and
|
| 56 |
+
len(result) == 2 and
|
| 57 |
+
# The query string must use placeholders β injection payload must NOT appear in query
|
| 58 |
+
(payload not in str(result[0])) and
|
| 59 |
+
("%s" in result[0] or "?" in result[0])
|
| 60 |
+
),
|
| 61 |
+
},
|
| 62 |
+
|
| 63 |
+
"security_checks": [
|
| 64 |
+
{"type": "no_string_format_sql", "forbidden": ["f\"SELECT", "f'SELECT", "format(", '%" %']},
|
| 65 |
],
|
| 66 |
+
|
| 67 |
+
"naive_code": '''def build_user_query(username: str, role: str):
|
| 68 |
+
query = f"SELECT * FROM users WHERE username = \'{username}\' AND role = \'{role}\'"
|
| 69 |
+
return (query, ())
|
| 70 |
+
''',
|
| 71 |
+
|
| 72 |
+
"optimal_code": '''ALLOWED_ROLES = {"admin", "user", "moderator"}
|
| 73 |
+
|
| 74 |
+
def build_user_query(username: str, role: str) -> tuple:
|
| 75 |
+
"""Builds a parameterized SQL query safe from injection.
|
| 76 |
+
|
| 77 |
+
Args:
|
| 78 |
+
username: The username to query (untrusted input)
|
| 79 |
+
role: Must be one of admin/user/moderator
|
| 80 |
+
|
| 81 |
+
Returns:
|
| 82 |
+
(query_string, params_tuple) safe for cursor.execute()
|
| 83 |
+
|
| 84 |
+
Raises:
|
| 85 |
+
ValueError: If role is not in the allowed set
|
| 86 |
+
"""
|
| 87 |
+
if role not in ALLOWED_ROLES:
|
| 88 |
+
raise ValueError(f"Invalid role: {role!r}. Must be one of {ALLOWED_ROLES}")
|
| 89 |
+
|
| 90 |
+
if not username:
|
| 91 |
+
return ("SELECT * FROM users WHERE 1=0", ())
|
| 92 |
+
|
| 93 |
+
query = "SELECT id, username, email, role FROM users WHERE username = %s AND role = %s"
|
| 94 |
+
params = (username, role)
|
| 95 |
+
return (query, params)
|
| 96 |
+
''',
|
| 97 |
}
|
tasks/task_registry.py
CHANGED
|
@@ -1,51 +1,54 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
within a difficulty tier to prevent memorisation across episodes.
|
| 6 |
"""
|
| 7 |
import random
|
| 8 |
-
from
|
| 9 |
-
|
| 10 |
-
from tasks.easy.
|
| 11 |
-
from tasks.
|
| 12 |
-
from tasks.
|
| 13 |
-
from tasks.medium.
|
| 14 |
-
from tasks.
|
| 15 |
-
from tasks.
|
| 16 |
-
from tasks.hard.
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
}
|
| 23 |
|
| 24 |
-
|
| 25 |
-
"easy": [
|
| 26 |
-
"medium": [
|
| 27 |
-
"hard": [
|
| 28 |
}
|
| 29 |
|
| 30 |
|
| 31 |
-
def get_task(task_id: str) ->
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
|
|
|
| 35 |
|
| 36 |
|
| 37 |
-
def sample_task(difficulty: str
|
| 38 |
-
"""
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
| 41 |
|
| 42 |
|
| 43 |
-
def list_tasks() -> list:
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
}
|
| 50 |
-
for t in ALL_TASKS.values()
|
| 51 |
-
]
|
|
|
|
| 1 |
"""
|
| 2 |
+
SecureCodeEnv - Task Registry
|
| 3 |
+
Indexes all 9 tasks by ID and difficulty. Serves them via reset().
|
| 4 |
+
Adding a new task = add file + add import here. Nothing else changes.
|
|
|
|
| 5 |
"""
|
| 6 |
import random
|
| 7 |
+
from tasks.easy.password_validator import TASK as TASK_PWD
|
| 8 |
+
from tasks.easy.input_sanitizer import TASK as TASK_SANITIZER
|
| 9 |
+
from tasks.easy.token_generator import TASK as TASK_TOKEN
|
| 10 |
+
from tasks.medium.sql_query_builder import TASK as TASK_SQL
|
| 11 |
+
from tasks.medium.file_path_handler import TASK as TASK_PATH
|
| 12 |
+
from tasks.medium.rate_limiter import TASK as TASK_RATE
|
| 13 |
+
from tasks.hard.file_upload_handler import TASK as TASK_UPLOAD
|
| 14 |
+
from tasks.hard.jwt_validator import TASK as TASK_JWT
|
| 15 |
+
from tasks.hard.auth_middleware import TASK as TASK_AUTH
|
| 16 |
+
|
| 17 |
+
# βββ Master registry ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 18 |
+
TASK_REGISTRY: dict[str, dict] = {
|
| 19 |
+
task["id"]: task
|
| 20 |
+
for task in [
|
| 21 |
+
TASK_PWD, TASK_SANITIZER, TASK_TOKEN, # Easy
|
| 22 |
+
TASK_SQL, TASK_PATH, TASK_RATE, # Medium
|
| 23 |
+
TASK_UPLOAD, TASK_JWT, TASK_AUTH, # Hard
|
| 24 |
+
]
|
| 25 |
}
|
| 26 |
|
| 27 |
+
TASKS_BY_DIFFICULTY: dict[str, list[str]] = {
|
| 28 |
+
"easy": [t for t, v in TASK_REGISTRY.items() if v["difficulty"] == "easy"],
|
| 29 |
+
"medium": [t for t, v in TASK_REGISTRY.items() if v["difficulty"] == "medium"],
|
| 30 |
+
"hard": [t for t, v in TASK_REGISTRY.items() if v["difficulty"] == "hard"],
|
| 31 |
}
|
| 32 |
|
| 33 |
|
| 34 |
+
def get_task(task_id: str) -> dict:
|
| 35 |
+
"""Returns a task by ID. Raises KeyError if not found."""
|
| 36 |
+
if task_id not in TASK_REGISTRY:
|
| 37 |
+
raise KeyError(f"Task {task_id!r} not found. Available: {list(TASK_REGISTRY.keys())}")
|
| 38 |
+
return TASK_REGISTRY[task_id]
|
| 39 |
|
| 40 |
|
| 41 |
+
def sample_task(difficulty: str) -> dict:
|
| 42 |
+
"""Returns a random task at the given difficulty level."""
|
| 43 |
+
pool = TASKS_BY_DIFFICULTY.get(difficulty)
|
| 44 |
+
if not pool:
|
| 45 |
+
raise ValueError(f"No tasks for difficulty {difficulty!r}. Use: easy, medium, hard")
|
| 46 |
+
return TASK_REGISTRY[random.choice(pool)]
|
| 47 |
|
| 48 |
|
| 49 |
+
def list_tasks(difficulty: str = None) -> list[dict]:
|
| 50 |
+
"""Lists all tasks, optionally filtered by difficulty."""
|
| 51 |
+
tasks = list(TASK_REGISTRY.values())
|
| 52 |
+
if difficulty:
|
| 53 |
+
tasks = [t for t in tasks if t["difficulty"] == difficulty]
|
| 54 |
+
return [{"id": t["id"], "difficulty": t["difficulty"], "cwe_targets": t["cwe_targets"]} for t in tasks]
|
|
|
|
|
|
|
|
|
tests/__init__.py
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
# tests/__init__.py
|
|
|
|
|
|
tests/test_api.py
DELETED
|
@@ -1,174 +0,0 @@
|
|
| 1 |
-
"""tests/test_api.py β Integration tests for /reset /step /state endpoints."""
|
| 2 |
-
import sys, os
|
| 3 |
-
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
| 4 |
-
|
| 5 |
-
import pytest
|
| 6 |
-
from fastapi.testclient import TestClient
|
| 7 |
-
from app.main import app
|
| 8 |
-
|
| 9 |
-
client = TestClient(app)
|
| 10 |
-
|
| 11 |
-
SIMPLE_SECURE_CODE = """
|
| 12 |
-
import hashlib
|
| 13 |
-
|
| 14 |
-
def generate_hash(data: str) -> str:
|
| 15 |
-
\"\"\"Generate a secure SHA-256 hash of the input.\"\"\"
|
| 16 |
-
if data is None:
|
| 17 |
-
data = ""
|
| 18 |
-
return hashlib.sha256(data.encode()).hexdigest()
|
| 19 |
-
"""
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
class TestHealth:
|
| 23 |
-
def test_health_returns_200(self):
|
| 24 |
-
r = client.get("/health")
|
| 25 |
-
assert r.status_code == 200
|
| 26 |
-
data = r.json()
|
| 27 |
-
assert data["status"] == "ok"
|
| 28 |
-
assert data["version"] == "2.0.0"
|
| 29 |
-
assert data["tasks"] == 9
|
| 30 |
-
|
| 31 |
-
def test_root_returns_200(self):
|
| 32 |
-
r = client.get("/")
|
| 33 |
-
assert r.status_code == 200
|
| 34 |
-
data = r.json()
|
| 35 |
-
assert "endpoints" in data
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
class TestReset:
|
| 39 |
-
def test_reset_easy(self):
|
| 40 |
-
r = client.post("/reset", params={"difficulty": "easy"})
|
| 41 |
-
assert r.status_code == 200
|
| 42 |
-
data = r.json()
|
| 43 |
-
assert "session_id" in data
|
| 44 |
-
assert "task_id" in data
|
| 45 |
-
assert "problem_statement" in data
|
| 46 |
-
assert "cwe_targets" in data
|
| 47 |
-
assert "codegraph" in data
|
| 48 |
-
assert "starter_code" in data
|
| 49 |
-
assert data["difficulty"] == "easy"
|
| 50 |
-
|
| 51 |
-
def test_reset_medium(self):
|
| 52 |
-
r = client.post("/reset", params={"difficulty": "medium"})
|
| 53 |
-
assert r.status_code == 200
|
| 54 |
-
data = r.json()
|
| 55 |
-
assert data["difficulty"] == "medium"
|
| 56 |
-
|
| 57 |
-
def test_reset_hard(self):
|
| 58 |
-
r = client.post("/reset", params={"difficulty": "hard"})
|
| 59 |
-
assert r.status_code == 200
|
| 60 |
-
|
| 61 |
-
def test_reset_invalid_difficulty(self):
|
| 62 |
-
r = client.post("/reset", params={"difficulty": "impossible"})
|
| 63 |
-
assert r.status_code == 400
|
| 64 |
-
|
| 65 |
-
def test_reset_returns_valid_task_id(self):
|
| 66 |
-
from tasks.task_registry import list_tasks
|
| 67 |
-
valid_ids = {t["id"] for t in list_tasks()}
|
| 68 |
-
r = client.post("/reset", params={"difficulty": "easy"})
|
| 69 |
-
data = r.json()
|
| 70 |
-
assert data["task_id"] in valid_ids
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
class TestStep:
|
| 74 |
-
def _new_session(self, difficulty="easy"):
|
| 75 |
-
r = client.post("/reset", params={"difficulty": difficulty})
|
| 76 |
-
return r.json()
|
| 77 |
-
|
| 78 |
-
def test_step_returns_reward_in_range(self):
|
| 79 |
-
episode = self._new_session("easy")
|
| 80 |
-
r = client.post("/step", json={
|
| 81 |
-
"session_id": episode["session_id"],
|
| 82 |
-
"task_id": episode["task_id"],
|
| 83 |
-
"filename": "solution.py",
|
| 84 |
-
"code": SIMPLE_SECURE_CODE,
|
| 85 |
-
})
|
| 86 |
-
assert r.status_code == 200
|
| 87 |
-
data = r.json()
|
| 88 |
-
assert 0.0 <= data["total_reward"] <= 1.0
|
| 89 |
-
|
| 90 |
-
def test_step_returns_all_score_keys(self):
|
| 91 |
-
episode = self._new_session("easy")
|
| 92 |
-
r = client.post("/step", json={
|
| 93 |
-
"session_id": episode["session_id"],
|
| 94 |
-
"task_id": episode["task_id"],
|
| 95 |
-
"filename": "solution.py",
|
| 96 |
-
"code": SIMPLE_SECURE_CODE,
|
| 97 |
-
})
|
| 98 |
-
data = r.json()
|
| 99 |
-
expected_keys = {
|
| 100 |
-
"correctness", "attack_resist", "static_security",
|
| 101 |
-
"consistency", "performance", "documentation",
|
| 102 |
-
"code_structure", "supply_chain",
|
| 103 |
-
}
|
| 104 |
-
assert expected_keys.issubset(set(data["scores"].keys()))
|
| 105 |
-
|
| 106 |
-
def test_step_missing_session_returns_404(self):
|
| 107 |
-
r = client.post("/step", json={
|
| 108 |
-
"session_id": "nonexistent-uuid-1234",
|
| 109 |
-
"task_id": "hash_generator",
|
| 110 |
-
"filename": "solution.py",
|
| 111 |
-
"code": SIMPLE_SECURE_CODE,
|
| 112 |
-
})
|
| 113 |
-
assert r.status_code == 404
|
| 114 |
-
|
| 115 |
-
def test_step_empty_code_returns_422(self):
|
| 116 |
-
episode = self._new_session("easy")
|
| 117 |
-
r = client.post("/step", json={
|
| 118 |
-
"session_id": episode["session_id"],
|
| 119 |
-
"task_id": episode["task_id"],
|
| 120 |
-
"filename": "solution.py",
|
| 121 |
-
"code": " ",
|
| 122 |
-
})
|
| 123 |
-
assert r.status_code == 422
|
| 124 |
-
|
| 125 |
-
def test_done_after_max_steps(self):
|
| 126 |
-
episode = self._new_session("easy")
|
| 127 |
-
sid = episode["session_id"]
|
| 128 |
-
task_id = episode["task_id"]
|
| 129 |
-
last_result = None
|
| 130 |
-
for i in range(5):
|
| 131 |
-
r = client.post("/step", json={
|
| 132 |
-
"session_id": sid,
|
| 133 |
-
"task_id": task_id,
|
| 134 |
-
"filename": f"step{i}.py",
|
| 135 |
-
"code": SIMPLE_SECURE_CODE,
|
| 136 |
-
})
|
| 137 |
-
if r.status_code != 200:
|
| 138 |
-
break
|
| 139 |
-
last_result = r.json()
|
| 140 |
-
assert last_result is not None
|
| 141 |
-
assert last_result["done"] is True
|
| 142 |
-
|
| 143 |
-
def test_step_updates_codegraph(self):
|
| 144 |
-
episode = self._new_session("easy")
|
| 145 |
-
r = client.post("/step", json={
|
| 146 |
-
"session_id": episode["session_id"],
|
| 147 |
-
"task_id": episode["task_id"],
|
| 148 |
-
"filename": "solution.py",
|
| 149 |
-
"code": SIMPLE_SECURE_CODE,
|
| 150 |
-
})
|
| 151 |
-
data = r.json()
|
| 152 |
-
assert "codegraph" in data
|
| 153 |
-
assert "conventions" in data["codegraph"]
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
class TestState:
|
| 157 |
-
def test_state_returns_current_episode(self):
|
| 158 |
-
r = client.post("/reset", params={"difficulty": "medium"})
|
| 159 |
-
sid = r.json()["session_id"]
|
| 160 |
-
|
| 161 |
-
r2 = client.get("/state", params={"session_id": sid})
|
| 162 |
-
assert r2.status_code == 200
|
| 163 |
-
data = r2.json()
|
| 164 |
-
assert data["step"] == 0
|
| 165 |
-
assert data["done"] is False
|
| 166 |
-
assert "task_id" in data
|
| 167 |
-
|
| 168 |
-
def test_state_missing_session_returns_404(self):
|
| 169 |
-
r = client.get("/state", params={"session_id": "bad-uuid-xyz"})
|
| 170 |
-
assert r.status_code == 404
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
if __name__ == "__main__":
|
| 174 |
-
pytest.main([__file__, "-v"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/test_codegraph.py
DELETED
|
@@ -1,127 +0,0 @@
|
|
| 1 |
-
"""tests/test_codegraph.py β Unit tests for CodeGraph V2."""
|
| 2 |
-
import sys, os
|
| 3 |
-
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
| 4 |
-
|
| 5 |
-
import pytest
|
| 6 |
-
from codegraph.graph import CodeGraph, _naming_style
|
| 7 |
-
from codegraph.extractor import extract_metadata
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
class TestNamingStyle:
|
| 11 |
-
def test_snake_case(self):
|
| 12 |
-
assert _naming_style("get_user") == "snake_case"
|
| 13 |
-
assert _naming_style("handle_path") == "snake_case"
|
| 14 |
-
|
| 15 |
-
def test_camel_case(self):
|
| 16 |
-
assert _naming_style("getUser") == "camelCase"
|
| 17 |
-
assert _naming_style("handlePath") == "camelCase"
|
| 18 |
-
|
| 19 |
-
def test_pascal_case(self):
|
| 20 |
-
assert _naming_style("GetUser") == "PascalCase"
|
| 21 |
-
assert _naming_style("UserManager") == "PascalCase"
|
| 22 |
-
|
| 23 |
-
def test_all_lowercase(self):
|
| 24 |
-
assert _naming_style("foo") == "snake_case"
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
class TestCodeGraph:
|
| 28 |
-
def test_empty_graph(self):
|
| 29 |
-
g = CodeGraph(episode_seed=1)
|
| 30 |
-
assert g.components == {}
|
| 31 |
-
assert g.conventions == {}
|
| 32 |
-
|
| 33 |
-
def test_update_adds_component(self):
|
| 34 |
-
g = CodeGraph(episode_seed=1)
|
| 35 |
-
meta = extract_metadata(
|
| 36 |
-
"def get_user(uid: int) -> dict:\n \"\"\"Get user.\"\"\"\n return {}",
|
| 37 |
-
"users.py", 0
|
| 38 |
-
)
|
| 39 |
-
g.update("users.py", meta)
|
| 40 |
-
assert "users" in g.components
|
| 41 |
-
|
| 42 |
-
def test_syntax_error_not_added(self):
|
| 43 |
-
g = CodeGraph(episode_seed=1)
|
| 44 |
-
bad_meta = {"status": "syntax_error", "functions": [], "imports": []}
|
| 45 |
-
g.update("bad.py", bad_meta)
|
| 46 |
-
assert len(g.components) == 0
|
| 47 |
-
|
| 48 |
-
def test_conventions_inferred_after_update(self):
|
| 49 |
-
g = CodeGraph(episode_seed=1)
|
| 50 |
-
meta = extract_metadata(
|
| 51 |
-
"def snake_one(x: int) -> str:\n \"\"\"Doc.\"\"\"\n return str(x)\n"
|
| 52 |
-
"def snake_two(y: int) -> str:\n \"\"\"Doc.\"\"\"\n return str(y)",
|
| 53 |
-
"module.py", 0
|
| 54 |
-
)
|
| 55 |
-
g.update("module.py", meta)
|
| 56 |
-
assert g.conventions.get("naming") in ("snake_case", "camelCase", "PascalCase", "mixed", "unknown")
|
| 57 |
-
|
| 58 |
-
def test_mixed_style_detected(self):
|
| 59 |
-
g = CodeGraph(episode_seed=1)
|
| 60 |
-
# Create artificial metadata with exactly 50/50 split
|
| 61 |
-
meta = {
|
| 62 |
-
"status": "ok",
|
| 63 |
-
"functions": [
|
| 64 |
-
{"name": "get_user"}, # snake_case
|
| 65 |
-
{"name": "getUser"}, # camelCase
|
| 66 |
-
{"name": "set_value"}, # snake_case
|
| 67 |
-
{"name": "getValue"}, # camelCase
|
| 68 |
-
],
|
| 69 |
-
"imports": [],
|
| 70 |
-
"conventions": {},
|
| 71 |
-
"language": "py",
|
| 72 |
-
"created_at_step": 0,
|
| 73 |
-
}
|
| 74 |
-
g.update("mixed.py", meta)
|
| 75 |
-
# 50/50 split β below 60% threshold β should be "mixed"
|
| 76 |
-
assert g.conventions.get("naming") == "mixed"
|
| 77 |
-
|
| 78 |
-
def test_slim_dict_under_limit(self):
|
| 79 |
-
g = CodeGraph(episode_seed=1)
|
| 80 |
-
for i in range(10):
|
| 81 |
-
meta = extract_metadata(
|
| 82 |
-
f"def func_{i}(x: int) -> str:\n return str(x)",
|
| 83 |
-
f"module_{i}.py", i
|
| 84 |
-
)
|
| 85 |
-
g.update(f"module_{i}.py", meta)
|
| 86 |
-
slim = g.to_slim_dict(limit=6000)
|
| 87 |
-
assert len(slim) <= 6000
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
class TestExtractor:
|
| 91 |
-
def test_extracts_functions(self):
|
| 92 |
-
code = "def hello(x: int) -> str:\n return str(x)"
|
| 93 |
-
meta = extract_metadata(code, "test.py", 0)
|
| 94 |
-
assert meta["status"] == "ok"
|
| 95 |
-
assert any(f["name"] == "hello" for f in meta["functions"])
|
| 96 |
-
|
| 97 |
-
def test_extracts_imports(self):
|
| 98 |
-
code = "import os\nfrom pathlib import Path\ndef foo(): pass"
|
| 99 |
-
meta = extract_metadata(code, "test.py", 0)
|
| 100 |
-
assert meta["status"] == "ok"
|
| 101 |
-
assert len(meta["imports"]) >= 1
|
| 102 |
-
|
| 103 |
-
def test_syntax_error_returns_structured(self):
|
| 104 |
-
code = "def broken(:\n pass"
|
| 105 |
-
meta = extract_metadata(code, "bad.py", 0)
|
| 106 |
-
assert meta["status"] == "syntax_error"
|
| 107 |
-
assert "line" in meta
|
| 108 |
-
assert "feedback" in meta
|
| 109 |
-
|
| 110 |
-
def test_conventions_detected(self):
|
| 111 |
-
code = "try:\n pass\nexcept ValueError:\n pass\ndef f(x: int) -> str:\n return str(x)"
|
| 112 |
-
meta = extract_metadata(code, "test.py", 0)
|
| 113 |
-
assert meta["conventions"]["uses_try_catch"] is True
|
| 114 |
-
assert meta["conventions"]["uses_type_hints"] is True
|
| 115 |
-
|
| 116 |
-
def test_no_print_detected(self):
|
| 117 |
-
code = "def f():\n print('hello')"
|
| 118 |
-
meta = extract_metadata(code, "test.py", 0)
|
| 119 |
-
assert meta["conventions"]["no_print_stmts"] is False
|
| 120 |
-
|
| 121 |
-
def test_language_set_correctly(self):
|
| 122 |
-
meta_py = extract_metadata("def f(): pass", "module.py", 0)
|
| 123 |
-
assert meta_py["language"] == "py"
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
if __name__ == "__main__":
|
| 127 |
-
pytest.main([__file__, "-v"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/test_graders.py
DELETED
|
@@ -1,206 +0,0 @@
|
|
| 1 |
-
"""tests/test_graders.py β Unit tests for all graders."""
|
| 2 |
-
import sys, os
|
| 3 |
-
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
| 4 |
-
|
| 5 |
-
import pytest
|
| 6 |
-
|
| 7 |
-
SECURE_SQL = """
|
| 8 |
-
import sqlite3
|
| 9 |
-
|
| 10 |
-
def build_query(conn: sqlite3.Connection, user_input: str) -> list:
|
| 11 |
-
\"\"\"Query users with parameterised query.\"\"\"
|
| 12 |
-
if not user_input:
|
| 13 |
-
return []
|
| 14 |
-
cursor = conn.cursor()
|
| 15 |
-
cursor.execute("SELECT * FROM users WHERE name = ?", (user_input,))
|
| 16 |
-
return cursor.fetchall()
|
| 17 |
-
"""
|
| 18 |
-
|
| 19 |
-
INSECURE_SQL = """
|
| 20 |
-
def build_query(conn, user_input):
|
| 21 |
-
cursor = conn.cursor()
|
| 22 |
-
cursor.execute(f"SELECT * FROM users WHERE name = '{user_input}'")
|
| 23 |
-
return cursor.fetchall()
|
| 24 |
-
"""
|
| 25 |
-
|
| 26 |
-
SECURE_PATH = """
|
| 27 |
-
from pathlib import Path
|
| 28 |
-
|
| 29 |
-
SAFE_BASE = Path('/tmp/sandbox').resolve()
|
| 30 |
-
|
| 31 |
-
def handle_path(user_path: str) -> str:
|
| 32 |
-
\"\"\"Safely resolve path within sandbox.\"\"\"
|
| 33 |
-
if not user_path:
|
| 34 |
-
raise ValueError("Empty path")
|
| 35 |
-
resolved = (SAFE_BASE / user_path).resolve()
|
| 36 |
-
if not str(resolved).startswith(str(SAFE_BASE)):
|
| 37 |
-
raise ValueError("Path traversal detected")
|
| 38 |
-
return str(resolved)
|
| 39 |
-
"""
|
| 40 |
-
|
| 41 |
-
INSECURE_PATH = """
|
| 42 |
-
def handle_path(user_path):
|
| 43 |
-
return open("/tmp/sandbox/" + user_path).read()
|
| 44 |
-
"""
|
| 45 |
-
|
| 46 |
-
SECURE_HASH = """
|
| 47 |
-
import hashlib
|
| 48 |
-
|
| 49 |
-
def generate_hash(data: str) -> str:
|
| 50 |
-
\"\"\"Generate SHA-256 hash.\"\"\"
|
| 51 |
-
return hashlib.sha256((data or "").encode()).hexdigest()
|
| 52 |
-
"""
|
| 53 |
-
|
| 54 |
-
INSECURE_HASH = """
|
| 55 |
-
import hashlib
|
| 56 |
-
|
| 57 |
-
def generate_hash(data):
|
| 58 |
-
return hashlib.md5(data.encode()).hexdigest()
|
| 59 |
-
"""
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
class TestAttackGrader:
|
| 63 |
-
def test_insecure_sql_scores_low(self):
|
| 64 |
-
from graders.attacks import grade_attack_resistance
|
| 65 |
-
r = grade_attack_resistance(INSECURE_SQL, "sql_query_builder", seed=42)
|
| 66 |
-
assert r["score"] <= 0.3, f"Insecure SQL scored too high: {r['score']}"
|
| 67 |
-
|
| 68 |
-
def test_secure_sql_scores_high(self):
|
| 69 |
-
from graders.attacks import grade_attack_resistance
|
| 70 |
-
r = grade_attack_resistance(SECURE_SQL, "sql_query_builder", seed=42)
|
| 71 |
-
assert r["score"] >= 0.6, f"Secure SQL scored too low: {r['score']}"
|
| 72 |
-
|
| 73 |
-
def test_insecure_path_scores_low(self):
|
| 74 |
-
from graders.attacks import grade_attack_resistance
|
| 75 |
-
r = grade_attack_resistance(INSECURE_PATH, "file_path_handler", seed=42)
|
| 76 |
-
assert r["score"] <= 0.4, f"Insecure path scored too high: {r['score']}"
|
| 77 |
-
|
| 78 |
-
def test_secure_path_scores_high(self):
|
| 79 |
-
from graders.attacks import grade_attack_resistance
|
| 80 |
-
r = grade_attack_resistance(SECURE_PATH, "file_path_handler", seed=42)
|
| 81 |
-
assert r["score"] >= 0.5, f"Secure path scored too low: {r['score']}"
|
| 82 |
-
|
| 83 |
-
def test_unknown_task_returns_full_score(self):
|
| 84 |
-
from graders.attacks import grade_attack_resistance
|
| 85 |
-
r = grade_attack_resistance("def foo(): pass", "unknown_task", seed=1)
|
| 86 |
-
assert r["score"] == 1.0
|
| 87 |
-
|
| 88 |
-
def test_score_in_range(self):
|
| 89 |
-
from graders.attacks import grade_attack_resistance
|
| 90 |
-
r = grade_attack_resistance(SECURE_SQL, "sql_query_builder", seed=99)
|
| 91 |
-
assert 0.0 <= r["score"] <= 1.0
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
class TestStaticAnalysis:
|
| 95 |
-
def test_md5_caught(self):
|
| 96 |
-
from graders.static_analysis import grade_static
|
| 97 |
-
r = grade_static(INSECURE_HASH)
|
| 98 |
-
assert r["score"] < 0.8
|
| 99 |
-
|
| 100 |
-
def test_sha256_clean(self):
|
| 101 |
-
from graders.static_analysis import grade_static
|
| 102 |
-
r = grade_static(SECURE_HASH)
|
| 103 |
-
assert r["score"] >= 0.7
|
| 104 |
-
|
| 105 |
-
def test_eval_caught(self):
|
| 106 |
-
from graders.static_analysis import grade_static
|
| 107 |
-
r = grade_static("def f(x):\n return eval(x)")
|
| 108 |
-
assert r["score"] < 0.7
|
| 109 |
-
|
| 110 |
-
def test_score_in_range(self):
|
| 111 |
-
from graders.static_analysis import grade_static
|
| 112 |
-
r = grade_static(SECURE_SQL)
|
| 113 |
-
assert 0.0 <= r["score"] <= 1.0
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
class TestDocumentation:
|
| 117 |
-
def test_documented_function_scores_high(self):
|
| 118 |
-
from graders.documentation import grade_documentation
|
| 119 |
-
code = '''
|
| 120 |
-
def hello(name: str) -> str:
|
| 121 |
-
"""Greet the user by name."""
|
| 122 |
-
return f"Hello, {name}"
|
| 123 |
-
'''
|
| 124 |
-
r = grade_documentation(code)
|
| 125 |
-
assert r["score"] >= 0.8
|
| 126 |
-
|
| 127 |
-
def test_undocumented_scores_low(self):
|
| 128 |
-
from graders.documentation import grade_documentation
|
| 129 |
-
code = "def hello(name):\n return name"
|
| 130 |
-
r = grade_documentation(code)
|
| 131 |
-
assert r["score"] < 0.5
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
class TestSupplyChain:
|
| 135 |
-
def test_clean_imports_score_full(self):
|
| 136 |
-
from graders.supply_chain import grade_supply_chain
|
| 137 |
-
code = "import hashlib\nimport os\nfrom pathlib import Path"
|
| 138 |
-
r = grade_supply_chain(code)
|
| 139 |
-
assert r["score"] == 1.0
|
| 140 |
-
|
| 141 |
-
def test_typosquat_detected(self):
|
| 142 |
-
from graders.supply_chain import grade_supply_chain
|
| 143 |
-
code = "import reqeusts"
|
| 144 |
-
r = grade_supply_chain(code)
|
| 145 |
-
assert r["score"] < 1.0
|
| 146 |
-
assert len(r["flagged"]) > 0
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
class TestCodeGraph:
|
| 150 |
-
def test_update_and_conventions(self):
|
| 151 |
-
from codegraph.graph import CodeGraph
|
| 152 |
-
from codegraph.extractor import extract_metadata
|
| 153 |
-
g = CodeGraph(episode_seed=1)
|
| 154 |
-
meta = extract_metadata(
|
| 155 |
-
"def get_user(user_id: int) -> dict:\n \"\"\"Get user.\"\"\"\n return {}",
|
| 156 |
-
"users.py", 0
|
| 157 |
-
)
|
| 158 |
-
assert meta["status"] == "ok"
|
| 159 |
-
g.update("users.py", meta)
|
| 160 |
-
assert "naming" in g.conventions
|
| 161 |
-
|
| 162 |
-
def test_syntax_error_returned(self):
|
| 163 |
-
from codegraph.extractor import extract_metadata
|
| 164 |
-
meta = extract_metadata("def broken(:\n pass", "bad.py", 0)
|
| 165 |
-
assert meta["status"] == "syntax_error"
|
| 166 |
-
assert "line" in meta
|
| 167 |
-
|
| 168 |
-
def test_no_update_on_syntax_error(self):
|
| 169 |
-
from codegraph.graph import CodeGraph
|
| 170 |
-
from codegraph.extractor import extract_metadata
|
| 171 |
-
g = CodeGraph(episode_seed=1)
|
| 172 |
-
meta = extract_metadata("def broken(:\n pass", "bad.py", 0)
|
| 173 |
-
g.update("bad.py", meta)
|
| 174 |
-
assert len(g.components) == 0
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
class TestTaskRegistry:
|
| 178 |
-
def test_all_9_tasks_registered(self):
|
| 179 |
-
from tasks.task_registry import list_tasks
|
| 180 |
-
tasks = list_tasks()
|
| 181 |
-
assert len(tasks) == 9
|
| 182 |
-
|
| 183 |
-
def test_sample_task_by_difficulty(self):
|
| 184 |
-
from tasks.task_registry import sample_task
|
| 185 |
-
for diff in ["easy", "medium", "hard"]:
|
| 186 |
-
t = sample_task(diff)
|
| 187 |
-
assert t["difficulty"] == diff
|
| 188 |
-
assert "id" in t
|
| 189 |
-
assert "problem_statement" in t
|
| 190 |
-
assert "test_cases" in t
|
| 191 |
-
assert "cwe_targets" in t
|
| 192 |
-
|
| 193 |
-
def test_get_task_by_id(self):
|
| 194 |
-
from tasks.task_registry import get_task
|
| 195 |
-
t = get_task("sql_query_builder")
|
| 196 |
-
assert t["id"] == "sql_query_builder"
|
| 197 |
-
assert "CWE-89" in t["cwe_targets"]
|
| 198 |
-
|
| 199 |
-
def test_invalid_task_raises(self):
|
| 200 |
-
from tasks.task_registry import get_task
|
| 201 |
-
with pytest.raises(ValueError):
|
| 202 |
-
get_task("nonexistent_task")
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
if __name__ == "__main__":
|
| 206 |
-
pytest.main([__file__, "-v"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
validate.py
CHANGED
|
@@ -1,226 +1,234 @@
|
|
| 1 |
"""
|
| 2 |
-
|
|
|
|
|
|
|
| 3 |
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
If insecure code scores HIGH β grader is broken β DO NOT SUBMIT.
|
| 8 |
-
|
| 9 |
-
All 4+ checks must pass. Any failure = fix before submitting.
|
| 10 |
"""
|
| 11 |
import sys
|
| 12 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
INSECURE_SQL = """
|
| 19 |
-
def build_query(conn, user_input):
|
| 20 |
-
cursor = conn.cursor()
|
| 21 |
-
cursor.execute(f"SELECT * FROM users WHERE name = '{user_input}'")
|
| 22 |
-
return cursor.fetchall()
|
| 23 |
-
"""
|
| 24 |
-
|
| 25 |
-
SECURE_SQL = """
|
| 26 |
-
import sqlite3
|
| 27 |
-
|
| 28 |
-
def build_query(conn: sqlite3.Connection, user_input: str) -> list:
|
| 29 |
-
\"\"\"Query users table using parameterised query.\"\"\"
|
| 30 |
-
if user_input is None:
|
| 31 |
-
return []
|
| 32 |
-
cursor = conn.cursor()
|
| 33 |
-
cursor.execute("SELECT * FROM users WHERE name = ?", (user_input,))
|
| 34 |
-
return cursor.fetchall()
|
| 35 |
-
"""
|
| 36 |
-
|
| 37 |
-
INSECURE_PATH = """
|
| 38 |
-
import os
|
| 39 |
-
|
| 40 |
-
def handle_path(user_path):
|
| 41 |
-
return open("/tmp/sandbox/" + user_path).read()
|
| 42 |
-
"""
|
| 43 |
-
|
| 44 |
-
SECURE_PATH = """
|
| 45 |
-
from pathlib import Path
|
| 46 |
-
|
| 47 |
-
SAFE_BASE = Path('/tmp/sandbox').resolve()
|
| 48 |
-
|
| 49 |
-
def handle_path(user_path: str) -> str:
|
| 50 |
-
\"\"\"Safely resolve path within sandbox directory.\"\"\"
|
| 51 |
-
if not user_path:
|
| 52 |
-
raise ValueError("Empty path")
|
| 53 |
-
resolved = (SAFE_BASE / user_path).resolve()
|
| 54 |
-
if not str(resolved).startswith(str(SAFE_BASE)):
|
| 55 |
-
raise ValueError(f"Path traversal detected: {user_path}")
|
| 56 |
-
return str(resolved)
|
| 57 |
-
"""
|
| 58 |
|
| 59 |
-
|
| 60 |
-
|
|
|
|
|
|
|
| 61 |
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
""
|
|
|
|
| 65 |
|
| 66 |
-
SECURE_HASH = """
|
| 67 |
-
import hashlib
|
| 68 |
|
| 69 |
-
def
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
data = ""
|
| 73 |
-
return hashlib.sha256(data.encode()).hexdigest()
|
| 74 |
-
"""
|
| 75 |
|
| 76 |
-
|
| 77 |
-
def validate_jwt(token):
|
| 78 |
-
import jwt
|
| 79 |
try:
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
try:
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
except Exception:
|
| 97 |
-
|
| 98 |
-
"""
|
| 99 |
|
|
|
|
| 100 |
|
| 101 |
-
# ββ Validation runner βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 102 |
|
| 103 |
-
def
|
| 104 |
-
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
-
|
| 108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
print("=" * 60)
|
| 111 |
-
print("SecureCodeEnv
|
| 112 |
print("=" * 60)
|
| 113 |
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
else:
|
| 121 |
-
passes.append("sql_query_builder insecure")
|
| 122 |
-
print(f" β
PASS β insecure SQL scored {r['score']:.2f}")
|
| 123 |
-
|
| 124 |
-
# ββ Test 2: Secure SQL must score HIGH ββββββββββββββββββββββββββββββββββββ
|
| 125 |
-
r = grade_attack_resistance(SECURE_SQL, "sql_query_builder", seed=42)
|
| 126 |
-
if r["score"] < 0.7:
|
| 127 |
-
failures.append(f"FAIL sql_query_builder: SECURE code scored {r['score']:.2f} (expected >0.70)")
|
| 128 |
-
print(f" β FAIL β secure SQL scored {r['score']:.2f} (should be >0.70)")
|
| 129 |
-
else:
|
| 130 |
-
passes.append("sql_query_builder secure")
|
| 131 |
-
print(f" β
PASS β secure SQL scored {r['score']:.2f}")
|
| 132 |
-
|
| 133 |
-
# ββ Test 3: Insecure path traversal must score LOW ββββββββββββββββββββββββ
|
| 134 |
-
print("\n[2] Path traversal grader...")
|
| 135 |
-
r = grade_attack_resistance(INSECURE_PATH, "file_path_handler", seed=42)
|
| 136 |
-
if r["score"] > 0.3:
|
| 137 |
-
failures.append(f"FAIL file_path_handler: insecure code scored {r['score']:.2f} (expected <0.30)")
|
| 138 |
-
print(f" β FAIL β insecure path scored {r['score']:.2f} (should be <0.30)")
|
| 139 |
-
else:
|
| 140 |
-
passes.append("file_path_handler insecure")
|
| 141 |
-
print(f" β
PASS β insecure path scored {r['score']:.2f}")
|
| 142 |
-
|
| 143 |
-
# ββ Test 4: Secure path must score HIGH βββββββββββββββββββββββββββββββββββ
|
| 144 |
-
r = grade_attack_resistance(SECURE_PATH, "file_path_handler", seed=42)
|
| 145 |
-
if r["score"] < 0.5:
|
| 146 |
-
failures.append(f"FAIL file_path_handler: SECURE code scored {r['score']:.2f} (expected >0.50)")
|
| 147 |
-
print(f" β FAIL β secure path scored {r['score']:.2f} (should be >0.50)")
|
| 148 |
-
else:
|
| 149 |
-
passes.append("file_path_handler secure")
|
| 150 |
-
print(f" β
PASS β secure path scored {r['score']:.2f}")
|
| 151 |
-
|
| 152 |
-
# ββ Test 5: MD5 usage must be caught by static analysis ββββββββββββββββββ
|
| 153 |
-
print("\n[3] Static analysis (bandit + heuristics)...")
|
| 154 |
-
r = grade_static(INSECURE_HASH)
|
| 155 |
-
if r["score"] > 0.7:
|
| 156 |
-
failures.append(f"FAIL static: MD5 usage not caught (scored {r['score']:.2f}, expected <0.70)")
|
| 157 |
-
print(f" β FAIL β MD5 not caught, score={r['score']:.2f}")
|
| 158 |
-
else:
|
| 159 |
-
passes.append("static_analysis MD5")
|
| 160 |
-
print(f" β
PASS β MD5 caught, score={r['score']:.2f}")
|
| 161 |
-
|
| 162 |
-
# ββ Test 6: JWT bypass must be caught ββββββββββββββββββββββββββββββββββββ
|
| 163 |
-
print("\n[4] JWT bypass grader...")
|
| 164 |
-
r = grade_attack_resistance(INSECURE_JWT, "jwt_validator", seed=99)
|
| 165 |
-
if r["score"] > 0.4:
|
| 166 |
-
failures.append(f"FAIL jwt_validator: insecure JWT scored {r['score']:.2f} (expected <0.40)")
|
| 167 |
-
print(f" β FAIL β insecure JWT scored {r['score']:.2f} (should be <0.40)")
|
| 168 |
-
else:
|
| 169 |
-
passes.append("jwt_validator insecure")
|
| 170 |
-
print(f" β
PASS β insecure JWT scored {r['score']:.2f}")
|
| 171 |
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
failures.append(f"FAIL jwt_validator: SECURE code scored {r['score']:.2f} (expected >0.50)")
|
| 175 |
-
print(f" β FAIL β secure JWT scored {r['score']:.2f} (should be >0.50)")
|
| 176 |
else:
|
| 177 |
-
|
| 178 |
-
print(f" β
PASS β secure JWT scored {r['score']:.2f}")
|
| 179 |
-
|
| 180 |
-
# ββ Test 7: API endpoints check ββββββββββββββββββββββββββββββββββββββββββ
|
| 181 |
-
print("\n[5] Task registry...")
|
| 182 |
-
try:
|
| 183 |
-
from tasks.task_registry import list_tasks, sample_task
|
| 184 |
-
tasks = list_tasks()
|
| 185 |
-
assert len(tasks) == 9, f"Expected 9 tasks, got {len(tasks)}"
|
| 186 |
-
for diff in ["easy", "medium", "hard"]:
|
| 187 |
-
t = sample_task(diff)
|
| 188 |
-
assert "id" in t and "problem_statement" in t and "test_cases" in t
|
| 189 |
-
passes.append("task_registry")
|
| 190 |
-
print(f" β
PASS β {len(tasks)} tasks registered correctly")
|
| 191 |
-
except Exception as e:
|
| 192 |
-
failures.append(f"FAIL task_registry: {e}")
|
| 193 |
-
print(f" β FAIL β {e}")
|
| 194 |
|
| 195 |
-
# ββ Test 8: CodeGraph βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 196 |
-
print("\n[6] CodeGraph...")
|
| 197 |
-
try:
|
| 198 |
-
from codegraph.graph import CodeGraph
|
| 199 |
-
from codegraph.extractor import extract_metadata
|
| 200 |
-
g = CodeGraph(episode_seed=42)
|
| 201 |
-
meta = extract_metadata("def hello(x: int) -> str:\n return str(x)", "test.py", 0)
|
| 202 |
-
assert meta["status"] == "ok"
|
| 203 |
-
assert len(meta["functions"]) == 1
|
| 204 |
-
g.update("test.py", meta)
|
| 205 |
-
assert "naming" in g.conventions
|
| 206 |
-
passes.append("codegraph")
|
| 207 |
-
print(f" β
PASS β CodeGraph working, naming={g.conventions['naming']}")
|
| 208 |
-
except Exception as e:
|
| 209 |
-
failures.append(f"FAIL codegraph: {e}")
|
| 210 |
-
print(f" β FAIL β {e}")
|
| 211 |
-
|
| 212 |
-
# ββ Summary βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 213 |
print("\n" + "=" * 60)
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
sys.exit(1)
|
| 220 |
else:
|
| 221 |
-
print(f"
|
| 222 |
-
print("
|
|
|
|
| 223 |
|
| 224 |
|
| 225 |
if __name__ == "__main__":
|
| 226 |
-
|
|
|
|
| 1 |
"""
|
| 2 |
+
SecureCodeEnv - Pre-Submission Validator
|
| 3 |
+
Run this before pushing to HuggingFace Spaces.
|
| 4 |
+
All checks must pass before submission.
|
| 5 |
|
| 6 |
+
Usage:
|
| 7 |
+
python validate.py
|
| 8 |
+
python validate.py --url https://vishaldhakad-securecodeenv.hf.space
|
|
|
|
|
|
|
|
|
|
| 9 |
"""
|
| 10 |
import sys
|
| 11 |
import os
|
| 12 |
+
import json
|
| 13 |
+
import requests
|
| 14 |
+
import argparse
|
| 15 |
+
import subprocess
|
| 16 |
+
|
| 17 |
+
PASS = "β
"
|
| 18 |
+
FAIL = "β"
|
| 19 |
+
WARN = "β οΈ "
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def check(name: str, ok: bool, detail: str = "") -> bool:
|
| 23 |
+
icon = PASS if ok else FAIL
|
| 24 |
+
line = f" {icon} {name}"
|
| 25 |
+
if detail:
|
| 26 |
+
line += f" β {detail}"
|
| 27 |
+
print(line)
|
| 28 |
+
return ok
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def validate_files() -> bool:
|
| 32 |
+
print("\nββ File Structure ββββββββββββββββββββββββββββββββββββββββββ")
|
| 33 |
+
required = [
|
| 34 |
+
"openenv.yaml",
|
| 35 |
+
"Dockerfile",
|
| 36 |
+
"inference.py",
|
| 37 |
+
"requirements.txt",
|
| 38 |
+
"README.md",
|
| 39 |
+
"app/main.py",
|
| 40 |
+
"app/routes.py",
|
| 41 |
+
"app/models.py",
|
| 42 |
+
"app/state.py",
|
| 43 |
+
"graders/reward_aggregator.py",
|
| 44 |
+
"graders/correctness.py",
|
| 45 |
+
"graders/attacks.py",
|
| 46 |
+
"graders/static_analysis.py",
|
| 47 |
+
"graders/performance.py",
|
| 48 |
+
"graders/consistency.py",
|
| 49 |
+
"graders/documentation.py",
|
| 50 |
+
"codegraph/graph.py",
|
| 51 |
+
"codegraph/extractor.py",
|
| 52 |
+
"codegraph/serializer.py",
|
| 53 |
+
"sandbox/executor.py",
|
| 54 |
+
"sandbox/payload_gen.py",
|
| 55 |
+
"tasks/task_registry.py",
|
| 56 |
+
]
|
| 57 |
+
all_ok = True
|
| 58 |
+
for path in required:
|
| 59 |
+
exists = os.path.exists(path)
|
| 60 |
+
if not check(path, exists):
|
| 61 |
+
all_ok = False
|
| 62 |
+
return all_ok
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def validate_imports() -> bool:
|
| 66 |
+
print("\nββ Python Imports ββββββββββββββββββββββββββββββββββββββββββ")
|
| 67 |
+
checks = [
|
| 68 |
+
("fastapi", "from fastapi import FastAPI"),
|
| 69 |
+
("pydantic", "from pydantic import BaseModel"),
|
| 70 |
+
("uvicorn", "import uvicorn"),
|
| 71 |
+
("bandit CLI", None),
|
| 72 |
+
]
|
| 73 |
+
all_ok = True
|
| 74 |
+
for name, stmt in checks:
|
| 75 |
+
if stmt:
|
| 76 |
+
try:
|
| 77 |
+
exec(stmt)
|
| 78 |
+
check(name, True)
|
| 79 |
+
except ImportError as e:
|
| 80 |
+
check(name, False, str(e))
|
| 81 |
+
all_ok = False
|
| 82 |
+
else:
|
| 83 |
+
# Check CLI tool
|
| 84 |
+
result = subprocess.run(["bandit", "--version"], capture_output=True, text=True)
|
| 85 |
+
ok = result.returncode == 0
|
| 86 |
+
check(f"bandit CLI", ok, result.stdout.strip()[:40] if ok else "not found β pip install bandit")
|
| 87 |
+
if not ok:
|
| 88 |
+
all_ok = False
|
| 89 |
+
return all_ok
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def validate_task_registry() -> bool:
|
| 93 |
+
print("\nββ Task Registry βββββββββββββββββββββββββββββββββββββββββββ")
|
| 94 |
+
try:
|
| 95 |
+
sys.path.insert(0, ".")
|
| 96 |
+
from tasks.task_registry import TASK_REGISTRY, TASKS_BY_DIFFICULTY
|
| 97 |
+
total = len(TASK_REGISTRY)
|
| 98 |
+
check("Task registry loads", True, f"{total} tasks loaded")
|
| 99 |
|
| 100 |
+
for diff in ["easy", "medium", "hard"]:
|
| 101 |
+
n = len(TASKS_BY_DIFFICULTY.get(diff, []))
|
| 102 |
+
check(f"{diff} tasks", n >= 3, f"{n} tasks (need β₯ 3)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
+
# Validate task structure
|
| 105 |
+
for tid, task in TASK_REGISTRY.items():
|
| 106 |
+
has_required = all(k in task for k in ["id", "difficulty", "cwe_targets", "problem_statement", "test_cases"])
|
| 107 |
+
check(f"task {tid} structure", has_required)
|
| 108 |
|
| 109 |
+
return True
|
| 110 |
+
except Exception as e:
|
| 111 |
+
check("Task registry import", False, str(e)[:80])
|
| 112 |
+
return False
|
| 113 |
|
|
|
|
|
|
|
| 114 |
|
| 115 |
+
def validate_api(base_url: str) -> bool:
|
| 116 |
+
print(f"\nββ Live API: {base_url} βββββββββββββββββββββββββββββββββββββ")
|
| 117 |
+
all_ok = True
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
+
# Health check
|
|
|
|
|
|
|
| 120 |
try:
|
| 121 |
+
r = requests.get(f"{base_url}/health", timeout=10)
|
| 122 |
+
ok = r.status_code == 200
|
| 123 |
+
check("GET /health β 200", ok, r.json().get("env", "") if ok else f"HTTP {r.status_code}")
|
| 124 |
+
if not ok:
|
| 125 |
+
all_ok = False
|
| 126 |
+
except Exception as e:
|
| 127 |
+
check("GET /health", False, str(e)[:60])
|
| 128 |
+
return False
|
| 129 |
+
|
| 130 |
+
# Reset
|
| 131 |
+
for diff in ["easy", "medium", "hard"]:
|
| 132 |
+
try:
|
| 133 |
+
r = requests.post(f"{base_url}/reset", json={"difficulty": diff}, timeout=15)
|
| 134 |
+
ok = r.status_code == 200
|
| 135 |
+
if ok:
|
| 136 |
+
data = r.json()
|
| 137 |
+
has_fields = all(k in data for k in ["session_id", "task_id", "problem_statement", "cwe_targets"])
|
| 138 |
+
check(f"POST /reset ({diff})", has_fields, data.get("task_id", ""))
|
| 139 |
+
if not has_fields:
|
| 140 |
+
all_ok = False
|
| 141 |
+
|
| 142 |
+
# Step with trivial code
|
| 143 |
+
sid = data["session_id"]
|
| 144 |
+
step_r = requests.post(f"{base_url}/step", json={
|
| 145 |
+
"session_id": sid,
|
| 146 |
+
"code": "def solution(): pass",
|
| 147 |
+
"filename": "test.py",
|
| 148 |
+
}, timeout=60)
|
| 149 |
+
step_ok = step_r.status_code == 200
|
| 150 |
+
if step_ok:
|
| 151 |
+
sdata = step_r.json()
|
| 152 |
+
reward = sdata.get("total_reward", -1)
|
| 153 |
+
in_range = 0.0 <= reward <= 1.0
|
| 154 |
+
check(f"POST /step ({diff}) β reward in [0,1]", in_range, f"reward={reward:.3f}")
|
| 155 |
+
if not in_range:
|
| 156 |
+
all_ok = False
|
| 157 |
+
else:
|
| 158 |
+
check(f"POST /step ({diff})", False, f"HTTP {step_r.status_code}")
|
| 159 |
+
all_ok = False
|
| 160 |
+
else:
|
| 161 |
+
check(f"POST /reset ({diff})", False, f"HTTP {r.status_code}")
|
| 162 |
+
all_ok = False
|
| 163 |
+
except Exception as e:
|
| 164 |
+
check(f"POST /reset ({diff})", False, str(e)[:60])
|
| 165 |
+
all_ok = False
|
| 166 |
+
|
| 167 |
+
# State
|
| 168 |
try:
|
| 169 |
+
r2 = requests.post(f"{base_url}/reset", json={"difficulty": "easy"}, timeout=10)
|
| 170 |
+
if r2.status_code == 200:
|
| 171 |
+
sid = r2.json()["session_id"]
|
| 172 |
+
state_r = requests.get(f"{base_url}/state", params={"session_id": sid}, timeout=10)
|
| 173 |
+
check("GET /state", state_r.status_code == 200)
|
| 174 |
except Exception:
|
| 175 |
+
pass
|
|
|
|
| 176 |
|
| 177 |
+
return all_ok
|
| 178 |
|
|
|
|
| 179 |
|
| 180 |
+
def validate_openenv_yaml() -> bool:
|
| 181 |
+
print("\nββ openenv.yaml ββββββββββββββββββββββββββββββββββββββββββββ")
|
| 182 |
+
try:
|
| 183 |
+
import yaml
|
| 184 |
+
with open("openenv.yaml") as f:
|
| 185 |
+
spec = yaml.safe_load(f)
|
| 186 |
+
required_keys = ["name", "version", "description", "action_space", "observation_space", "tasks", "reward"]
|
| 187 |
+
for k in required_keys:
|
| 188 |
+
check(f"has '{k}' field", k in spec)
|
| 189 |
+
check("9 tasks defined", len(spec.get("tasks", [])) == 9, f"found {len(spec.get('tasks', []))}")
|
| 190 |
+
return True
|
| 191 |
+
except ImportError:
|
| 192 |
+
print(f" {WARN} yaml not installed β skipping YAML validation (pip install pyyaml)")
|
| 193 |
+
return True
|
| 194 |
+
except Exception as e:
|
| 195 |
+
check("openenv.yaml parses", False, str(e)[:80])
|
| 196 |
+
return False
|
| 197 |
|
| 198 |
+
|
| 199 |
+
def main():
|
| 200 |
+
parser = argparse.ArgumentParser(description="SecureCodeEnv pre-submission validator")
|
| 201 |
+
parser.add_argument("--url", default="http://localhost:7860", help="Base URL of the running environment")
|
| 202 |
+
parser.add_argument("--skip-api", action="store_true", help="Skip live API checks")
|
| 203 |
+
args = parser.parse_args()
|
| 204 |
|
| 205 |
print("=" * 60)
|
| 206 |
+
print(" SecureCodeEnv β Pre-Submission Validator")
|
| 207 |
print("=" * 60)
|
| 208 |
|
| 209 |
+
results = [
|
| 210 |
+
validate_files(),
|
| 211 |
+
validate_imports(),
|
| 212 |
+
validate_task_registry(),
|
| 213 |
+
validate_openenv_yaml(),
|
| 214 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
|
| 216 |
+
if not args.skip_api:
|
| 217 |
+
results.append(validate_api(args.url))
|
|
|
|
|
|
|
| 218 |
else:
|
| 219 |
+
print(f"\n {WARN} Skipping live API checks (--skip-api)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
print("\n" + "=" * 60)
|
| 222 |
+
passed = sum(results)
|
| 223 |
+
total = len(results)
|
| 224 |
+
if passed == total:
|
| 225 |
+
print(f" {PASS} ALL CHECKS PASSED ({passed}/{total}) β ready to submit!")
|
| 226 |
+
sys.exit(0)
|
|
|
|
| 227 |
else:
|
| 228 |
+
print(f" {FAIL} {total - passed} check group(s) failed ({passed}/{total} passed)")
|
| 229 |
+
print(" Fix failures before submitting.")
|
| 230 |
+
sys.exit(1)
|
| 231 |
|
| 232 |
|
| 233 |
if __name__ == "__main__":
|
| 234 |
+
main()
|