Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- Dockerfile +81 -0
- README.md +269 -5
- __init__.py +17 -0
- client.py +94 -0
- inference.py +264 -0
- models.py +304 -0
- openenv.yaml +6 -0
- openenv_python_env.egg-info/PKG-INFO +11 -0
- openenv_python_env.egg-info/SOURCES.txt +27 -0
- openenv_python_env.egg-info/dependency_links.txt +1 -0
- openenv_python_env.egg-info/entry_points.txt +2 -0
- openenv_python_env.egg-info/requires.txt +7 -0
- openenv_python_env.egg-info/top_level.txt +1 -0
- pyproject.toml +50 -0
- rollout.py +71 -0
- server/__init__.py +11 -0
- server/app.py +148 -0
- server/data/snippets_easy.json +238 -0
- server/data/snippets_hard.json +214 -0
- server/data/snippets_medium.json +191 -0
- server/grading.py +465 -0
- server/python_env_environment.py +500 -0
- server/requirements.txt +5 -0
- server/review_runtime.py +418 -0
- server/task_bank.py +83 -0
- tests/test_env.py +157 -0
- uv.lock +0 -0
Dockerfile
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
# Multi-stage build using openenv-base
|
| 8 |
+
# This Dockerfile is flexible and works for both:
|
| 9 |
+
# - In-repo environments (with local OpenEnv sources)
|
| 10 |
+
# - Standalone environments (with openenv from PyPI/Git)
|
| 11 |
+
# The build script (openenv build) handles context detection and sets appropriate build args.
|
| 12 |
+
|
| 13 |
+
ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
|
| 14 |
+
FROM ${BASE_IMAGE} AS builder
|
| 15 |
+
|
| 16 |
+
WORKDIR /app
|
| 17 |
+
|
| 18 |
+
# Ensure git is available (required for installing dependencies from VCS)
|
| 19 |
+
RUN apt-get update && \
|
| 20 |
+
apt-get install -y --no-install-recommends git && \
|
| 21 |
+
rm -rf /var/lib/apt/lists/*
|
| 22 |
+
|
| 23 |
+
# Build argument to control whether we're building standalone or in-repo
|
| 24 |
+
ARG BUILD_MODE=in-repo
|
| 25 |
+
ARG ENV_NAME=python_env
|
| 26 |
+
|
| 27 |
+
# Copy environment code (always at root of build context)
|
| 28 |
+
COPY . /app/env
|
| 29 |
+
|
| 30 |
+
# For in-repo builds, openenv is already vendored in the build context
|
| 31 |
+
# For standalone builds, openenv will be installed via pyproject.toml
|
| 32 |
+
WORKDIR /app/env
|
| 33 |
+
|
| 34 |
+
# Ensure uv is available (for local builds where base image lacks it)
|
| 35 |
+
RUN if ! command -v uv >/dev/null 2>&1; then \
|
| 36 |
+
curl -LsSf https://astral.sh/uv/install.sh | sh && \
|
| 37 |
+
mv /root/.local/bin/uv /usr/local/bin/uv && \
|
| 38 |
+
mv /root/.local/bin/uvx /usr/local/bin/uvx; \
|
| 39 |
+
fi
|
| 40 |
+
|
| 41 |
+
# Install dependencies using uv sync
|
| 42 |
+
# If uv.lock exists, use it; otherwise resolve on the fly
|
| 43 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 44 |
+
if [ -f uv.lock ]; then \
|
| 45 |
+
uv sync --frozen --no-install-project --no-editable; \
|
| 46 |
+
else \
|
| 47 |
+
uv sync --no-install-project --no-editable; \
|
| 48 |
+
fi
|
| 49 |
+
|
| 50 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 51 |
+
if [ -f uv.lock ]; then \
|
| 52 |
+
uv sync --frozen --no-editable; \
|
| 53 |
+
else \
|
| 54 |
+
uv sync --no-editable; \
|
| 55 |
+
fi
|
| 56 |
+
|
| 57 |
+
# Final runtime stage
|
| 58 |
+
FROM ${BASE_IMAGE}
|
| 59 |
+
|
| 60 |
+
WORKDIR /app
|
| 61 |
+
|
| 62 |
+
# Copy the virtual environment from builder
|
| 63 |
+
COPY --from=builder /app/env/.venv /app/.venv
|
| 64 |
+
|
| 65 |
+
# Copy the environment code
|
| 66 |
+
COPY --from=builder /app/env /app/env
|
| 67 |
+
|
| 68 |
+
# Set PATH to use the virtual environment
|
| 69 |
+
ENV PATH="/app/.venv/bin:$PATH"
|
| 70 |
+
|
| 71 |
+
# Set PYTHONPATH so imports work correctly
|
| 72 |
+
ENV PYTHONPATH="/app/env:$PYTHONPATH"
|
| 73 |
+
|
| 74 |
+
# Health check
|
| 75 |
+
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
|
| 76 |
+
CMD curl -f http://localhost:8000/health || exit 1
|
| 77 |
+
|
| 78 |
+
# Run the FastAPI server
|
| 79 |
+
# The module path is constructed to work with the /app/env structure
|
| 80 |
+
ENV ENABLE_WEB_INTERFACE=true
|
| 81 |
+
CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
|
README.md
CHANGED
|
@@ -1,10 +1,274 @@
|
|
| 1 |
---
|
| 2 |
-
title: Python Env
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Python Env Environment Server
|
| 3 |
+
emoji: 🎶
|
| 4 |
+
colorFrom: purple
|
| 5 |
+
colorTo: red
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
+
app_port: 8000
|
| 9 |
+
base_path: /web
|
| 10 |
+
tags:
|
| 11 |
+
- openenv
|
| 12 |
---
|
| 13 |
|
| 14 |
+
# Python Code Review Environment
|
| 15 |
+
|
| 16 |
+
This repository now hosts a deterministic OpenEnv benchmark for Python code review. Agents review snippets one step at a time and receive dense rewards, `done` flags, precision/recall/F1 metrics, and task-specific grading suitable for RL training or evaluation.
|
| 17 |
+
|
| 18 |
+
Active task families:
|
| 19 |
+
|
| 20 |
+
- `task_easy`: style and convention review
|
| 21 |
+
- `task_medium`: logic bug detection
|
| 22 |
+
- `task_hard`: security vulnerability audit
|
| 23 |
+
|
| 24 |
+
Use the action schema in `models.py`, the runtime in `server/python_env_environment.py`, and the rollout loop in `inference.py` / `rollout.py` as the current source of truth. The remaining template sections below are legacy scaffolding and may lag behind the benchmark implementation.
|
| 25 |
+
|
| 26 |
+
## Quick Start
|
| 27 |
+
|
| 28 |
+
The simplest way to use the Python Env environment is through the `PythonEnv` class:
|
| 29 |
+
|
| 30 |
+
```python
|
| 31 |
+
from python_env import PythonAction, PythonEnv
|
| 32 |
+
|
| 33 |
+
try:
|
| 34 |
+
# Create environment from Docker image
|
| 35 |
+
python_envenv = PythonEnv.from_docker_image("python_env-env:latest")
|
| 36 |
+
|
| 37 |
+
# Reset
|
| 38 |
+
result = python_envenv.reset()
|
| 39 |
+
print(f"Reset: {result.observation.echoed_message}")
|
| 40 |
+
|
| 41 |
+
# Send multiple messages
|
| 42 |
+
messages = ["Hello, World!", "Testing echo", "Final message"]
|
| 43 |
+
|
| 44 |
+
for msg in messages:
|
| 45 |
+
result = python_envenv.step(PythonAction(message=msg))
|
| 46 |
+
print(f"Sent: '{msg}'")
|
| 47 |
+
print(f" → Echoed: '{result.observation.echoed_message}'")
|
| 48 |
+
print(f" → Length: {result.observation.message_length}")
|
| 49 |
+
print(f" → Reward: {result.reward}")
|
| 50 |
+
|
| 51 |
+
finally:
|
| 52 |
+
# Always clean up
|
| 53 |
+
python_envenv.close()
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
That's it! The `PythonEnv.from_docker_image()` method handles:
|
| 57 |
+
- Starting the Docker container
|
| 58 |
+
- Waiting for the server to be ready
|
| 59 |
+
- Connecting to the environment
|
| 60 |
+
- Container cleanup when you call `close()`
|
| 61 |
+
|
| 62 |
+
## Building the Docker Image
|
| 63 |
+
|
| 64 |
+
Before using the environment, you need to build the Docker image:
|
| 65 |
+
|
| 66 |
+
```bash
|
| 67 |
+
# From project root
|
| 68 |
+
docker build -t python_env-env:latest -f server/Dockerfile .
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
## Deploying to Hugging Face Spaces
|
| 72 |
+
|
| 73 |
+
You can easily deploy your OpenEnv environment to Hugging Face Spaces using the `openenv push` command:
|
| 74 |
+
|
| 75 |
+
```bash
|
| 76 |
+
# From the environment directory (where openenv.yaml is located)
|
| 77 |
+
openenv push
|
| 78 |
+
|
| 79 |
+
# Or specify options
|
| 80 |
+
openenv push --namespace my-org --private
|
| 81 |
+
```
|
| 82 |
+
|
| 83 |
+
The `openenv push` command will:
|
| 84 |
+
1. Validate that the directory is an OpenEnv environment (checks for `openenv.yaml`)
|
| 85 |
+
2. Prepare a custom build for Hugging Face Docker space (enables web interface)
|
| 86 |
+
3. Upload to Hugging Face (ensuring you're logged in)
|
| 87 |
+
|
| 88 |
+
### Prerequisites
|
| 89 |
+
|
| 90 |
+
- Authenticate with Hugging Face: The command will prompt for login if not already authenticated
|
| 91 |
+
|
| 92 |
+
### Options
|
| 93 |
+
|
| 94 |
+
- `--directory`, `-d`: Directory containing the OpenEnv environment (defaults to current directory)
|
| 95 |
+
- `--repo-id`, `-r`: Repository ID in format 'username/repo-name' (defaults to 'username/env-name' from openenv.yaml)
|
| 96 |
+
- `--base-image`, `-b`: Base Docker image to use (overrides Dockerfile FROM)
|
| 97 |
+
- `--private`: Deploy the space as private (default: public)
|
| 98 |
+
|
| 99 |
+
### Examples
|
| 100 |
+
|
| 101 |
+
```bash
|
| 102 |
+
# Push to your personal namespace (defaults to username/env-name from openenv.yaml)
|
| 103 |
+
openenv push
|
| 104 |
+
|
| 105 |
+
# Push to a specific repository
|
| 106 |
+
openenv push --repo-id my-org/my-env
|
| 107 |
+
|
| 108 |
+
# Push with a custom base image
|
| 109 |
+
openenv push --base-image ghcr.io/meta-pytorch/openenv-base:latest
|
| 110 |
+
|
| 111 |
+
# Push as a private space
|
| 112 |
+
openenv push --private
|
| 113 |
+
|
| 114 |
+
# Combine options
|
| 115 |
+
openenv push --repo-id my-org/my-env --base-image custom-base:latest --private
|
| 116 |
+
```
|
| 117 |
+
|
| 118 |
+
After deployment, your space will be available at:
|
| 119 |
+
`https://huggingface.co/spaces/<repo-id>`
|
| 120 |
+
|
| 121 |
+
The deployed space includes:
|
| 122 |
+
- **Web Interface** at `/web` - Interactive UI for exploring the environment
|
| 123 |
+
- **API Documentation** at `/docs` - Full OpenAPI/Swagger interface
|
| 124 |
+
- **Health Check** at `/health` - Container health monitoring
|
| 125 |
+
- **WebSocket** at `/ws` - Persistent session endpoint for low-latency interactions
|
| 126 |
+
|
| 127 |
+
## Environment Details
|
| 128 |
+
|
| 129 |
+
### Action
|
| 130 |
+
**PythonAction**: Contains a single field
|
| 131 |
+
- `message` (str) - The message to echo back
|
| 132 |
+
|
| 133 |
+
### Observation
|
| 134 |
+
**PythonObservation**: Contains the echo response and metadata
|
| 135 |
+
- `echoed_message` (str) - The message echoed back
|
| 136 |
+
- `message_length` (int) - Length of the message
|
| 137 |
+
- `reward` (float) - Reward based on message length (length × 0.1)
|
| 138 |
+
- `done` (bool) - Always False for echo environment
|
| 139 |
+
- `metadata` (dict) - Additional info like step count
|
| 140 |
+
|
| 141 |
+
### Reward
|
| 142 |
+
The reward is calculated as: `message_length × 0.1`
|
| 143 |
+
- "Hi" → reward: 0.2
|
| 144 |
+
- "Hello, World!" → reward: 1.3
|
| 145 |
+
- Empty message → reward: 0.0
|
| 146 |
+
|
| 147 |
+
## Advanced Usage
|
| 148 |
+
|
| 149 |
+
### Connecting to an Existing Server
|
| 150 |
+
|
| 151 |
+
If you already have a Python Env environment server running, you can connect directly:
|
| 152 |
+
|
| 153 |
+
```python
|
| 154 |
+
from python_env import PythonEnv
|
| 155 |
+
|
| 156 |
+
# Connect to existing server
|
| 157 |
+
python_envenv = PythonEnv(base_url="<ENV_HTTP_URL_HERE>")
|
| 158 |
+
|
| 159 |
+
# Use as normal
|
| 160 |
+
result = python_envenv.reset()
|
| 161 |
+
result = python_envenv.step(PythonAction(message="Hello!"))
|
| 162 |
+
```
|
| 163 |
+
|
| 164 |
+
Note: When connecting to an existing server, `python_envenv.close()` will NOT stop the server.
|
| 165 |
+
|
| 166 |
+
### Using the Context Manager
|
| 167 |
+
|
| 168 |
+
The client supports context manager usage for automatic connection management:
|
| 169 |
+
|
| 170 |
+
```python
|
| 171 |
+
from python_env import PythonAction, PythonEnv
|
| 172 |
+
|
| 173 |
+
# Connect with context manager (auto-connects and closes)
|
| 174 |
+
with PythonEnv(base_url="http://localhost:8000") as env:
|
| 175 |
+
result = env.reset()
|
| 176 |
+
print(f"Reset: {result.observation.echoed_message}")
|
| 177 |
+
# Multiple steps with low latency
|
| 178 |
+
for msg in ["Hello", "World", "!"]:
|
| 179 |
+
result = env.step(PythonAction(message=msg))
|
| 180 |
+
print(f"Echoed: {result.observation.echoed_message}")
|
| 181 |
+
```
|
| 182 |
+
|
| 183 |
+
The client uses WebSocket connections for:
|
| 184 |
+
- **Lower latency**: No HTTP connection overhead per request
|
| 185 |
+
- **Persistent session**: Server maintains your environment state
|
| 186 |
+
- **Efficient for episodes**: Better for many sequential steps
|
| 187 |
+
|
| 188 |
+
### Concurrent WebSocket Sessions
|
| 189 |
+
|
| 190 |
+
The server supports multiple concurrent WebSocket connections. To enable this,
|
| 191 |
+
modify `server/app.py` to use factory mode:
|
| 192 |
+
|
| 193 |
+
```python
|
| 194 |
+
# In server/app.py - use factory mode for concurrent sessions
|
| 195 |
+
app = create_app(
|
| 196 |
+
PythonEnvironment, # Pass class, not instance
|
| 197 |
+
PythonAction,
|
| 198 |
+
PythonObservation,
|
| 199 |
+
max_concurrent_envs=4, # Allow 4 concurrent sessions
|
| 200 |
+
)
|
| 201 |
+
```
|
| 202 |
+
|
| 203 |
+
Then multiple clients can connect simultaneously:
|
| 204 |
+
|
| 205 |
+
```python
|
| 206 |
+
from python_env import PythonAction, PythonEnv
|
| 207 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 208 |
+
|
| 209 |
+
def run_episode(client_id: int):
|
| 210 |
+
with PythonEnv(base_url="http://localhost:8000") as env:
|
| 211 |
+
result = env.reset()
|
| 212 |
+
for i in range(10):
|
| 213 |
+
result = env.step(PythonAction(message=f"Client {client_id}, step {i}"))
|
| 214 |
+
return client_id, result.observation.message_length
|
| 215 |
+
|
| 216 |
+
# Run 4 episodes concurrently
|
| 217 |
+
with ThreadPoolExecutor(max_workers=4) as executor:
|
| 218 |
+
results = list(executor.map(run_episode, range(4)))
|
| 219 |
+
```
|
| 220 |
+
|
| 221 |
+
## Development & Testing
|
| 222 |
+
|
| 223 |
+
### Direct Environment Testing
|
| 224 |
+
|
| 225 |
+
Test the environment logic directly without starting the HTTP server:
|
| 226 |
+
|
| 227 |
+
```bash
|
| 228 |
+
# From the server directory
|
| 229 |
+
python3 server/python_env_environment.py
|
| 230 |
+
```
|
| 231 |
+
|
| 232 |
+
This verifies that:
|
| 233 |
+
- Environment resets correctly
|
| 234 |
+
- Step executes actions properly
|
| 235 |
+
- State tracking works
|
| 236 |
+
- Rewards are calculated correctly
|
| 237 |
+
|
| 238 |
+
### Running Locally
|
| 239 |
+
|
| 240 |
+
Run the server locally for development:
|
| 241 |
+
|
| 242 |
+
```bash
|
| 243 |
+
uvicorn server.app:app --reload
|
| 244 |
+
```
|
| 245 |
+
|
| 246 |
+
## Project Structure
|
| 247 |
+
|
| 248 |
+
```
|
| 249 |
+
python_env/
|
| 250 |
+
├── .dockerignore # Docker build exclusions
|
| 251 |
+
├── __init__.py # Module exports
|
| 252 |
+
├── README.md # This file
|
| 253 |
+
├── openenv.yaml # OpenEnv manifest
|
| 254 |
+
├── pyproject.toml # Project metadata and dependencies
|
| 255 |
+
├── uv.lock # Locked dependencies (generated)
|
| 256 |
+
├── client.py # PythonEnv client
|
| 257 |
+
├── models.py # Action and Observation models
|
| 258 |
+
└── server/
|
| 259 |
+
├── __init__.py # Server module exports
|
| 260 |
+
├── python_env_environment.py # Core environment logic
|
| 261 |
+
├── app.py # FastAPI application (HTTP + WebSocket endpoints)
|
| 262 |
+
└── Dockerfile # Container image definition
|
| 263 |
+
```
|
| 264 |
+
---------------------------------------
|
| 265 |
+
|
| 266 |
+
cd F:\python_env
|
| 267 |
+
# Edit your environment implementation in server/python_env_environment.py
|
| 268 |
+
# Edit your models in models.py
|
| 269 |
+
# Install dependencies: uv sync
|
| 270 |
+
|
| 271 |
+
# To integrate into OpenEnv repo:
|
| 272 |
+
# 1. Copy this directory to <repo_root>/envs/python_env_env
|
| 273 |
+
# 2. Build from repo root: docker build -t python_env_env:latest -f envs/python_env_env/server/Dockerfile .
|
| 274 |
+
# 3. Run your image: docker run -p 8000:8000 python_env_env:latest
|
__init__.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Python code-review benchmark environment."""
|
| 8 |
+
|
| 9 |
+
from .client import PythonEnv
|
| 10 |
+
from .models import PythonAction, PythonObservation, PythonState
|
| 11 |
+
|
| 12 |
+
__all__ = [
|
| 13 |
+
"PythonAction",
|
| 14 |
+
"PythonObservation",
|
| 15 |
+
"PythonState",
|
| 16 |
+
"PythonEnv",
|
| 17 |
+
]
|
client.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Python Env Environment Client."""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
from typing import Any, Dict
|
| 12 |
+
from urllib.parse import urlparse
|
| 13 |
+
|
| 14 |
+
import httpx
|
| 15 |
+
|
| 16 |
+
from openenv.core import EnvClient
|
| 17 |
+
from openenv.core.client_types import StepResult
|
| 18 |
+
|
| 19 |
+
try:
|
| 20 |
+
from .models import (
|
| 21 |
+
HealthResponse,
|
| 22 |
+
MetricsResponse,
|
| 23 |
+
PythonAction,
|
| 24 |
+
PythonObservation,
|
| 25 |
+
PythonState,
|
| 26 |
+
TaskListResponse,
|
| 27 |
+
)
|
| 28 |
+
except ImportError:
|
| 29 |
+
from models import ( # type: ignore
|
| 30 |
+
HealthResponse,
|
| 31 |
+
MetricsResponse,
|
| 32 |
+
PythonAction,
|
| 33 |
+
PythonObservation,
|
| 34 |
+
PythonState,
|
| 35 |
+
TaskListResponse,
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def _to_http_base_url(base_url: str) -> str:
|
| 40 |
+
parsed = urlparse(base_url)
|
| 41 |
+
scheme = "https" if parsed.scheme == "wss" else "http"
|
| 42 |
+
if parsed.scheme in {"http", "https"}:
|
| 43 |
+
scheme = parsed.scheme
|
| 44 |
+
return f"{scheme}://{parsed.netloc}{parsed.path}".rstrip("/")
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
class PythonEnv(EnvClient[PythonAction, PythonObservation, PythonState]):
|
| 48 |
+
"""Typed client for the Python code-review environment."""
|
| 49 |
+
|
| 50 |
+
def __init__(self, base_url: str, **kwargs: Any):
|
| 51 |
+
super().__init__(base_url=base_url, **kwargs)
|
| 52 |
+
self._http_base_url = _to_http_base_url(base_url)
|
| 53 |
+
|
| 54 |
+
def _step_payload(self, action: PythonAction) -> Dict[str, Any]:
|
| 55 |
+
"""Convert a validated action model to the JSON payload expected by the server."""
|
| 56 |
+
|
| 57 |
+
return action.model_dump(exclude_none=True)
|
| 58 |
+
|
| 59 |
+
def _parse_result(self, payload: Dict[str, Any]) -> StepResult[PythonObservation]:
|
| 60 |
+
"""Parse a server response into a typed step result."""
|
| 61 |
+
|
| 62 |
+
obs_data = dict(payload.get("observation", {}))
|
| 63 |
+
obs_data.setdefault("done", payload.get("done", False))
|
| 64 |
+
obs_data.setdefault("reward", payload.get("reward"))
|
| 65 |
+
observation = PythonObservation.model_validate(obs_data)
|
| 66 |
+
|
| 67 |
+
return StepResult(
|
| 68 |
+
observation=observation,
|
| 69 |
+
reward=payload.get("reward"),
|
| 70 |
+
done=payload.get("done", False),
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
def _parse_state(self, payload: Dict[str, Any]) -> PythonState:
|
| 74 |
+
"""Parse the server state payload into the shared state model."""
|
| 75 |
+
|
| 76 |
+
return PythonState.model_validate(payload)
|
| 77 |
+
|
| 78 |
+
async def get_tasks(self) -> TaskListResponse:
|
| 79 |
+
async with httpx.AsyncClient() as client:
|
| 80 |
+
response = await client.get(f"{self._http_base_url}/tasks")
|
| 81 |
+
response.raise_for_status()
|
| 82 |
+
return TaskListResponse.model_validate(response.json())
|
| 83 |
+
|
| 84 |
+
async def get_metrics(self) -> MetricsResponse:
|
| 85 |
+
async with httpx.AsyncClient() as client:
|
| 86 |
+
response = await client.get(f"{self._http_base_url}/metrics")
|
| 87 |
+
response.raise_for_status()
|
| 88 |
+
return MetricsResponse.model_validate(response.json())
|
| 89 |
+
|
| 90 |
+
async def get_health(self) -> HealthResponse:
|
| 91 |
+
async with httpx.AsyncClient() as client:
|
| 92 |
+
response = await client.get(f"{self._http_base_url}/health")
|
| 93 |
+
response.raise_for_status()
|
| 94 |
+
return HealthResponse.model_validate(response.json())
|
inference.py
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Baseline inference script for the Python code-review environment."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import asyncio
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
import re
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import Any, Dict, List, Optional
|
| 11 |
+
|
| 12 |
+
from openai import OpenAI
|
| 13 |
+
|
| 14 |
+
from client import PythonEnv
|
| 15 |
+
from models import ActionType, PythonReviewAction
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# Read all runtime configuration from environment variables so the script can
|
| 19 |
+
# be reused unchanged across local runs, CI, and HF Spaces validation.
|
| 20 |
+
API_BASE_URL = os.environ["API_BASE_URL"]
|
| 21 |
+
MODEL_NAME = os.environ["MODEL_NAME"]
|
| 22 |
+
API_KEY = os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY")
|
| 23 |
+
ENV_BASE_URL = os.getenv("ENV_BASE_URL")
|
| 24 |
+
DOCKER_IMAGE = os.getenv("PYTHON_ENV_IMAGE", "python_env-env:latest")
|
| 25 |
+
MAX_STEPS = int(os.getenv("MAX_STEPS", "25"))
|
| 26 |
+
REPORT_PATH = Path(os.getenv("INFERENCE_REPORT_PATH", "inference_results.json"))
|
| 27 |
+
TEMPERATURE = float(os.getenv("TEMPERATURE", "0"))
|
| 28 |
+
MAX_TOKENS = int(os.getenv("MAX_TOKENS", "900"))
|
| 29 |
+
TASK_IDS = ["task_easy", "task_medium", "task_hard"]
|
| 30 |
+
|
| 31 |
+
SYSTEM_PROMPT = """You are a precise senior Python code reviewer.
|
| 32 |
+
Return strict JSON using this schema:
|
| 33 |
+
{
|
| 34 |
+
"action_type": "ADD_COMMENT|APPROVE|REQUEST_CHANGES|ASK_CONTEXT|SKIP_LINE",
|
| 35 |
+
"line_number": 1,
|
| 36 |
+
"issue_type": "STYLE|LOGIC|SECURITY|PERFORMANCE|DOCS",
|
| 37 |
+
"severity": "LOW|MEDIUM|HIGH|CRITICAL",
|
| 38 |
+
"comment": "why this matters",
|
| 39 |
+
"suggestion": "optional fix suggestion",
|
| 40 |
+
"question": "optional context question"
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
Rules:
|
| 44 |
+
- Output JSON only. No markdown fences.
|
| 45 |
+
- Only report issues supported by the visible code.
|
| 46 |
+
- Use one action per step.
|
| 47 |
+
- Prefer high precision over quantity.
|
| 48 |
+
- Use REQUEST_CHANGES once you believe the code should be rejected.
|
| 49 |
+
- Use APPROVE only when the snippet is genuinely clean.
|
| 50 |
+
"""
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def _build_prompt(observation, step: int, history: List[str]) -> str:
|
| 54 |
+
"""Build the task prompt sent to the model for one step."""
|
| 55 |
+
|
| 56 |
+
numbered_lines = "\n".join(
|
| 57 |
+
f"{index + 1:>3}: {line}" for index, line in enumerate(observation.lines)
|
| 58 |
+
)
|
| 59 |
+
history_text = "\n".join(history[-4:]) if history else "No previous attempts."
|
| 60 |
+
return (
|
| 61 |
+
f"Task ID: {observation.task_id}\n"
|
| 62 |
+
f"Step: {step}\n"
|
| 63 |
+
f"Current score: {observation.metrics.current_score:.2f}\n"
|
| 64 |
+
f"Last reward: {observation.reward_summary.step_reward:.2f}\n"
|
| 65 |
+
f"Cumulative reward: {observation.reward_summary.cumulative_reward:.2f}\n"
|
| 66 |
+
f"Latest feedback: {observation.feedback or 'None'}\n"
|
| 67 |
+
f"Attempt history:\n{history_text}\n\n"
|
| 68 |
+
f"Filename: {observation.filename}\n"
|
| 69 |
+
f"Context: {observation.context or 'None'}\n"
|
| 70 |
+
"Code to review:\n"
|
| 71 |
+
f"{numbered_lines}"
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def _extract_text_content(message_content: Any) -> str:
|
| 76 |
+
"""Normalize OpenAI response content into one text string."""
|
| 77 |
+
|
| 78 |
+
if isinstance(message_content, str):
|
| 79 |
+
return message_content
|
| 80 |
+
if isinstance(message_content, list):
|
| 81 |
+
parts: List[str] = []
|
| 82 |
+
for item in message_content:
|
| 83 |
+
if isinstance(item, dict):
|
| 84 |
+
text = item.get("text")
|
| 85 |
+
if isinstance(text, str):
|
| 86 |
+
parts.append(text)
|
| 87 |
+
return "\n".join(parts)
|
| 88 |
+
return ""
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def _extract_json_blob(content: str) -> str:
|
| 92 |
+
"""Extract a JSON object from plain or fenced model output."""
|
| 93 |
+
|
| 94 |
+
fenced_match = re.search(r"```(?:json)?\s*(\{.*\})\s*```", content, re.DOTALL)
|
| 95 |
+
if fenced_match:
|
| 96 |
+
return fenced_match.group(1)
|
| 97 |
+
|
| 98 |
+
start = content.find("{")
|
| 99 |
+
end = content.rfind("}")
|
| 100 |
+
if start != -1 and end != -1 and end > start:
|
| 101 |
+
return content[start : end + 1]
|
| 102 |
+
return content
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def _parse_response(content: str) -> Dict[str, Any]:
|
| 106 |
+
"""Parse the model response into a normalized payload dict."""
|
| 107 |
+
|
| 108 |
+
raw = _extract_json_blob(content)
|
| 109 |
+
try:
|
| 110 |
+
data = json.loads(raw)
|
| 111 |
+
except json.JSONDecodeError:
|
| 112 |
+
return {"_parse_error": raw}
|
| 113 |
+
return data
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def _completion(client: OpenAI, prompt: str) -> Dict[str, Any]:
|
| 117 |
+
"""Send one completion request to the configured model endpoint."""
|
| 118 |
+
|
| 119 |
+
response = client.chat.completions.create(
|
| 120 |
+
model=MODEL_NAME,
|
| 121 |
+
temperature=TEMPERATURE,
|
| 122 |
+
max_tokens=MAX_TOKENS,
|
| 123 |
+
messages=[
|
| 124 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 125 |
+
{"role": "user", "content": prompt},
|
| 126 |
+
],
|
| 127 |
+
)
|
| 128 |
+
content = _extract_text_content(response.choices[0].message.content) or "{}"
|
| 129 |
+
return _parse_response(content)
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def _build_fallback_action(observation, note: str) -> PythonReviewAction:
|
| 133 |
+
"""Create a safe fallback action when model output is unusable."""
|
| 134 |
+
|
| 135 |
+
return PythonReviewAction(
|
| 136 |
+
action_type=ActionType.REQUEST_CHANGES
|
| 137 |
+
if observation.current_step + 1 >= observation.max_steps
|
| 138 |
+
else ActionType.ASK_CONTEXT,
|
| 139 |
+
question=note if observation.current_step + 1 < observation.max_steps else None,
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def _to_action(
|
| 144 |
+
payload: Dict[str, Any],
|
| 145 |
+
observation,
|
| 146 |
+
) -> PythonReviewAction:
|
| 147 |
+
"""Convert a parsed model payload into a valid environment action."""
|
| 148 |
+
|
| 149 |
+
try:
|
| 150 |
+
return PythonReviewAction.model_validate(payload)
|
| 151 |
+
except Exception:
|
| 152 |
+
note = "Model returned no valid action."
|
| 153 |
+
if payload.get("_parse_error"):
|
| 154 |
+
note = f"{note} Raw response could not be parsed as JSON."
|
| 155 |
+
return _build_fallback_action(observation, note)
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def _make_env():
|
| 159 |
+
"""Connect to a live environment or launch the Docker image."""
|
| 160 |
+
|
| 161 |
+
if ENV_BASE_URL:
|
| 162 |
+
return PythonEnv(base_url=ENV_BASE_URL).sync()
|
| 163 |
+
return asyncio.run(PythonEnv.from_docker_image(DOCKER_IMAGE)).sync()
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def _task_result_dict(observation, step_logs: List[Dict[str, Any]]) -> Dict[str, Any]:
|
| 167 |
+
"""Build the report payload for one completed task run."""
|
| 168 |
+
|
| 169 |
+
return {
|
| 170 |
+
"task_id": observation.task_id,
|
| 171 |
+
"snippet_id": observation.snippet_id,
|
| 172 |
+
"score": observation.metrics.current_score,
|
| 173 |
+
"precision": observation.metrics.precision,
|
| 174 |
+
"recall": observation.metrics.recall,
|
| 175 |
+
"f1": observation.metrics.f1,
|
| 176 |
+
"true_positives": observation.metrics.true_positives,
|
| 177 |
+
"false_positives": observation.metrics.false_positives,
|
| 178 |
+
"missed_issues": observation.metrics.missed_issues,
|
| 179 |
+
"cumulative_reward": observation.metrics.cumulative_reward,
|
| 180 |
+
"steps": step_logs,
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def main() -> None:
|
| 185 |
+
"""Run the configured model against the benchmark task set."""
|
| 186 |
+
|
| 187 |
+
if not API_KEY:
|
| 188 |
+
raise RuntimeError("Set HF_TOKEN or OPENAI_API_KEY before running inference.py")
|
| 189 |
+
|
| 190 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
|
| 191 |
+
env = _make_env()
|
| 192 |
+
episode_results: List[Dict[str, Any]] = []
|
| 193 |
+
|
| 194 |
+
try:
|
| 195 |
+
for index, task_id in enumerate(TASK_IDS, start=1):
|
| 196 |
+
result = env.reset(task_id=task_id)
|
| 197 |
+
observation = result.observation
|
| 198 |
+
history: List[str] = []
|
| 199 |
+
step_logs: List[Dict[str, Any]] = []
|
| 200 |
+
|
| 201 |
+
print(f"Task {index}: {task_id} ({observation.snippet_id})")
|
| 202 |
+
|
| 203 |
+
for step in range(1, MAX_STEPS + 1):
|
| 204 |
+
prompt = _build_prompt(observation, step, history)
|
| 205 |
+
try:
|
| 206 |
+
payload = _completion(client, prompt)
|
| 207 |
+
except Exception as exc:
|
| 208 |
+
payload = {"_error": str(exc)}
|
| 209 |
+
|
| 210 |
+
action = _to_action(payload=payload, observation=observation)
|
| 211 |
+
|
| 212 |
+
result = env.step(action)
|
| 213 |
+
observation = result.observation
|
| 214 |
+
|
| 215 |
+
step_log = {
|
| 216 |
+
"step": step,
|
| 217 |
+
"action_type": action.action_type.value,
|
| 218 |
+
"line_number": action.line_number,
|
| 219 |
+
"reward": result.reward or 0.0,
|
| 220 |
+
"score": observation.metrics.current_score,
|
| 221 |
+
"done": result.done,
|
| 222 |
+
"feedback": observation.feedback,
|
| 223 |
+
}
|
| 224 |
+
if payload.get("_error"):
|
| 225 |
+
step_log["model_error"] = payload["_error"]
|
| 226 |
+
if payload.get("_parse_error"):
|
| 227 |
+
step_log["parse_error"] = True
|
| 228 |
+
step_logs.append(step_log)
|
| 229 |
+
|
| 230 |
+
history.append(
|
| 231 |
+
f"step={step} action={action.action_type.value} "
|
| 232 |
+
f"line={action.line_number} score={observation.metrics.current_score:.2f} "
|
| 233 |
+
f"reward={(result.reward or 0.0):.2f} feedback={observation.feedback}"
|
| 234 |
+
)
|
| 235 |
+
|
| 236 |
+
print(
|
| 237 |
+
f" step={step} action={action.action_type.value} "
|
| 238 |
+
f"score={observation.metrics.current_score:.2f} reward={(result.reward or 0.0):.2f} "
|
| 239 |
+
f"done={result.done}"
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
if result.done:
|
| 243 |
+
break
|
| 244 |
+
|
| 245 |
+
episode_results.append(_task_result_dict(observation, step_logs))
|
| 246 |
+
finally:
|
| 247 |
+
env.close()
|
| 248 |
+
|
| 249 |
+
mean_score = sum(item["score"] for item in episode_results) / len(episode_results) if episode_results else 0.0
|
| 250 |
+
summary = {
|
| 251 |
+
"model_name": MODEL_NAME,
|
| 252 |
+
"api_base_url": API_BASE_URL,
|
| 253 |
+
"task_count": len(episode_results),
|
| 254 |
+
"mean_score": mean_score,
|
| 255 |
+
"results": episode_results,
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
REPORT_PATH.write_text(json.dumps(summary, indent=2), encoding="utf-8")
|
| 259 |
+
print(json.dumps(summary, indent=2))
|
| 260 |
+
print(f"\nSaved report to {REPORT_PATH}")
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
if __name__ == "__main__":
|
| 264 |
+
main()
|
models.py
ADDED
|
@@ -0,0 +1,304 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Shared models for the Python code-review OpenEnv benchmark."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from enum import Enum
|
| 6 |
+
from typing import Dict, List, Optional
|
| 7 |
+
|
| 8 |
+
from pydantic import BaseModel, Field, model_validator
|
| 9 |
+
from openenv.core.env_server.types import Action, Observation, State
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class Difficulty(str, Enum):
|
| 13 |
+
EASY = "easy"
|
| 14 |
+
MEDIUM = "medium"
|
| 15 |
+
HARD = "hard"
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class ActionType(str, Enum):
|
| 19 |
+
ADD_COMMENT = "ADD_COMMENT"
|
| 20 |
+
APPROVE = "APPROVE"
|
| 21 |
+
REQUEST_CHANGES = "REQUEST_CHANGES"
|
| 22 |
+
ASK_CONTEXT = "ASK_CONTEXT"
|
| 23 |
+
SKIP_LINE = "SKIP_LINE"
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class IssueType(str, Enum):
|
| 27 |
+
STYLE = "STYLE"
|
| 28 |
+
LOGIC = "LOGIC"
|
| 29 |
+
SECURITY = "SECURITY"
|
| 30 |
+
PERFORMANCE = "PERFORMANCE"
|
| 31 |
+
DOCS = "DOCS"
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class Severity(str, Enum):
|
| 35 |
+
LOW = "LOW"
|
| 36 |
+
MEDIUM = "MEDIUM"
|
| 37 |
+
HIGH = "HIGH"
|
| 38 |
+
CRITICAL = "CRITICAL"
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class GoldIssue(BaseModel):
|
| 42 |
+
"""Hidden benchmark annotation for one issue in a snippet."""
|
| 43 |
+
|
| 44 |
+
issue_id: str
|
| 45 |
+
line: int = Field(..., ge=1)
|
| 46 |
+
issue_type: IssueType
|
| 47 |
+
severity: Severity
|
| 48 |
+
description: str
|
| 49 |
+
required: bool = True
|
| 50 |
+
explanation_keywords: List[str] = Field(default_factory=list)
|
| 51 |
+
fix_keywords: List[str] = Field(default_factory=list)
|
| 52 |
+
owasp_category: Optional[str] = None
|
| 53 |
+
owasp_keywords: List[str] = Field(default_factory=list)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
class ReviewComment(BaseModel):
|
| 57 |
+
"""Stored review action visible to the agent in `review_history`."""
|
| 58 |
+
|
| 59 |
+
step_index: int = Field(..., ge=1)
|
| 60 |
+
action_type: ActionType
|
| 61 |
+
line_number: Optional[int] = Field(default=None, ge=1)
|
| 62 |
+
issue_type: Optional[IssueType] = None
|
| 63 |
+
severity: Optional[Severity] = None
|
| 64 |
+
comment: Optional[str] = None
|
| 65 |
+
suggestion: Optional[str] = None
|
| 66 |
+
question: Optional[str] = None
|
| 67 |
+
matched_issue_ids: List[str] = Field(default_factory=list)
|
| 68 |
+
reward_delta: float = 0.0
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
class CodeReviewSnippet(BaseModel):
|
| 72 |
+
"""Benchmark sample loaded from JSON."""
|
| 73 |
+
|
| 74 |
+
snippet_id: str
|
| 75 |
+
filename: str
|
| 76 |
+
code: str
|
| 77 |
+
context: Optional[str] = None
|
| 78 |
+
diff: Optional[str] = None
|
| 79 |
+
gold_issues: List[GoldIssue]
|
| 80 |
+
must_approve: bool = False
|
| 81 |
+
must_reject: bool = True
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
class TaskMetadata(BaseModel):
|
| 85 |
+
"""Visible task-family metadata."""
|
| 86 |
+
|
| 87 |
+
task_id: str
|
| 88 |
+
name: str
|
| 89 |
+
difficulty: Difficulty
|
| 90 |
+
description: str
|
| 91 |
+
snippet_count: int = Field(..., ge=0)
|
| 92 |
+
max_steps: int = Field(..., ge=1)
|
| 93 |
+
min_score: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 94 |
+
max_score: float = Field(default=1.0, ge=0.0, le=1.0)
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
class ReviewFinding(BaseModel):
|
| 98 |
+
"""Compatibility shim for earlier template-derived environment code."""
|
| 99 |
+
|
| 100 |
+
title: str = ""
|
| 101 |
+
line: Optional[int] = Field(default=None, ge=1)
|
| 102 |
+
category: str = "bug"
|
| 103 |
+
severity: str = "warning"
|
| 104 |
+
rationale: str = ""
|
| 105 |
+
recommendation: Optional[str] = None
|
| 106 |
+
rule_id: Optional[str] = None
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
class TaskDescriptor(BaseModel):
|
| 110 |
+
"""Compatibility shim for earlier template-derived environment code."""
|
| 111 |
+
|
| 112 |
+
task_id: str
|
| 113 |
+
difficulty: str
|
| 114 |
+
title: str
|
| 115 |
+
objective: str
|
| 116 |
+
code: str
|
| 117 |
+
max_steps: int = Field(..., ge=1)
|
| 118 |
+
success_threshold: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
class TaskEvaluation(BaseModel):
|
| 122 |
+
"""Compatibility shim for earlier template-derived environment code."""
|
| 123 |
+
|
| 124 |
+
matched_reference_ids: List[str] = Field(default_factory=list)
|
| 125 |
+
matched_findings: int = 0
|
| 126 |
+
total_findings: int = 0
|
| 127 |
+
false_positives: int = 0
|
| 128 |
+
duplicate_findings: int = 0
|
| 129 |
+
weighted_recall: float = 0.0
|
| 130 |
+
patch_score: float = 0.0
|
| 131 |
+
score: float = 0.0
|
| 132 |
+
passed: bool = False
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
class PythonEnvConfig(BaseModel):
|
| 136 |
+
"""Environment configuration used by the benchmark runtime."""
|
| 137 |
+
|
| 138 |
+
task_order: List[str] = Field(
|
| 139 |
+
default_factory=lambda: ["task_easy", "task_medium", "task_hard"]
|
| 140 |
+
)
|
| 141 |
+
max_steps_per_task: int = Field(default=25, ge=1, le=100)
|
| 142 |
+
max_history_entries: int = Field(default=200, ge=1, le=1000)
|
| 143 |
+
rotate_tasks: bool = True
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
class EpisodeMetrics(BaseModel):
|
| 147 |
+
"""Current episode metrics for UI, evaluation, and RL logging."""
|
| 148 |
+
|
| 149 |
+
precision: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 150 |
+
recall: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 151 |
+
f1: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 152 |
+
true_positives: int = Field(default=0, ge=0)
|
| 153 |
+
false_positives: int = Field(default=0, ge=0)
|
| 154 |
+
missed_issues: int = Field(default=0, ge=0)
|
| 155 |
+
required_found: int = Field(default=0, ge=0)
|
| 156 |
+
required_total: int = Field(default=0, ge=0)
|
| 157 |
+
bonus_found: int = Field(default=0, ge=0)
|
| 158 |
+
duplicate_comments: int = Field(default=0, ge=0)
|
| 159 |
+
context_requests: int = Field(default=0, ge=0)
|
| 160 |
+
skipped_clean_lines: int = Field(default=0, ge=0)
|
| 161 |
+
skipped_issue_lines: int = Field(default=0, ge=0)
|
| 162 |
+
current_score: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 163 |
+
cumulative_reward: float = 0.0
|
| 164 |
+
breakdown: Dict[str, float] = Field(default_factory=dict)
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
class RewardSummary(BaseModel):
|
| 168 |
+
"""Reward details from the most recent step."""
|
| 169 |
+
|
| 170 |
+
step_reward: float = 0.0
|
| 171 |
+
cumulative_reward: float = 0.0
|
| 172 |
+
breakdown: Dict[str, float] = Field(default_factory=dict)
|
| 173 |
+
false_positives: int = Field(default=0, ge=0)
|
| 174 |
+
true_positives: int = Field(default=0, ge=0)
|
| 175 |
+
missed_issues: int = Field(default=0, ge=0)
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
class PythonReviewAction(Action):
|
| 179 |
+
"""Structured review action emitted by a model or trainer."""
|
| 180 |
+
|
| 181 |
+
action_type: ActionType
|
| 182 |
+
line_number: Optional[int] = Field(default=None, ge=1)
|
| 183 |
+
issue_type: Optional[IssueType] = None
|
| 184 |
+
severity: Optional[Severity] = None
|
| 185 |
+
comment: Optional[str] = None
|
| 186 |
+
suggestion: Optional[str] = None
|
| 187 |
+
question: Optional[str] = None
|
| 188 |
+
|
| 189 |
+
# Template compatibility
|
| 190 |
+
operation: str = "submit_findings"
|
| 191 |
+
findings: List[ReviewFinding] = Field(default_factory=list)
|
| 192 |
+
patched_code: Optional[str] = None
|
| 193 |
+
|
| 194 |
+
@model_validator(mode="after")
|
| 195 |
+
def validate_action_shape(self) -> "PythonReviewAction":
|
| 196 |
+
"""Require the right fields for each action type."""
|
| 197 |
+
|
| 198 |
+
if self.action_type == ActionType.ADD_COMMENT:
|
| 199 |
+
missing = []
|
| 200 |
+
if self.line_number is None:
|
| 201 |
+
missing.append("line_number")
|
| 202 |
+
if self.issue_type is None:
|
| 203 |
+
missing.append("issue_type")
|
| 204 |
+
if self.severity is None:
|
| 205 |
+
missing.append("severity")
|
| 206 |
+
if not (self.comment or "").strip():
|
| 207 |
+
missing.append("comment")
|
| 208 |
+
if missing:
|
| 209 |
+
raise ValueError("ADD_COMMENT requires: " + ", ".join(missing))
|
| 210 |
+
elif self.action_type == ActionType.SKIP_LINE:
|
| 211 |
+
if self.line_number is None:
|
| 212 |
+
raise ValueError("SKIP_LINE requires line_number")
|
| 213 |
+
elif self.action_type == ActionType.ASK_CONTEXT:
|
| 214 |
+
if not (self.question or self.comment or "").strip():
|
| 215 |
+
raise ValueError("ASK_CONTEXT requires question or comment")
|
| 216 |
+
elif self.action_type in {ActionType.APPROVE, ActionType.REQUEST_CHANGES}:
|
| 217 |
+
noisy_fields = {
|
| 218 |
+
"line_number": self.line_number,
|
| 219 |
+
"issue_type": self.issue_type,
|
| 220 |
+
"severity": self.severity,
|
| 221 |
+
"comment": self.comment,
|
| 222 |
+
"suggestion": self.suggestion,
|
| 223 |
+
"question": self.question,
|
| 224 |
+
}
|
| 225 |
+
populated = [
|
| 226 |
+
name for name, value in noisy_fields.items() if value not in (None, "")
|
| 227 |
+
]
|
| 228 |
+
if populated:
|
| 229 |
+
raise ValueError(
|
| 230 |
+
f"{self.action_type.value} does not accept extra fields: {', '.join(populated)}"
|
| 231 |
+
)
|
| 232 |
+
return self
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
class PythonReviewObservation(Observation):
|
| 236 |
+
"""Observation returned by reset/step, including trainer-visible metrics."""
|
| 237 |
+
|
| 238 |
+
snippet_id: str
|
| 239 |
+
code: str
|
| 240 |
+
filename: str
|
| 241 |
+
language: str = "python"
|
| 242 |
+
context: Optional[str] = None
|
| 243 |
+
diff: Optional[str] = None
|
| 244 |
+
line_count: int = Field(..., ge=0)
|
| 245 |
+
current_step: int = Field(..., ge=0)
|
| 246 |
+
max_steps: int = Field(..., ge=1)
|
| 247 |
+
task_id: str
|
| 248 |
+
review_history: List[ReviewComment] = Field(default_factory=list)
|
| 249 |
+
lines: List[str] = Field(default_factory=list)
|
| 250 |
+
reward_summary: RewardSummary = Field(default_factory=RewardSummary)
|
| 251 |
+
metrics: EpisodeMetrics = Field(default_factory=EpisodeMetrics)
|
| 252 |
+
feedback: str = ""
|
| 253 |
+
|
| 254 |
+
# Template compatibility
|
| 255 |
+
task: Optional[TaskDescriptor] = None
|
| 256 |
+
instructions: str = ""
|
| 257 |
+
submitted_findings: List[ReviewFinding] = Field(default_factory=list)
|
| 258 |
+
hints_used: int = 0
|
| 259 |
+
attempts_remaining: int = 0
|
| 260 |
+
evaluation: Optional[TaskEvaluation] = None
|
| 261 |
+
score: float = 0.0
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
class PythonReviewState(State):
|
| 265 |
+
"""Full server-side state exposed by `/state`."""
|
| 266 |
+
|
| 267 |
+
task_id: Optional[str] = None
|
| 268 |
+
difficulty: Optional[Difficulty] = None
|
| 269 |
+
snippet_id: Optional[str] = None
|
| 270 |
+
current_step: int = Field(default=0, ge=0)
|
| 271 |
+
max_steps: int = Field(default=0, ge=0)
|
| 272 |
+
done: bool = False
|
| 273 |
+
filename: Optional[str] = None
|
| 274 |
+
review_history: List[ReviewComment] = Field(default_factory=list)
|
| 275 |
+
metrics: EpisodeMetrics = Field(default_factory=EpisodeMetrics)
|
| 276 |
+
last_feedback: str = ""
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
class TaskListResponse(BaseModel):
|
| 280 |
+
tasks: List[TaskMetadata] = Field(default_factory=list)
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
class MetricsResponse(BaseModel):
|
| 284 |
+
task_id: Optional[str] = None
|
| 285 |
+
snippet_id: Optional[str] = None
|
| 286 |
+
done: bool = False
|
| 287 |
+
metrics: EpisodeMetrics = Field(default_factory=EpisodeMetrics)
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
class HealthResponse(BaseModel):
|
| 291 |
+
status: str = "ok"
|
| 292 |
+
environment: str = "python_code_review_env"
|
| 293 |
+
task_count: int = Field(default=0, ge=0)
|
| 294 |
+
active_task_id: Optional[str] = None
|
| 295 |
+
active_snippet_id: Optional[str] = None
|
| 296 |
+
active_episode_id: Optional[str] = None
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
PythonAction = PythonReviewAction
|
| 300 |
+
PythonObservation = PythonReviewObservation
|
| 301 |
+
PythonState = PythonReviewState
|
| 302 |
+
CodeReviewAction = PythonReviewAction
|
| 303 |
+
CodeReviewObservation = PythonReviewObservation
|
| 304 |
+
CodeReviewConfig = PythonEnvConfig
|
openenv.yaml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
spec_version: 1
|
| 2 |
+
name: python-code-review
|
| 3 |
+
type: space
|
| 4 |
+
runtime: fastapi
|
| 5 |
+
app: server.app:app
|
| 6 |
+
port: 8000
|
openenv_python_env.egg-info/PKG-INFO
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: openenv-python_env
|
| 3 |
+
Version: 0.1.0
|
| 4 |
+
Summary: Python Env environment for OpenEnv
|
| 5 |
+
Requires-Python: >=3.10
|
| 6 |
+
Requires-Dist: openenv-core[core]>=0.2.2
|
| 7 |
+
Requires-Dist: httpx>=0.28.1
|
| 8 |
+
Requires-Dist: pydantic>=2.12.5
|
| 9 |
+
Provides-Extra: dev
|
| 10 |
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
| 11 |
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
openenv_python_env.egg-info/SOURCES.txt
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
README.md
|
| 2 |
+
__init__.py
|
| 3 |
+
client.py
|
| 4 |
+
inference.py
|
| 5 |
+
models.py
|
| 6 |
+
pyproject.toml
|
| 7 |
+
./__init__.py
|
| 8 |
+
./client.py
|
| 9 |
+
./inference.py
|
| 10 |
+
./models.py
|
| 11 |
+
./rollout.py
|
| 12 |
+
openenv_python_env.egg-info/PKG-INFO
|
| 13 |
+
openenv_python_env.egg-info/SOURCES.txt
|
| 14 |
+
openenv_python_env.egg-info/dependency_links.txt
|
| 15 |
+
openenv_python_env.egg-info/entry_points.txt
|
| 16 |
+
openenv_python_env.egg-info/requires.txt
|
| 17 |
+
openenv_python_env.egg-info/top_level.txt
|
| 18 |
+
server/__init__.py
|
| 19 |
+
server/app.py
|
| 20 |
+
server/grading.py
|
| 21 |
+
server/python_env_environment.py
|
| 22 |
+
server/review_runtime.py
|
| 23 |
+
server/task_bank.py
|
| 24 |
+
server/data/snippets_easy.json
|
| 25 |
+
server/data/snippets_hard.json
|
| 26 |
+
server/data/snippets_medium.json
|
| 27 |
+
tests/test_env.py
|
openenv_python_env.egg-info/dependency_links.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
openenv_python_env.egg-info/entry_points.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[console_scripts]
|
| 2 |
+
server = python_env.server.app:main
|
openenv_python_env.egg-info/requires.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openenv-core[core]>=0.2.2
|
| 2 |
+
httpx>=0.28.1
|
| 3 |
+
pydantic>=2.12.5
|
| 4 |
+
|
| 5 |
+
[dev]
|
| 6 |
+
pytest>=8.0.0
|
| 7 |
+
pytest-cov>=4.0.0
|
openenv_python_env.egg-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
python_env
|
pyproject.toml
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
[build-system]
|
| 8 |
+
requires = ["setuptools>=45", "wheel"]
|
| 9 |
+
build-backend = "setuptools.build_meta"
|
| 10 |
+
|
| 11 |
+
[project]
|
| 12 |
+
name = "openenv-python_env"
|
| 13 |
+
version = "0.1.0"
|
| 14 |
+
description = "Python Env environment for OpenEnv"
|
| 15 |
+
requires-python = ">=3.10"
|
| 16 |
+
dependencies = [
|
| 17 |
+
# Core OpenEnv runtime (provides FastAPI server + HTTP client types)
|
| 18 |
+
# install from github
|
| 19 |
+
# "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
|
| 20 |
+
"openenv-core[core]>=0.2.2",
|
| 21 |
+
# Environment-specific dependencies
|
| 22 |
+
# Add all dependencies needed for your environment here
|
| 23 |
+
# Examples:
|
| 24 |
+
# "numpy>=1.19.0",
|
| 25 |
+
# "torch>=2.0.0",
|
| 26 |
+
# "gymnasium>=0.29.0",
|
| 27 |
+
# "openspiel>=1.0.0",
|
| 28 |
+
# "smolagents>=1.22.0,<2",
|
| 29 |
+
"httpx>=0.28.1",
|
| 30 |
+
"pydantic>=2.12.5",
|
| 31 |
+
]
|
| 32 |
+
|
| 33 |
+
[project.optional-dependencies]
|
| 34 |
+
dev = [
|
| 35 |
+
"pytest>=8.0.0",
|
| 36 |
+
"pytest-cov>=4.0.0",
|
| 37 |
+
]
|
| 38 |
+
|
| 39 |
+
[project.scripts]
|
| 40 |
+
# Server entry point - enables running via: uv run --project . server
|
| 41 |
+
# or: python -m python_env.server.app
|
| 42 |
+
server = "python_env.server.app:main"
|
| 43 |
+
|
| 44 |
+
[tool.setuptools]
|
| 45 |
+
include-package-data = true
|
| 46 |
+
packages = ["python_env", "python_env.server"]
|
| 47 |
+
package-dir = { "python_env" = ".", "python_env.server" = "server" }
|
| 48 |
+
|
| 49 |
+
[tool.setuptools.package-data]
|
| 50 |
+
"python_env.server" = ["data/*.json"]
|
rollout.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Trajectory collection helpers for RL-style training loops."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
from dataclasses import dataclass, asdict
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Callable, Dict, List, Optional
|
| 9 |
+
|
| 10 |
+
from client import PythonEnv
|
| 11 |
+
from models import PythonReviewAction
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@dataclass
|
| 15 |
+
class TrajectoryStep:
|
| 16 |
+
observation: Dict[str, object]
|
| 17 |
+
action: Dict[str, object]
|
| 18 |
+
reward: float
|
| 19 |
+
done: bool
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@dataclass
|
| 23 |
+
class TrajectoryEpisode:
|
| 24 |
+
task_id: str
|
| 25 |
+
snippet_id: str
|
| 26 |
+
final_score: float
|
| 27 |
+
cumulative_reward: float
|
| 28 |
+
steps: List[TrajectoryStep]
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
PolicyFn = Callable[[object], PythonReviewAction]
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def collect_episode(env, task_id: str, policy: PolicyFn, max_steps: Optional[int] = None) -> TrajectoryEpisode:
|
| 35 |
+
"""Collect one benchmark episode for an external trainer."""
|
| 36 |
+
|
| 37 |
+
result = env.reset(task_id=task_id)
|
| 38 |
+
observation = result.observation
|
| 39 |
+
steps: List[TrajectoryStep] = []
|
| 40 |
+
step_limit = max_steps or observation.max_steps
|
| 41 |
+
|
| 42 |
+
for _ in range(step_limit):
|
| 43 |
+
action = policy(observation)
|
| 44 |
+
result = env.step(action)
|
| 45 |
+
steps.append(
|
| 46 |
+
TrajectoryStep(
|
| 47 |
+
observation=observation.model_dump(),
|
| 48 |
+
action=action.model_dump(exclude_none=True),
|
| 49 |
+
reward=float(result.reward or 0.0),
|
| 50 |
+
done=bool(result.done),
|
| 51 |
+
)
|
| 52 |
+
)
|
| 53 |
+
observation = result.observation
|
| 54 |
+
if result.done:
|
| 55 |
+
break
|
| 56 |
+
|
| 57 |
+
return TrajectoryEpisode(
|
| 58 |
+
task_id=observation.task_id,
|
| 59 |
+
snippet_id=observation.snippet_id,
|
| 60 |
+
final_score=observation.metrics.current_score,
|
| 61 |
+
cumulative_reward=observation.metrics.cumulative_reward,
|
| 62 |
+
steps=steps,
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def write_jsonl(episodes: List[TrajectoryEpisode], output_path: str | Path) -> None:
|
| 67 |
+
"""Persist collected trajectories in a trainer-friendly JSONL format."""
|
| 68 |
+
|
| 69 |
+
path = Path(output_path)
|
| 70 |
+
lines = [json.dumps(asdict(episode)) for episode in episodes]
|
| 71 |
+
path.write_text("\n".join(lines), encoding="utf-8")
|
server/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Python Env environment server components."""
|
| 8 |
+
|
| 9 |
+
from .python_env_environment import PythonEnvironment
|
| 10 |
+
|
| 11 |
+
__all__ = ["PythonEnvironment"]
|
server/app.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
FastAPI application for the Python Env Environment.
|
| 9 |
+
|
| 10 |
+
This module creates an HTTP server that exposes the PythonEnvironment
|
| 11 |
+
over HTTP and WebSocket endpoints, compatible with EnvClient.
|
| 12 |
+
|
| 13 |
+
Endpoints:
|
| 14 |
+
- POST /reset: Reset the environment
|
| 15 |
+
- POST /step: Execute an action
|
| 16 |
+
- GET /state: Get current environment state
|
| 17 |
+
- GET /schema: Get action/observation schemas
|
| 18 |
+
- WS /ws: WebSocket endpoint for persistent sessions
|
| 19 |
+
|
| 20 |
+
Usage:
|
| 21 |
+
# Development (with auto-reload):
|
| 22 |
+
uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
|
| 23 |
+
|
| 24 |
+
# Production:
|
| 25 |
+
uvicorn server.app:app --host 0.0.0.0 --port 8000 --workers 4
|
| 26 |
+
|
| 27 |
+
# Or run directly:
|
| 28 |
+
python -m server.app
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
from fastapi.routing import APIRoute
|
| 32 |
+
|
| 33 |
+
try:
|
| 34 |
+
from openenv.core.env_server.http_server import create_app
|
| 35 |
+
except Exception as e: # pragma: no cover
|
| 36 |
+
raise ImportError(
|
| 37 |
+
"openenv is required for the web interface. Install dependencies with '\n uv sync\n'"
|
| 38 |
+
) from e
|
| 39 |
+
|
| 40 |
+
try:
|
| 41 |
+
from ..models import (
|
| 42 |
+
HealthResponse,
|
| 43 |
+
MetricsResponse,
|
| 44 |
+
PythonAction,
|
| 45 |
+
PythonObservation,
|
| 46 |
+
PythonState,
|
| 47 |
+
TaskListResponse,
|
| 48 |
+
)
|
| 49 |
+
from .python_env_environment import (
|
| 50 |
+
PythonEnvironment,
|
| 51 |
+
get_current_state,
|
| 52 |
+
get_health_response,
|
| 53 |
+
get_metrics_response,
|
| 54 |
+
get_tasks_response,
|
| 55 |
+
)
|
| 56 |
+
except ImportError:
|
| 57 |
+
from models import ( # type: ignore
|
| 58 |
+
HealthResponse,
|
| 59 |
+
MetricsResponse,
|
| 60 |
+
PythonAction,
|
| 61 |
+
PythonObservation,
|
| 62 |
+
PythonState,
|
| 63 |
+
TaskListResponse,
|
| 64 |
+
)
|
| 65 |
+
from server.python_env_environment import ( # type: ignore
|
| 66 |
+
PythonEnvironment,
|
| 67 |
+
get_current_state,
|
| 68 |
+
get_health_response,
|
| 69 |
+
get_metrics_response,
|
| 70 |
+
get_tasks_response,
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
# Create the app with web interface and README integration
|
| 75 |
+
app = create_app(
|
| 76 |
+
PythonEnvironment,
|
| 77 |
+
PythonAction,
|
| 78 |
+
PythonObservation,
|
| 79 |
+
env_name="python_env",
|
| 80 |
+
max_concurrent_envs=1, # increase this number to allow more concurrent WebSocket sessions
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def _remove_get_route(path: str) -> None:
|
| 85 |
+
app.router.routes = [
|
| 86 |
+
route
|
| 87 |
+
for route in app.router.routes
|
| 88 |
+
if not (
|
| 89 |
+
isinstance(route, APIRoute)
|
| 90 |
+
and route.path == path
|
| 91 |
+
and "GET" in route.methods
|
| 92 |
+
)
|
| 93 |
+
]
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
_remove_get_route("/health")
|
| 97 |
+
_remove_get_route("/state")
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
@app.get("/health", response_model=HealthResponse, tags=["Health"])
|
| 101 |
+
async def health() -> HealthResponse:
|
| 102 |
+
return get_health_response()
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
@app.get("/state", response_model=PythonState, tags=["State Management"])
|
| 106 |
+
async def state() -> PythonState:
|
| 107 |
+
return get_current_state()
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
@app.get("/tasks", response_model=TaskListResponse, tags=["Environment Info"])
|
| 111 |
+
async def tasks() -> TaskListResponse:
|
| 112 |
+
return get_tasks_response()
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
@app.get("/metrics", response_model=MetricsResponse, tags=["Environment Info"])
|
| 116 |
+
async def metrics() -> MetricsResponse:
|
| 117 |
+
return get_metrics_response()
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def main(host: str = "0.0.0.0", port: int = 8000):
|
| 121 |
+
"""
|
| 122 |
+
Entry point for direct execution via uv run or python -m.
|
| 123 |
+
|
| 124 |
+
This function enables running the server without Docker:
|
| 125 |
+
uv run --project . server
|
| 126 |
+
uv run --project . server --port 8001
|
| 127 |
+
python -m python_env.server.app
|
| 128 |
+
|
| 129 |
+
Args:
|
| 130 |
+
host: Host address to bind to (default: "0.0.0.0")
|
| 131 |
+
port: Port number to listen on (default: 8000)
|
| 132 |
+
|
| 133 |
+
For production deployments, consider using uvicorn directly with
|
| 134 |
+
multiple workers:
|
| 135 |
+
uvicorn python_env.server.app:app --workers 4
|
| 136 |
+
"""
|
| 137 |
+
import uvicorn
|
| 138 |
+
|
| 139 |
+
uvicorn.run(app, host=host, port=port)
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
if __name__ == "__main__":
|
| 143 |
+
import argparse
|
| 144 |
+
|
| 145 |
+
parser = argparse.ArgumentParser()
|
| 146 |
+
parser.add_argument("--port", type=int, default=8000)
|
| 147 |
+
args = parser.parse_args()
|
| 148 |
+
main(port=args.port)
|
server/data/snippets_easy.json
ADDED
|
@@ -0,0 +1,238 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"snippet_id": "easy_001",
|
| 4 |
+
"filename": "utils.py",
|
| 5 |
+
"code": "import math\n\ndef build_label(value):\n l = str(value)\n return l.upper()",
|
| 6 |
+
"context": "Utility helper used by several modules.",
|
| 7 |
+
"gold_issues": [
|
| 8 |
+
{
|
| 9 |
+
"issue_id": "easy_001_docs",
|
| 10 |
+
"line": 3,
|
| 11 |
+
"issue_type": "DOCS",
|
| 12 |
+
"severity": "LOW",
|
| 13 |
+
"description": "Missing docstring on public function.",
|
| 14 |
+
"required": false,
|
| 15 |
+
"explanation_keywords": ["docstring", "documentation", "public"]
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"issue_id": "easy_001_name",
|
| 19 |
+
"line": 4,
|
| 20 |
+
"issue_type": "STYLE",
|
| 21 |
+
"severity": "LOW",
|
| 22 |
+
"description": "Variable name 'l' is ambiguous (PEP8 E741).",
|
| 23 |
+
"required": true,
|
| 24 |
+
"explanation_keywords": ["ambiguous", "pep8", "e741", "variable", "name"]
|
| 25 |
+
}
|
| 26 |
+
],
|
| 27 |
+
"must_approve": false,
|
| 28 |
+
"must_reject": true
|
| 29 |
+
},
|
| 30 |
+
{
|
| 31 |
+
"snippet_id": "easy_002",
|
| 32 |
+
"filename": "cleanup.py",
|
| 33 |
+
"code": "def normalize_total(total, fee):\n result=total+fee\n return result",
|
| 34 |
+
"gold_issues": [
|
| 35 |
+
{
|
| 36 |
+
"issue_id": "easy_002_spacing",
|
| 37 |
+
"line": 2,
|
| 38 |
+
"issue_type": "STYLE",
|
| 39 |
+
"severity": "LOW",
|
| 40 |
+
"description": "Missing whitespace around operators.",
|
| 41 |
+
"required": true,
|
| 42 |
+
"explanation_keywords": ["whitespace", "spacing", "operator", "pep8"]
|
| 43 |
+
}
|
| 44 |
+
],
|
| 45 |
+
"must_approve": false,
|
| 46 |
+
"must_reject": true
|
| 47 |
+
},
|
| 48 |
+
{
|
| 49 |
+
"snippet_id": "easy_003",
|
| 50 |
+
"filename": "deploy.py",
|
| 51 |
+
"code": "def deploy_service(name):\n print(\"deploying\", name)\n return name.lower()",
|
| 52 |
+
"context": "Runs during an automated deployment pipeline.",
|
| 53 |
+
"gold_issues": [
|
| 54 |
+
{
|
| 55 |
+
"issue_id": "easy_003_print",
|
| 56 |
+
"line": 2,
|
| 57 |
+
"issue_type": "STYLE",
|
| 58 |
+
"severity": "LOW",
|
| 59 |
+
"description": "Leftover print statement in production code.",
|
| 60 |
+
"required": true,
|
| 61 |
+
"explanation_keywords": ["print", "debug", "production", "logging"]
|
| 62 |
+
}
|
| 63 |
+
],
|
| 64 |
+
"must_approve": false,
|
| 65 |
+
"must_reject": true
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"snippet_id": "easy_004",
|
| 69 |
+
"filename": "imports.py",
|
| 70 |
+
"code": "import os\n\ndef slugify(name):\n return name.strip().lower().replace(\" \", \"-\")",
|
| 71 |
+
"gold_issues": [
|
| 72 |
+
{
|
| 73 |
+
"issue_id": "easy_004_unused_import",
|
| 74 |
+
"line": 1,
|
| 75 |
+
"issue_type": "STYLE",
|
| 76 |
+
"severity": "LOW",
|
| 77 |
+
"description": "Unused import `os`.",
|
| 78 |
+
"required": true,
|
| 79 |
+
"explanation_keywords": ["unused", "import", "os"]
|
| 80 |
+
},
|
| 81 |
+
{
|
| 82 |
+
"issue_id": "easy_004_docs",
|
| 83 |
+
"line": 3,
|
| 84 |
+
"issue_type": "DOCS",
|
| 85 |
+
"severity": "LOW",
|
| 86 |
+
"description": "Missing docstring on public function.",
|
| 87 |
+
"required": false,
|
| 88 |
+
"explanation_keywords": ["docstring", "documentation", "public"]
|
| 89 |
+
}
|
| 90 |
+
],
|
| 91 |
+
"must_approve": false,
|
| 92 |
+
"must_reject": true
|
| 93 |
+
},
|
| 94 |
+
{
|
| 95 |
+
"snippet_id": "easy_005",
|
| 96 |
+
"filename": "pricing.py",
|
| 97 |
+
"code": "def render_banner(product_name):\n return f\"Product {product_name} ships worldwide with next business day handling and standard insured delivery included.\"",
|
| 98 |
+
"gold_issues": [
|
| 99 |
+
{
|
| 100 |
+
"issue_id": "easy_005_long_line",
|
| 101 |
+
"line": 2,
|
| 102 |
+
"issue_type": "STYLE",
|
| 103 |
+
"severity": "LOW",
|
| 104 |
+
"description": "Line exceeds 79 characters.",
|
| 105 |
+
"required": true,
|
| 106 |
+
"explanation_keywords": ["line", "79", "too long", "length"]
|
| 107 |
+
}
|
| 108 |
+
],
|
| 109 |
+
"must_approve": false,
|
| 110 |
+
"must_reject": true
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"snippet_id": "easy_006",
|
| 114 |
+
"filename": "stats.py",
|
| 115 |
+
"code": "def summarize(items):\n total = len(items)\n O = total / 2\n return total, O",
|
| 116 |
+
"gold_issues": [
|
| 117 |
+
{
|
| 118 |
+
"issue_id": "easy_006_name",
|
| 119 |
+
"line": 3,
|
| 120 |
+
"issue_type": "STYLE",
|
| 121 |
+
"severity": "LOW",
|
| 122 |
+
"description": "Variable name 'O' is ambiguous (PEP8 E741).",
|
| 123 |
+
"required": true,
|
| 124 |
+
"explanation_keywords": ["ambiguous", "pep8", "e741", "variable", "name"]
|
| 125 |
+
},
|
| 126 |
+
{
|
| 127 |
+
"issue_id": "easy_006_docs",
|
| 128 |
+
"line": 1,
|
| 129 |
+
"issue_type": "DOCS",
|
| 130 |
+
"severity": "LOW",
|
| 131 |
+
"description": "Missing docstring on public function.",
|
| 132 |
+
"required": false,
|
| 133 |
+
"explanation_keywords": ["docstring", "documentation", "public"]
|
| 134 |
+
}
|
| 135 |
+
],
|
| 136 |
+
"must_approve": false,
|
| 137 |
+
"must_reject": true
|
| 138 |
+
},
|
| 139 |
+
{
|
| 140 |
+
"snippet_id": "easy_007",
|
| 141 |
+
"filename": "parser.py",
|
| 142 |
+
"code": "import json\n\ndef parse_flag(flag):\n I = flag.strip()\n return I.lower() == \"yes\"",
|
| 143 |
+
"gold_issues": [
|
| 144 |
+
{
|
| 145 |
+
"issue_id": "easy_007_unused_import",
|
| 146 |
+
"line": 1,
|
| 147 |
+
"issue_type": "STYLE",
|
| 148 |
+
"severity": "LOW",
|
| 149 |
+
"description": "Unused import `json`.",
|
| 150 |
+
"required": true,
|
| 151 |
+
"explanation_keywords": ["unused", "import", "json"]
|
| 152 |
+
},
|
| 153 |
+
{
|
| 154 |
+
"issue_id": "easy_007_name",
|
| 155 |
+
"line": 4,
|
| 156 |
+
"issue_type": "STYLE",
|
| 157 |
+
"severity": "LOW",
|
| 158 |
+
"description": "Variable name 'I' is ambiguous (PEP8 E741).",
|
| 159 |
+
"required": true,
|
| 160 |
+
"explanation_keywords": ["ambiguous", "pep8", "e741", "variable", "name"]
|
| 161 |
+
}
|
| 162 |
+
],
|
| 163 |
+
"must_approve": false,
|
| 164 |
+
"must_reject": true
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"snippet_id": "easy_008",
|
| 168 |
+
"filename": "notifier.py",
|
| 169 |
+
"code": "def notify(user, count):\n message = user + \":\" + str(count)\n print(message)\n return message",
|
| 170 |
+
"gold_issues": [
|
| 171 |
+
{
|
| 172 |
+
"issue_id": "easy_008_print",
|
| 173 |
+
"line": 3,
|
| 174 |
+
"issue_type": "STYLE",
|
| 175 |
+
"severity": "LOW",
|
| 176 |
+
"description": "Leftover print statement in production code.",
|
| 177 |
+
"required": true,
|
| 178 |
+
"explanation_keywords": ["print", "debug", "production", "logging"]
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"issue_id": "easy_008_docs",
|
| 182 |
+
"line": 1,
|
| 183 |
+
"issue_type": "DOCS",
|
| 184 |
+
"severity": "LOW",
|
| 185 |
+
"description": "Missing docstring on public function.",
|
| 186 |
+
"required": false,
|
| 187 |
+
"explanation_keywords": ["docstring", "documentation", "public"]
|
| 188 |
+
}
|
| 189 |
+
],
|
| 190 |
+
"must_approve": false,
|
| 191 |
+
"must_reject": true
|
| 192 |
+
},
|
| 193 |
+
{
|
| 194 |
+
"snippet_id": "easy_009",
|
| 195 |
+
"filename": "math_helpers.py",
|
| 196 |
+
"code": "def add_fee(total, fee):\n amount = total+fee\n return amount",
|
| 197 |
+
"gold_issues": [
|
| 198 |
+
{
|
| 199 |
+
"issue_id": "easy_009_spacing",
|
| 200 |
+
"line": 2,
|
| 201 |
+
"issue_type": "STYLE",
|
| 202 |
+
"severity": "LOW",
|
| 203 |
+
"description": "Missing whitespace around operators.",
|
| 204 |
+
"required": true,
|
| 205 |
+
"explanation_keywords": ["whitespace", "spacing", "operator", "pep8"]
|
| 206 |
+
}
|
| 207 |
+
],
|
| 208 |
+
"must_approve": false,
|
| 209 |
+
"must_reject": true
|
| 210 |
+
},
|
| 211 |
+
{
|
| 212 |
+
"snippet_id": "easy_010",
|
| 213 |
+
"filename": "views.py",
|
| 214 |
+
"code": "import datetime\n\ndef build_title(user_name):\n return f\"Welcome {user_name}, thanks for joining the quarterly partner enablement kickoff meeting today.\"",
|
| 215 |
+
"gold_issues": [
|
| 216 |
+
{
|
| 217 |
+
"issue_id": "easy_010_unused_import",
|
| 218 |
+
"line": 1,
|
| 219 |
+
"issue_type": "STYLE",
|
| 220 |
+
"severity": "LOW",
|
| 221 |
+
"description": "Unused import `datetime`.",
|
| 222 |
+
"required": true,
|
| 223 |
+
"explanation_keywords": ["unused", "import", "datetime"]
|
| 224 |
+
},
|
| 225 |
+
{
|
| 226 |
+
"issue_id": "easy_010_long_line",
|
| 227 |
+
"line": 4,
|
| 228 |
+
"issue_type": "STYLE",
|
| 229 |
+
"severity": "LOW",
|
| 230 |
+
"description": "Line exceeds 79 characters.",
|
| 231 |
+
"required": true,
|
| 232 |
+
"explanation_keywords": ["line", "79", "too long", "length"]
|
| 233 |
+
}
|
| 234 |
+
],
|
| 235 |
+
"must_approve": false,
|
| 236 |
+
"must_reject": true
|
| 237 |
+
}
|
| 238 |
+
]
|
server/data/snippets_hard.json
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"snippet_id": "hard_001",
|
| 4 |
+
"filename": "db.py",
|
| 5 |
+
"code": "def load_user(cursor, user_id):\n query = f\"SELECT * FROM users WHERE id = {user_id}\"\n return cursor.execute(query).fetchone()",
|
| 6 |
+
"context": "Used by an internal admin dashboard.",
|
| 7 |
+
"gold_issues": [
|
| 8 |
+
{
|
| 9 |
+
"issue_id": "hard_001_sqli",
|
| 10 |
+
"line": 2,
|
| 11 |
+
"issue_type": "SECURITY",
|
| 12 |
+
"severity": "CRITICAL",
|
| 13 |
+
"description": "SQL query interpolates user input directly and is vulnerable to SQL injection.",
|
| 14 |
+
"required": true,
|
| 15 |
+
"explanation_keywords": ["sql injection", "parameterized", "query", "interpolate", "user input"],
|
| 16 |
+
"fix_keywords": ["parameterized", "placeholder", "bind", "params"],
|
| 17 |
+
"owasp_category": "A03:2021-Injection",
|
| 18 |
+
"owasp_keywords": ["owasp", "injection", "a03"]
|
| 19 |
+
}
|
| 20 |
+
],
|
| 21 |
+
"must_approve": false,
|
| 22 |
+
"must_reject": true
|
| 23 |
+
},
|
| 24 |
+
{
|
| 25 |
+
"snippet_id": "hard_002",
|
| 26 |
+
"filename": "settings.py",
|
| 27 |
+
"code": "API_KEY = \"sk-live-hardcoded-secret\"\n\ndef get_auth_header():\n return {\"Authorization\": f\"Bearer {API_KEY}\"}",
|
| 28 |
+
"gold_issues": [
|
| 29 |
+
{
|
| 30 |
+
"issue_id": "hard_002_secret",
|
| 31 |
+
"line": 1,
|
| 32 |
+
"issue_type": "SECURITY",
|
| 33 |
+
"severity": "HIGH",
|
| 34 |
+
"description": "Hardcoded API secret in source code.",
|
| 35 |
+
"required": true,
|
| 36 |
+
"explanation_keywords": ["hardcoded", "secret", "api key", "credential"],
|
| 37 |
+
"fix_keywords": ["environment", "vault", "secret manager", "config"],
|
| 38 |
+
"owasp_category": "A07:2021-Identification and Authentication Failures",
|
| 39 |
+
"owasp_keywords": ["secret", "credential", "authentication", "owasp"]
|
| 40 |
+
}
|
| 41 |
+
],
|
| 42 |
+
"must_approve": false,
|
| 43 |
+
"must_reject": true
|
| 44 |
+
},
|
| 45 |
+
{
|
| 46 |
+
"snippet_id": "hard_003",
|
| 47 |
+
"filename": "storage.py",
|
| 48 |
+
"code": "from pathlib import Path\n\ndef read_report(base_dir, user_filename):\n path = Path(base_dir) / user_filename\n return path.read_text()",
|
| 49 |
+
"gold_issues": [
|
| 50 |
+
{
|
| 51 |
+
"issue_id": "hard_003_path_traversal",
|
| 52 |
+
"line": 4,
|
| 53 |
+
"issue_type": "SECURITY",
|
| 54 |
+
"severity": "HIGH",
|
| 55 |
+
"description": "User-controlled file path can escape the intended directory.",
|
| 56 |
+
"required": true,
|
| 57 |
+
"explanation_keywords": ["path traversal", "directory", "user filename", "path"],
|
| 58 |
+
"fix_keywords": ["resolve", "validate", "allowlist", "basename"],
|
| 59 |
+
"owasp_category": "A01:2021-Broken Access Control",
|
| 60 |
+
"owasp_keywords": ["broken access control", "path traversal", "owasp", "a01"]
|
| 61 |
+
}
|
| 62 |
+
],
|
| 63 |
+
"must_approve": false,
|
| 64 |
+
"must_reject": true
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"snippet_id": "hard_004",
|
| 68 |
+
"filename": "net.py",
|
| 69 |
+
"code": "import os\n\ndef ping(hostname):\n return os.system(f\"ping -c 1 {hostname}\")",
|
| 70 |
+
"gold_issues": [
|
| 71 |
+
{
|
| 72 |
+
"issue_id": "hard_004_command_injection",
|
| 73 |
+
"line": 4,
|
| 74 |
+
"issue_type": "SECURITY",
|
| 75 |
+
"severity": "CRITICAL",
|
| 76 |
+
"description": "Shell command is built from user input, allowing command injection.",
|
| 77 |
+
"required": true,
|
| 78 |
+
"explanation_keywords": ["command injection", "shell", "os.system", "user input"],
|
| 79 |
+
"fix_keywords": ["subprocess", "argument list", "shell false", "validate"],
|
| 80 |
+
"owasp_category": "A03:2021-Injection",
|
| 81 |
+
"owasp_keywords": ["owasp", "injection", "a03", "command injection"]
|
| 82 |
+
}
|
| 83 |
+
],
|
| 84 |
+
"must_approve": false,
|
| 85 |
+
"must_reject": true
|
| 86 |
+
},
|
| 87 |
+
{
|
| 88 |
+
"snippet_id": "hard_005",
|
| 89 |
+
"filename": "serializer.py",
|
| 90 |
+
"code": "import pickle\n\ndef load_session(user_data):\n return pickle.loads(user_data)",
|
| 91 |
+
"gold_issues": [
|
| 92 |
+
{
|
| 93 |
+
"issue_id": "hard_005_pickle",
|
| 94 |
+
"line": 4,
|
| 95 |
+
"issue_type": "SECURITY",
|
| 96 |
+
"severity": "CRITICAL",
|
| 97 |
+
"description": "Untrusted pickle deserialization can lead to arbitrary code execution.",
|
| 98 |
+
"required": true,
|
| 99 |
+
"explanation_keywords": ["pickle", "deserialization", "arbitrary code", "untrusted"],
|
| 100 |
+
"fix_keywords": ["json", "safe format", "validate", "trusted"],
|
| 101 |
+
"owasp_category": "A08:2021-Software and Data Integrity Failures",
|
| 102 |
+
"owasp_keywords": ["integrity", "deserialization", "owasp", "a08"]
|
| 103 |
+
}
|
| 104 |
+
],
|
| 105 |
+
"must_approve": false,
|
| 106 |
+
"must_reject": true
|
| 107 |
+
},
|
| 108 |
+
{
|
| 109 |
+
"snippet_id": "hard_006",
|
| 110 |
+
"filename": "crypto.py",
|
| 111 |
+
"code": "import hashlib\n\ndef hash_password(password):\n return hashlib.md5(password.encode()).hexdigest()",
|
| 112 |
+
"gold_issues": [
|
| 113 |
+
{
|
| 114 |
+
"issue_id": "hard_006_weak_crypto",
|
| 115 |
+
"line": 4,
|
| 116 |
+
"issue_type": "SECURITY",
|
| 117 |
+
"severity": "HIGH",
|
| 118 |
+
"description": "Uses MD5 for password hashing, which is cryptographically weak.",
|
| 119 |
+
"required": true,
|
| 120 |
+
"explanation_keywords": ["md5", "weak", "hash", "password", "crypto"],
|
| 121 |
+
"fix_keywords": ["bcrypt", "argon2", "scrypt", "salt"],
|
| 122 |
+
"owasp_category": "A02:2021-Cryptographic Failures",
|
| 123 |
+
"owasp_keywords": ["cryptographic", "owasp", "a02", "weak crypto"]
|
| 124 |
+
}
|
| 125 |
+
],
|
| 126 |
+
"must_approve": false,
|
| 127 |
+
"must_reject": true
|
| 128 |
+
},
|
| 129 |
+
{
|
| 130 |
+
"snippet_id": "hard_007",
|
| 131 |
+
"filename": "shell.py",
|
| 132 |
+
"code": "import subprocess\n\ndef run(cmd):\n return subprocess.run(cmd, shell=True, check=True)",
|
| 133 |
+
"gold_issues": [
|
| 134 |
+
{
|
| 135 |
+
"issue_id": "hard_007_shell_true",
|
| 136 |
+
"line": 4,
|
| 137 |
+
"issue_type": "SECURITY",
|
| 138 |
+
"severity": "CRITICAL",
|
| 139 |
+
"description": "Runs shell commands with shell=True on untrusted input.",
|
| 140 |
+
"required": true,
|
| 141 |
+
"explanation_keywords": ["shell=true", "subprocess", "command injection", "shell"],
|
| 142 |
+
"fix_keywords": ["shell false", "argument list", "validate", "subprocess"],
|
| 143 |
+
"owasp_category": "A03:2021-Injection",
|
| 144 |
+
"owasp_keywords": ["owasp", "injection", "a03", "shell"]
|
| 145 |
+
}
|
| 146 |
+
],
|
| 147 |
+
"must_approve": false,
|
| 148 |
+
"must_reject": true
|
| 149 |
+
},
|
| 150 |
+
{
|
| 151 |
+
"snippet_id": "hard_008",
|
| 152 |
+
"filename": "redirects.py",
|
| 153 |
+
"code": "def login_redirect(next_url):\n return {\"location\": next_url, \"status\": 302}",
|
| 154 |
+
"gold_issues": [
|
| 155 |
+
{
|
| 156 |
+
"issue_id": "hard_008_open_redirect",
|
| 157 |
+
"line": 2,
|
| 158 |
+
"issue_type": "SECURITY",
|
| 159 |
+
"severity": "HIGH",
|
| 160 |
+
"description": "Redirect target is fully user-controlled, creating an open redirect.",
|
| 161 |
+
"required": true,
|
| 162 |
+
"explanation_keywords": ["open redirect", "redirect", "next_url", "user controlled"],
|
| 163 |
+
"fix_keywords": ["allowlist", "relative path", "validate", "trusted host"],
|
| 164 |
+
"owasp_category": "A01:2021-Broken Access Control",
|
| 165 |
+
"owasp_keywords": ["owasp", "broken access control", "open redirect", "a01"]
|
| 166 |
+
}
|
| 167 |
+
],
|
| 168 |
+
"must_approve": false,
|
| 169 |
+
"must_reject": true
|
| 170 |
+
},
|
| 171 |
+
{
|
| 172 |
+
"snippet_id": "hard_009",
|
| 173 |
+
"filename": "orders.py",
|
| 174 |
+
"code": "def view_order(request, db):\n order_id = request.query_params[\"order_id\"]\n return db.get_order(order_id)",
|
| 175 |
+
"context": "Customer dashboard handler.",
|
| 176 |
+
"gold_issues": [
|
| 177 |
+
{
|
| 178 |
+
"issue_id": "hard_009_idor",
|
| 179 |
+
"line": 3,
|
| 180 |
+
"issue_type": "SECURITY",
|
| 181 |
+
"severity": "HIGH",
|
| 182 |
+
"description": "Looks up an order by user-supplied id without an ownership check, enabling IDOR.",
|
| 183 |
+
"required": true,
|
| 184 |
+
"explanation_keywords": ["idor", "ownership", "authorization", "access control", "order id"],
|
| 185 |
+
"fix_keywords": ["authorize", "ownership", "current user", "scoped query"],
|
| 186 |
+
"owasp_category": "A01:2021-Broken Access Control",
|
| 187 |
+
"owasp_keywords": ["owasp", "broken access control", "idor", "a01"]
|
| 188 |
+
}
|
| 189 |
+
],
|
| 190 |
+
"must_approve": false,
|
| 191 |
+
"must_reject": true
|
| 192 |
+
},
|
| 193 |
+
{
|
| 194 |
+
"snippet_id": "hard_010",
|
| 195 |
+
"filename": "yaml_loader.py",
|
| 196 |
+
"code": "import yaml\n\ndef parse_config(data):\n return yaml.load(data, Loader=yaml.Loader)",
|
| 197 |
+
"gold_issues": [
|
| 198 |
+
{
|
| 199 |
+
"issue_id": "hard_010_yaml_load",
|
| 200 |
+
"line": 4,
|
| 201 |
+
"issue_type": "SECURITY",
|
| 202 |
+
"severity": "HIGH",
|
| 203 |
+
"description": "Unsafe YAML loader can construct arbitrary Python objects from untrusted input.",
|
| 204 |
+
"required": true,
|
| 205 |
+
"explanation_keywords": ["yaml.load", "unsafe", "loader", "object", "untrusted"],
|
| 206 |
+
"fix_keywords": ["safe_load", "safe loader", "validate", "trusted"],
|
| 207 |
+
"owasp_category": "A08:2021-Software and Data Integrity Failures",
|
| 208 |
+
"owasp_keywords": ["owasp", "integrity", "yaml", "a08"]
|
| 209 |
+
}
|
| 210 |
+
],
|
| 211 |
+
"must_approve": false,
|
| 212 |
+
"must_reject": true
|
| 213 |
+
}
|
| 214 |
+
]
|
server/data/snippets_medium.json
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"snippet_id": "medium_001",
|
| 4 |
+
"filename": "cart.py",
|
| 5 |
+
"code": "def collect_names(items):\n names = []\n for i in range(len(items) - 1):\n names.append(items[i].name)\n return names",
|
| 6 |
+
"gold_issues": [
|
| 7 |
+
{
|
| 8 |
+
"issue_id": "medium_001_off_by_one",
|
| 9 |
+
"line": 3,
|
| 10 |
+
"issue_type": "LOGIC",
|
| 11 |
+
"severity": "MEDIUM",
|
| 12 |
+
"description": "Loop skips the last item because of an off-by-one range.",
|
| 13 |
+
"required": true,
|
| 14 |
+
"explanation_keywords": ["off-by-one", "last item", "range", "skip", "len"]
|
| 15 |
+
}
|
| 16 |
+
],
|
| 17 |
+
"must_approve": false,
|
| 18 |
+
"must_reject": true
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"snippet_id": "medium_002",
|
| 22 |
+
"filename": "lists.py",
|
| 23 |
+
"code": "def add_item(item, bucket=[]):\n bucket.append(item)\n return bucket",
|
| 24 |
+
"gold_issues": [
|
| 25 |
+
{
|
| 26 |
+
"issue_id": "medium_002_mutable_default",
|
| 27 |
+
"line": 1,
|
| 28 |
+
"issue_type": "LOGIC",
|
| 29 |
+
"severity": "HIGH",
|
| 30 |
+
"description": "Mutable default argument is shared between calls.",
|
| 31 |
+
"required": true,
|
| 32 |
+
"explanation_keywords": ["mutable", "default", "shared", "calls", "list"]
|
| 33 |
+
}
|
| 34 |
+
],
|
| 35 |
+
"must_approve": false,
|
| 36 |
+
"must_reject": true
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"snippet_id": "medium_003",
|
| 40 |
+
"filename": "loader.py",
|
| 41 |
+
"code": "def load_payload(reader):\n try:\n return reader.read()\n except Exception:\n pass\n return None",
|
| 42 |
+
"gold_issues": [
|
| 43 |
+
{
|
| 44 |
+
"issue_id": "medium_003_swallow",
|
| 45 |
+
"line": 4,
|
| 46 |
+
"issue_type": "LOGIC",
|
| 47 |
+
"severity": "HIGH",
|
| 48 |
+
"description": "Broad exception is swallowed, hiding errors from callers.",
|
| 49 |
+
"required": true,
|
| 50 |
+
"explanation_keywords": ["exception", "swallow", "pass", "hide", "error"]
|
| 51 |
+
}
|
| 52 |
+
],
|
| 53 |
+
"must_approve": false,
|
| 54 |
+
"must_reject": true
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"snippet_id": "medium_004",
|
| 58 |
+
"filename": "billing.py",
|
| 59 |
+
"code": "def total_price(prices):\n total = 0\n for price in prices:\n total = total + str(price)\n return total",
|
| 60 |
+
"gold_issues": [
|
| 61 |
+
{
|
| 62 |
+
"issue_id": "medium_004_type_bug",
|
| 63 |
+
"line": 4,
|
| 64 |
+
"issue_type": "LOGIC",
|
| 65 |
+
"severity": "MEDIUM",
|
| 66 |
+
"description": "Converts price to string and concatenates instead of adding numerically.",
|
| 67 |
+
"required": true,
|
| 68 |
+
"explanation_keywords": ["string", "concatenate", "numeric", "add", "type"]
|
| 69 |
+
}
|
| 70 |
+
],
|
| 71 |
+
"must_approve": false,
|
| 72 |
+
"must_reject": true
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"snippet_id": "medium_005",
|
| 76 |
+
"filename": "flags.py",
|
| 77 |
+
"code": "def should_run(user_input):\n if user_input == True:\n return True\n return False",
|
| 78 |
+
"gold_issues": [
|
| 79 |
+
{
|
| 80 |
+
"issue_id": "medium_005_bool_compare",
|
| 81 |
+
"line": 2,
|
| 82 |
+
"issue_type": "LOGIC",
|
| 83 |
+
"severity": "LOW",
|
| 84 |
+
"description": "Explicit comparison to True is brittle and can mis-handle truthy values.",
|
| 85 |
+
"required": true,
|
| 86 |
+
"explanation_keywords": ["true", "truthy", "boolean", "comparison", "if"]
|
| 87 |
+
}
|
| 88 |
+
],
|
| 89 |
+
"must_approve": false,
|
| 90 |
+
"must_reject": true
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"snippet_id": "medium_006",
|
| 94 |
+
"filename": "ranges.py",
|
| 95 |
+
"code": "def between(value, start, end):\n if value >= start or value <= end:\n return True\n return False",
|
| 96 |
+
"gold_issues": [
|
| 97 |
+
{
|
| 98 |
+
"issue_id": "medium_006_boolean_logic",
|
| 99 |
+
"line": 2,
|
| 100 |
+
"issue_type": "LOGIC",
|
| 101 |
+
"severity": "HIGH",
|
| 102 |
+
"description": "Uses `or` instead of `and`, so the range check almost always passes.",
|
| 103 |
+
"required": true,
|
| 104 |
+
"explanation_keywords": ["or", "and", "range", "always", "boolean"]
|
| 105 |
+
}
|
| 106 |
+
],
|
| 107 |
+
"must_approve": false,
|
| 108 |
+
"must_reject": true
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"snippet_id": "medium_007",
|
| 112 |
+
"filename": "cleanup.py",
|
| 113 |
+
"code": "def remove_empty(values):\n for value in values:\n if not value:\n values.remove(value)\n return values",
|
| 114 |
+
"gold_issues": [
|
| 115 |
+
{
|
| 116 |
+
"issue_id": "medium_007_mutation_during_iteration",
|
| 117 |
+
"line": 4,
|
| 118 |
+
"issue_type": "LOGIC",
|
| 119 |
+
"severity": "HIGH",
|
| 120 |
+
"description": "Mutates the list while iterating, causing elements to be skipped.",
|
| 121 |
+
"required": true,
|
| 122 |
+
"explanation_keywords": ["mutate", "iteration", "remove", "skip", "list"]
|
| 123 |
+
}
|
| 124 |
+
],
|
| 125 |
+
"must_approve": false,
|
| 126 |
+
"must_reject": true
|
| 127 |
+
},
|
| 128 |
+
{
|
| 129 |
+
"snippet_id": "medium_008",
|
| 130 |
+
"filename": "averages.py",
|
| 131 |
+
"code": "def average(numbers):\n if not numbers:\n return 0\n return sum(numbers) / (len(numbers) - 1)",
|
| 132 |
+
"gold_issues": [
|
| 133 |
+
{
|
| 134 |
+
"issue_id": "medium_008_divisor",
|
| 135 |
+
"line": 4,
|
| 136 |
+
"issue_type": "LOGIC",
|
| 137 |
+
"severity": "HIGH",
|
| 138 |
+
"description": "Divides by len(numbers) - 1, producing the wrong average and crashing for one item.",
|
| 139 |
+
"required": true,
|
| 140 |
+
"explanation_keywords": ["average", "divide", "len", "minus 1", "wrong"]
|
| 141 |
+
}
|
| 142 |
+
],
|
| 143 |
+
"must_approve": false,
|
| 144 |
+
"must_reject": true
|
| 145 |
+
},
|
| 146 |
+
{
|
| 147 |
+
"snippet_id": "medium_009",
|
| 148 |
+
"filename": "retry.py",
|
| 149 |
+
"code": "def fetch_name(client):\n try:\n return client.name()\n except ValueError:\n return \"\"\n return None",
|
| 150 |
+
"gold_issues": [
|
| 151 |
+
{
|
| 152 |
+
"issue_id": "medium_009_unreachable_fallback",
|
| 153 |
+
"line": 6,
|
| 154 |
+
"issue_type": "LOGIC",
|
| 155 |
+
"severity": "LOW",
|
| 156 |
+
"description": "The final return is unreachable and suggests the error path was designed incorrectly.",
|
| 157 |
+
"required": false,
|
| 158 |
+
"explanation_keywords": ["unreachable", "return", "fallback", "dead code"]
|
| 159 |
+
},
|
| 160 |
+
{
|
| 161 |
+
"issue_id": "medium_009_swallow",
|
| 162 |
+
"line": 5,
|
| 163 |
+
"issue_type": "LOGIC",
|
| 164 |
+
"severity": "MEDIUM",
|
| 165 |
+
"description": "Returns an empty string on ValueError, masking the failure as a valid result.",
|
| 166 |
+
"required": true,
|
| 167 |
+
"explanation_keywords": ["empty string", "mask", "failure", "valid result", "error"]
|
| 168 |
+
}
|
| 169 |
+
],
|
| 170 |
+
"must_approve": false,
|
| 171 |
+
"must_reject": true
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"snippet_id": "medium_010",
|
| 175 |
+
"filename": "tokens.py",
|
| 176 |
+
"code": "def normalize_token(token):\n if token is \"\":\n return None\n return token.strip()",
|
| 177 |
+
"gold_issues": [
|
| 178 |
+
{
|
| 179 |
+
"issue_id": "medium_010_is_compare",
|
| 180 |
+
"line": 2,
|
| 181 |
+
"issue_type": "LOGIC",
|
| 182 |
+
"severity": "MEDIUM",
|
| 183 |
+
"description": "Uses `is` for string comparison instead of equality.",
|
| 184 |
+
"required": true,
|
| 185 |
+
"explanation_keywords": ["is", "string", "comparison", "equality", "identity"]
|
| 186 |
+
}
|
| 187 |
+
],
|
| 188 |
+
"must_approve": false,
|
| 189 |
+
"must_reject": true
|
| 190 |
+
}
|
| 191 |
+
]
|
server/grading.py
ADDED
|
@@ -0,0 +1,465 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Deterministic task graders for the code-review benchmark."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
from typing import Dict, Iterable, List, Optional
|
| 7 |
+
|
| 8 |
+
try:
|
| 9 |
+
from ..models import (
|
| 10 |
+
ActionType,
|
| 11 |
+
CodeReviewSnippet,
|
| 12 |
+
GoldIssue,
|
| 13 |
+
IssueType,
|
| 14 |
+
ReviewComment,
|
| 15 |
+
Severity,
|
| 16 |
+
)
|
| 17 |
+
except ImportError:
|
| 18 |
+
from models import ( # type: ignore
|
| 19 |
+
ActionType,
|
| 20 |
+
CodeReviewSnippet,
|
| 21 |
+
GoldIssue,
|
| 22 |
+
IssueType,
|
| 23 |
+
ReviewComment,
|
| 24 |
+
Severity,
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def _normalize_text(value: Optional[str]) -> str:
|
| 29 |
+
return " ".join((value or "").lower().split())
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def _keyword_match(text: str, keywords: Iterable[str]) -> bool:
|
| 33 |
+
normalized = _normalize_text(text)
|
| 34 |
+
return any(_normalize_text(keyword) in normalized for keyword in keywords if keyword)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def _keyword_match_score(text: str, keywords: Iterable[str]) -> float:
|
| 38 |
+
"""
|
| 39 |
+
FIX: Returns partial score 0.0-1.0 based on how many keywords matched.
|
| 40 |
+
Old code: binary match (any keyword → True/False).
|
| 41 |
+
New code: count matches → partial credit even with 1 keyword hit.
|
| 42 |
+
"""
|
| 43 |
+
normalized = _normalize_text(text)
|
| 44 |
+
kw_list = [k for k in keywords if k]
|
| 45 |
+
if not kw_list:
|
| 46 |
+
return 0.0
|
| 47 |
+
hits = sum(1 for kw in kw_list if _normalize_text(kw) in normalized)
|
| 48 |
+
return hits / len(kw_list)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def _terminal_action(history: List[ReviewComment]) -> Optional[ActionType]:
|
| 52 |
+
for item in reversed(history):
|
| 53 |
+
if item.action_type in {ActionType.APPROVE, ActionType.REQUEST_CHANGES}:
|
| 54 |
+
return item.action_type
|
| 55 |
+
return None
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
@dataclass
|
| 59 |
+
class GradeResult:
|
| 60 |
+
score: float
|
| 61 |
+
precision: float
|
| 62 |
+
recall: float
|
| 63 |
+
f1: float
|
| 64 |
+
true_positives: int
|
| 65 |
+
false_positives: int
|
| 66 |
+
missed_issues: int
|
| 67 |
+
required_found: int
|
| 68 |
+
required_total: int
|
| 69 |
+
bonus_found: int
|
| 70 |
+
matched_issue_ids: List[str]
|
| 71 |
+
breakdown: Dict[str, float]
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def grade_review(
|
| 75 |
+
task_id: str,
|
| 76 |
+
snippet: CodeReviewSnippet,
|
| 77 |
+
history: List[ReviewComment],
|
| 78 |
+
duplicate_comments: int,
|
| 79 |
+
) -> GradeResult:
|
| 80 |
+
"""Grade a completed or in-progress review deterministically."""
|
| 81 |
+
|
| 82 |
+
comments = [item for item in history if item.action_type == ActionType.ADD_COMMENT]
|
| 83 |
+
if task_id == "task_easy":
|
| 84 |
+
return _grade_easy(snippet, comments, history, duplicate_comments)
|
| 85 |
+
if task_id == "task_medium":
|
| 86 |
+
return _grade_medium(snippet, comments, history, duplicate_comments)
|
| 87 |
+
return _grade_hard(snippet, comments, history, duplicate_comments)
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def _grade_easy(
|
| 91 |
+
snippet: CodeReviewSnippet,
|
| 92 |
+
comments: List[ReviewComment],
|
| 93 |
+
history: List[ReviewComment],
|
| 94 |
+
duplicate_comments: int,
|
| 95 |
+
) -> GradeResult:
|
| 96 |
+
required_issues = [issue for issue in snippet.gold_issues if issue.required]
|
| 97 |
+
required_denominator = max(len(required_issues), 1)
|
| 98 |
+
|
| 99 |
+
# FIX: Start credit at 0 for every issue
|
| 100 |
+
best_credit: Dict[str, float] = {issue.issue_id: 0.0 for issue in snippet.gold_issues}
|
| 101 |
+
matched_ids: set[str] = set()
|
| 102 |
+
false_positives = 0
|
| 103 |
+
|
| 104 |
+
for comment in comments:
|
| 105 |
+
positive = False
|
| 106 |
+
comment_text = f"{comment.comment or ''} {comment.suggestion or ''}"
|
| 107 |
+
|
| 108 |
+
for issue in snippet.gold_issues:
|
| 109 |
+
if comment.line_number is None:
|
| 110 |
+
continue
|
| 111 |
+
|
| 112 |
+
distance = abs(comment.line_number - issue.line)
|
| 113 |
+
credit = 0.0
|
| 114 |
+
|
| 115 |
+
if issue.required:
|
| 116 |
+
# FIX: More generous distance tolerance + keyword fallback
|
| 117 |
+
if comment.issue_type == issue.issue_type:
|
| 118 |
+
if distance <= 1:
|
| 119 |
+
credit = 0.30 / required_denominator
|
| 120 |
+
elif distance <= 3:
|
| 121 |
+
credit = 0.15 / required_denominator # FIX: was 0.10
|
| 122 |
+
elif distance <= 5:
|
| 123 |
+
credit = 0.08 / required_denominator # FIX: new tier
|
| 124 |
+
elif _keyword_match(comment_text, getattr(issue, "explanation_keywords", [])):
|
| 125 |
+
# FIX: Wrong issue_type but comment mentions the bug → partial credit
|
| 126 |
+
if distance <= 3:
|
| 127 |
+
credit = 0.08 / required_denominator
|
| 128 |
+
else:
|
| 129 |
+
# Bonus issues
|
| 130 |
+
if distance <= 3:
|
| 131 |
+
if comment.issue_type == issue.issue_type:
|
| 132 |
+
credit = 0.05
|
| 133 |
+
elif _keyword_match(comment_text, getattr(issue, "explanation_keywords", [])):
|
| 134 |
+
credit = 0.02 # FIX: small credit for keyword match
|
| 135 |
+
|
| 136 |
+
if credit > 0:
|
| 137 |
+
positive = True
|
| 138 |
+
best_credit[issue.issue_id] = max(best_credit[issue.issue_id], credit)
|
| 139 |
+
matched_ids.add(issue.issue_id)
|
| 140 |
+
|
| 141 |
+
if not positive:
|
| 142 |
+
false_positives += 1
|
| 143 |
+
|
| 144 |
+
required_score = sum(best_credit[issue.issue_id] for issue in required_issues)
|
| 145 |
+
|
| 146 |
+
bonus_score = min(
|
| 147 |
+
sum(
|
| 148 |
+
best_credit[issue.issue_id]
|
| 149 |
+
for issue in snippet.gold_issues
|
| 150 |
+
if not issue.required
|
| 151 |
+
),
|
| 152 |
+
0.15,
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
# FIX: Reduced false positive penalty — was 0.05 per FP, now 0.03
|
| 156 |
+
# Prevents over-penalising agents that flag too many issues
|
| 157 |
+
false_positive_penalty = min(false_positives * 0.03, 0.15)
|
| 158 |
+
|
| 159 |
+
final_action = _terminal_action(history)
|
| 160 |
+
action_adjustment = 0.0
|
| 161 |
+
if snippet.must_reject and final_action == ActionType.REQUEST_CHANGES:
|
| 162 |
+
action_adjustment = 0.10
|
| 163 |
+
elif snippet.must_reject and final_action == ActionType.APPROVE:
|
| 164 |
+
action_adjustment = -0.10
|
| 165 |
+
|
| 166 |
+
raw_score = required_score + bonus_score - false_positive_penalty + action_adjustment
|
| 167 |
+
|
| 168 |
+
required_found = sum(1 for issue in required_issues if best_credit[issue.issue_id] > 0)
|
| 169 |
+
bonus_found = sum(
|
| 170 |
+
1
|
| 171 |
+
for issue in snippet.gold_issues
|
| 172 |
+
if not issue.required and best_credit[issue.issue_id] > 0
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
return _build_result(
|
| 176 |
+
score=raw_score,
|
| 177 |
+
matched_issue_ids=sorted(matched_ids),
|
| 178 |
+
false_positives=false_positives,
|
| 179 |
+
required_found=required_found,
|
| 180 |
+
required_total=len(required_issues),
|
| 181 |
+
bonus_found=bonus_found,
|
| 182 |
+
duplicate_comments=duplicate_comments,
|
| 183 |
+
breakdown={
|
| 184 |
+
"required_score": required_score,
|
| 185 |
+
"bonus_score": bonus_score,
|
| 186 |
+
"false_positive_penalty": -false_positive_penalty,
|
| 187 |
+
"action_adjustment": action_adjustment,
|
| 188 |
+
},
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def _grade_medium(
|
| 193 |
+
snippet: CodeReviewSnippet,
|
| 194 |
+
comments: List[ReviewComment],
|
| 195 |
+
history: List[ReviewComment],
|
| 196 |
+
duplicate_comments: int,
|
| 197 |
+
) -> GradeResult:
|
| 198 |
+
required_issues = [issue for issue in snippet.gold_issues if issue.required]
|
| 199 |
+
required_denominator = max(len(required_issues), 1)
|
| 200 |
+
best_credit: Dict[str, float] = {issue.issue_id: 0.0 for issue in snippet.gold_issues}
|
| 201 |
+
explanation_credit: Dict[str, float] = {issue.issue_id: 0.0 for issue in snippet.gold_issues}
|
| 202 |
+
matched_ids: set[str] = set()
|
| 203 |
+
false_positives = 0
|
| 204 |
+
|
| 205 |
+
for comment in comments:
|
| 206 |
+
positive = False
|
| 207 |
+
comment_text = f"{comment.comment or ''} {comment.suggestion or ''}"
|
| 208 |
+
|
| 209 |
+
for issue in snippet.gold_issues:
|
| 210 |
+
if comment.line_number is None:
|
| 211 |
+
continue
|
| 212 |
+
|
| 213 |
+
distance = abs(comment.line_number - issue.line)
|
| 214 |
+
|
| 215 |
+
# FIX: Relaxed from distance <= 5 to distance <= 8
|
| 216 |
+
if distance > 8:
|
| 217 |
+
continue
|
| 218 |
+
|
| 219 |
+
credit = 0.0
|
| 220 |
+
keyword_match = _keyword_match(comment_text, issue.explanation_keywords)
|
| 221 |
+
|
| 222 |
+
# FIX: Old code required BOTH issue_type match AND exact/near line.
|
| 223 |
+
# New code: issue_type OR keyword match gives credit, distance tiers.
|
| 224 |
+
if comment.issue_type == IssueType.LOGIC and issue.issue_type == IssueType.LOGIC:
|
| 225 |
+
if distance <= 1:
|
| 226 |
+
# FIX: was "distance == 0" — now ±1 for full credit
|
| 227 |
+
credit = 0.25 / required_denominator if issue.required else 0.05
|
| 228 |
+
elif distance <= 3:
|
| 229 |
+
credit = 0.15 / required_denominator if issue.required else 0.03 # FIX: was 0.10
|
| 230 |
+
elif distance <= 8:
|
| 231 |
+
credit = 0.08 / required_denominator if issue.required else 0.02 # FIX: new tier
|
| 232 |
+
elif keyword_match:
|
| 233 |
+
# FIX: keyword match alone is worth more — was 0.05, now 0.10
|
| 234 |
+
if distance <= 3:
|
| 235 |
+
credit = 0.10 / required_denominator if issue.required else 0.03
|
| 236 |
+
elif distance <= 8:
|
| 237 |
+
credit = 0.05 / required_denominator if issue.required else 0.01
|
| 238 |
+
|
| 239 |
+
if credit > 0:
|
| 240 |
+
positive = True
|
| 241 |
+
best_credit[issue.issue_id] = max(best_credit[issue.issue_id], credit)
|
| 242 |
+
matched_ids.add(issue.issue_id)
|
| 243 |
+
|
| 244 |
+
# FIX: Use partial keyword score instead of binary
|
| 245 |
+
kw_score = _keyword_match_score(comment_text, issue.explanation_keywords)
|
| 246 |
+
if kw_score > 0:
|
| 247 |
+
explanation_credit[issue.issue_id] = max(
|
| 248 |
+
explanation_credit[issue.issue_id],
|
| 249 |
+
# FIX: Scale explanation bonus by keyword match quality
|
| 250 |
+
(0.05 * kw_score) / required_denominator if issue.required else (0.02 * kw_score),
|
| 251 |
+
)
|
| 252 |
+
|
| 253 |
+
if not positive:
|
| 254 |
+
false_positives += 1
|
| 255 |
+
|
| 256 |
+
base_score = sum(best_credit.values()) + sum(explanation_credit.values())
|
| 257 |
+
|
| 258 |
+
# FIX: Reduced FP penalty — was 0.08 per FP, now 0.05
|
| 259 |
+
false_positive_penalty = min(false_positives * 0.05, 0.25)
|
| 260 |
+
|
| 261 |
+
final_action = _terminal_action(history)
|
| 262 |
+
action_adjustment = 0.0
|
| 263 |
+
if snippet.must_reject and final_action == ActionType.REQUEST_CHANGES:
|
| 264 |
+
action_adjustment = 0.10
|
| 265 |
+
elif snippet.must_reject and final_action == ActionType.APPROVE:
|
| 266 |
+
action_adjustment = -0.15
|
| 267 |
+
|
| 268 |
+
required_found = sum(1 for issue in required_issues if best_credit[issue.issue_id] > 0)
|
| 269 |
+
bonus_found = sum(
|
| 270 |
+
1
|
| 271 |
+
for issue in snippet.gold_issues
|
| 272 |
+
if not issue.required and best_credit[issue.issue_id] > 0
|
| 273 |
+
)
|
| 274 |
+
|
| 275 |
+
return _build_result(
|
| 276 |
+
score=base_score - false_positive_penalty + action_adjustment,
|
| 277 |
+
matched_issue_ids=sorted(matched_ids),
|
| 278 |
+
false_positives=false_positives,
|
| 279 |
+
required_found=required_found,
|
| 280 |
+
required_total=len(required_issues),
|
| 281 |
+
bonus_found=bonus_found,
|
| 282 |
+
duplicate_comments=duplicate_comments,
|
| 283 |
+
breakdown={
|
| 284 |
+
"logic_score": sum(best_credit.values()),
|
| 285 |
+
"explanation_score": sum(explanation_credit.values()),
|
| 286 |
+
"false_positive_penalty": -false_positive_penalty,
|
| 287 |
+
"action_adjustment": action_adjustment,
|
| 288 |
+
},
|
| 289 |
+
)
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
def _grade_hard(
|
| 293 |
+
snippet: CodeReviewSnippet,
|
| 294 |
+
comments: List[ReviewComment],
|
| 295 |
+
history: List[ReviewComment],
|
| 296 |
+
duplicate_comments: int,
|
| 297 |
+
) -> GradeResult:
|
| 298 |
+
required_issues = [issue for issue in snippet.gold_issues if issue.required]
|
| 299 |
+
required_denominator = max(len(required_issues), 1)
|
| 300 |
+
best_credit: Dict[str, float] = {issue.issue_id: 0.0 for issue in snippet.gold_issues}
|
| 301 |
+
owasp_credit: Dict[str, float] = {issue.issue_id: 0.0 for issue in snippet.gold_issues}
|
| 302 |
+
fix_credit: Dict[str, float] = {issue.issue_id: 0.0 for issue in snippet.gold_issues}
|
| 303 |
+
severity_credit: Dict[str, float] = {issue.issue_id: 0.0 for issue in snippet.gold_issues}
|
| 304 |
+
matched_ids: set[str] = set()
|
| 305 |
+
false_positives = 0
|
| 306 |
+
|
| 307 |
+
for comment in comments:
|
| 308 |
+
positive = False
|
| 309 |
+
comment_text = f"{comment.comment or ''} {comment.suggestion or ''}"
|
| 310 |
+
|
| 311 |
+
for issue in snippet.gold_issues:
|
| 312 |
+
# FIX: Was exact line match only (distance == 0).
|
| 313 |
+
# Security vulns span multiple lines — now ±2 tolerance.
|
| 314 |
+
if comment.line_number is None:
|
| 315 |
+
continue
|
| 316 |
+
distance = abs(comment.line_number - issue.line)
|
| 317 |
+
if distance > 2: # FIX: was `!= issue.line` (zero tolerance)
|
| 318 |
+
continue
|
| 319 |
+
|
| 320 |
+
credit = 0.0
|
| 321 |
+
if comment.issue_type == IssueType.SECURITY and issue.issue_type == IssueType.SECURITY:
|
| 322 |
+
if distance == 0:
|
| 323 |
+
credit = 0.20 / required_denominator if issue.required else 0.05
|
| 324 |
+
else:
|
| 325 |
+
# FIX: ±1-2 lines gets partial credit (was zero)
|
| 326 |
+
credit = 0.12 / required_denominator if issue.required else 0.03
|
| 327 |
+
|
| 328 |
+
# FIX: Even if issue_type is wrong, keyword match on SECURITY issue → small credit
|
| 329 |
+
elif _keyword_match(comment_text, getattr(issue, "owasp_keywords", []) + getattr(issue, "fix_keywords", [])):
|
| 330 |
+
if distance <= 2:
|
| 331 |
+
credit = 0.06 / required_denominator if issue.required else 0.02
|
| 332 |
+
|
| 333 |
+
if credit > 0:
|
| 334 |
+
positive = True
|
| 335 |
+
matched_ids.add(issue.issue_id)
|
| 336 |
+
best_credit[issue.issue_id] = max(best_credit[issue.issue_id], credit)
|
| 337 |
+
|
| 338 |
+
owasp_kw = list(getattr(issue, "owasp_keywords", []))
|
| 339 |
+
owasp_cat = [issue.owasp_category] if getattr(issue, "owasp_category", None) else []
|
| 340 |
+
if _keyword_match(comment_text, owasp_kw + owasp_cat):
|
| 341 |
+
owasp_credit[issue.issue_id] = max(
|
| 342 |
+
owasp_credit[issue.issue_id],
|
| 343 |
+
0.10 / required_denominator if issue.required else 0.02,
|
| 344 |
+
)
|
| 345 |
+
|
| 346 |
+
fix_kw = list(getattr(issue, "fix_keywords", []))
|
| 347 |
+
if _keyword_match(comment_text, fix_kw):
|
| 348 |
+
fix_credit[issue.issue_id] = max(
|
| 349 |
+
fix_credit[issue.issue_id],
|
| 350 |
+
0.05 / required_denominator if issue.required else 0.02,
|
| 351 |
+
)
|
| 352 |
+
|
| 353 |
+
if comment.severity in {Severity.HIGH, Severity.CRITICAL}:
|
| 354 |
+
if comment.severity == issue.severity or (
|
| 355 |
+
issue.severity == Severity.HIGH and comment.severity == Severity.CRITICAL
|
| 356 |
+
):
|
| 357 |
+
severity_credit[issue.issue_id] = max(
|
| 358 |
+
severity_credit[issue.issue_id], 0.05 / required_denominator
|
| 359 |
+
)
|
| 360 |
+
elif issue.severity == Severity.CRITICAL and comment.severity in {
|
| 361 |
+
Severity.LOW,
|
| 362 |
+
Severity.MEDIUM,
|
| 363 |
+
}:
|
| 364 |
+
# FIX: Only penalise if we actually matched (was applying even with no match)
|
| 365 |
+
if best_credit[issue.issue_id] > 0:
|
| 366 |
+
severity_credit[issue.issue_id] = min(
|
| 367 |
+
severity_credit[issue.issue_id], -0.05 / required_denominator
|
| 368 |
+
)
|
| 369 |
+
|
| 370 |
+
if not positive:
|
| 371 |
+
false_positives += 1
|
| 372 |
+
|
| 373 |
+
# Missing critical penalty
|
| 374 |
+
missing_critical_penalty = 0.0
|
| 375 |
+
for issue in required_issues:
|
| 376 |
+
if issue.severity == Severity.CRITICAL and best_credit[issue.issue_id] == 0:
|
| 377 |
+
missing_critical_penalty += 0.15
|
| 378 |
+
|
| 379 |
+
# FIX: Reduced FP penalty for hard task — was 0.10, now 0.07
|
| 380 |
+
# Hard tasks have many lines so innocent FPs should cost less
|
| 381 |
+
false_positive_penalty = min(false_positives * 0.07, 0.35)
|
| 382 |
+
|
| 383 |
+
final_action = _terminal_action(history)
|
| 384 |
+
action_adjustment = 0.0
|
| 385 |
+
if snippet.must_reject and final_action == ActionType.REQUEST_CHANGES:
|
| 386 |
+
action_adjustment = 0.10
|
| 387 |
+
elif snippet.must_reject and final_action == ActionType.APPROVE:
|
| 388 |
+
action_adjustment = -0.20
|
| 389 |
+
|
| 390 |
+
required_found = sum(1 for issue in required_issues if best_credit[issue.issue_id] > 0)
|
| 391 |
+
bonus_found = sum(
|
| 392 |
+
1
|
| 393 |
+
for issue in snippet.gold_issues
|
| 394 |
+
if not issue.required and best_credit[issue.issue_id] > 0
|
| 395 |
+
)
|
| 396 |
+
|
| 397 |
+
return _build_result(
|
| 398 |
+
score=(
|
| 399 |
+
sum(best_credit.values())
|
| 400 |
+
+ sum(owasp_credit.values())
|
| 401 |
+
+ sum(fix_credit.values())
|
| 402 |
+
+ sum(severity_credit.values())
|
| 403 |
+
- false_positive_penalty
|
| 404 |
+
- missing_critical_penalty
|
| 405 |
+
+ action_adjustment
|
| 406 |
+
),
|
| 407 |
+
matched_issue_ids=sorted(matched_ids),
|
| 408 |
+
false_positives=false_positives,
|
| 409 |
+
required_found=required_found,
|
| 410 |
+
required_total=len(required_issues),
|
| 411 |
+
bonus_found=bonus_found,
|
| 412 |
+
duplicate_comments=duplicate_comments,
|
| 413 |
+
breakdown={
|
| 414 |
+
"security_score": sum(best_credit.values()),
|
| 415 |
+
"owasp_score": sum(owasp_credit.values()),
|
| 416 |
+
"fix_score": sum(fix_credit.values()),
|
| 417 |
+
"severity_score": sum(severity_credit.values()),
|
| 418 |
+
"false_positive_penalty": -false_positive_penalty,
|
| 419 |
+
"missing_critical_penalty": -missing_critical_penalty,
|
| 420 |
+
"action_adjustment": action_adjustment,
|
| 421 |
+
},
|
| 422 |
+
)
|
| 423 |
+
|
| 424 |
+
|
| 425 |
+
def _build_result(
|
| 426 |
+
*,
|
| 427 |
+
score: float,
|
| 428 |
+
matched_issue_ids: List[str],
|
| 429 |
+
false_positives: int,
|
| 430 |
+
required_found: int,
|
| 431 |
+
required_total: int,
|
| 432 |
+
bonus_found: int,
|
| 433 |
+
duplicate_comments: int,
|
| 434 |
+
breakdown: Dict[str, float],
|
| 435 |
+
) -> GradeResult:
|
| 436 |
+
clamped_score = max(0.0, min(score, 1.0))
|
| 437 |
+
true_positives = len(matched_issue_ids)
|
| 438 |
+
missed_issues = max(required_total - required_found, 0)
|
| 439 |
+
precision = true_positives / max(true_positives + false_positives, 1)
|
| 440 |
+
recall = required_found / max(required_total, 1)
|
| 441 |
+
f1 = 0.0
|
| 442 |
+
if precision + recall:
|
| 443 |
+
f1 = 2 * precision * recall / (precision + recall)
|
| 444 |
+
breakdown = {
|
| 445 |
+
**breakdown,
|
| 446 |
+
"duplicate_comments": float(duplicate_comments),
|
| 447 |
+
"precision": precision,
|
| 448 |
+
"recall": recall,
|
| 449 |
+
"f1": f1,
|
| 450 |
+
"score": clamped_score,
|
| 451 |
+
}
|
| 452 |
+
return GradeResult(
|
| 453 |
+
score=clamped_score,
|
| 454 |
+
precision=precision,
|
| 455 |
+
recall=recall,
|
| 456 |
+
f1=f1,
|
| 457 |
+
true_positives=true_positives,
|
| 458 |
+
false_positives=false_positives,
|
| 459 |
+
missed_issues=missed_issues,
|
| 460 |
+
required_found=required_found,
|
| 461 |
+
required_total=required_total,
|
| 462 |
+
bonus_found=bonus_found,
|
| 463 |
+
matched_issue_ids=matched_issue_ids,
|
| 464 |
+
breakdown=breakdown,
|
| 465 |
+
)
|
server/python_env_environment.py
ADDED
|
@@ -0,0 +1,500 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Python code-review environment implementation."""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
from dataclasses import dataclass
|
| 12 |
+
from datetime import UTC, datetime
|
| 13 |
+
from typing import Dict, Iterable, List, Optional
|
| 14 |
+
from uuid import uuid4
|
| 15 |
+
|
| 16 |
+
from openenv.core.env_server.interfaces import Environment
|
| 17 |
+
from openenv.core.env_server.types import State
|
| 18 |
+
|
| 19 |
+
try:
|
| 20 |
+
from ..models import (
|
| 21 |
+
Difficulty,
|
| 22 |
+
PythonAction,
|
| 23 |
+
PythonEnvConfig,
|
| 24 |
+
PythonObservation,
|
| 25 |
+
PythonState,
|
| 26 |
+
ReviewFinding,
|
| 27 |
+
TaskDescriptor,
|
| 28 |
+
TaskEvaluation,
|
| 29 |
+
TaskMetadata,
|
| 30 |
+
)
|
| 31 |
+
except ImportError:
|
| 32 |
+
from models import ( # type: ignore
|
| 33 |
+
Difficulty,
|
| 34 |
+
PythonAction,
|
| 35 |
+
PythonEnvConfig,
|
| 36 |
+
PythonObservation,
|
| 37 |
+
PythonState,
|
| 38 |
+
ReviewFinding,
|
| 39 |
+
TaskDescriptor,
|
| 40 |
+
TaskEvaluation,
|
| 41 |
+
TaskMetadata,
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
@dataclass(frozen=True)
|
| 46 |
+
class ReferenceFinding:
|
| 47 |
+
"""Hidden finding metadata used for deterministic grading."""
|
| 48 |
+
|
| 49 |
+
rule_id: str
|
| 50 |
+
title: str
|
| 51 |
+
line: int
|
| 52 |
+
category: str
|
| 53 |
+
severity: str
|
| 54 |
+
rationale: str
|
| 55 |
+
recommendation: str
|
| 56 |
+
weight: float
|
| 57 |
+
keywords: List[str] = Field(default_factory=list)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
@dataclass(frozen=True)
|
| 61 |
+
class ReviewTask:
|
| 62 |
+
"""A visible task plus its hidden grading references."""
|
| 63 |
+
|
| 64 |
+
descriptor: TaskDescriptor
|
| 65 |
+
references: tuple[ReferenceFinding, ...]
|
| 66 |
+
hint: str
|
| 67 |
+
patched_code: Optional[str] = None
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
TASK_BANK: Dict[str, ReviewTask] = {
|
| 71 |
+
"py-review-easy": ReviewTask(
|
| 72 |
+
descriptor=TaskDescriptor(
|
| 73 |
+
task_id="py-review-easy",
|
| 74 |
+
difficulty="easy",
|
| 75 |
+
title="Mutable default argument",
|
| 76 |
+
objective="Find the correctness issue and explain a safe fix.",
|
| 77 |
+
code=(
|
| 78 |
+
"def add_tag(tag, tags=[]):\n"
|
| 79 |
+
" tags.append(tag)\n"
|
| 80 |
+
" return tags\n"
|
| 81 |
+
),
|
| 82 |
+
max_steps=4,
|
| 83 |
+
success_threshold=0.7,
|
| 84 |
+
),
|
| 85 |
+
references=(
|
| 86 |
+
ReferenceFinding(
|
| 87 |
+
rule_id="mutable-default",
|
| 88 |
+
title="Mutable default list is shared across calls",
|
| 89 |
+
line=1,
|
| 90 |
+
category="bug",
|
| 91 |
+
severity="warning",
|
| 92 |
+
rationale="The list persists between calls and leaks state.",
|
| 93 |
+
recommendation="Use None as the default and create a new list inside the function.",
|
| 94 |
+
weight=1.0,
|
| 95 |
+
keywords=["mutable", "default", "list", "shared", "persists", "leaks"],
|
| 96 |
+
),
|
| 97 |
+
),
|
| 98 |
+
hint="Look for state that survives between separate function calls.",
|
| 99 |
+
patched_code=(
|
| 100 |
+
"def add_tag(tag, tags=None):\n"
|
| 101 |
+
" if tags is None:\n"
|
| 102 |
+
" tags = []\n"
|
| 103 |
+
" tags.append(tag)\n"
|
| 104 |
+
" return tags\n"
|
| 105 |
+
),
|
| 106 |
+
),
|
| 107 |
+
"py-review-medium": ReviewTask(
|
| 108 |
+
descriptor=TaskDescriptor(
|
| 109 |
+
task_id="py-review-medium",
|
| 110 |
+
difficulty="medium",
|
| 111 |
+
title="Unsafe shell invocation",
|
| 112 |
+
objective="Review the snippet for security-sensitive behavior.",
|
| 113 |
+
code=(
|
| 114 |
+
"import os\n\n"
|
| 115 |
+
"def run_backup(path):\n"
|
| 116 |
+
" os.system(f\"tar -czf backup.tgz {path}\")\n"
|
| 117 |
+
),
|
| 118 |
+
max_steps=4,
|
| 119 |
+
success_threshold=0.72,
|
| 120 |
+
),
|
| 121 |
+
references=(
|
| 122 |
+
ReferenceFinding(
|
| 123 |
+
rule_id="shell-injection",
|
| 124 |
+
title="User input is interpolated into a shell command",
|
| 125 |
+
line=4,
|
| 126 |
+
category="security",
|
| 127 |
+
severity="critical",
|
| 128 |
+
rationale="An attacker can inject shell metacharacters through the path argument.",
|
| 129 |
+
recommendation="Use subprocess with an argument list instead of os.system.",
|
| 130 |
+
weight=1.0,
|
| 131 |
+
keywords=["shell", "injection", "os.system", "subprocess", "input", "unsantized", "escaping"],
|
| 132 |
+
),
|
| 133 |
+
),
|
| 134 |
+
hint="Check how external commands are invoked and whether user input is escaped.",
|
| 135 |
+
patched_code=(
|
| 136 |
+
"import subprocess\n\n"
|
| 137 |
+
"def run_backup(path):\n"
|
| 138 |
+
" subprocess.run([\"tar\", \"-czf\", \"backup.tgz\", path], check=True)\n"
|
| 139 |
+
),
|
| 140 |
+
),
|
| 141 |
+
"py-review-hard": ReviewTask(
|
| 142 |
+
descriptor=TaskDescriptor(
|
| 143 |
+
task_id="py-review-hard",
|
| 144 |
+
difficulty="hard",
|
| 145 |
+
title="Retry helper hides failures",
|
| 146 |
+
objective="Identify correctness and maintainability issues in the retry logic.",
|
| 147 |
+
code=(
|
| 148 |
+
"import time\n\n"
|
| 149 |
+
"def fetch_with_retry(client, url, retries=3):\n"
|
| 150 |
+
" last_error = None\n"
|
| 151 |
+
" for _ in range(retries):\n"
|
| 152 |
+
" try:\n"
|
| 153 |
+
" return client.get(url, timeout=1)\n"
|
| 154 |
+
" except Exception as exc:\n"
|
| 155 |
+
" last_error = exc\n"
|
| 156 |
+
" time.sleep(0.1)\n"
|
| 157 |
+
" return None\n"
|
| 158 |
+
),
|
| 159 |
+
max_steps=4,
|
| 160 |
+
success_threshold=0.74,
|
| 161 |
+
),
|
| 162 |
+
references=(
|
| 163 |
+
ReferenceFinding(
|
| 164 |
+
rule_id="swallowed-error",
|
| 165 |
+
title="Function swallows the final exception and returns None",
|
| 166 |
+
line=10,
|
| 167 |
+
category="bug",
|
| 168 |
+
severity="warning",
|
| 169 |
+
rationale="Callers cannot distinguish a failed request from a valid None result.",
|
| 170 |
+
recommendation="Re-raise the last exception after retries are exhausted.",
|
| 171 |
+
weight=0.65,
|
| 172 |
+
keywords=["swallowed", "exception", "return none", "error handling"],
|
| 173 |
+
),
|
| 174 |
+
ReferenceFinding(
|
| 175 |
+
rule_id="broad-except",
|
| 176 |
+
title="Broad exception handler catches unexpected failures",
|
| 177 |
+
line=7,
|
| 178 |
+
category="maintainability",
|
| 179 |
+
severity="info",
|
| 180 |
+
rationale="Catching Exception masks programming errors and interrupts.",
|
| 181 |
+
recommendation="Catch only the client or network exceptions you expect to retry.",
|
| 182 |
+
weight=0.35,
|
| 183 |
+
keywords=["broad", "except", "catch exception"],
|
| 184 |
+
),
|
| 185 |
+
),
|
| 186 |
+
hint="Consider what happens to the final error after the retry loop finishes.",
|
| 187 |
+
patched_code=(
|
| 188 |
+
"import time\n\n"
|
| 189 |
+
"def fetch_with_retry(client, url, retries=3):\n"
|
| 190 |
+
" last_error = None\n"
|
| 191 |
+
" for _ in range(retries):\n"
|
| 192 |
+
" try:\n"
|
| 193 |
+
" return client.get(url, timeout=1)\n"
|
| 194 |
+
" except client.retryable_exceptions as exc:\n"
|
| 195 |
+
" last_error = exc\n"
|
| 196 |
+
" time.sleep(0.1)\n"
|
| 197 |
+
" if last_error is not None:\n"
|
| 198 |
+
" raise last_error\n"
|
| 199 |
+
),
|
| 200 |
+
),
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
def _utc_now() -> str:
|
| 205 |
+
return datetime.now(UTC).isoformat()
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
def _normalize_text(value: Optional[str]) -> str:
|
| 209 |
+
return " ".join((value or "").strip().lower().split())
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
def _normalize_code(value: Optional[str]) -> str:
|
| 213 |
+
return "\n".join(line.rstrip() for line in (value or "").strip().splitlines())
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
class PythonEnvironment(Environment[PythonAction, PythonObservation, State]):
|
| 217 |
+
"""Deterministic benchmark environment for Python code review tasks."""
|
| 218 |
+
|
| 219 |
+
SUPPORTS_CONCURRENT_SESSIONS: bool = True
|
| 220 |
+
|
| 221 |
+
def __init__(self, config: Optional[PythonEnvConfig] = None):
|
| 222 |
+
super().__init__()
|
| 223 |
+
self._config = config or PythonEnvConfig()
|
| 224 |
+
self._state = State(episode_id=str(uuid4()), step_count=0)
|
| 225 |
+
self._task_cursor = -1
|
| 226 |
+
self._current_task: Optional[ReviewTask] = None
|
| 227 |
+
self._submitted_findings: List[ReviewFinding] = []
|
| 228 |
+
self._hints_used = 0
|
| 229 |
+
self._created_at = _utc_now()
|
| 230 |
+
|
| 231 |
+
def reset(
|
| 232 |
+
self,
|
| 233 |
+
seed: Optional[int] = None,
|
| 234 |
+
episode_id: Optional[str] = None,
|
| 235 |
+
**kwargs,
|
| 236 |
+
) -> PythonObservation:
|
| 237 |
+
"""Start the next configured review task."""
|
| 238 |
+
|
| 239 |
+
del seed, kwargs
|
| 240 |
+
self._task_cursor = (self._task_cursor + 1) % len(self._config.task_order)
|
| 241 |
+
task_id = self._config.task_order[self._task_cursor]
|
| 242 |
+
self._current_task = TASK_BANK.get(task_id, TASK_BANK["py-review-easy"])
|
| 243 |
+
self._state = State(
|
| 244 |
+
episode_id=episode_id or str(uuid4()),
|
| 245 |
+
step_count=0,
|
| 246 |
+
)
|
| 247 |
+
self._submitted_findings = []
|
| 248 |
+
self._hints_used = 0
|
| 249 |
+
self._created_at = _utc_now()
|
| 250 |
+
return self._build_observation(
|
| 251 |
+
feedback="New review task loaded. Submit findings or request a hint.",
|
| 252 |
+
reward=0.0,
|
| 253 |
+
done=False,
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
def step(
|
| 257 |
+
self,
|
| 258 |
+
action: PythonAction,
|
| 259 |
+
timeout_s: Optional[float] = None,
|
| 260 |
+
**kwargs,
|
| 261 |
+
) -> PythonObservation:
|
| 262 |
+
"""Process one review action and return updated feedback."""
|
| 263 |
+
|
| 264 |
+
del timeout_s, kwargs
|
| 265 |
+
if self._current_task is None:
|
| 266 |
+
return self.reset()
|
| 267 |
+
|
| 268 |
+
self._state.step_count += 1
|
| 269 |
+
operation = action.operation
|
| 270 |
+
feedback = ""
|
| 271 |
+
reward = 0.0
|
| 272 |
+
done = False
|
| 273 |
+
|
| 274 |
+
if operation == "request_hint":
|
| 275 |
+
self._hints_used += 1
|
| 276 |
+
feedback = self._current_task.hint
|
| 277 |
+
evaluation = self._evaluate(self._submitted_findings, action.patched_code)
|
| 278 |
+
reward = evaluation.score
|
| 279 |
+
else:
|
| 280 |
+
if action.findings:
|
| 281 |
+
self._submitted_findings.extend(action.findings)
|
| 282 |
+
evaluation = self._evaluate(self._submitted_findings, action.patched_code)
|
| 283 |
+
reward = evaluation.score
|
| 284 |
+
if operation == "finalize":
|
| 285 |
+
done = True
|
| 286 |
+
feedback = (
|
| 287 |
+
"Review finalized. "
|
| 288 |
+
f"Matched {evaluation.matched_findings}/{evaluation.total_findings} "
|
| 289 |
+
"reference findings."
|
| 290 |
+
)
|
| 291 |
+
else:
|
| 292 |
+
feedback = (
|
| 293 |
+
f"Progress saved. Matched {evaluation.matched_findings}/"
|
| 294 |
+
f"{evaluation.total_findings} findings with score {evaluation.score:.2f}."
|
| 295 |
+
)
|
| 296 |
+
|
| 297 |
+
if self._state.step_count >= self._max_steps():
|
| 298 |
+
done = True
|
| 299 |
+
if operation != "finalize":
|
| 300 |
+
feedback = (
|
| 301 |
+
f"{feedback} Maximum steps reached."
|
| 302 |
+
if feedback
|
| 303 |
+
else "Maximum steps reached."
|
| 304 |
+
)
|
| 305 |
+
|
| 306 |
+
return self._build_observation(
|
| 307 |
+
feedback=feedback,
|
| 308 |
+
reward=reward,
|
| 309 |
+
done=done,
|
| 310 |
+
patched_code=action.patched_code,
|
| 311 |
+
)
|
| 312 |
+
|
| 313 |
+
def _build_observation(
|
| 314 |
+
self,
|
| 315 |
+
*,
|
| 316 |
+
feedback: str,
|
| 317 |
+
reward: float,
|
| 318 |
+
done: bool,
|
| 319 |
+
patched_code: Optional[str] = None,
|
| 320 |
+
) -> PythonObservation:
|
| 321 |
+
assert self._current_task is not None
|
| 322 |
+
evaluation = self._evaluate(self._submitted_findings, patched_code)
|
| 323 |
+
attempts_remaining = max(
|
| 324 |
+
self._max_steps() - self._state.step_count,
|
| 325 |
+
0,
|
| 326 |
+
)
|
| 327 |
+
return PythonObservation(
|
| 328 |
+
task=self._current_task.descriptor,
|
| 329 |
+
feedback=feedback,
|
| 330 |
+
submitted_findings=list(self._submitted_findings),
|
| 331 |
+
hints_used=self._hints_used,
|
| 332 |
+
attempts_remaining=attempts_remaining,
|
| 333 |
+
evaluation=evaluation,
|
| 334 |
+
score=evaluation.score,
|
| 335 |
+
review_time_ms=float(self._state.step_count * 125),
|
| 336 |
+
done=done,
|
| 337 |
+
reward=reward,
|
| 338 |
+
metadata={
|
| 339 |
+
"episode_id": self._state.episode_id,
|
| 340 |
+
"created_at": self._created_at,
|
| 341 |
+
"updated_at": _utc_now(),
|
| 342 |
+
},
|
| 343 |
+
)
|
| 344 |
+
|
| 345 |
+
def _evaluate(
|
| 346 |
+
self,
|
| 347 |
+
findings: Iterable[ReviewFinding],
|
| 348 |
+
patched_code: Optional[str],
|
| 349 |
+
) -> TaskEvaluation:
|
| 350 |
+
assert self._current_task is not None
|
| 351 |
+
|
| 352 |
+
references = self._current_task.references
|
| 353 |
+
matched_reference_ids: List[str] = []
|
| 354 |
+
matched_weight = 0.0
|
| 355 |
+
false_positives = 0
|
| 356 |
+
duplicate_findings = 0
|
| 357 |
+
|
| 358 |
+
seen_ids = set()
|
| 359 |
+
for finding in findings:
|
| 360 |
+
ref_id = self._match_reference(finding, references)
|
| 361 |
+
if ref_id is None:
|
| 362 |
+
false_positives += 1
|
| 363 |
+
continue
|
| 364 |
+
if ref_id in seen_ids:
|
| 365 |
+
duplicate_findings += 1
|
| 366 |
+
continue
|
| 367 |
+
seen_ids.add(ref_id)
|
| 368 |
+
matched_reference_ids.append(ref_id)
|
| 369 |
+
matched_weight += next(ref.weight for ref in references if ref.rule_id == ref_id)
|
| 370 |
+
|
| 371 |
+
total_weight = sum(ref.weight for ref in references) or 1.0
|
| 372 |
+
weighted_recall = min(matched_weight / total_weight, 1.0)
|
| 373 |
+
|
| 374 |
+
patch_score = 0.0
|
| 375 |
+
if self._current_task.patched_code and patched_code:
|
| 376 |
+
patch_score = float(
|
| 377 |
+
_normalize_code(patched_code) == _normalize_code(self._current_task.patched_code)
|
| 378 |
+
)
|
| 379 |
+
|
| 380 |
+
raw_score = (
|
| 381 |
+
weighted_recall
|
| 382 |
+
+ (self._config.patch_bonus_multiplier * patch_score)
|
| 383 |
+
- (self._config.false_positive_penalty * false_positives)
|
| 384 |
+
- (self._config.duplicate_penalty * duplicate_findings)
|
| 385 |
+
- (self._config.hint_penalty * self._hints_used)
|
| 386 |
+
)
|
| 387 |
+
score = max(0.0, min(raw_score, 1.0))
|
| 388 |
+
|
| 389 |
+
return TaskEvaluation(
|
| 390 |
+
matched_reference_ids=matched_reference_ids,
|
| 391 |
+
matched_findings=len(matched_reference_ids),
|
| 392 |
+
total_findings=len(references),
|
| 393 |
+
false_positives=false_positives,
|
| 394 |
+
duplicate_findings=duplicate_findings,
|
| 395 |
+
weighted_recall=weighted_recall,
|
| 396 |
+
patch_score=patch_score,
|
| 397 |
+
score=score,
|
| 398 |
+
passed=score >= self._current_task.descriptor.success_threshold,
|
| 399 |
+
)
|
| 400 |
+
|
| 401 |
+
def _match_reference(
|
| 402 |
+
self,
|
| 403 |
+
finding: ReviewFinding,
|
| 404 |
+
references: Iterable[ReferenceFinding],
|
| 405 |
+
) -> Optional[str]:
|
| 406 |
+
finding_rule = _normalize_text(finding.rule_id)
|
| 407 |
+
finding_title = _normalize_text(finding.title)
|
| 408 |
+
for reference in references:
|
| 409 |
+
if finding_rule and finding_rule == _normalize_text(reference.rule_id):
|
| 410 |
+
return reference.rule_id
|
| 411 |
+
line_matches = finding.line is not None and finding.line == reference.line
|
| 412 |
+
category_matches = finding.category == reference.category
|
| 413 |
+
title_matches = finding_title and (
|
| 414 |
+
finding_title in _normalize_text(reference.title)
|
| 415 |
+
or _normalize_text(reference.title) in finding_title
|
| 416 |
+
)
|
| 417 |
+
|
| 418 |
+
# Keyword match: check if any reference keywords are in the finding text
|
| 419 |
+
keyword_match = any(
|
| 420 |
+
_normalize_text(kw) in finding_title
|
| 421 |
+
for kw in getattr(reference, "keywords", [])
|
| 422 |
+
) if finding_title else False
|
| 423 |
+
|
| 424 |
+
# Relaxed matching: allow matching if the title or keywords match even if the line is missing
|
| 425 |
+
if (line_matches and (category_matches or title_matches)) or title_matches or keyword_match:
|
| 426 |
+
return reference.rule_id
|
| 427 |
+
return None
|
| 428 |
+
|
| 429 |
+
def _max_steps(self) -> int:
|
| 430 |
+
assert self._current_task is not None
|
| 431 |
+
return min(
|
| 432 |
+
self._current_task.descriptor.max_steps,
|
| 433 |
+
self._config.max_steps_per_task,
|
| 434 |
+
)
|
| 435 |
+
|
| 436 |
+
@property
|
| 437 |
+
def state(self) -> State:
|
| 438 |
+
"""Return the current environment state."""
|
| 439 |
+
|
| 440 |
+
return self._state
|
| 441 |
+
|
| 442 |
+
|
| 443 |
+
# try:
|
| 444 |
+
# from .review_runtime import ( # type: ignore
|
| 445 |
+
...
|
| 446 |
+
# )
|
| 447 |
+
|
| 448 |
+
# --- App Interface Shims ---
|
| 449 |
+
|
| 450 |
+
_GLOBAL_ENV: Optional[PythonEnvironment] = None
|
| 451 |
+
|
| 452 |
+
def _get_env() -> PythonEnvironment:
|
| 453 |
+
global _GLOBAL_ENV
|
| 454 |
+
if _GLOBAL_ENV is None:
|
| 455 |
+
_GLOBAL_ENV = PythonEnvironment()
|
| 456 |
+
return _GLOBAL_ENV
|
| 457 |
+
|
| 458 |
+
def get_current_state() -> PythonState:
|
| 459 |
+
env = _get_env()
|
| 460 |
+
obs = env._build_observation(feedback="State request", reward=0.0, done=False)
|
| 461 |
+
# Convert PythonObservation to PythonState if needed
|
| 462 |
+
return PythonState(
|
| 463 |
+
episode_id=env.state.episode_id,
|
| 464 |
+
current_step=env.state.step_count,
|
| 465 |
+
task_id=obs.task.task_id if obs.task else None,
|
| 466 |
+
difficulty=Difficulty(obs.task.difficulty) if obs.task else None,
|
| 467 |
+
done=False,
|
| 468 |
+
last_feedback=obs.feedback,
|
| 469 |
+
)
|
| 470 |
+
|
| 471 |
+
def get_health_response() -> HealthResponse:
|
| 472 |
+
return HealthResponse(
|
| 473 |
+
status="ok",
|
| 474 |
+
environment="python_env",
|
| 475 |
+
task_count=len(TASK_BANK),
|
| 476 |
+
)
|
| 477 |
+
|
| 478 |
+
def get_metrics_response() -> MetricsResponse:
|
| 479 |
+
return MetricsResponse()
|
| 480 |
+
|
| 481 |
+
def get_tasks_response() -> TaskListResponse:
|
| 482 |
+
from .task_bank import load_task_catalog
|
| 483 |
+
try:
|
| 484 |
+
tasks = load_task_catalog()
|
| 485 |
+
except Exception:
|
| 486 |
+
tasks = []
|
| 487 |
+
# If using local TASK_BANK, convert them
|
| 488 |
+
if not tasks:
|
| 489 |
+
tasks = [
|
| 490 |
+
TaskMetadata(
|
| 491 |
+
task_id=tid,
|
| 492 |
+
name=t.descriptor.title,
|
| 493 |
+
difficulty=Difficulty(t.descriptor.difficulty),
|
| 494 |
+
description=t.descriptor.objective,
|
| 495 |
+
snippet_count=1,
|
| 496 |
+
max_steps=t.descriptor.max_steps,
|
| 497 |
+
)
|
| 498 |
+
for tid, t in TASK_BANK.items()
|
| 499 |
+
]
|
| 500 |
+
return TaskListResponse(tasks=tasks)
|
server/requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openenv[core]>=0.2.0
|
| 2 |
+
fastapi>=0.115.0
|
| 3 |
+
uvicorn>=0.24.0
|
| 4 |
+
pydantic>=2.12.5
|
| 5 |
+
httpx>=0.28.1
|
server/review_runtime.py
ADDED
|
@@ -0,0 +1,418 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Benchmark runtime for the Python code-review environment."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import random
|
| 6 |
+
from dataclasses import dataclass, field
|
| 7 |
+
from datetime import UTC, datetime
|
| 8 |
+
from typing import Dict, List, Optional
|
| 9 |
+
from uuid import uuid4
|
| 10 |
+
|
| 11 |
+
from openenv.core.env_server.interfaces import Environment
|
| 12 |
+
|
| 13 |
+
try:
|
| 14 |
+
from ..models import (
|
| 15 |
+
ActionType,
|
| 16 |
+
CodeReviewSnippet,
|
| 17 |
+
EpisodeMetrics,
|
| 18 |
+
HealthResponse,
|
| 19 |
+
IssueType,
|
| 20 |
+
MetricsResponse,
|
| 21 |
+
PythonAction,
|
| 22 |
+
PythonEnvConfig,
|
| 23 |
+
PythonObservation,
|
| 24 |
+
PythonState,
|
| 25 |
+
ReviewComment,
|
| 26 |
+
RewardSummary,
|
| 27 |
+
TaskListResponse,
|
| 28 |
+
)
|
| 29 |
+
from .grading import GradeResult, grade_review
|
| 30 |
+
from .task_bank import get_task_metadata, load_task_bank, load_task_catalog
|
| 31 |
+
except ImportError:
|
| 32 |
+
from models import ( # type: ignore
|
| 33 |
+
ActionType,
|
| 34 |
+
CodeReviewSnippet,
|
| 35 |
+
EpisodeMetrics,
|
| 36 |
+
HealthResponse,
|
| 37 |
+
IssueType,
|
| 38 |
+
MetricsResponse,
|
| 39 |
+
PythonAction,
|
| 40 |
+
PythonEnvConfig,
|
| 41 |
+
PythonObservation,
|
| 42 |
+
PythonState,
|
| 43 |
+
ReviewComment,
|
| 44 |
+
RewardSummary,
|
| 45 |
+
TaskListResponse,
|
| 46 |
+
)
|
| 47 |
+
from server.grading import GradeResult, grade_review # type: ignore
|
| 48 |
+
from server.task_bank import get_task_metadata, load_task_bank, load_task_catalog # type: ignore
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def _utc_now() -> str:
|
| 52 |
+
return datetime.now(UTC).isoformat()
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def _severity_reward(issue_severity: str, bonus_issue: bool) -> float:
|
| 56 |
+
if bonus_issue:
|
| 57 |
+
return 0.03
|
| 58 |
+
if issue_severity in {"CRITICAL", "HIGH"}:
|
| 59 |
+
return 0.15
|
| 60 |
+
if issue_severity == "MEDIUM":
|
| 61 |
+
return 0.10
|
| 62 |
+
return 0.05
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def _false_positive_penalty(action_severity: Optional[str]) -> float:
|
| 66 |
+
if action_severity == "CRITICAL":
|
| 67 |
+
return -0.12
|
| 68 |
+
if action_severity == "HIGH":
|
| 69 |
+
return -0.08
|
| 70 |
+
return -0.04
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def _line_window_for_task(task_id: str) -> int:
|
| 74 |
+
if task_id == "task_easy":
|
| 75 |
+
return 3
|
| 76 |
+
if task_id == "task_medium":
|
| 77 |
+
return 5
|
| 78 |
+
return 0
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
@dataclass
|
| 82 |
+
class EpisodeRuntime:
|
| 83 |
+
episode_id: str
|
| 84 |
+
task_id: str
|
| 85 |
+
snippet: CodeReviewSnippet
|
| 86 |
+
current_step: int
|
| 87 |
+
max_steps: int
|
| 88 |
+
created_at: str
|
| 89 |
+
review_history: List[ReviewComment] = field(default_factory=list)
|
| 90 |
+
cumulative_reward: float = 0.0
|
| 91 |
+
done: bool = False
|
| 92 |
+
last_feedback: str = ""
|
| 93 |
+
found_issue_ids: set[str] = field(default_factory=set)
|
| 94 |
+
duplicate_comments: int = 0
|
| 95 |
+
context_requests: int = 0
|
| 96 |
+
skipped_clean_lines: int = 0
|
| 97 |
+
skipped_issue_lines: int = 0
|
| 98 |
+
commented_lines: set[int] = field(default_factory=set)
|
| 99 |
+
grade: GradeResult = field(
|
| 100 |
+
default_factory=lambda: GradeResult(
|
| 101 |
+
score=0.0,
|
| 102 |
+
precision=0.0,
|
| 103 |
+
recall=0.0,
|
| 104 |
+
f1=0.0,
|
| 105 |
+
true_positives=0,
|
| 106 |
+
false_positives=0,
|
| 107 |
+
missed_issues=0,
|
| 108 |
+
required_found=0,
|
| 109 |
+
required_total=0,
|
| 110 |
+
bonus_found=0,
|
| 111 |
+
matched_issue_ids=[],
|
| 112 |
+
breakdown={},
|
| 113 |
+
)
|
| 114 |
+
)
|
| 115 |
+
reward_summary: RewardSummary = field(default_factory=RewardSummary)
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
_ACTIVE_EPISODE: Optional[EpisodeRuntime] = None
|
| 119 |
+
_TASK_CURSOR = -1
|
| 120 |
+
_SNIPPET_CURSORS: Dict[str, int] = {task.task_id: -1 for task in load_task_catalog()}
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def _set_active_episode(episode: Optional[EpisodeRuntime]) -> None:
|
| 124 |
+
global _ACTIVE_EPISODE
|
| 125 |
+
_ACTIVE_EPISODE = episode
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def _current_episode() -> Optional[EpisodeRuntime]:
|
| 129 |
+
return _ACTIVE_EPISODE
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def _match_issue_for_action(task_id: str, snippet: CodeReviewSnippet, action: PythonAction, found_issue_ids: set[str]) -> Optional[str]:
|
| 133 |
+
if action.action_type != ActionType.ADD_COMMENT or action.line_number is None or action.issue_type is None:
|
| 134 |
+
return None
|
| 135 |
+
max_distance = _line_window_for_task(task_id)
|
| 136 |
+
best_issue_id: Optional[str] = None
|
| 137 |
+
best_distance = max_distance + 1
|
| 138 |
+
for issue in snippet.gold_issues:
|
| 139 |
+
if issue.issue_id in found_issue_ids or issue.issue_type != action.issue_type:
|
| 140 |
+
continue
|
| 141 |
+
distance = abs(action.line_number - issue.line)
|
| 142 |
+
if distance <= max_distance and distance < best_distance:
|
| 143 |
+
best_issue_id = issue.issue_id
|
| 144 |
+
best_distance = distance
|
| 145 |
+
return best_issue_id
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def build_metrics(episode: EpisodeRuntime) -> EpisodeMetrics:
|
| 149 |
+
return EpisodeMetrics(
|
| 150 |
+
precision=episode.grade.precision,
|
| 151 |
+
recall=episode.grade.recall,
|
| 152 |
+
f1=episode.grade.f1,
|
| 153 |
+
true_positives=episode.grade.true_positives,
|
| 154 |
+
false_positives=episode.grade.false_positives,
|
| 155 |
+
missed_issues=episode.grade.missed_issues,
|
| 156 |
+
required_found=episode.grade.required_found,
|
| 157 |
+
required_total=episode.grade.required_total,
|
| 158 |
+
bonus_found=episode.grade.bonus_found,
|
| 159 |
+
duplicate_comments=episode.duplicate_comments,
|
| 160 |
+
context_requests=episode.context_requests,
|
| 161 |
+
skipped_clean_lines=episode.skipped_clean_lines,
|
| 162 |
+
skipped_issue_lines=episode.skipped_issue_lines,
|
| 163 |
+
current_score=episode.grade.score,
|
| 164 |
+
cumulative_reward=episode.cumulative_reward,
|
| 165 |
+
breakdown=episode.grade.breakdown,
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def build_state(episode: EpisodeRuntime) -> PythonState:
|
| 170 |
+
return PythonState(
|
| 171 |
+
episode_id=episode.episode_id,
|
| 172 |
+
step_count=episode.current_step,
|
| 173 |
+
task_id=episode.task_id,
|
| 174 |
+
difficulty=get_task_metadata(episode.task_id).difficulty,
|
| 175 |
+
snippet_id=episode.snippet.snippet_id,
|
| 176 |
+
current_step=episode.current_step,
|
| 177 |
+
max_steps=episode.max_steps,
|
| 178 |
+
done=episode.done,
|
| 179 |
+
filename=episode.snippet.filename,
|
| 180 |
+
review_history=list(episode.review_history),
|
| 181 |
+
metrics=build_metrics(episode),
|
| 182 |
+
last_feedback=episode.last_feedback,
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
def get_tasks_response() -> TaskListResponse:
|
| 187 |
+
return TaskListResponse(tasks=load_task_catalog())
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
def get_metrics_response() -> MetricsResponse:
|
| 191 |
+
episode = _current_episode()
|
| 192 |
+
if episode is None:
|
| 193 |
+
return MetricsResponse()
|
| 194 |
+
return MetricsResponse(task_id=episode.task_id, snippet_id=episode.snippet.snippet_id, done=episode.done, metrics=build_metrics(episode))
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def get_health_response() -> HealthResponse:
|
| 198 |
+
episode = _current_episode()
|
| 199 |
+
return HealthResponse(
|
| 200 |
+
status="ok",
|
| 201 |
+
environment="python_code_review_env",
|
| 202 |
+
task_count=sum(len(items) for items in load_task_bank().values()),
|
| 203 |
+
active_task_id=episode.task_id if episode else None,
|
| 204 |
+
active_snippet_id=episode.snippet.snippet_id if episode else None,
|
| 205 |
+
active_episode_id=episode.episode_id if episode else None,
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
def get_current_state() -> PythonState:
|
| 210 |
+
episode = _current_episode()
|
| 211 |
+
return PythonState() if episode is None else build_state(episode)
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
class PythonReviewRuntime(Environment[PythonAction, PythonObservation, PythonState]):
|
| 215 |
+
"""Deterministic code-review benchmark environment with dense rewards."""
|
| 216 |
+
|
| 217 |
+
SUPPORTS_CONCURRENT_SESSIONS: bool = True
|
| 218 |
+
|
| 219 |
+
def __init__(self, config: Optional[PythonEnvConfig] = None):
|
| 220 |
+
super().__init__()
|
| 221 |
+
self._config = config or PythonEnvConfig()
|
| 222 |
+
self._episode: Optional[EpisodeRuntime] = None
|
| 223 |
+
|
| 224 |
+
def _restore_episode(self) -> Optional[EpisodeRuntime]:
|
| 225 |
+
if self._episode is not None:
|
| 226 |
+
return self._episode
|
| 227 |
+
self._episode = _current_episode()
|
| 228 |
+
return self._episode
|
| 229 |
+
|
| 230 |
+
def _select_task_id(self, seed: Optional[int]) -> str:
|
| 231 |
+
task_order = list(self._config.task_order)
|
| 232 |
+
if seed is not None:
|
| 233 |
+
return random.Random(seed).choice(task_order)
|
| 234 |
+
if not self._config.rotate_tasks:
|
| 235 |
+
return task_order[0]
|
| 236 |
+
global _TASK_CURSOR
|
| 237 |
+
_TASK_CURSOR = (_TASK_CURSOR + 1) % len(task_order)
|
| 238 |
+
return task_order[_TASK_CURSOR]
|
| 239 |
+
|
| 240 |
+
def _select_snippet(self, task_id: str, seed: Optional[int]) -> CodeReviewSnippet:
|
| 241 |
+
snippets = load_task_bank()[task_id]
|
| 242 |
+
if seed is not None:
|
| 243 |
+
return random.Random(seed).choice(snippets)
|
| 244 |
+
_SNIPPET_CURSORS[task_id] = (_SNIPPET_CURSORS[task_id] + 1) % len(snippets)
|
| 245 |
+
return snippets[_SNIPPET_CURSORS[task_id]]
|
| 246 |
+
|
| 247 |
+
def _terminal_reward(self, episode: EpisodeRuntime, action_type: ActionType) -> float:
|
| 248 |
+
reward = 0.0
|
| 249 |
+
if episode.grade.required_found == episode.grade.required_total and episode.grade.required_total:
|
| 250 |
+
reward += 0.20
|
| 251 |
+
if episode.grade.false_positives == 0:
|
| 252 |
+
reward += 0.10
|
| 253 |
+
if action_type == ActionType.REQUEST_CHANGES and episode.snippet.must_reject:
|
| 254 |
+
reward += 0.10
|
| 255 |
+
if action_type == ActionType.APPROVE and episode.snippet.must_approve:
|
| 256 |
+
reward += 0.15
|
| 257 |
+
if action_type == ActionType.APPROVE and episode.snippet.must_reject:
|
| 258 |
+
reward -= 0.25
|
| 259 |
+
reward += 0.05 * (1 - (episode.current_step / max(episode.max_steps, 1)))
|
| 260 |
+
return reward
|
| 261 |
+
|
| 262 |
+
def reset(self, seed: Optional[int] = None, episode_id: Optional[str] = None, task_id: Optional[str] = None, **kwargs) -> PythonObservation:
|
| 263 |
+
del kwargs
|
| 264 |
+
selected_task_id = task_id or self._select_task_id(seed)
|
| 265 |
+
snippet = self._select_snippet(selected_task_id, seed)
|
| 266 |
+
metadata = get_task_metadata(selected_task_id)
|
| 267 |
+
episode = EpisodeRuntime(
|
| 268 |
+
episode_id=episode_id or str(uuid4()),
|
| 269 |
+
task_id=selected_task_id,
|
| 270 |
+
snippet=snippet,
|
| 271 |
+
current_step=0,
|
| 272 |
+
max_steps=min(metadata.max_steps, self._config.max_steps_per_task),
|
| 273 |
+
created_at=_utc_now(),
|
| 274 |
+
)
|
| 275 |
+
episode.grade = grade_review(selected_task_id, snippet, episode.review_history, episode.duplicate_comments)
|
| 276 |
+
episode.last_feedback = f"Loaded {metadata.name}. Review the code and submit comments line by line."
|
| 277 |
+
self._episode = episode
|
| 278 |
+
_set_active_episode(episode)
|
| 279 |
+
return self._build_observation(episode, 0.0)
|
| 280 |
+
|
| 281 |
+
def step(self, action: PythonAction, timeout_s: Optional[float] = None, **kwargs) -> PythonObservation:
|
| 282 |
+
del timeout_s, kwargs
|
| 283 |
+
episode = self._restore_episode()
|
| 284 |
+
if episode is None:
|
| 285 |
+
return self.reset()
|
| 286 |
+
if episode.done:
|
| 287 |
+
return self._build_observation(episode, 0.0)
|
| 288 |
+
|
| 289 |
+
episode.current_step += 1
|
| 290 |
+
step_reward = 0.0
|
| 291 |
+
breakdown: Dict[str, float] = {}
|
| 292 |
+
feedback = ""
|
| 293 |
+
matched_issue_ids: List[str] = []
|
| 294 |
+
|
| 295 |
+
if action.action_type == ActionType.ADD_COMMENT:
|
| 296 |
+
if action.line_number in episode.commented_lines:
|
| 297 |
+
episode.duplicate_comments += 1
|
| 298 |
+
step_reward -= 0.08
|
| 299 |
+
breakdown["duplicate_comment_penalty"] = -0.08
|
| 300 |
+
issue_id = _match_issue_for_action(episode.task_id, episode.snippet, action, episode.found_issue_ids)
|
| 301 |
+
if issue_id is not None:
|
| 302 |
+
issue = next(item for item in episode.snippet.gold_issues if item.issue_id == issue_id)
|
| 303 |
+
hit_reward = _severity_reward(issue.severity.value, not issue.required)
|
| 304 |
+
step_reward += hit_reward
|
| 305 |
+
breakdown["issue_hit"] = hit_reward
|
| 306 |
+
episode.found_issue_ids.add(issue_id)
|
| 307 |
+
matched_issue_ids = [issue_id]
|
| 308 |
+
feedback = f"Recorded issue on line {action.line_number}."
|
| 309 |
+
else:
|
| 310 |
+
penalty = _false_positive_penalty(action.severity.value if action.severity else None)
|
| 311 |
+
step_reward += penalty
|
| 312 |
+
breakdown["false_positive_penalty"] = penalty
|
| 313 |
+
feedback = "Comment did not match a benchmark issue."
|
| 314 |
+
if action.line_number is not None:
|
| 315 |
+
episode.commented_lines.add(action.line_number)
|
| 316 |
+
|
| 317 |
+
elif action.action_type == ActionType.SKIP_LINE:
|
| 318 |
+
assert action.line_number is not None
|
| 319 |
+
required_issue_on_line = any(
|
| 320 |
+
issue.required and issue.line == action.line_number
|
| 321 |
+
for issue in episode.snippet.gold_issues
|
| 322 |
+
)
|
| 323 |
+
if required_issue_on_line:
|
| 324 |
+
step_reward -= 0.10
|
| 325 |
+
episode.skipped_issue_lines += 1
|
| 326 |
+
breakdown["skip_issue_penalty"] = -0.10
|
| 327 |
+
feedback = "Skipped a line with a required issue."
|
| 328 |
+
else:
|
| 329 |
+
step_reward += 0.02
|
| 330 |
+
episode.skipped_clean_lines += 1
|
| 331 |
+
breakdown["skip_clean_reward"] = 0.02
|
| 332 |
+
feedback = "Marked the line as clean."
|
| 333 |
+
|
| 334 |
+
elif action.action_type == ActionType.ASK_CONTEXT:
|
| 335 |
+
episode.context_requests += 1
|
| 336 |
+
step_reward -= 0.03
|
| 337 |
+
breakdown["ask_context_penalty"] = -0.03
|
| 338 |
+
feedback = episode.snippet.context or episode.snippet.diff or "No additional context available."
|
| 339 |
+
|
| 340 |
+
elif action.action_type in {ActionType.APPROVE, ActionType.REQUEST_CHANGES}:
|
| 341 |
+
feedback = "Final review decision recorded."
|
| 342 |
+
|
| 343 |
+
episode.review_history.append(
|
| 344 |
+
ReviewComment(
|
| 345 |
+
step_index=episode.current_step,
|
| 346 |
+
action_type=action.action_type,
|
| 347 |
+
line_number=action.line_number,
|
| 348 |
+
issue_type=action.issue_type,
|
| 349 |
+
severity=action.severity,
|
| 350 |
+
comment=action.comment,
|
| 351 |
+
suggestion=action.suggestion,
|
| 352 |
+
question=action.question,
|
| 353 |
+
matched_issue_ids=matched_issue_ids,
|
| 354 |
+
reward_delta=step_reward,
|
| 355 |
+
)
|
| 356 |
+
)
|
| 357 |
+
if len(episode.review_history) > self._config.max_history_entries:
|
| 358 |
+
episode.review_history = episode.review_history[-self._config.max_history_entries :]
|
| 359 |
+
|
| 360 |
+
done = action.action_type in {ActionType.APPROVE, ActionType.REQUEST_CHANGES}
|
| 361 |
+
if episode.current_step >= episode.max_steps:
|
| 362 |
+
done = True
|
| 363 |
+
feedback = f"{feedback} Maximum steps reached.".strip()
|
| 364 |
+
|
| 365 |
+
episode.grade = grade_review(episode.task_id, episode.snippet, episode.review_history, episode.duplicate_comments)
|
| 366 |
+
if done:
|
| 367 |
+
terminal_bonus = self._terminal_reward(episode, action.action_type)
|
| 368 |
+
step_reward += terminal_bonus
|
| 369 |
+
breakdown["terminal_bonus"] = terminal_bonus
|
| 370 |
+
episode.done = True
|
| 371 |
+
feedback = f"{feedback} Final score {episode.grade.score:.2f}.".strip()
|
| 372 |
+
|
| 373 |
+
episode.cumulative_reward += step_reward
|
| 374 |
+
episode.reward_summary = RewardSummary(
|
| 375 |
+
step_reward=step_reward,
|
| 376 |
+
cumulative_reward=episode.cumulative_reward,
|
| 377 |
+
breakdown=breakdown,
|
| 378 |
+
false_positives=episode.grade.false_positives,
|
| 379 |
+
true_positives=episode.grade.true_positives,
|
| 380 |
+
missed_issues=episode.grade.missed_issues,
|
| 381 |
+
)
|
| 382 |
+
episode.last_feedback = feedback or "Step complete."
|
| 383 |
+
self._episode = episode
|
| 384 |
+
_set_active_episode(episode)
|
| 385 |
+
return self._build_observation(episode, step_reward)
|
| 386 |
+
|
| 387 |
+
def _build_observation(self, episode: EpisodeRuntime, reward: float) -> PythonObservation:
|
| 388 |
+
lines = episode.snippet.code.splitlines()
|
| 389 |
+
return PythonObservation(
|
| 390 |
+
snippet_id=episode.snippet.snippet_id,
|
| 391 |
+
code=episode.snippet.code,
|
| 392 |
+
filename=episode.snippet.filename,
|
| 393 |
+
language="python",
|
| 394 |
+
context=episode.snippet.context,
|
| 395 |
+
diff=episode.snippet.diff,
|
| 396 |
+
line_count=len(lines),
|
| 397 |
+
current_step=episode.current_step,
|
| 398 |
+
max_steps=episode.max_steps,
|
| 399 |
+
task_id=episode.task_id,
|
| 400 |
+
review_history=list(episode.review_history),
|
| 401 |
+
lines=lines,
|
| 402 |
+
reward_summary=episode.reward_summary,
|
| 403 |
+
metrics=build_metrics(episode),
|
| 404 |
+
feedback=episode.last_feedback,
|
| 405 |
+
done=episode.done,
|
| 406 |
+
reward=reward,
|
| 407 |
+
metadata={
|
| 408 |
+
"episode_id": episode.episode_id,
|
| 409 |
+
"created_at": episode.created_at,
|
| 410 |
+
"updated_at": _utc_now(),
|
| 411 |
+
"task_name": get_task_metadata(episode.task_id).name,
|
| 412 |
+
},
|
| 413 |
+
)
|
| 414 |
+
|
| 415 |
+
@property
|
| 416 |
+
def state(self) -> PythonState:
|
| 417 |
+
episode = self._restore_episode()
|
| 418 |
+
return PythonState() if episode is None else build_state(episode)
|
server/task_bank.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Dataset-backed task catalog for the Python code-review benchmark."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
from functools import lru_cache
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Dict, List
|
| 9 |
+
|
| 10 |
+
try:
|
| 11 |
+
from ..models import CodeReviewSnippet, Difficulty, TaskMetadata
|
| 12 |
+
except ImportError:
|
| 13 |
+
from models import CodeReviewSnippet, Difficulty, TaskMetadata # type: ignore
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
DATA_DIR = Path(__file__).with_name("data")
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
TASK_DEFINITIONS: Dict[str, dict[str, object]] = {
|
| 20 |
+
"task_easy": {
|
| 21 |
+
"name": "Style & Convention Review",
|
| 22 |
+
"difficulty": Difficulty.EASY,
|
| 23 |
+
"description": "Find style, naming, formatting, and documentation issues.",
|
| 24 |
+
"filename": "snippets_easy.json",
|
| 25 |
+
"max_steps": 25,
|
| 26 |
+
},
|
| 27 |
+
"task_medium": {
|
| 28 |
+
"name": "Logic Bug Detection",
|
| 29 |
+
"difficulty": Difficulty.MEDIUM,
|
| 30 |
+
"description": "Identify correctness issues in ordinary Python code.",
|
| 31 |
+
"filename": "snippets_medium.json",
|
| 32 |
+
"max_steps": 25,
|
| 33 |
+
},
|
| 34 |
+
"task_hard": {
|
| 35 |
+
"name": "Security Vulnerability Audit",
|
| 36 |
+
"difficulty": Difficulty.HARD,
|
| 37 |
+
"description": "Review web and data-processing code for security flaws.",
|
| 38 |
+
"filename": "snippets_hard.json",
|
| 39 |
+
"max_steps": 25,
|
| 40 |
+
},
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
@lru_cache(maxsize=1)
|
| 45 |
+
def load_task_bank() -> Dict[str, List[CodeReviewSnippet]]:
|
| 46 |
+
"""Load and validate all snippet JSON files."""
|
| 47 |
+
|
| 48 |
+
task_bank: Dict[str, List[CodeReviewSnippet]] = {}
|
| 49 |
+
for task_id, spec in TASK_DEFINITIONS.items():
|
| 50 |
+
raw_items = json.loads((DATA_DIR / str(spec["filename"])).read_text(encoding="utf-8"))
|
| 51 |
+
task_bank[task_id] = [CodeReviewSnippet.model_validate(item) for item in raw_items]
|
| 52 |
+
return task_bank
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
@lru_cache(maxsize=1)
|
| 56 |
+
def load_task_catalog() -> List[TaskMetadata]:
|
| 57 |
+
"""Return visible task metadata for `/tasks` and environment resets."""
|
| 58 |
+
|
| 59 |
+
task_bank = load_task_bank()
|
| 60 |
+
catalog: List[TaskMetadata] = []
|
| 61 |
+
for task_id, spec in TASK_DEFINITIONS.items():
|
| 62 |
+
catalog.append(
|
| 63 |
+
TaskMetadata(
|
| 64 |
+
task_id=task_id,
|
| 65 |
+
name=str(spec["name"]),
|
| 66 |
+
difficulty=spec["difficulty"], # type: ignore[arg-type]
|
| 67 |
+
description=str(spec["description"]),
|
| 68 |
+
snippet_count=len(task_bank[task_id]),
|
| 69 |
+
max_steps=int(spec["max_steps"]),
|
| 70 |
+
min_score=0.0,
|
| 71 |
+
max_score=1.0,
|
| 72 |
+
)
|
| 73 |
+
)
|
| 74 |
+
return catalog
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def get_task_metadata(task_id: str) -> TaskMetadata:
|
| 78 |
+
"""Return task metadata for one family."""
|
| 79 |
+
|
| 80 |
+
for task in load_task_catalog():
|
| 81 |
+
if task.task_id == task_id:
|
| 82 |
+
return task
|
| 83 |
+
raise KeyError(f"Unknown task_id: {task_id}")
|
tests/test_env.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from fastapi.testclient import TestClient
|
| 4 |
+
import pytest
|
| 5 |
+
|
| 6 |
+
from models import (
|
| 7 |
+
ActionType,
|
| 8 |
+
IssueType,
|
| 9 |
+
PythonReviewAction,
|
| 10 |
+
Severity,
|
| 11 |
+
)
|
| 12 |
+
from server.app import app
|
| 13 |
+
from server.grading import grade_review
|
| 14 |
+
from server.python_env_environment import PythonEnvironment
|
| 15 |
+
from server.task_bank import load_task_bank
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def _snippet_by_id(task_id: str, snippet_id: str):
|
| 19 |
+
return next(item for item in load_task_bank()[task_id] if item.snippet_id == snippet_id)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def test_add_comment_requires_fields() -> None:
|
| 23 |
+
with pytest.raises(Exception):
|
| 24 |
+
PythonReviewAction(action_type=ActionType.ADD_COMMENT)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def test_approve_rejects_extra_fields() -> None:
|
| 28 |
+
with pytest.raises(Exception):
|
| 29 |
+
PythonReviewAction(
|
| 30 |
+
action_type=ActionType.APPROVE,
|
| 31 |
+
comment="looks good",
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def test_easy_grader_rewards_required_issue_and_request_changes() -> None:
|
| 36 |
+
snippet = load_task_bank()["task_easy"][0]
|
| 37 |
+
history = [
|
| 38 |
+
PythonReviewAction(
|
| 39 |
+
action_type=ActionType.ADD_COMMENT,
|
| 40 |
+
line_number=4,
|
| 41 |
+
issue_type=IssueType.STYLE,
|
| 42 |
+
severity=Severity.LOW,
|
| 43 |
+
comment="Ambiguous variable name l violates PEP8 E741.",
|
| 44 |
+
),
|
| 45 |
+
PythonReviewAction(action_type=ActionType.REQUEST_CHANGES),
|
| 46 |
+
]
|
| 47 |
+
comments = []
|
| 48 |
+
for step, action in enumerate(history, start=1):
|
| 49 |
+
comments.append(
|
| 50 |
+
{
|
| 51 |
+
"step_index": step,
|
| 52 |
+
"action_type": action.action_type,
|
| 53 |
+
"line_number": action.line_number,
|
| 54 |
+
"issue_type": action.issue_type,
|
| 55 |
+
"severity": action.severity,
|
| 56 |
+
"comment": action.comment,
|
| 57 |
+
}
|
| 58 |
+
)
|
| 59 |
+
from models import ReviewComment
|
| 60 |
+
|
| 61 |
+
result = grade_review(
|
| 62 |
+
"task_easy",
|
| 63 |
+
snippet,
|
| 64 |
+
[ReviewComment.model_validate(item) for item in comments],
|
| 65 |
+
duplicate_comments=0,
|
| 66 |
+
)
|
| 67 |
+
assert result.score > 0.35
|
| 68 |
+
assert result.required_found >= 1
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def test_hard_grader_rewards_security_metadata() -> None:
|
| 72 |
+
snippet = load_task_bank()["task_hard"][0]
|
| 73 |
+
from models import ReviewComment
|
| 74 |
+
|
| 75 |
+
review = ReviewComment(
|
| 76 |
+
step_index=1,
|
| 77 |
+
action_type=ActionType.ADD_COMMENT,
|
| 78 |
+
line_number=2,
|
| 79 |
+
issue_type=IssueType.SECURITY,
|
| 80 |
+
severity=Severity.CRITICAL,
|
| 81 |
+
comment="SQL injection risk. This is an OWASP injection issue because the query interpolates user input.",
|
| 82 |
+
suggestion="Use a parameterized query with placeholders instead of string interpolation.",
|
| 83 |
+
)
|
| 84 |
+
result = grade_review("task_hard", snippet, [review], duplicate_comments=0)
|
| 85 |
+
assert result.score > 0.30
|
| 86 |
+
assert result.true_positives == 1
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def test_environment_step_updates_metrics() -> None:
|
| 90 |
+
env = PythonEnvironment()
|
| 91 |
+
observation = env.reset(task_id="task_easy").model_copy()
|
| 92 |
+
snippet = _snippet_by_id("task_easy", observation.snippet_id)
|
| 93 |
+
issue = next(item for item in snippet.gold_issues if item.required)
|
| 94 |
+
|
| 95 |
+
next_observation = env.step(
|
| 96 |
+
PythonReviewAction(
|
| 97 |
+
action_type=ActionType.ADD_COMMENT,
|
| 98 |
+
line_number=issue.line,
|
| 99 |
+
issue_type=issue.issue_type,
|
| 100 |
+
severity=issue.severity,
|
| 101 |
+
comment=issue.description,
|
| 102 |
+
)
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
assert next_observation.reward is not None
|
| 106 |
+
assert next_observation.metrics.true_positives >= 1
|
| 107 |
+
assert next_observation.review_history[-1].matched_issue_ids
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def test_environment_terminal_action_sets_done() -> None:
|
| 111 |
+
env = PythonEnvironment()
|
| 112 |
+
observation = env.reset(task_id="task_easy")
|
| 113 |
+
result = env.step(PythonReviewAction(action_type=ActionType.REQUEST_CHANGES))
|
| 114 |
+
assert result.done is True
|
| 115 |
+
assert result.metrics.current_score >= 0.0
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def test_api_smoke_endpoints() -> None:
|
| 119 |
+
client = TestClient(app)
|
| 120 |
+
|
| 121 |
+
reset_response = client.post("/reset", json={"task_id": "task_easy"})
|
| 122 |
+
assert reset_response.status_code == 200
|
| 123 |
+
payload = reset_response.json()
|
| 124 |
+
assert payload["observation"]["task_id"] == "task_easy"
|
| 125 |
+
snippet = _snippet_by_id("task_easy", payload["observation"]["snippet_id"])
|
| 126 |
+
issue = next(item for item in snippet.gold_issues if item.required)
|
| 127 |
+
|
| 128 |
+
step_response = client.post(
|
| 129 |
+
"/step",
|
| 130 |
+
json={
|
| 131 |
+
"action": {
|
| 132 |
+
"action_type": "ADD_COMMENT",
|
| 133 |
+
"line_number": issue.line,
|
| 134 |
+
"issue_type": issue.issue_type.value,
|
| 135 |
+
"severity": issue.severity.value,
|
| 136 |
+
"comment": issue.description,
|
| 137 |
+
}
|
| 138 |
+
},
|
| 139 |
+
)
|
| 140 |
+
assert step_response.status_code == 200
|
| 141 |
+
assert step_response.json()["observation"]["metrics"]["true_positives"] >= 1
|
| 142 |
+
|
| 143 |
+
tasks_response = client.get("/tasks")
|
| 144 |
+
assert tasks_response.status_code == 200
|
| 145 |
+
assert len(tasks_response.json()["tasks"]) == 3
|
| 146 |
+
|
| 147 |
+
metrics_response = client.get("/metrics")
|
| 148 |
+
assert metrics_response.status_code == 200
|
| 149 |
+
assert "metrics" in metrics_response.json()
|
| 150 |
+
|
| 151 |
+
health_response = client.get("/health")
|
| 152 |
+
assert health_response.status_code == 200
|
| 153 |
+
assert health_response.json()["status"] == "ok"
|
| 154 |
+
|
| 155 |
+
schema_response = client.get("/schema")
|
| 156 |
+
assert schema_response.status_code == 200
|
| 157 |
+
assert "action" in schema_response.json()
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|