Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- Dockerfile +16 -0
- README.md +214 -5
- __init__.py +4 -0
- client.py +79 -0
- inference.py +313 -0
- models.py +96 -0
- openenv.yaml +5 -0
- pyproject.toml +22 -0
- requirements.txt +6 -0
- server/__init__.py +3 -0
- server/api_specs.py +639 -0
- server/app.py +39 -0
- server/environment.py +456 -0
- server/error_injectors.py +388 -0
- server/validators.py +150 -0
- uv.lock +0 -0
Dockerfile
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.12-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
RUN apt-get update && apt-get install -y --no-install-recommends gcc \
|
| 6 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 7 |
+
|
| 8 |
+
COPY requirements.txt .
|
| 9 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 10 |
+
|
| 11 |
+
COPY . .
|
| 12 |
+
|
| 13 |
+
EXPOSE 7860
|
| 14 |
+
|
| 15 |
+
ENV ENABLE_WEB_INTERFACE=true
|
| 16 |
+
CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -1,10 +1,219 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
|
|
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: API Debug Environment
|
| 3 |
+
emoji: "🔧"
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
+
license: mit
|
| 9 |
+
base_path: /web
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# API Debug Environment
|
| 13 |
+
|
| 14 |
+
An OpenEnv reinforcement learning environment where LLM agents learn to debug malformed API requests. The agent receives a broken request and its API specification, then must diagnose the error, fix the request, and explain the fix.
|
| 15 |
+
|
| 16 |
+
Built for the Meta PyTorch OpenEnv Hackathon x Scaler School of Technology 2026.
|
| 17 |
+
|
| 18 |
+
## Why This Domain
|
| 19 |
+
|
| 20 |
+
Developers spend significant time debugging API contract mismatches. Research from Calendar Gym shows that malformed tool arguments caused more than half of agent failures. This environment trains agents to identify and fix these errors systematically.
|
| 21 |
+
|
| 22 |
+
**Real-world applications:** API gateway validation, automated debugging assistants, developer tooling, CI/CD request validation, LLM tool-use reliability.
|
| 23 |
+
|
| 24 |
+
## How It Works
|
| 25 |
+
|
| 26 |
+
1. On `reset()`, the environment picks a random API spec from 30 templates and injects 1-3 errors
|
| 27 |
+
2. The agent receives the broken request, headers, and the API specification
|
| 28 |
+
3. The agent submits a fix attempt via `step()`
|
| 29 |
+
4. The environment grades the attempt and returns structured feedback
|
| 30 |
+
5. The agent can iterate (multi-turn) using the feedback to improve
|
| 31 |
+
|
| 32 |
+
Each episode allows multiple attempts. Perfect answers on early steps earn full reward. Later steps get decayed reward, encouraging efficient debugging.
|
| 33 |
+
|
| 34 |
+
## Tasks
|
| 35 |
+
|
| 36 |
+
| Task | Difficulty | Max Steps | Errors | Grading |
|
| 37 |
+
|------|-----------|-----------|--------|---------|
|
| 38 |
+
| easy | Identify error type and affected fields | 3 | 1 | Deterministic: 0.6 x type_match + 0.4 x fields_match |
|
| 39 |
+
| medium | Fix the broken request | 5 | 1 | Deterministic: per-field validation against spec |
|
| 40 |
+
| hard | Fix request + explain for developers | 7 | 2-3 | 70% deterministic fix + 30% LLM-as-judge explanation |
|
| 41 |
+
|
| 42 |
+
## Error Types
|
| 43 |
+
|
| 44 |
+
| Error Type | Description | Example |
|
| 45 |
+
|-----------|-------------|---------|
|
| 46 |
+
| missing_required_field | A required field is removed | `email` missing from Create User |
|
| 47 |
+
| wrong_field_type | Field has wrong type | `amount` sent as `"2500"` instead of `2500` |
|
| 48 |
+
| invalid_email_format | Email field is malformed | `user@` instead of `user@example.com` |
|
| 49 |
+
| missing_auth_header | Authorization header removed | No `Bearer` token |
|
| 50 |
+
| extra_unknown_field | Unknown field added | `debug_mode: true` in production request |
|
| 51 |
+
| null_value_in_required | Required field set to null | `name: null` |
|
| 52 |
+
| wrong_http_method | Wrong HTTP method used | `GET` instead of `POST` |
|
| 53 |
+
| malformed_json_value | Corrupted field value | `{broken` as a value |
|
| 54 |
+
| invalid_enum_value | Value not in allowed list | `currency: "xyz"` |
|
| 55 |
+
| datetime_format_error | Wrong date format | `04/01/2026` instead of ISO 8601 |
|
| 56 |
+
|
| 57 |
+
## API Spec Domains (30 templates)
|
| 58 |
+
|
| 59 |
+
| Domain | Count | Examples |
|
| 60 |
+
|--------|-------|---------|
|
| 61 |
+
| Payment (Stripe-like) | 5 | Create Customer, Create Charge, Process Refund |
|
| 62 |
+
| User Management | 5 | Create User, Update Profile, Reset Password |
|
| 63 |
+
| Content (GitHub-like) | 5 | Create Repository, Create Issue, Merge PR |
|
| 64 |
+
| Messaging (Twilio-like) | 5 | Send SMS, Send Email, Create Webhook |
|
| 65 |
+
| E-Commerce | 5 | Create Order, Process Payment, Create Shipping Label |
|
| 66 |
+
| Calendar and Auth | 5 | Create Event, OAuth Token, Create API Key |
|
| 67 |
+
|
| 68 |
+
## Action Space
|
| 69 |
+
|
| 70 |
+
The agent sends an `APIDebugAction` with these fields (all optional, submit what you have):
|
| 71 |
+
|
| 72 |
+
| Field | Type | Used In | Description |
|
| 73 |
+
|-------|------|---------|-------------|
|
| 74 |
+
| error_type | string | easy, hard | Diagnosed error type |
|
| 75 |
+
| affected_fields | list[string] | easy, hard | Fields affected by the error |
|
| 76 |
+
| fixed_request | string (JSON) | medium, hard | Corrected request body |
|
| 77 |
+
| fixed_headers | dict | medium, hard | Corrected HTTP headers |
|
| 78 |
+
| explanation | string | hard | Developer-facing explanation |
|
| 79 |
+
|
| 80 |
+
## Observation Space
|
| 81 |
+
|
| 82 |
+
The environment returns an `APIDebugObservation` with:
|
| 83 |
+
|
| 84 |
+
| Field | Type | Description |
|
| 85 |
+
|-------|------|-------------|
|
| 86 |
+
| task | string | Current difficulty: easy, medium, hard |
|
| 87 |
+
| api_name | string | Name of the API (e.g. "Create Customer") |
|
| 88 |
+
| http_method | string | HTTP method of the request |
|
| 89 |
+
| endpoint | string | API endpoint path |
|
| 90 |
+
| broken_request | string (JSON) | The malformed request body |
|
| 91 |
+
| broken_headers | dict | HTTP headers sent with the request |
|
| 92 |
+
| api_spec | string (JSON) | API specification with required fields and types |
|
| 93 |
+
| error_count | int | Number of errors injected |
|
| 94 |
+
| step_number | int | Current step in the episode |
|
| 95 |
+
| max_steps | int | Maximum steps allowed |
|
| 96 |
+
| feedback | string | Structured validation feedback from last action |
|
| 97 |
+
| message | string | Human-readable status |
|
| 98 |
+
| done | bool | Whether the episode has ended |
|
| 99 |
+
| reward | float | Reward signal (0.0 to 1.0) |
|
| 100 |
+
|
| 101 |
+
## Reward Design
|
| 102 |
+
|
| 103 |
+
Rewards are shaped per-step with decay to encourage efficient debugging:
|
| 104 |
+
|
| 105 |
+
```
|
| 106 |
+
reward = raw_score x max(1.0 - 0.1 x (step - 1), 0.3)
|
| 107 |
+
```
|
| 108 |
+
|
| 109 |
+
- Step 1: full reward (1.0x multiplier)
|
| 110 |
+
- Step 2: 0.9x multiplier
|
| 111 |
+
- Step 5: 0.6x multiplier
|
| 112 |
+
- Step 7+: 0.3x floor (agent still gets credit for late fixes)
|
| 113 |
+
|
| 114 |
+
At episode end, the best reward achieved across all steps is returned.
|
| 115 |
+
|
| 116 |
+
## Baseline Scores
|
| 117 |
+
|
| 118 |
+
Scores will be updated after running inference against the live HF Space.
|
| 119 |
+
|
| 120 |
+
| Task | Episodes | Avg Score | Model |
|
| 121 |
+
|------|----------|-----------|-------|
|
| 122 |
+
| easy | 3 | TBD | Qwen/Qwen2.5-72B-Instruct |
|
| 123 |
+
| medium | 3 | TBD | Qwen/Qwen2.5-72B-Instruct |
|
| 124 |
+
| hard | 3 | TBD | Qwen/Qwen2.5-72B-Instruct |
|
| 125 |
+
|
| 126 |
+
## Setup
|
| 127 |
+
|
| 128 |
+
### Prerequisites
|
| 129 |
+
|
| 130 |
+
- Python 3.12+
|
| 131 |
+
- Docker (for container deployment)
|
| 132 |
+
- uv (Python package manager)
|
| 133 |
+
|
| 134 |
+
### Local Development
|
| 135 |
+
|
| 136 |
+
```bash
|
| 137 |
+
git clone https://github.com/Avi-chauhan/api-debug-env.git
|
| 138 |
+
cd api-debug-env
|
| 139 |
+
uv venv --python 3.12
|
| 140 |
+
source .venv/bin/activate
|
| 141 |
+
uv pip install -r requirements.txt
|
| 142 |
+
```
|
| 143 |
+
|
| 144 |
+
### Run Server Locally
|
| 145 |
+
|
| 146 |
+
```bash
|
| 147 |
+
uvicorn server.app:app --host 0.0.0.0 --port 8000 --reload
|
| 148 |
+
```
|
| 149 |
+
|
| 150 |
+
### Docker
|
| 151 |
+
|
| 152 |
+
```bash
|
| 153 |
+
docker build -t api-debug-env:latest .
|
| 154 |
+
docker run -p 7860:7860 api-debug-env:latest
|
| 155 |
+
```
|
| 156 |
+
|
| 157 |
+
### Test
|
| 158 |
+
|
| 159 |
+
```python
|
| 160 |
+
import asyncio
|
| 161 |
+
from client import APIDebugEnv
|
| 162 |
+
from models import APIDebugAction
|
| 163 |
+
|
| 164 |
+
async def test():
|
| 165 |
+
async with APIDebugEnv(base_url="http://localhost:8000") as env:
|
| 166 |
+
result = await env.reset(task="easy")
|
| 167 |
+
print(result.observation.message)
|
| 168 |
+
|
| 169 |
+
action = APIDebugAction(
|
| 170 |
+
error_type="missing_required_field",
|
| 171 |
+
affected_fields=["email"]
|
| 172 |
+
)
|
| 173 |
+
result = await env.step(action)
|
| 174 |
+
print(f"Reward: {result.reward}, Feedback: {result.observation.feedback}")
|
| 175 |
+
|
| 176 |
+
asyncio.run(test())
|
| 177 |
+
```
|
| 178 |
+
|
| 179 |
+
## Project Structure
|
| 180 |
+
|
| 181 |
+
```
|
| 182 |
+
api-debug-env/
|
| 183 |
+
├── Dockerfile # Root level (required for HF Spaces)
|
| 184 |
+
├── requirements.txt
|
| 185 |
+
├── inference.py # Baseline inference script (mandatory)
|
| 186 |
+
├── openenv.yaml # OpenEnv manifest
|
| 187 |
+
├── pyproject.toml
|
| 188 |
+
├── README.md
|
| 189 |
+
├── models.py # APIDebugAction, APIDebugObservation
|
| 190 |
+
├── client.py # APIDebugEnv(EnvClient)
|
| 191 |
+
├── __init__.py
|
| 192 |
+
└── server/
|
| 193 |
+
├── __init__.py
|
| 194 |
+
├── app.py # FastAPI app via create_app()
|
| 195 |
+
├── environment.py # Core logic: reset(), step(), graders
|
| 196 |
+
├── api_specs.py # 30 API spec templates
|
| 197 |
+
├── error_injectors.py # 10 error injection functions
|
| 198 |
+
└── validators.py # Field type validation helpers
|
| 199 |
+
```
|
| 200 |
+
|
| 201 |
+
## Deployment
|
| 202 |
+
|
| 203 |
+
### HuggingFace Spaces
|
| 204 |
+
|
| 205 |
+
```bash
|
| 206 |
+
openenv push --repo-id avichauhan/api-debug-env
|
| 207 |
+
```
|
| 208 |
+
|
| 209 |
+
HF Space URL: https://avichauhan-api-debug-env.hf.space
|
| 210 |
+
|
| 211 |
+
### Validation
|
| 212 |
+
|
| 213 |
+
```bash
|
| 214 |
+
./validate-submission.sh https://avichauhan-api-debug-env.hf.space .
|
| 215 |
+
```
|
| 216 |
+
|
| 217 |
+
## License
|
| 218 |
+
|
| 219 |
+
MIT
|
__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .models import APIDebugAction, APIDebugObservation
|
| 2 |
+
from .client import APIDebugEnv
|
| 3 |
+
|
| 4 |
+
__all__ = ["APIDebugAction", "APIDebugObservation", "APIDebugEnv"]
|
client.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Client SDK for the API Debug Environment.
|
| 3 |
+
|
| 4 |
+
Implements the three required abstract methods from EnvClient:
|
| 5 |
+
- _step_payload: converts APIDebugAction to JSON dict
|
| 6 |
+
- _parse_result: converts server response to StepResult
|
| 7 |
+
- _parse_state: converts server state to State object
|
| 8 |
+
|
| 9 |
+
Usage:
|
| 10 |
+
async with APIDebugEnv(base_url="http://localhost:8000") as env:
|
| 11 |
+
result = await env.reset(task="easy")
|
| 12 |
+
result = await env.step(APIDebugAction(error_type="missing_required_field"))
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from typing import Any, Dict
|
| 16 |
+
|
| 17 |
+
from openenv.core import EnvClient
|
| 18 |
+
from openenv.core.client_types import StepResult
|
| 19 |
+
from openenv.core.env_server.types import State
|
| 20 |
+
|
| 21 |
+
from models import APIDebugAction, APIDebugObservation
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class APIDebugEnv(EnvClient[APIDebugAction, APIDebugObservation, State]):
|
| 25 |
+
|
| 26 |
+
def _step_payload(self, action: APIDebugAction) -> Dict[str, Any]:
|
| 27 |
+
"""Convert action to JSON dict, including only non-None fields."""
|
| 28 |
+
payload = {}
|
| 29 |
+
if action.error_type is not None:
|
| 30 |
+
payload["error_type"] = action.error_type
|
| 31 |
+
if action.affected_fields is not None:
|
| 32 |
+
payload["affected_fields"] = action.affected_fields
|
| 33 |
+
if action.fixed_request is not None:
|
| 34 |
+
payload["fixed_request"] = action.fixed_request
|
| 35 |
+
if action.fixed_headers is not None:
|
| 36 |
+
payload["fixed_headers"] = action.fixed_headers
|
| 37 |
+
if action.explanation is not None:
|
| 38 |
+
payload["explanation"] = action.explanation
|
| 39 |
+
return payload
|
| 40 |
+
|
| 41 |
+
def _parse_result(self, payload: Dict[str, Any]) -> StepResult[APIDebugObservation]:
|
| 42 |
+
"""Convert server JSON response to StepResult.
|
| 43 |
+
|
| 44 |
+
The server sends:
|
| 45 |
+
{
|
| 46 |
+
"observation": { ...fields except reward/done/metadata... },
|
| 47 |
+
"reward": float,
|
| 48 |
+
"done": bool,
|
| 49 |
+
}
|
| 50 |
+
"""
|
| 51 |
+
obs_data = payload.get("observation", {})
|
| 52 |
+
observation = APIDebugObservation(
|
| 53 |
+
task=obs_data.get("task", "easy"),
|
| 54 |
+
api_name=obs_data.get("api_name", ""),
|
| 55 |
+
http_method=obs_data.get("http_method", "POST"),
|
| 56 |
+
endpoint=obs_data.get("endpoint", ""),
|
| 57 |
+
broken_request=obs_data.get("broken_request", ""),
|
| 58 |
+
broken_headers=obs_data.get("broken_headers", {}),
|
| 59 |
+
api_spec=obs_data.get("api_spec", ""),
|
| 60 |
+
error_count=obs_data.get("error_count", 1),
|
| 61 |
+
step_number=obs_data.get("step_number", 0),
|
| 62 |
+
max_steps=obs_data.get("max_steps", 3),
|
| 63 |
+
feedback=obs_data.get("feedback", ""),
|
| 64 |
+
message=obs_data.get("message", ""),
|
| 65 |
+
done=payload.get("done", False),
|
| 66 |
+
reward=payload.get("reward", 0.0),
|
| 67 |
+
)
|
| 68 |
+
return StepResult(
|
| 69 |
+
observation=observation,
|
| 70 |
+
reward=payload.get("reward", 0.0),
|
| 71 |
+
done=payload.get("done", False),
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
def _parse_state(self, payload: Dict[str, Any]) -> State:
|
| 75 |
+
"""Convert server state JSON to State object."""
|
| 76 |
+
return State(
|
| 77 |
+
episode_id=payload.get("episode_id", ""),
|
| 78 |
+
step_count=payload.get("step_count", 0),
|
| 79 |
+
)
|
inference.py
ADDED
|
@@ -0,0 +1,313 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Baseline inference script for the API Debug Environment.
|
| 3 |
+
|
| 4 |
+
MANDATORY:
|
| 5 |
+
- Must be named inference.py and placed in the root directory.
|
| 6 |
+
- Must use OpenAI Client for all LLM calls.
|
| 7 |
+
- Must read env vars: API_BASE_URL, MODEL_NAME, HF_TOKEN.
|
| 8 |
+
- Must emit [START], [STEP], [END] structured logs to stdout.
|
| 9 |
+
|
| 10 |
+
STDOUT FORMAT:
|
| 11 |
+
[START] task=<task_name> env=<benchmark> model=<model_name>
|
| 12 |
+
[STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
|
| 13 |
+
[END] success=<true|false> steps=<n> score=<score> rewards=<r1,r2,...,rn>
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
import asyncio
|
| 17 |
+
import json
|
| 18 |
+
import os
|
| 19 |
+
import re
|
| 20 |
+
import textwrap
|
| 21 |
+
from typing import List, Optional
|
| 22 |
+
|
| 23 |
+
from openai import OpenAI
|
| 24 |
+
|
| 25 |
+
from client import APIDebugEnv
|
| 26 |
+
from models import APIDebugAction
|
| 27 |
+
|
| 28 |
+
# Environment variables (mandatory for hackathon evaluation)
|
| 29 |
+
API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
|
| 30 |
+
API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 31 |
+
MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
|
| 32 |
+
ENV_URL = os.getenv("ENV_URL") or "https://avichauhan-api-debug-env.hf.space"
|
| 33 |
+
IMAGE_NAME = os.getenv("IMAGE_NAME")
|
| 34 |
+
|
| 35 |
+
# Task configuration
|
| 36 |
+
TASKS = ["easy", "medium", "hard"]
|
| 37 |
+
EPISODES_PER_TASK = 3
|
| 38 |
+
MAX_STEPS = {"easy": 3, "medium": 5, "hard": 7}
|
| 39 |
+
BENCHMARK_NAME = "api_debug"
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
# =========================================================================
|
| 43 |
+
# Structured logging (exact format required by evaluator)
|
| 44 |
+
# =========================================================================
|
| 45 |
+
|
| 46 |
+
def log_start(task: str, env: str, model: str) -> None:
|
| 47 |
+
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def log_step(
|
| 51 |
+
step: int, action: str, reward: float, done: bool, error: Optional[str]
|
| 52 |
+
) -> None:
|
| 53 |
+
done_val = str(done).lower()
|
| 54 |
+
error_val = error if error else "null"
|
| 55 |
+
print(
|
| 56 |
+
f"[STEP] step={step} action={action} reward={reward:.2f} "
|
| 57 |
+
f"done={done_val} error={error_val}",
|
| 58 |
+
flush=True,
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def log_end(
|
| 63 |
+
success: bool, steps: int, score: float, rewards: List[float]
|
| 64 |
+
) -> None:
|
| 65 |
+
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 66 |
+
print(
|
| 67 |
+
f"[END] success={str(success).lower()} steps={steps} "
|
| 68 |
+
f"score={score:.3f} rewards={rewards_str}",
|
| 69 |
+
flush=True,
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
# =========================================================================
|
| 74 |
+
# System prompts per task
|
| 75 |
+
# =========================================================================
|
| 76 |
+
|
| 77 |
+
SYSTEM_PROMPTS = {
|
| 78 |
+
"easy": textwrap.dedent("""
|
| 79 |
+
You are an API debugging expert. You receive a broken API request and its specification.
|
| 80 |
+
Your job: identify the error type and the affected fields.
|
| 81 |
+
|
| 82 |
+
Respond with ONLY a JSON object in this format:
|
| 83 |
+
{"error_type": "<type>", "affected_fields": ["field1", "field2"]}
|
| 84 |
+
|
| 85 |
+
Valid error types:
|
| 86 |
+
missing_required_field, wrong_field_type, invalid_email_format,
|
| 87 |
+
missing_auth_header, extra_unknown_field, null_value_in_required,
|
| 88 |
+
wrong_http_method, malformed_json_value, invalid_enum_value,
|
| 89 |
+
datetime_format_error
|
| 90 |
+
""").strip(),
|
| 91 |
+
|
| 92 |
+
"medium": textwrap.dedent("""
|
| 93 |
+
You are an API debugging expert. You receive a broken API request and its specification.
|
| 94 |
+
Your job: fix the request so it matches the spec.
|
| 95 |
+
|
| 96 |
+
Respond with ONLY a JSON object in this format:
|
| 97 |
+
{"fixed_request": "<valid JSON string matching the spec>", "fixed_headers": {"Header": "value"}}
|
| 98 |
+
|
| 99 |
+
The fixed_request must be a valid JSON string. Include all required fields with correct types.
|
| 100 |
+
""").strip(),
|
| 101 |
+
|
| 102 |
+
"hard": textwrap.dedent("""
|
| 103 |
+
You are an API debugging expert. You receive a broken API request with multiple errors.
|
| 104 |
+
Your job: diagnose the errors, fix the request, and explain the fix for a developer.
|
| 105 |
+
|
| 106 |
+
Respond with ONLY a JSON object in this format:
|
| 107 |
+
{
|
| 108 |
+
"error_type": "<primary error type>",
|
| 109 |
+
"affected_fields": ["field1"],
|
| 110 |
+
"fixed_request": "<valid JSON string>",
|
| 111 |
+
"fixed_headers": {"Header": "value"},
|
| 112 |
+
"explanation": "Clear explanation of what was wrong and how to fix it."
|
| 113 |
+
}
|
| 114 |
+
""").strip(),
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
# =========================================================================
|
| 119 |
+
# Prompt building
|
| 120 |
+
# =========================================================================
|
| 121 |
+
|
| 122 |
+
def build_user_prompt(obs, step_num: int) -> str:
|
| 123 |
+
"""Build the user prompt from the observation."""
|
| 124 |
+
parts = [
|
| 125 |
+
f"API: {obs.http_method} {obs.endpoint} ({obs.api_name})",
|
| 126 |
+
f"Error count: {obs.error_count}",
|
| 127 |
+
f"Step {step_num}/{obs.max_steps}",
|
| 128 |
+
f"\nBroken request body:\n{obs.broken_request}",
|
| 129 |
+
f"\nRequest headers: {json.dumps(obs.broken_headers)}",
|
| 130 |
+
f"\nAPI Specification:\n{obs.api_spec}",
|
| 131 |
+
]
|
| 132 |
+
if obs.feedback:
|
| 133 |
+
parts.append(f"\nFeedback from previous attempt:\n{obs.feedback}")
|
| 134 |
+
return "\n".join(parts)
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
# =========================================================================
|
| 138 |
+
# LLM response parsing
|
| 139 |
+
# =========================================================================
|
| 140 |
+
|
| 141 |
+
def parse_llm_response(text: str) -> dict:
|
| 142 |
+
"""Extract a JSON object from the LLM response.
|
| 143 |
+
|
| 144 |
+
Handles cases where the LLM wraps JSON in markdown code blocks
|
| 145 |
+
or adds extra text around it.
|
| 146 |
+
"""
|
| 147 |
+
if not text:
|
| 148 |
+
return {}
|
| 149 |
+
|
| 150 |
+
# Try direct parse first
|
| 151 |
+
try:
|
| 152 |
+
return json.loads(text)
|
| 153 |
+
except json.JSONDecodeError:
|
| 154 |
+
pass
|
| 155 |
+
|
| 156 |
+
# Try extracting from markdown code block
|
| 157 |
+
code_block = re.search(r"```(?:json)?\s*\n?(.*?)\n?\s*```", text, re.DOTALL)
|
| 158 |
+
if code_block:
|
| 159 |
+
try:
|
| 160 |
+
return json.loads(code_block.group(1))
|
| 161 |
+
except json.JSONDecodeError:
|
| 162 |
+
pass
|
| 163 |
+
|
| 164 |
+
# Try finding any JSON object in the text
|
| 165 |
+
brace_match = re.search(r"\{[^{}]*\}", text, re.DOTALL)
|
| 166 |
+
if brace_match:
|
| 167 |
+
try:
|
| 168 |
+
return json.loads(brace_match.group(0))
|
| 169 |
+
except json.JSONDecodeError:
|
| 170 |
+
pass
|
| 171 |
+
|
| 172 |
+
return {}
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
def build_action(data: dict) -> APIDebugAction:
|
| 176 |
+
"""Convert parsed JSON dict to APIDebugAction."""
|
| 177 |
+
# Handle fixed_request: if it's a dict, serialize to JSON string
|
| 178 |
+
fixed_req = data.get("fixed_request")
|
| 179 |
+
if isinstance(fixed_req, dict):
|
| 180 |
+
fixed_req = json.dumps(fixed_req)
|
| 181 |
+
|
| 182 |
+
return APIDebugAction(
|
| 183 |
+
error_type=data.get("error_type"),
|
| 184 |
+
affected_fields=data.get("affected_fields"),
|
| 185 |
+
fixed_request=fixed_req,
|
| 186 |
+
fixed_headers=data.get("fixed_headers"),
|
| 187 |
+
explanation=data.get("explanation"),
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
# =========================================================================
|
| 192 |
+
# Episode runner
|
| 193 |
+
# =========================================================================
|
| 194 |
+
|
| 195 |
+
async def run_episode(
|
| 196 |
+
env: APIDebugEnv,
|
| 197 |
+
llm_client: OpenAI,
|
| 198 |
+
task: str,
|
| 199 |
+
) -> float:
|
| 200 |
+
"""Run a single episode for the given task. Returns the final score."""
|
| 201 |
+
log_start(task=task, env=BENCHMARK_NAME, model=MODEL_NAME)
|
| 202 |
+
|
| 203 |
+
result = await env.reset(task=task)
|
| 204 |
+
obs = result.observation
|
| 205 |
+
rewards: List[float] = []
|
| 206 |
+
steps_taken = 0
|
| 207 |
+
|
| 208 |
+
max_steps = MAX_STEPS[task]
|
| 209 |
+
|
| 210 |
+
for step in range(1, max_steps + 1):
|
| 211 |
+
if result.done:
|
| 212 |
+
break
|
| 213 |
+
|
| 214 |
+
user_prompt = build_user_prompt(obs, step)
|
| 215 |
+
|
| 216 |
+
# Call the LLM
|
| 217 |
+
try:
|
| 218 |
+
completion = llm_client.chat.completions.create(
|
| 219 |
+
model=MODEL_NAME,
|
| 220 |
+
messages=[
|
| 221 |
+
{"role": "system", "content": SYSTEM_PROMPTS[task]},
|
| 222 |
+
{"role": "user", "content": user_prompt},
|
| 223 |
+
],
|
| 224 |
+
max_tokens=500,
|
| 225 |
+
temperature=0.0,
|
| 226 |
+
)
|
| 227 |
+
llm_text = completion.choices[0].message.content or ""
|
| 228 |
+
except Exception as exc:
|
| 229 |
+
print(f"[DEBUG] LLM request failed: {exc}", flush=True)
|
| 230 |
+
llm_text = ""
|
| 231 |
+
|
| 232 |
+
# Parse LLM output into action
|
| 233 |
+
parsed = parse_llm_response(llm_text)
|
| 234 |
+
action = build_action(parsed)
|
| 235 |
+
|
| 236 |
+
# Step the environment
|
| 237 |
+
result = await env.step(action)
|
| 238 |
+
obs = result.observation
|
| 239 |
+
reward = result.reward or 0.0
|
| 240 |
+
done = result.done
|
| 241 |
+
|
| 242 |
+
rewards.append(reward)
|
| 243 |
+
steps_taken = step
|
| 244 |
+
|
| 245 |
+
# Build a short action summary for the log
|
| 246 |
+
action_summary = _action_summary(action, task)
|
| 247 |
+
log_step(step=step, action=action_summary, reward=reward, done=done, error=None)
|
| 248 |
+
|
| 249 |
+
if done:
|
| 250 |
+
break
|
| 251 |
+
|
| 252 |
+
# Final score is the max reward achieved (environment already tracks best)
|
| 253 |
+
score = max(rewards) if rewards else 0.0
|
| 254 |
+
score = min(max(score, 0.0), 1.0)
|
| 255 |
+
success = score >= 0.5
|
| 256 |
+
|
| 257 |
+
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
| 258 |
+
return score
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
def _action_summary(action: APIDebugAction, task: str) -> str:
|
| 262 |
+
"""Short summary of the action for logging."""
|
| 263 |
+
if task == "easy":
|
| 264 |
+
return f"diagnose:{action.error_type or 'none'}"
|
| 265 |
+
elif task == "medium":
|
| 266 |
+
fix_len = len(action.fixed_request or "")
|
| 267 |
+
return f"fix:len={fix_len}"
|
| 268 |
+
else:
|
| 269 |
+
fix_len = len(action.fixed_request or "")
|
| 270 |
+
exp_len = len(action.explanation or "")
|
| 271 |
+
return f"fix:len={fix_len}+explain:len={exp_len}"
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
# =========================================================================
|
| 275 |
+
# Main
|
| 276 |
+
# =========================================================================
|
| 277 |
+
|
| 278 |
+
async def main() -> None:
|
| 279 |
+
llm_client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
|
| 280 |
+
|
| 281 |
+
# Connect to environment (via Docker image or direct URL)
|
| 282 |
+
if IMAGE_NAME:
|
| 283 |
+
env = await APIDebugEnv.from_docker_image(IMAGE_NAME)
|
| 284 |
+
else:
|
| 285 |
+
env = APIDebugEnv(base_url=ENV_URL)
|
| 286 |
+
|
| 287 |
+
all_scores: dict = {}
|
| 288 |
+
|
| 289 |
+
try:
|
| 290 |
+
for task in TASKS:
|
| 291 |
+
task_scores = []
|
| 292 |
+
for ep in range(EPISODES_PER_TASK):
|
| 293 |
+
score = await run_episode(env, llm_client, task)
|
| 294 |
+
task_scores.append(score)
|
| 295 |
+
avg = sum(task_scores) / len(task_scores)
|
| 296 |
+
all_scores[task] = avg
|
| 297 |
+
|
| 298 |
+
# Print summary
|
| 299 |
+
print("\n--- Baseline Scores ---", flush=True)
|
| 300 |
+
for task, avg in all_scores.items():
|
| 301 |
+
print(f" {task}: {avg:.3f}", flush=True)
|
| 302 |
+
overall = sum(all_scores.values()) / len(all_scores)
|
| 303 |
+
print(f" overall: {overall:.3f}", flush=True)
|
| 304 |
+
|
| 305 |
+
finally:
|
| 306 |
+
try:
|
| 307 |
+
await env.close()
|
| 308 |
+
except Exception as e:
|
| 309 |
+
print(f"[DEBUG] env.close() error: {e}", flush=True)
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
if __name__ == "__main__":
|
| 313 |
+
asyncio.run(main())
|
models.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Pydantic models for the API Debug Environment.
|
| 3 |
+
|
| 4 |
+
APIDebugAction: What the agent sends each step.
|
| 5 |
+
APIDebugObservation: What the environment returns each step.
|
| 6 |
+
|
| 7 |
+
All Action fields are Optional so the agent can submit only what it has.
|
| 8 |
+
For example, on an easy task the agent only needs error_type and affected_fields.
|
| 9 |
+
On medium, it needs fixed_request. On hard, it needs everything plus explanation.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from typing import Dict, List, Optional
|
| 13 |
+
|
| 14 |
+
from openenv.core.env_server.types import Action, Observation
|
| 15 |
+
from pydantic import Field
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class APIDebugAction(Action):
|
| 19 |
+
"""Agent's response at each step of the debugging episode."""
|
| 20 |
+
|
| 21 |
+
error_type: Optional[str] = Field(
|
| 22 |
+
default=None,
|
| 23 |
+
description="Diagnosed error type, e.g. 'missing_required_field'"
|
| 24 |
+
)
|
| 25 |
+
affected_fields: Optional[List[str]] = Field(
|
| 26 |
+
default=None,
|
| 27 |
+
description="List of field names affected by the error"
|
| 28 |
+
)
|
| 29 |
+
fixed_request: Optional[str] = Field(
|
| 30 |
+
default=None,
|
| 31 |
+
description="JSON string of the corrected request body"
|
| 32 |
+
)
|
| 33 |
+
fixed_headers: Optional[Dict[str, str]] = Field(
|
| 34 |
+
default=None,
|
| 35 |
+
description="Corrected HTTP headers if applicable"
|
| 36 |
+
)
|
| 37 |
+
explanation: Optional[str] = Field(
|
| 38 |
+
default=None,
|
| 39 |
+
description="Developer-facing explanation of the fix (hard task only)"
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class APIDebugObservation(Observation):
|
| 44 |
+
"""Environment's response at each step.
|
| 45 |
+
|
| 46 |
+
Inherits done, reward, and metadata from Observation base class.
|
| 47 |
+
"""
|
| 48 |
+
|
| 49 |
+
task: str = Field(
|
| 50 |
+
default="easy",
|
| 51 |
+
description="Current task difficulty: easy, medium, hard"
|
| 52 |
+
)
|
| 53 |
+
api_name: str = Field(
|
| 54 |
+
default="",
|
| 55 |
+
description="Name of the API being debugged"
|
| 56 |
+
)
|
| 57 |
+
http_method: str = Field(
|
| 58 |
+
default="POST",
|
| 59 |
+
description="HTTP method of the broken request"
|
| 60 |
+
)
|
| 61 |
+
endpoint: str = Field(
|
| 62 |
+
default="",
|
| 63 |
+
description="API endpoint path"
|
| 64 |
+
)
|
| 65 |
+
broken_request: str = Field(
|
| 66 |
+
default="",
|
| 67 |
+
description="JSON string of the malformed request body"
|
| 68 |
+
)
|
| 69 |
+
broken_headers: Dict[str, str] = Field(
|
| 70 |
+
default_factory=dict,
|
| 71 |
+
description="HTTP headers sent with the broken request"
|
| 72 |
+
)
|
| 73 |
+
api_spec: str = Field(
|
| 74 |
+
default="",
|
| 75 |
+
description="JSON string of the API specification"
|
| 76 |
+
)
|
| 77 |
+
error_count: int = Field(
|
| 78 |
+
default=1,
|
| 79 |
+
description="Number of errors injected in this episode"
|
| 80 |
+
)
|
| 81 |
+
step_number: int = Field(
|
| 82 |
+
default=0,
|
| 83 |
+
description="Current step in this episode"
|
| 84 |
+
)
|
| 85 |
+
max_steps: int = Field(
|
| 86 |
+
default=3,
|
| 87 |
+
description="Maximum steps allowed for this task"
|
| 88 |
+
)
|
| 89 |
+
feedback: str = Field(
|
| 90 |
+
default="",
|
| 91 |
+
description="Structured validation feedback from the last action"
|
| 92 |
+
)
|
| 93 |
+
message: str = Field(
|
| 94 |
+
default="",
|
| 95 |
+
description="Human-readable status message"
|
| 96 |
+
)
|
openenv.yaml
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: api-debug-env
|
| 2 |
+
version: 0.1.0
|
| 3 |
+
description: API Contract Validation RL environment for debugging malformed API requests
|
| 4 |
+
sdk: docker
|
| 5 |
+
dockerfile: Dockerfile
|
pyproject.toml
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["hatchling"]
|
| 3 |
+
build-backend = "hatchling.build"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "api-debug-env"
|
| 7 |
+
version = "0.1.0"
|
| 8 |
+
requires-python = ">=3.10"
|
| 9 |
+
dependencies = [
|
| 10 |
+
"openenv-core",
|
| 11 |
+
"fastapi",
|
| 12 |
+
"uvicorn[standard]",
|
| 13 |
+
"pydantic",
|
| 14 |
+
"websockets",
|
| 15 |
+
"openai",
|
| 16 |
+
]
|
| 17 |
+
|
| 18 |
+
[project.scripts]
|
| 19 |
+
server = "server.app:main"
|
| 20 |
+
|
| 21 |
+
[tool.hatch.build.targets.wheel]
|
| 22 |
+
packages = ["."]
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openenv-core
|
| 2 |
+
fastapi
|
| 3 |
+
uvicorn[standard]
|
| 4 |
+
pydantic
|
| 5 |
+
websockets
|
| 6 |
+
openai
|
server/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .environment import APIDebugEnvironment
|
| 2 |
+
|
| 3 |
+
__all__ = ["APIDebugEnvironment"]
|
server/api_specs.py
ADDED
|
@@ -0,0 +1,639 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
30 API spec templates covering 6 real-world domains.
|
| 3 |
+
|
| 4 |
+
Each spec defines:
|
| 5 |
+
- api_name: Human-readable name
|
| 6 |
+
- http_method: GET, POST, PUT, PATCH, DELETE
|
| 7 |
+
- endpoint: API path
|
| 8 |
+
- required_headers: Headers that must be present
|
| 9 |
+
- required_fields: Fields that must be in the request body
|
| 10 |
+
- optional_fields: Fields that may be in the request body
|
| 11 |
+
- field_types: Expected type for each field (used for validation)
|
| 12 |
+
- valid_example: A correct request body (used to generate broken requests)
|
| 13 |
+
|
| 14 |
+
Supported field types:
|
| 15 |
+
- "string", "integer", "float", "boolean"
|
| 16 |
+
- "email" (validated by regex)
|
| 17 |
+
- "datetime" (ISO 8601 format)
|
| 18 |
+
- "enum:val1,val2,val3" (one of the listed values)
|
| 19 |
+
- "url" (validated by pattern)
|
| 20 |
+
- "phone" (validated by pattern)
|
| 21 |
+
- "object" (nested dict, not deeply validated)
|
| 22 |
+
- "array" (list)
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
from typing import Any, Dict, List
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def _spec(
|
| 29 |
+
api_name: str,
|
| 30 |
+
http_method: str,
|
| 31 |
+
endpoint: str,
|
| 32 |
+
required_fields: List[str],
|
| 33 |
+
field_types: Dict[str, str],
|
| 34 |
+
valid_example: Dict[str, Any],
|
| 35 |
+
optional_fields: List[str] = None,
|
| 36 |
+
required_headers: Dict[str, str] = None,
|
| 37 |
+
) -> Dict[str, Any]:
|
| 38 |
+
"""Build a spec dict with sensible defaults."""
|
| 39 |
+
return {
|
| 40 |
+
"api_name": api_name,
|
| 41 |
+
"http_method": http_method,
|
| 42 |
+
"endpoint": endpoint,
|
| 43 |
+
"required_headers": required_headers or {
|
| 44 |
+
"Authorization": "Bearer sk_test_abc123",
|
| 45 |
+
"Content-Type": "application/json",
|
| 46 |
+
},
|
| 47 |
+
"required_fields": required_fields,
|
| 48 |
+
"optional_fields": optional_fields or [],
|
| 49 |
+
"field_types": field_types,
|
| 50 |
+
"valid_example": valid_example,
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
# =========================================================================
|
| 55 |
+
# Domain 1: Payment APIs (Stripe-like)
|
| 56 |
+
# =========================================================================
|
| 57 |
+
|
| 58 |
+
PAYMENT_SPECS = [
|
| 59 |
+
_spec(
|
| 60 |
+
api_name="Create Customer",
|
| 61 |
+
http_method="POST",
|
| 62 |
+
endpoint="/v1/customers",
|
| 63 |
+
required_fields=["email", "name"],
|
| 64 |
+
optional_fields=["phone", "description", "address"],
|
| 65 |
+
field_types={
|
| 66 |
+
"email": "email",
|
| 67 |
+
"name": "string",
|
| 68 |
+
"phone": "phone",
|
| 69 |
+
"description": "string",
|
| 70 |
+
"address": "object",
|
| 71 |
+
},
|
| 72 |
+
valid_example={
|
| 73 |
+
"email": "alice@example.com",
|
| 74 |
+
"name": "Alice Johnson",
|
| 75 |
+
},
|
| 76 |
+
),
|
| 77 |
+
_spec(
|
| 78 |
+
api_name="Create Charge",
|
| 79 |
+
http_method="POST",
|
| 80 |
+
endpoint="/v1/charges",
|
| 81 |
+
required_fields=["amount", "currency", "customer_id"],
|
| 82 |
+
optional_fields=["description", "receipt_email"],
|
| 83 |
+
field_types={
|
| 84 |
+
"amount": "integer",
|
| 85 |
+
"currency": "enum:usd,eur,gbp,inr,jpy",
|
| 86 |
+
"customer_id": "string",
|
| 87 |
+
"description": "string",
|
| 88 |
+
"receipt_email": "email",
|
| 89 |
+
},
|
| 90 |
+
valid_example={
|
| 91 |
+
"amount": 2500,
|
| 92 |
+
"currency": "usd",
|
| 93 |
+
"customer_id": "cus_abc123",
|
| 94 |
+
},
|
| 95 |
+
),
|
| 96 |
+
_spec(
|
| 97 |
+
api_name="Create Subscription",
|
| 98 |
+
http_method="POST",
|
| 99 |
+
endpoint="/v1/subscriptions",
|
| 100 |
+
required_fields=["customer_id", "plan_id", "start_date"],
|
| 101 |
+
optional_fields=["trial_days", "auto_renew"],
|
| 102 |
+
field_types={
|
| 103 |
+
"customer_id": "string",
|
| 104 |
+
"plan_id": "string",
|
| 105 |
+
"start_date": "datetime",
|
| 106 |
+
"trial_days": "integer",
|
| 107 |
+
"auto_renew": "boolean",
|
| 108 |
+
},
|
| 109 |
+
valid_example={
|
| 110 |
+
"customer_id": "cus_abc123",
|
| 111 |
+
"plan_id": "plan_monthly_pro",
|
| 112 |
+
"start_date": "2026-04-01T00:00:00Z",
|
| 113 |
+
},
|
| 114 |
+
),
|
| 115 |
+
_spec(
|
| 116 |
+
api_name="Process Refund",
|
| 117 |
+
http_method="POST",
|
| 118 |
+
endpoint="/v1/refunds",
|
| 119 |
+
required_fields=["charge_id", "amount"],
|
| 120 |
+
optional_fields=["reason"],
|
| 121 |
+
field_types={
|
| 122 |
+
"charge_id": "string",
|
| 123 |
+
"amount": "integer",
|
| 124 |
+
"reason": "enum:duplicate,fraudulent,requested_by_customer",
|
| 125 |
+
},
|
| 126 |
+
valid_example={
|
| 127 |
+
"charge_id": "ch_abc123",
|
| 128 |
+
"amount": 1500,
|
| 129 |
+
},
|
| 130 |
+
),
|
| 131 |
+
_spec(
|
| 132 |
+
api_name="List Transactions",
|
| 133 |
+
http_method="GET",
|
| 134 |
+
endpoint="/v1/transactions",
|
| 135 |
+
required_fields=["account_id"],
|
| 136 |
+
optional_fields=["start_date", "end_date", "limit"],
|
| 137 |
+
field_types={
|
| 138 |
+
"account_id": "string",
|
| 139 |
+
"start_date": "datetime",
|
| 140 |
+
"end_date": "datetime",
|
| 141 |
+
"limit": "integer",
|
| 142 |
+
},
|
| 143 |
+
valid_example={
|
| 144 |
+
"account_id": "acc_abc123",
|
| 145 |
+
},
|
| 146 |
+
),
|
| 147 |
+
]
|
| 148 |
+
|
| 149 |
+
# =========================================================================
|
| 150 |
+
# Domain 2: User Management
|
| 151 |
+
# =========================================================================
|
| 152 |
+
|
| 153 |
+
USER_SPECS = [
|
| 154 |
+
_spec(
|
| 155 |
+
api_name="Create User",
|
| 156 |
+
http_method="POST",
|
| 157 |
+
endpoint="/api/users",
|
| 158 |
+
required_fields=["email", "username", "password"],
|
| 159 |
+
optional_fields=["full_name", "role"],
|
| 160 |
+
field_types={
|
| 161 |
+
"email": "email",
|
| 162 |
+
"username": "string",
|
| 163 |
+
"password": "string",
|
| 164 |
+
"full_name": "string",
|
| 165 |
+
"role": "enum:admin,editor,viewer",
|
| 166 |
+
},
|
| 167 |
+
valid_example={
|
| 168 |
+
"email": "bob@example.com",
|
| 169 |
+
"username": "bob_smith",
|
| 170 |
+
"password": "SecurePass123!",
|
| 171 |
+
},
|
| 172 |
+
),
|
| 173 |
+
_spec(
|
| 174 |
+
api_name="Update Profile",
|
| 175 |
+
http_method="PATCH",
|
| 176 |
+
endpoint="/api/users/{user_id}/profile",
|
| 177 |
+
required_fields=["user_id", "display_name"],
|
| 178 |
+
optional_fields=["bio", "avatar_url", "timezone"],
|
| 179 |
+
field_types={
|
| 180 |
+
"user_id": "string",
|
| 181 |
+
"display_name": "string",
|
| 182 |
+
"bio": "string",
|
| 183 |
+
"avatar_url": "url",
|
| 184 |
+
"timezone": "string",
|
| 185 |
+
},
|
| 186 |
+
valid_example={
|
| 187 |
+
"user_id": "usr_abc123",
|
| 188 |
+
"display_name": "Bob Smith",
|
| 189 |
+
},
|
| 190 |
+
),
|
| 191 |
+
_spec(
|
| 192 |
+
api_name="Reset Password",
|
| 193 |
+
http_method="POST",
|
| 194 |
+
endpoint="/api/auth/reset-password",
|
| 195 |
+
required_fields=["email"],
|
| 196 |
+
optional_fields=["redirect_url"],
|
| 197 |
+
field_types={
|
| 198 |
+
"email": "email",
|
| 199 |
+
"redirect_url": "url",
|
| 200 |
+
},
|
| 201 |
+
valid_example={
|
| 202 |
+
"email": "bob@example.com",
|
| 203 |
+
},
|
| 204 |
+
),
|
| 205 |
+
_spec(
|
| 206 |
+
api_name="Verify Email",
|
| 207 |
+
http_method="POST",
|
| 208 |
+
endpoint="/api/auth/verify-email",
|
| 209 |
+
required_fields=["token", "email"],
|
| 210 |
+
field_types={
|
| 211 |
+
"token": "string",
|
| 212 |
+
"email": "email",
|
| 213 |
+
},
|
| 214 |
+
valid_example={
|
| 215 |
+
"token": "verify_abc123xyz",
|
| 216 |
+
"email": "bob@example.com",
|
| 217 |
+
},
|
| 218 |
+
),
|
| 219 |
+
_spec(
|
| 220 |
+
api_name="Delete Account",
|
| 221 |
+
http_method="DELETE",
|
| 222 |
+
endpoint="/api/users/{user_id}",
|
| 223 |
+
required_fields=["user_id", "confirmation"],
|
| 224 |
+
field_types={
|
| 225 |
+
"user_id": "string",
|
| 226 |
+
"confirmation": "enum:DELETE,CONFIRM",
|
| 227 |
+
},
|
| 228 |
+
valid_example={
|
| 229 |
+
"user_id": "usr_abc123",
|
| 230 |
+
"confirmation": "DELETE",
|
| 231 |
+
},
|
| 232 |
+
),
|
| 233 |
+
]
|
| 234 |
+
|
| 235 |
+
# =========================================================================
|
| 236 |
+
# Domain 3: Content APIs (GitHub-like)
|
| 237 |
+
# =========================================================================
|
| 238 |
+
|
| 239 |
+
CONTENT_SPECS = [
|
| 240 |
+
_spec(
|
| 241 |
+
api_name="Create Repository",
|
| 242 |
+
http_method="POST",
|
| 243 |
+
endpoint="/api/repos",
|
| 244 |
+
required_fields=["name", "visibility"],
|
| 245 |
+
optional_fields=["description", "auto_init", "license"],
|
| 246 |
+
field_types={
|
| 247 |
+
"name": "string",
|
| 248 |
+
"visibility": "enum:public,private,internal",
|
| 249 |
+
"description": "string",
|
| 250 |
+
"auto_init": "boolean",
|
| 251 |
+
"license": "string",
|
| 252 |
+
},
|
| 253 |
+
valid_example={
|
| 254 |
+
"name": "my-project",
|
| 255 |
+
"visibility": "public",
|
| 256 |
+
},
|
| 257 |
+
),
|
| 258 |
+
_spec(
|
| 259 |
+
api_name="Create Issue",
|
| 260 |
+
http_method="POST",
|
| 261 |
+
endpoint="/api/repos/{repo_id}/issues",
|
| 262 |
+
required_fields=["title", "repo_id"],
|
| 263 |
+
optional_fields=["body", "assignee", "labels", "priority"],
|
| 264 |
+
field_types={
|
| 265 |
+
"title": "string",
|
| 266 |
+
"repo_id": "string",
|
| 267 |
+
"body": "string",
|
| 268 |
+
"assignee": "string",
|
| 269 |
+
"labels": "array",
|
| 270 |
+
"priority": "enum:low,medium,high,critical",
|
| 271 |
+
},
|
| 272 |
+
valid_example={
|
| 273 |
+
"title": "Fix login page redirect",
|
| 274 |
+
"repo_id": "repo_abc123",
|
| 275 |
+
},
|
| 276 |
+
),
|
| 277 |
+
_spec(
|
| 278 |
+
api_name="Create Comment",
|
| 279 |
+
http_method="POST",
|
| 280 |
+
endpoint="/api/issues/{issue_id}/comments",
|
| 281 |
+
required_fields=["issue_id", "body"],
|
| 282 |
+
optional_fields=["mentions"],
|
| 283 |
+
field_types={
|
| 284 |
+
"issue_id": "string",
|
| 285 |
+
"body": "string",
|
| 286 |
+
"mentions": "array",
|
| 287 |
+
},
|
| 288 |
+
valid_example={
|
| 289 |
+
"issue_id": "issue_abc123",
|
| 290 |
+
"body": "This looks like a duplicate of #42.",
|
| 291 |
+
},
|
| 292 |
+
),
|
| 293 |
+
_spec(
|
| 294 |
+
api_name="Merge Pull Request",
|
| 295 |
+
http_method="PUT",
|
| 296 |
+
endpoint="/api/repos/{repo_id}/pulls/{pr_id}/merge",
|
| 297 |
+
required_fields=["repo_id", "pr_id", "merge_method"],
|
| 298 |
+
optional_fields=["commit_title", "delete_branch"],
|
| 299 |
+
field_types={
|
| 300 |
+
"repo_id": "string",
|
| 301 |
+
"pr_id": "string",
|
| 302 |
+
"merge_method": "enum:merge,squash,rebase",
|
| 303 |
+
"commit_title": "string",
|
| 304 |
+
"delete_branch": "boolean",
|
| 305 |
+
},
|
| 306 |
+
valid_example={
|
| 307 |
+
"repo_id": "repo_abc123",
|
| 308 |
+
"pr_id": "pr_456",
|
| 309 |
+
"merge_method": "squash",
|
| 310 |
+
},
|
| 311 |
+
),
|
| 312 |
+
_spec(
|
| 313 |
+
api_name="Create Release",
|
| 314 |
+
http_method="POST",
|
| 315 |
+
endpoint="/api/repos/{repo_id}/releases",
|
| 316 |
+
required_fields=["repo_id", "tag_name", "name"],
|
| 317 |
+
optional_fields=["body", "draft", "prerelease"],
|
| 318 |
+
field_types={
|
| 319 |
+
"repo_id": "string",
|
| 320 |
+
"tag_name": "string",
|
| 321 |
+
"name": "string",
|
| 322 |
+
"body": "string",
|
| 323 |
+
"draft": "boolean",
|
| 324 |
+
"prerelease": "boolean",
|
| 325 |
+
},
|
| 326 |
+
valid_example={
|
| 327 |
+
"repo_id": "repo_abc123",
|
| 328 |
+
"tag_name": "v1.0.0",
|
| 329 |
+
"name": "Version 1.0.0",
|
| 330 |
+
},
|
| 331 |
+
),
|
| 332 |
+
]
|
| 333 |
+
|
| 334 |
+
# =========================================================================
|
| 335 |
+
# Domain 4: Messaging (Twilio-like)
|
| 336 |
+
# =========================================================================
|
| 337 |
+
|
| 338 |
+
MESSAGING_SPECS = [
|
| 339 |
+
_spec(
|
| 340 |
+
api_name="Send SMS",
|
| 341 |
+
http_method="POST",
|
| 342 |
+
endpoint="/api/messages/sms",
|
| 343 |
+
required_fields=["to", "from_number", "body"],
|
| 344 |
+
optional_fields=["callback_url"],
|
| 345 |
+
field_types={
|
| 346 |
+
"to": "phone",
|
| 347 |
+
"from_number": "phone",
|
| 348 |
+
"body": "string",
|
| 349 |
+
"callback_url": "url",
|
| 350 |
+
},
|
| 351 |
+
valid_example={
|
| 352 |
+
"to": "+14155551234",
|
| 353 |
+
"from_number": "+14155550000",
|
| 354 |
+
"body": "Your verification code is 123456",
|
| 355 |
+
},
|
| 356 |
+
),
|
| 357 |
+
_spec(
|
| 358 |
+
api_name="Send Email",
|
| 359 |
+
http_method="POST",
|
| 360 |
+
endpoint="/api/messages/email",
|
| 361 |
+
required_fields=["to_email", "subject", "body"],
|
| 362 |
+
optional_fields=["cc", "bcc", "reply_to"],
|
| 363 |
+
field_types={
|
| 364 |
+
"to_email": "email",
|
| 365 |
+
"subject": "string",
|
| 366 |
+
"body": "string",
|
| 367 |
+
"cc": "email",
|
| 368 |
+
"bcc": "email",
|
| 369 |
+
"reply_to": "email",
|
| 370 |
+
},
|
| 371 |
+
valid_example={
|
| 372 |
+
"to_email": "customer@example.com",
|
| 373 |
+
"subject": "Order Confirmation",
|
| 374 |
+
"body": "Your order #1234 has been confirmed.",
|
| 375 |
+
},
|
| 376 |
+
),
|
| 377 |
+
_spec(
|
| 378 |
+
api_name="Create Webhook",
|
| 379 |
+
http_method="POST",
|
| 380 |
+
endpoint="/api/webhooks",
|
| 381 |
+
required_fields=["url", "events"],
|
| 382 |
+
optional_fields=["secret", "active"],
|
| 383 |
+
field_types={
|
| 384 |
+
"url": "url",
|
| 385 |
+
"events": "array",
|
| 386 |
+
"secret": "string",
|
| 387 |
+
"active": "boolean",
|
| 388 |
+
},
|
| 389 |
+
valid_example={
|
| 390 |
+
"url": "https://myapp.com/webhook",
|
| 391 |
+
"events": ["message.sent", "message.delivered"],
|
| 392 |
+
},
|
| 393 |
+
),
|
| 394 |
+
_spec(
|
| 395 |
+
api_name="Create Template",
|
| 396 |
+
http_method="POST",
|
| 397 |
+
endpoint="/api/templates",
|
| 398 |
+
required_fields=["name", "content", "channel"],
|
| 399 |
+
optional_fields=["variables", "language"],
|
| 400 |
+
field_types={
|
| 401 |
+
"name": "string",
|
| 402 |
+
"content": "string",
|
| 403 |
+
"channel": "enum:sms,email,push",
|
| 404 |
+
"variables": "array",
|
| 405 |
+
"language": "string",
|
| 406 |
+
},
|
| 407 |
+
valid_example={
|
| 408 |
+
"name": "welcome_message",
|
| 409 |
+
"content": "Hello {{name}}, welcome to our service!",
|
| 410 |
+
"channel": "email",
|
| 411 |
+
},
|
| 412 |
+
),
|
| 413 |
+
_spec(
|
| 414 |
+
api_name="Verify Phone",
|
| 415 |
+
http_method="POST",
|
| 416 |
+
endpoint="/api/verify/phone",
|
| 417 |
+
required_fields=["phone_number", "code"],
|
| 418 |
+
field_types={
|
| 419 |
+
"phone_number": "phone",
|
| 420 |
+
"code": "string",
|
| 421 |
+
},
|
| 422 |
+
valid_example={
|
| 423 |
+
"phone_number": "+14155551234",
|
| 424 |
+
"code": "123456",
|
| 425 |
+
},
|
| 426 |
+
),
|
| 427 |
+
]
|
| 428 |
+
|
| 429 |
+
# =========================================================================
|
| 430 |
+
# Domain 5: E-Commerce
|
| 431 |
+
# =========================================================================
|
| 432 |
+
|
| 433 |
+
ECOMMERCE_SPECS = [
|
| 434 |
+
_spec(
|
| 435 |
+
api_name="Create Order",
|
| 436 |
+
http_method="POST",
|
| 437 |
+
endpoint="/api/orders",
|
| 438 |
+
required_fields=["customer_id", "items", "shipping_address"],
|
| 439 |
+
optional_fields=["notes", "coupon_code"],
|
| 440 |
+
field_types={
|
| 441 |
+
"customer_id": "string",
|
| 442 |
+
"items": "array",
|
| 443 |
+
"shipping_address": "object",
|
| 444 |
+
"notes": "string",
|
| 445 |
+
"coupon_code": "string",
|
| 446 |
+
},
|
| 447 |
+
valid_example={
|
| 448 |
+
"customer_id": "cust_abc123",
|
| 449 |
+
"items": [{"product_id": "prod_1", "quantity": 2}],
|
| 450 |
+
"shipping_address": {"line1": "123 Main St", "city": "Portland", "zip": "97201"},
|
| 451 |
+
},
|
| 452 |
+
),
|
| 453 |
+
_spec(
|
| 454 |
+
api_name="Add Cart Item",
|
| 455 |
+
http_method="POST",
|
| 456 |
+
endpoint="/api/cart/items",
|
| 457 |
+
required_fields=["product_id", "quantity"],
|
| 458 |
+
optional_fields=["variant_id", "notes"],
|
| 459 |
+
field_types={
|
| 460 |
+
"product_id": "string",
|
| 461 |
+
"quantity": "integer",
|
| 462 |
+
"variant_id": "string",
|
| 463 |
+
"notes": "string",
|
| 464 |
+
},
|
| 465 |
+
valid_example={
|
| 466 |
+
"product_id": "prod_abc123",
|
| 467 |
+
"quantity": 1,
|
| 468 |
+
},
|
| 469 |
+
),
|
| 470 |
+
_spec(
|
| 471 |
+
api_name="Process Payment",
|
| 472 |
+
http_method="POST",
|
| 473 |
+
endpoint="/api/payments",
|
| 474 |
+
required_fields=["order_id", "amount", "currency", "payment_method"],
|
| 475 |
+
optional_fields=["billing_email"],
|
| 476 |
+
field_types={
|
| 477 |
+
"order_id": "string",
|
| 478 |
+
"amount": "float",
|
| 479 |
+
"currency": "enum:usd,eur,gbp,inr",
|
| 480 |
+
"payment_method": "enum:card,bank_transfer,wallet",
|
| 481 |
+
"billing_email": "email",
|
| 482 |
+
},
|
| 483 |
+
valid_example={
|
| 484 |
+
"order_id": "ord_abc123",
|
| 485 |
+
"amount": 49.99,
|
| 486 |
+
"currency": "usd",
|
| 487 |
+
"payment_method": "card",
|
| 488 |
+
},
|
| 489 |
+
),
|
| 490 |
+
_spec(
|
| 491 |
+
api_name="Apply Coupon",
|
| 492 |
+
http_method="POST",
|
| 493 |
+
endpoint="/api/cart/coupon",
|
| 494 |
+
required_fields=["coupon_code", "cart_id"],
|
| 495 |
+
field_types={
|
| 496 |
+
"coupon_code": "string",
|
| 497 |
+
"cart_id": "string",
|
| 498 |
+
},
|
| 499 |
+
valid_example={
|
| 500 |
+
"coupon_code": "SAVE20",
|
| 501 |
+
"cart_id": "cart_abc123",
|
| 502 |
+
},
|
| 503 |
+
),
|
| 504 |
+
_spec(
|
| 505 |
+
api_name="Create Shipping Label",
|
| 506 |
+
http_method="POST",
|
| 507 |
+
endpoint="/api/shipping/labels",
|
| 508 |
+
required_fields=["order_id", "carrier", "weight"],
|
| 509 |
+
optional_fields=["insurance", "signature_required"],
|
| 510 |
+
field_types={
|
| 511 |
+
"order_id": "string",
|
| 512 |
+
"carrier": "enum:usps,fedex,ups,dhl",
|
| 513 |
+
"weight": "float",
|
| 514 |
+
"insurance": "boolean",
|
| 515 |
+
"signature_required": "boolean",
|
| 516 |
+
},
|
| 517 |
+
valid_example={
|
| 518 |
+
"order_id": "ord_abc123",
|
| 519 |
+
"carrier": "usps",
|
| 520 |
+
"weight": 2.5,
|
| 521 |
+
},
|
| 522 |
+
),
|
| 523 |
+
]
|
| 524 |
+
|
| 525 |
+
# =========================================================================
|
| 526 |
+
# Domain 6: Calendar and Auth
|
| 527 |
+
# =========================================================================
|
| 528 |
+
|
| 529 |
+
CALENDAR_AUTH_SPECS = [
|
| 530 |
+
_spec(
|
| 531 |
+
api_name="Create Event",
|
| 532 |
+
http_method="POST",
|
| 533 |
+
endpoint="/api/calendar/events",
|
| 534 |
+
required_fields=["title", "start_time", "end_time"],
|
| 535 |
+
optional_fields=["description", "location", "attendees", "recurrence"],
|
| 536 |
+
field_types={
|
| 537 |
+
"title": "string",
|
| 538 |
+
"start_time": "datetime",
|
| 539 |
+
"end_time": "datetime",
|
| 540 |
+
"description": "string",
|
| 541 |
+
"location": "string",
|
| 542 |
+
"attendees": "array",
|
| 543 |
+
"recurrence": "enum:none,daily,weekly,monthly",
|
| 544 |
+
},
|
| 545 |
+
valid_example={
|
| 546 |
+
"title": "Team Standup",
|
| 547 |
+
"start_time": "2026-04-05T09:00:00Z",
|
| 548 |
+
"end_time": "2026-04-05T09:30:00Z",
|
| 549 |
+
},
|
| 550 |
+
),
|
| 551 |
+
_spec(
|
| 552 |
+
api_name="OAuth Token Request",
|
| 553 |
+
http_method="POST",
|
| 554 |
+
endpoint="/oauth/token",
|
| 555 |
+
required_fields=["grant_type", "client_id", "client_secret"],
|
| 556 |
+
optional_fields=["scope", "redirect_uri"],
|
| 557 |
+
field_types={
|
| 558 |
+
"grant_type": "enum:authorization_code,client_credentials,refresh_token",
|
| 559 |
+
"client_id": "string",
|
| 560 |
+
"client_secret": "string",
|
| 561 |
+
"scope": "string",
|
| 562 |
+
"redirect_uri": "url",
|
| 563 |
+
},
|
| 564 |
+
valid_example={
|
| 565 |
+
"grant_type": "client_credentials",
|
| 566 |
+
"client_id": "app_abc123",
|
| 567 |
+
"client_secret": "secret_xyz789",
|
| 568 |
+
},
|
| 569 |
+
required_headers={
|
| 570 |
+
"Content-Type": "application/json",
|
| 571 |
+
},
|
| 572 |
+
),
|
| 573 |
+
_spec(
|
| 574 |
+
api_name="Create API Key",
|
| 575 |
+
http_method="POST",
|
| 576 |
+
endpoint="/api/keys",
|
| 577 |
+
required_fields=["name", "permissions"],
|
| 578 |
+
optional_fields=["expires_at"],
|
| 579 |
+
field_types={
|
| 580 |
+
"name": "string",
|
| 581 |
+
"permissions": "array",
|
| 582 |
+
"expires_at": "datetime",
|
| 583 |
+
},
|
| 584 |
+
valid_example={
|
| 585 |
+
"name": "production-key",
|
| 586 |
+
"permissions": ["read", "write"],
|
| 587 |
+
},
|
| 588 |
+
),
|
| 589 |
+
_spec(
|
| 590 |
+
api_name="Invite User",
|
| 591 |
+
http_method="POST",
|
| 592 |
+
endpoint="/api/teams/{team_id}/invites",
|
| 593 |
+
required_fields=["team_id", "email", "role"],
|
| 594 |
+
optional_fields=["message"],
|
| 595 |
+
field_types={
|
| 596 |
+
"team_id": "string",
|
| 597 |
+
"email": "email",
|
| 598 |
+
"role": "enum:admin,member,viewer",
|
| 599 |
+
"message": "string",
|
| 600 |
+
},
|
| 601 |
+
valid_example={
|
| 602 |
+
"team_id": "team_abc123",
|
| 603 |
+
"email": "newuser@example.com",
|
| 604 |
+
"role": "member",
|
| 605 |
+
},
|
| 606 |
+
),
|
| 607 |
+
_spec(
|
| 608 |
+
api_name="Update Permissions",
|
| 609 |
+
http_method="PUT",
|
| 610 |
+
endpoint="/api/users/{user_id}/permissions",
|
| 611 |
+
required_fields=["user_id", "permissions"],
|
| 612 |
+
optional_fields=["effective_from"],
|
| 613 |
+
field_types={
|
| 614 |
+
"user_id": "string",
|
| 615 |
+
"permissions": "array",
|
| 616 |
+
"effective_from": "datetime",
|
| 617 |
+
},
|
| 618 |
+
valid_example={
|
| 619 |
+
"user_id": "usr_abc123",
|
| 620 |
+
"permissions": ["read", "write", "admin"],
|
| 621 |
+
},
|
| 622 |
+
),
|
| 623 |
+
]
|
| 624 |
+
|
| 625 |
+
|
| 626 |
+
# All 30 specs in a single flat list
|
| 627 |
+
ALL_SPECS = (
|
| 628 |
+
PAYMENT_SPECS
|
| 629 |
+
+ USER_SPECS
|
| 630 |
+
+ CONTENT_SPECS
|
| 631 |
+
+ MESSAGING_SPECS
|
| 632 |
+
+ ECOMMERCE_SPECS
|
| 633 |
+
+ CALENDAR_AUTH_SPECS
|
| 634 |
+
)
|
| 635 |
+
|
| 636 |
+
|
| 637 |
+
def get_random_spec(rng) -> Dict[str, Any]:
|
| 638 |
+
"""Pick a random spec using the provided RNG instance."""
|
| 639 |
+
return rng.choice(ALL_SPECS)
|
server/app.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
FastAPI application for the API Debug Environment.
|
| 3 |
+
|
| 4 |
+
Uses OpenEnv's create_app() to generate all endpoints:
|
| 5 |
+
POST /reset, POST /step, GET /state, GET /schema, WS /ws, GET /health
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from openenv.core.env_server.http_server import create_app
|
| 9 |
+
|
| 10 |
+
try:
|
| 11 |
+
from ..models import APIDebugAction, APIDebugObservation
|
| 12 |
+
from .environment import APIDebugEnvironment
|
| 13 |
+
except ImportError:
|
| 14 |
+
from models import APIDebugAction, APIDebugObservation
|
| 15 |
+
from server.environment import APIDebugEnvironment
|
| 16 |
+
|
| 17 |
+
app = create_app(
|
| 18 |
+
APIDebugEnvironment,
|
| 19 |
+
APIDebugAction,
|
| 20 |
+
APIDebugObservation,
|
| 21 |
+
env_name="api_debug",
|
| 22 |
+
max_concurrent_envs=10,
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def main():
|
| 27 |
+
"""Run the server directly."""
|
| 28 |
+
import sys
|
| 29 |
+
import uvicorn
|
| 30 |
+
|
| 31 |
+
port = 8000
|
| 32 |
+
if len(sys.argv) > 1:
|
| 33 |
+
port = int(sys.argv[1])
|
| 34 |
+
|
| 35 |
+
uvicorn.run(app, host="0.0.0.0", port=port)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
if __name__ == "__main__":
|
| 39 |
+
main()
|
server/environment.py
ADDED
|
@@ -0,0 +1,456 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Core environment for the API Debug Environment.
|
| 3 |
+
|
| 4 |
+
Implements the OpenEnv Environment interface with:
|
| 5 |
+
- 3 task difficulty levels (easy, medium, hard)
|
| 6 |
+
- Multi-turn episodes with structured feedback
|
| 7 |
+
- Deterministic grading for easy/medium, LLM-as-judge for hard
|
| 8 |
+
- Step reward decay to encourage efficient debugging
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import copy
|
| 12 |
+
import json
|
| 13 |
+
import os
|
| 14 |
+
import random
|
| 15 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 16 |
+
from uuid import uuid4
|
| 17 |
+
|
| 18 |
+
from openenv.core.env_server.interfaces import Environment
|
| 19 |
+
from openenv.core.env_server.types import State
|
| 20 |
+
|
| 21 |
+
try:
|
| 22 |
+
from ..models import APIDebugAction, APIDebugObservation
|
| 23 |
+
except ImportError:
|
| 24 |
+
from models import APIDebugAction, APIDebugObservation
|
| 25 |
+
|
| 26 |
+
from .api_specs import get_random_spec
|
| 27 |
+
from .error_injectors import (
|
| 28 |
+
ERROR_TYPES,
|
| 29 |
+
inject_error,
|
| 30 |
+
inject_multiple_errors,
|
| 31 |
+
)
|
| 32 |
+
from .validators import (
|
| 33 |
+
validate_field_type,
|
| 34 |
+
validate_headers_against_spec,
|
| 35 |
+
validate_request_against_spec,
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
# Task configuration: max steps and error count per difficulty
|
| 40 |
+
TASK_CONFIG = {
|
| 41 |
+
"easy": {"max_steps": 3, "error_count": 1},
|
| 42 |
+
"medium": {"max_steps": 5, "error_count": 1},
|
| 43 |
+
"hard": {"max_steps": 7, "min_errors": 2, "max_errors": 3},
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
class APIDebugEnvironment(Environment):
|
| 48 |
+
"""API Contract Validation environment.
|
| 49 |
+
|
| 50 |
+
An LLM agent receives a broken API request and must:
|
| 51 |
+
- Easy: Identify the error type and affected fields
|
| 52 |
+
- Medium: Fix the request to match the API spec
|
| 53 |
+
- Hard: Fix the request and explain the fix for developers
|
| 54 |
+
|
| 55 |
+
Each episode allows multiple attempts. Perfect answers on early
|
| 56 |
+
steps get full reward. Later steps get decayed reward.
|
| 57 |
+
"""
|
| 58 |
+
|
| 59 |
+
SUPPORTS_CONCURRENT_SESSIONS: bool = True
|
| 60 |
+
|
| 61 |
+
def __init__(self):
|
| 62 |
+
super().__init__()
|
| 63 |
+
self._state = State(episode_id=str(uuid4()), step_count=0)
|
| 64 |
+
self.task = "easy"
|
| 65 |
+
self.spec: Dict[str, Any] = {}
|
| 66 |
+
self.broken_request: Dict[str, Any] = {}
|
| 67 |
+
self.broken_headers: Dict[str, str] = {}
|
| 68 |
+
self.ground_truths: List[Dict[str, Any]] = []
|
| 69 |
+
self.current_step = 0
|
| 70 |
+
self.max_steps = 3
|
| 71 |
+
self.episode_done = False
|
| 72 |
+
self.best_reward = 0.0
|
| 73 |
+
self.rng = random.Random()
|
| 74 |
+
# For wrong_http_method error: the method shown to the agent
|
| 75 |
+
self.shown_http_method = ""
|
| 76 |
+
|
| 77 |
+
def reset(
|
| 78 |
+
self,
|
| 79 |
+
seed: Optional[int] = None,
|
| 80 |
+
episode_id: Optional[str] = None,
|
| 81 |
+
task: str = "easy",
|
| 82 |
+
**kwargs,
|
| 83 |
+
) -> APIDebugObservation:
|
| 84 |
+
"""Start a new debugging episode.
|
| 85 |
+
|
| 86 |
+
Args:
|
| 87 |
+
seed: Random seed for reproducible episodes.
|
| 88 |
+
episode_id: Custom episode identifier.
|
| 89 |
+
task: Difficulty level (easy, medium, hard).
|
| 90 |
+
"""
|
| 91 |
+
# Initialize RNG
|
| 92 |
+
if seed is not None:
|
| 93 |
+
self.rng = random.Random(seed)
|
| 94 |
+
else:
|
| 95 |
+
self.rng = random.Random()
|
| 96 |
+
|
| 97 |
+
# Validate task
|
| 98 |
+
self.task = task if task in TASK_CONFIG else "easy"
|
| 99 |
+
config = TASK_CONFIG[self.task]
|
| 100 |
+
self.max_steps = config["max_steps"]
|
| 101 |
+
self.current_step = 0
|
| 102 |
+
self.episode_done = False
|
| 103 |
+
self.best_reward = 0.0
|
| 104 |
+
|
| 105 |
+
# Fresh state
|
| 106 |
+
self._state = State(
|
| 107 |
+
episode_id=episode_id or str(uuid4()),
|
| 108 |
+
step_count=0,
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
# Pick random spec and build valid request
|
| 112 |
+
self.spec = copy.deepcopy(get_random_spec(self.rng))
|
| 113 |
+
valid_request = copy.deepcopy(self.spec["valid_example"])
|
| 114 |
+
valid_headers = copy.deepcopy(self.spec["required_headers"])
|
| 115 |
+
|
| 116 |
+
# Inject errors based on difficulty
|
| 117 |
+
if self.task == "hard":
|
| 118 |
+
error_count = self.rng.randint(config["min_errors"], config["max_errors"])
|
| 119 |
+
self.broken_request, self.broken_headers, self.ground_truths = (
|
| 120 |
+
inject_multiple_errors(
|
| 121 |
+
valid_request, valid_headers, self.spec, self.rng, error_count
|
| 122 |
+
)
|
| 123 |
+
)
|
| 124 |
+
else:
|
| 125 |
+
error_type = self.rng.choice(ERROR_TYPES)
|
| 126 |
+
self.broken_request, self.broken_headers, gt = inject_error(
|
| 127 |
+
error_type, valid_request, valid_headers, self.spec, self.rng
|
| 128 |
+
)
|
| 129 |
+
self.ground_truths = [gt]
|
| 130 |
+
|
| 131 |
+
# Handle wrong_http_method: show the wrong method to the agent
|
| 132 |
+
self.shown_http_method = self.spec["http_method"]
|
| 133 |
+
for gt in self.ground_truths:
|
| 134 |
+
if gt["error_type"] == "wrong_http_method":
|
| 135 |
+
self.shown_http_method = gt.get("wrong_method", self.spec["http_method"])
|
| 136 |
+
break
|
| 137 |
+
|
| 138 |
+
error_count = len(self.ground_truths)
|
| 139 |
+
return APIDebugObservation(
|
| 140 |
+
task=self.task,
|
| 141 |
+
api_name=self.spec["api_name"],
|
| 142 |
+
http_method=self.shown_http_method,
|
| 143 |
+
endpoint=self.spec["endpoint"],
|
| 144 |
+
broken_request=json.dumps(self.broken_request, indent=2),
|
| 145 |
+
broken_headers=self.broken_headers,
|
| 146 |
+
api_spec=self._build_spec_string(),
|
| 147 |
+
error_count=error_count,
|
| 148 |
+
step_number=0,
|
| 149 |
+
max_steps=self.max_steps,
|
| 150 |
+
feedback="",
|
| 151 |
+
message=(
|
| 152 |
+
f"Debug this {self.shown_http_method} {self.spec['endpoint']} request. "
|
| 153 |
+
f"It contains {error_count} error(s). "
|
| 154 |
+
f"You have {self.max_steps} steps."
|
| 155 |
+
),
|
| 156 |
+
done=False,
|
| 157 |
+
reward=0.0,
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
def step(
|
| 161 |
+
self,
|
| 162 |
+
action: APIDebugAction,
|
| 163 |
+
timeout_s: Optional[float] = None,
|
| 164 |
+
**kwargs,
|
| 165 |
+
) -> APIDebugObservation:
|
| 166 |
+
"""Process the agent's debugging attempt.
|
| 167 |
+
|
| 168 |
+
The agent can submit a partial or complete response.
|
| 169 |
+
The grader evaluates whatever fields are present.
|
| 170 |
+
"""
|
| 171 |
+
self.current_step += 1
|
| 172 |
+
self._state.step_count = self.current_step
|
| 173 |
+
|
| 174 |
+
if self.episode_done:
|
| 175 |
+
return self._make_observation(
|
| 176 |
+
feedback="Episode already ended.",
|
| 177 |
+
reward=0.0,
|
| 178 |
+
done=True,
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
# Grade based on task type
|
| 182 |
+
if self.task == "easy":
|
| 183 |
+
raw_score, feedback = self._grade_easy(action)
|
| 184 |
+
elif self.task == "medium":
|
| 185 |
+
raw_score, feedback = self._grade_medium(action)
|
| 186 |
+
else:
|
| 187 |
+
raw_score, feedback = self._grade_hard(action)
|
| 188 |
+
|
| 189 |
+
# Apply step decay: step 1 = 1.0x, step 2 = 0.9x, etc. Floor at 0.3x
|
| 190 |
+
step_multiplier = max(1.0 - 0.1 * (self.current_step - 1), 0.3)
|
| 191 |
+
reward = round(raw_score * step_multiplier, 4)
|
| 192 |
+
|
| 193 |
+
# Track best reward across all steps
|
| 194 |
+
self.best_reward = max(self.best_reward, reward)
|
| 195 |
+
|
| 196 |
+
# Episode ends if score is near-perfect or out of steps
|
| 197 |
+
near_perfect = raw_score >= 0.95
|
| 198 |
+
out_of_steps = self.current_step >= self.max_steps
|
| 199 |
+
done = near_perfect or out_of_steps
|
| 200 |
+
|
| 201 |
+
if done:
|
| 202 |
+
self.episode_done = True
|
| 203 |
+
# Return best reward achieved during the episode
|
| 204 |
+
reward = self.best_reward
|
| 205 |
+
|
| 206 |
+
return self._make_observation(
|
| 207 |
+
feedback=feedback,
|
| 208 |
+
reward=reward,
|
| 209 |
+
done=done,
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
@property
|
| 213 |
+
def state(self) -> State:
|
| 214 |
+
return self._state
|
| 215 |
+
|
| 216 |
+
# =====================================================================
|
| 217 |
+
# Grading methods
|
| 218 |
+
# =====================================================================
|
| 219 |
+
|
| 220 |
+
def _grade_easy(self, action: APIDebugAction) -> Tuple[float, str]:
|
| 221 |
+
"""Grade error identification. Fully deterministic.
|
| 222 |
+
|
| 223 |
+
Scoring: 0.6 for correct error_type + 0.4 for correct affected_fields.
|
| 224 |
+
Fields use Jaccard similarity for partial credit.
|
| 225 |
+
"""
|
| 226 |
+
score = 0.0
|
| 227 |
+
parts = []
|
| 228 |
+
|
| 229 |
+
# Collect all ground truth error types and affected fields
|
| 230 |
+
gt_types = {gt["error_type"] for gt in self.ground_truths}
|
| 231 |
+
gt_fields: set = set()
|
| 232 |
+
for gt in self.ground_truths:
|
| 233 |
+
gt_fields.update(gt.get("affected_fields", []))
|
| 234 |
+
|
| 235 |
+
# Check error type (0.6 weight)
|
| 236 |
+
if action.error_type and action.error_type in gt_types:
|
| 237 |
+
score += 0.6
|
| 238 |
+
parts.append("error_type: CORRECT")
|
| 239 |
+
else:
|
| 240 |
+
given = action.error_type or "(none)"
|
| 241 |
+
parts.append(f"error_type: INCORRECT (you said '{given}')")
|
| 242 |
+
|
| 243 |
+
# Check affected fields using Jaccard similarity (0.4 weight)
|
| 244 |
+
agent_fields = set(action.affected_fields or [])
|
| 245 |
+
if gt_fields and agent_fields:
|
| 246 |
+
intersection = gt_fields & agent_fields
|
| 247 |
+
union = gt_fields | agent_fields
|
| 248 |
+
jaccard = len(intersection) / len(union) if union else 0.0
|
| 249 |
+
score += 0.4 * jaccard
|
| 250 |
+
parts.append(
|
| 251 |
+
f"affected_fields: {len(intersection)}/{len(gt_fields)} correct, "
|
| 252 |
+
f"{len(agent_fields - gt_fields)} extra"
|
| 253 |
+
)
|
| 254 |
+
elif not agent_fields:
|
| 255 |
+
parts.append("affected_fields: MISSING (none provided)")
|
| 256 |
+
else:
|
| 257 |
+
parts.append("affected_fields: INCORRECT (0 matches)")
|
| 258 |
+
|
| 259 |
+
return round(score, 4), "; ".join(parts)
|
| 260 |
+
|
| 261 |
+
def _grade_medium(self, action: APIDebugAction) -> Tuple[float, str]:
|
| 262 |
+
"""Grade request fix. Fully deterministic per-field validation.
|
| 263 |
+
|
| 264 |
+
Validates the fixed request against the spec: required fields present,
|
| 265 |
+
field types correct, headers present. Each check is equally weighted.
|
| 266 |
+
"""
|
| 267 |
+
if not action.fixed_request:
|
| 268 |
+
return 0.0, "No fixed_request provided."
|
| 269 |
+
|
| 270 |
+
try:
|
| 271 |
+
fixed = json.loads(action.fixed_request)
|
| 272 |
+
except (json.JSONDecodeError, TypeError):
|
| 273 |
+
return 0.0, "fixed_request is not valid JSON."
|
| 274 |
+
|
| 275 |
+
if not isinstance(fixed, dict):
|
| 276 |
+
return 0.0, "fixed_request must be a JSON object."
|
| 277 |
+
|
| 278 |
+
# Validate request body against spec
|
| 279 |
+
body_score, body_feedback = validate_request_against_spec(fixed, self.spec)
|
| 280 |
+
|
| 281 |
+
# Validate headers if provided
|
| 282 |
+
header_score = 0.0
|
| 283 |
+
header_feedback = ""
|
| 284 |
+
has_header_errors = any(
|
| 285 |
+
gt["error_type"] == "missing_auth_header" for gt in self.ground_truths
|
| 286 |
+
)
|
| 287 |
+
|
| 288 |
+
if has_header_errors and action.fixed_headers:
|
| 289 |
+
header_score, header_feedback = validate_headers_against_spec(
|
| 290 |
+
action.fixed_headers, self.spec
|
| 291 |
+
)
|
| 292 |
+
# Blend: 80% body + 20% headers when header errors exist
|
| 293 |
+
total_score = 0.8 * body_score + 0.2 * header_score
|
| 294 |
+
feedback = body_feedback + "\n" + header_feedback
|
| 295 |
+
elif has_header_errors and not action.fixed_headers:
|
| 296 |
+
feedback = body_feedback + "\nHeaders: NOT PROVIDED (header fix needed)"
|
| 297 |
+
total_score = 0.8 * body_score
|
| 298 |
+
else:
|
| 299 |
+
total_score = body_score
|
| 300 |
+
feedback = body_feedback
|
| 301 |
+
|
| 302 |
+
return round(total_score, 4), feedback
|
| 303 |
+
|
| 304 |
+
def _grade_hard(self, action: APIDebugAction) -> Tuple[float, str]:
|
| 305 |
+
"""Grade fix + explanation. 70% deterministic fix, 30% explanation.
|
| 306 |
+
|
| 307 |
+
The explanation is scored by LLM-as-judge if available,
|
| 308 |
+
with a heuristic fallback if the LLM is not reachable.
|
| 309 |
+
"""
|
| 310 |
+
# Deterministic fix scoring (same as medium)
|
| 311 |
+
fix_score, fix_feedback = self._grade_medium(action)
|
| 312 |
+
|
| 313 |
+
# Explanation scoring
|
| 314 |
+
explain_score = 0.0
|
| 315 |
+
explain_feedback = "No explanation provided."
|
| 316 |
+
|
| 317 |
+
if action.explanation and len(action.explanation.strip()) > 10:
|
| 318 |
+
explain_score = self._score_explanation(action.explanation)
|
| 319 |
+
explain_feedback = f"Explanation quality: {explain_score:.2f}/1.0"
|
| 320 |
+
|
| 321 |
+
total = 0.7 * fix_score + 0.3 * explain_score
|
| 322 |
+
feedback = (
|
| 323 |
+
f"Fix score: {fix_score:.2f} (70% weight)\n"
|
| 324 |
+
f"{fix_feedback}\n"
|
| 325 |
+
f"{explain_feedback}"
|
| 326 |
+
)
|
| 327 |
+
return round(total, 4), feedback
|
| 328 |
+
|
| 329 |
+
def _score_explanation(self, explanation: str) -> float:
|
| 330 |
+
"""Score an explanation using LLM-as-judge with heuristic fallback.
|
| 331 |
+
|
| 332 |
+
Tries to call the LLM via the HF router. If that fails for any
|
| 333 |
+
reason, falls back to a keyword + length heuristic.
|
| 334 |
+
"""
|
| 335 |
+
# Try LLM-as-judge first
|
| 336 |
+
try:
|
| 337 |
+
llm_score = self._llm_judge_explanation(explanation)
|
| 338 |
+
if llm_score is not None:
|
| 339 |
+
return llm_score
|
| 340 |
+
except Exception:
|
| 341 |
+
pass
|
| 342 |
+
|
| 343 |
+
# Heuristic fallback
|
| 344 |
+
return self._heuristic_score_explanation(explanation)
|
| 345 |
+
|
| 346 |
+
def _llm_judge_explanation(self, explanation: str) -> Optional[float]:
|
| 347 |
+
"""Call LLM to score the explanation. Returns None if unavailable."""
|
| 348 |
+
api_base = os.getenv("API_BASE_URL")
|
| 349 |
+
api_key = os.getenv("HF_TOKEN")
|
| 350 |
+
model = os.getenv("MODEL_NAME")
|
| 351 |
+
|
| 352 |
+
if not all([api_base, api_key, model]):
|
| 353 |
+
return None
|
| 354 |
+
|
| 355 |
+
from openai import OpenAI
|
| 356 |
+
|
| 357 |
+
client = OpenAI(base_url=api_base, api_key=api_key)
|
| 358 |
+
|
| 359 |
+
error_types = [gt["error_type"] for gt in self.ground_truths]
|
| 360 |
+
prompt = (
|
| 361 |
+
"Rate this API debugging explanation on a 0.0 to 1.0 scale.\n\n"
|
| 362 |
+
"Criteria:\n"
|
| 363 |
+
"- Correctly identifies root cause (0 to 0.4)\n"
|
| 364 |
+
"- Provides actionable fix guidance (0 to 0.3)\n"
|
| 365 |
+
"- Includes prevention advice for developers (0 to 0.3)\n\n"
|
| 366 |
+
f"API: {self.spec['api_name']} {self.spec['endpoint']}\n"
|
| 367 |
+
f"Errors present: {json.dumps(error_types)}\n"
|
| 368 |
+
f"Explanation: {explanation}\n\n"
|
| 369 |
+
'Return ONLY a JSON object: {"score": 0.0}'
|
| 370 |
+
)
|
| 371 |
+
|
| 372 |
+
response = client.chat.completions.create(
|
| 373 |
+
model=model,
|
| 374 |
+
messages=[{"role": "user", "content": prompt}],
|
| 375 |
+
max_tokens=50,
|
| 376 |
+
temperature=0.0,
|
| 377 |
+
)
|
| 378 |
+
text = response.choices[0].message.content or ""
|
| 379 |
+
|
| 380 |
+
# Parse score from response
|
| 381 |
+
result = json.loads(text)
|
| 382 |
+
raw_score = float(result["score"])
|
| 383 |
+
return max(0.0, min(1.0, raw_score))
|
| 384 |
+
|
| 385 |
+
def _heuristic_score_explanation(self, explanation: str) -> float:
|
| 386 |
+
"""Simple heuristic scoring based on length and keyword presence.
|
| 387 |
+
|
| 388 |
+
This is the fallback when LLM-as-judge is not available.
|
| 389 |
+
Not perfect, but ensures non-zero scores for reasonable explanations.
|
| 390 |
+
"""
|
| 391 |
+
keywords = [
|
| 392 |
+
"because", "should", "instead", "required", "missing",
|
| 393 |
+
"type", "format", "expected", "invalid", "correct",
|
| 394 |
+
"field", "header", "value", "fix", "error",
|
| 395 |
+
]
|
| 396 |
+
keyword_hits = sum(1 for k in keywords if k in explanation.lower())
|
| 397 |
+
keyword_score = min(keyword_hits / 6.0, 1.0)
|
| 398 |
+
|
| 399 |
+
# Length score: reward explanations between 50 and 500 chars
|
| 400 |
+
length = len(explanation.strip())
|
| 401 |
+
if length < 20:
|
| 402 |
+
length_score = 0.1
|
| 403 |
+
elif length < 50:
|
| 404 |
+
length_score = 0.3
|
| 405 |
+
elif length <= 500:
|
| 406 |
+
length_score = 0.6
|
| 407 |
+
else:
|
| 408 |
+
length_score = 0.5 # Slightly penalize very long explanations
|
| 409 |
+
|
| 410 |
+
return round(0.5 * keyword_score + 0.5 * length_score, 2)
|
| 411 |
+
|
| 412 |
+
# =====================================================================
|
| 413 |
+
# Helpers
|
| 414 |
+
# =====================================================================
|
| 415 |
+
|
| 416 |
+
def _build_spec_string(self) -> str:
|
| 417 |
+
"""Build a JSON string of the spec info the agent needs to see."""
|
| 418 |
+
visible_spec = {
|
| 419 |
+
"required_fields": self.spec["required_fields"],
|
| 420 |
+
"optional_fields": self.spec.get("optional_fields", []),
|
| 421 |
+
"field_types": self.spec["field_types"],
|
| 422 |
+
"required_headers": list(self.spec.get("required_headers", {}).keys()),
|
| 423 |
+
}
|
| 424 |
+
return json.dumps(visible_spec, indent=2)
|
| 425 |
+
|
| 426 |
+
def _make_observation(
|
| 427 |
+
self,
|
| 428 |
+
feedback: str,
|
| 429 |
+
reward: float,
|
| 430 |
+
done: bool,
|
| 431 |
+
) -> APIDebugObservation:
|
| 432 |
+
"""Build an observation with the current episode state."""
|
| 433 |
+
if done and not feedback:
|
| 434 |
+
msg = "Episode complete."
|
| 435 |
+
elif done:
|
| 436 |
+
msg = f"Episode complete. Final reward: {reward:.2f}"
|
| 437 |
+
else:
|
| 438 |
+
remaining = self.max_steps - self.current_step
|
| 439 |
+
msg = f"{remaining} step(s) remaining. Use the feedback to improve."
|
| 440 |
+
|
| 441 |
+
return APIDebugObservation(
|
| 442 |
+
task=self.task,
|
| 443 |
+
api_name=self.spec.get("api_name", ""),
|
| 444 |
+
http_method=self.shown_http_method,
|
| 445 |
+
endpoint=self.spec.get("endpoint", ""),
|
| 446 |
+
broken_request=json.dumps(self.broken_request, indent=2),
|
| 447 |
+
broken_headers=self.broken_headers,
|
| 448 |
+
api_spec=self._build_spec_string(),
|
| 449 |
+
error_count=len(self.ground_truths),
|
| 450 |
+
step_number=self.current_step,
|
| 451 |
+
max_steps=self.max_steps,
|
| 452 |
+
feedback=feedback,
|
| 453 |
+
message=msg,
|
| 454 |
+
done=done,
|
| 455 |
+
reward=reward,
|
| 456 |
+
)
|
server/error_injectors.py
ADDED
|
@@ -0,0 +1,388 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
10 error injection functions for the API Debug Environment.
|
| 3 |
+
|
| 4 |
+
Each injector takes a valid request + headers + spec + RNG and returns:
|
| 5 |
+
(broken_request, broken_headers, ground_truth)
|
| 6 |
+
|
| 7 |
+
ground_truth contains the error_type, affected_fields, and the original
|
| 8 |
+
valid request/headers so the grader knows the correct answer.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import copy
|
| 12 |
+
import random as random_module
|
| 13 |
+
from typing import Any, Dict, List, Tuple
|
| 14 |
+
|
| 15 |
+
GroundTruth = Dict[str, Any]
|
| 16 |
+
InjectorResult = Tuple[Dict[str, Any], Dict[str, str], GroundTruth]
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def _ground_truth(
|
| 20 |
+
error_type: str,
|
| 21 |
+
affected_fields: List[str],
|
| 22 |
+
valid_request: Dict[str, Any],
|
| 23 |
+
valid_headers: Dict[str, str],
|
| 24 |
+
) -> GroundTruth:
|
| 25 |
+
"""Build a standard ground truth dict."""
|
| 26 |
+
return {
|
| 27 |
+
"error_type": error_type,
|
| 28 |
+
"affected_fields": affected_fields,
|
| 29 |
+
"valid_request": valid_request,
|
| 30 |
+
"valid_headers": valid_headers,
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
# =========================================================================
|
| 35 |
+
# 1. missing_required_field
|
| 36 |
+
# =========================================================================
|
| 37 |
+
|
| 38 |
+
def inject_missing_required_field(
|
| 39 |
+
request: Dict[str, Any],
|
| 40 |
+
headers: Dict[str, str],
|
| 41 |
+
spec: Dict[str, Any],
|
| 42 |
+
rng: random_module.Random,
|
| 43 |
+
) -> InjectorResult:
|
| 44 |
+
"""Remove a random required field from the request."""
|
| 45 |
+
broken = copy.deepcopy(request)
|
| 46 |
+
candidates = [f for f in spec["required_fields"] if f in broken]
|
| 47 |
+
if not candidates:
|
| 48 |
+
return broken, headers, _ground_truth(
|
| 49 |
+
"missing_required_field", [], request, headers
|
| 50 |
+
)
|
| 51 |
+
field = rng.choice(candidates)
|
| 52 |
+
del broken[field]
|
| 53 |
+
return broken, headers, _ground_truth(
|
| 54 |
+
"missing_required_field", [field], request, headers
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
# =========================================================================
|
| 59 |
+
# 2. wrong_field_type
|
| 60 |
+
# =========================================================================
|
| 61 |
+
|
| 62 |
+
def inject_wrong_field_type(
|
| 63 |
+
request: Dict[str, Any],
|
| 64 |
+
headers: Dict[str, str],
|
| 65 |
+
spec: Dict[str, Any],
|
| 66 |
+
rng: random_module.Random,
|
| 67 |
+
) -> InjectorResult:
|
| 68 |
+
"""Change a field's value to the wrong type (e.g. int to string)."""
|
| 69 |
+
broken = copy.deepcopy(request)
|
| 70 |
+
candidates = [f for f in spec["required_fields"] if f in broken]
|
| 71 |
+
if not candidates:
|
| 72 |
+
return broken, headers, _ground_truth(
|
| 73 |
+
"wrong_field_type", [], request, headers
|
| 74 |
+
)
|
| 75 |
+
field = rng.choice(candidates)
|
| 76 |
+
original = broken[field]
|
| 77 |
+
|
| 78 |
+
# Pick a wrong type based on what the original is
|
| 79 |
+
if isinstance(original, int):
|
| 80 |
+
broken[field] = str(original)
|
| 81 |
+
elif isinstance(original, float):
|
| 82 |
+
broken[field] = str(original)
|
| 83 |
+
elif isinstance(original, bool):
|
| 84 |
+
broken[field] = "true"
|
| 85 |
+
elif isinstance(original, str):
|
| 86 |
+
broken[field] = 12345
|
| 87 |
+
elif isinstance(original, list):
|
| 88 |
+
broken[field] = "should_be_array"
|
| 89 |
+
elif isinstance(original, dict):
|
| 90 |
+
broken[field] = "should_be_object"
|
| 91 |
+
else:
|
| 92 |
+
broken[field] = "wrong_type"
|
| 93 |
+
|
| 94 |
+
return broken, headers, _ground_truth(
|
| 95 |
+
"wrong_field_type", [field], request, headers
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
# =========================================================================
|
| 100 |
+
# 3. invalid_email_format
|
| 101 |
+
# =========================================================================
|
| 102 |
+
|
| 103 |
+
def inject_invalid_email_format(
|
| 104 |
+
request: Dict[str, Any],
|
| 105 |
+
headers: Dict[str, str],
|
| 106 |
+
spec: Dict[str, Any],
|
| 107 |
+
rng: random_module.Random,
|
| 108 |
+
) -> InjectorResult:
|
| 109 |
+
"""Corrupt an email field to an invalid format."""
|
| 110 |
+
broken = copy.deepcopy(request)
|
| 111 |
+
email_fields = [
|
| 112 |
+
f for f in spec["field_types"]
|
| 113 |
+
if spec["field_types"][f] == "email" and f in broken
|
| 114 |
+
]
|
| 115 |
+
if not email_fields:
|
| 116 |
+
# Fallback: inject a missing field instead
|
| 117 |
+
return inject_missing_required_field(request, headers, spec, rng)
|
| 118 |
+
|
| 119 |
+
field = rng.choice(email_fields)
|
| 120 |
+
bad_emails = ["not-an-email", "user@", "@domain.com", "user@.com", "user space@example.com"]
|
| 121 |
+
broken[field] = rng.choice(bad_emails)
|
| 122 |
+
return broken, headers, _ground_truth(
|
| 123 |
+
"invalid_email_format", [field], request, headers
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
# =========================================================================
|
| 128 |
+
# 4. missing_auth_header
|
| 129 |
+
# =========================================================================
|
| 130 |
+
|
| 131 |
+
def inject_missing_auth_header(
|
| 132 |
+
request: Dict[str, Any],
|
| 133 |
+
headers: Dict[str, str],
|
| 134 |
+
spec: Dict[str, Any],
|
| 135 |
+
rng: random_module.Random,
|
| 136 |
+
) -> InjectorResult:
|
| 137 |
+
"""Remove the Authorization header."""
|
| 138 |
+
broken_headers = copy.deepcopy(headers)
|
| 139 |
+
if "Authorization" in broken_headers:
|
| 140 |
+
del broken_headers["Authorization"]
|
| 141 |
+
return request, broken_headers, _ground_truth(
|
| 142 |
+
"missing_auth_header", ["Authorization"], request, headers
|
| 143 |
+
)
|
| 144 |
+
# If no auth header exists in spec, remove Content-Type instead
|
| 145 |
+
if "Content-Type" in broken_headers:
|
| 146 |
+
del broken_headers["Content-Type"]
|
| 147 |
+
return request, broken_headers, _ground_truth(
|
| 148 |
+
"missing_auth_header", ["Content-Type"], request, headers
|
| 149 |
+
)
|
| 150 |
+
return request, broken_headers, _ground_truth(
|
| 151 |
+
"missing_auth_header", [], request, headers
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
# =========================================================================
|
| 156 |
+
# 5. extra_unknown_field
|
| 157 |
+
# =========================================================================
|
| 158 |
+
|
| 159 |
+
def inject_extra_unknown_field(
|
| 160 |
+
request: Dict[str, Any],
|
| 161 |
+
headers: Dict[str, str],
|
| 162 |
+
spec: Dict[str, Any],
|
| 163 |
+
rng: random_module.Random,
|
| 164 |
+
) -> InjectorResult:
|
| 165 |
+
"""Add a field that is not in the spec."""
|
| 166 |
+
broken = copy.deepcopy(request)
|
| 167 |
+
unknown_fields = [
|
| 168 |
+
("unknown_field", "unexpected_value"),
|
| 169 |
+
("debug_mode", True),
|
| 170 |
+
("internal_id", 99999),
|
| 171 |
+
("_private", "should_not_exist"),
|
| 172 |
+
("extra_data", {"nested": "bad"}),
|
| 173 |
+
]
|
| 174 |
+
field_name, field_value = rng.choice(unknown_fields)
|
| 175 |
+
broken[field_name] = field_value
|
| 176 |
+
return broken, headers, _ground_truth(
|
| 177 |
+
"extra_unknown_field", [field_name], request, headers
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
# =========================================================================
|
| 182 |
+
# 6. null_value_in_required
|
| 183 |
+
# =========================================================================
|
| 184 |
+
|
| 185 |
+
def inject_null_value_in_required(
|
| 186 |
+
request: Dict[str, Any],
|
| 187 |
+
headers: Dict[str, str],
|
| 188 |
+
spec: Dict[str, Any],
|
| 189 |
+
rng: random_module.Random,
|
| 190 |
+
) -> InjectorResult:
|
| 191 |
+
"""Set a required field to null."""
|
| 192 |
+
broken = copy.deepcopy(request)
|
| 193 |
+
candidates = [f for f in spec["required_fields"] if f in broken]
|
| 194 |
+
if not candidates:
|
| 195 |
+
return broken, headers, _ground_truth(
|
| 196 |
+
"null_value_in_required", [], request, headers
|
| 197 |
+
)
|
| 198 |
+
field = rng.choice(candidates)
|
| 199 |
+
broken[field] = None
|
| 200 |
+
return broken, headers, _ground_truth(
|
| 201 |
+
"null_value_in_required", [field], request, headers
|
| 202 |
+
)
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
# =========================================================================
|
| 206 |
+
# 7. wrong_http_method
|
| 207 |
+
# =========================================================================
|
| 208 |
+
|
| 209 |
+
def inject_wrong_http_method(
|
| 210 |
+
request: Dict[str, Any],
|
| 211 |
+
headers: Dict[str, str],
|
| 212 |
+
spec: Dict[str, Any],
|
| 213 |
+
rng: random_module.Random,
|
| 214 |
+
) -> InjectorResult:
|
| 215 |
+
"""Indicate the wrong HTTP method was used.
|
| 216 |
+
|
| 217 |
+
The error is stored in the ground truth. The request body stays the same
|
| 218 |
+
but the observation will show a different http_method.
|
| 219 |
+
"""
|
| 220 |
+
all_methods = ["GET", "POST", "PUT", "PATCH", "DELETE"]
|
| 221 |
+
correct = spec["http_method"]
|
| 222 |
+
wrong_methods = [m for m in all_methods if m != correct]
|
| 223 |
+
wrong = rng.choice(wrong_methods)
|
| 224 |
+
|
| 225 |
+
gt = _ground_truth("wrong_http_method", ["http_method"], request, headers)
|
| 226 |
+
gt["wrong_method"] = wrong
|
| 227 |
+
gt["correct_method"] = correct
|
| 228 |
+
return request, headers, gt
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
# =========================================================================
|
| 232 |
+
# 8. malformed_json_value
|
| 233 |
+
# =========================================================================
|
| 234 |
+
|
| 235 |
+
def inject_malformed_json_value(
|
| 236 |
+
request: Dict[str, Any],
|
| 237 |
+
headers: Dict[str, str],
|
| 238 |
+
spec: Dict[str, Any],
|
| 239 |
+
rng: random_module.Random,
|
| 240 |
+
) -> InjectorResult:
|
| 241 |
+
"""Corrupt a field value so it looks like broken JSON.
|
| 242 |
+
|
| 243 |
+
Since we work with Python dicts (already parsed), we simulate this
|
| 244 |
+
by inserting strings that look like malformed JSON fragments.
|
| 245 |
+
"""
|
| 246 |
+
broken = copy.deepcopy(request)
|
| 247 |
+
candidates = [f for f in spec["required_fields"] if f in broken]
|
| 248 |
+
if not candidates:
|
| 249 |
+
return broken, headers, _ground_truth(
|
| 250 |
+
"malformed_json_value", [], request, headers
|
| 251 |
+
)
|
| 252 |
+
field = rng.choice(candidates)
|
| 253 |
+
bad_values = [
|
| 254 |
+
"{broken",
|
| 255 |
+
"[unclosed",
|
| 256 |
+
"value with 'mixed\" quotes",
|
| 257 |
+
"undefined",
|
| 258 |
+
"NaN",
|
| 259 |
+
]
|
| 260 |
+
broken[field] = rng.choice(bad_values)
|
| 261 |
+
return broken, headers, _ground_truth(
|
| 262 |
+
"malformed_json_value", [field], request, headers
|
| 263 |
+
)
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
# =========================================================================
|
| 267 |
+
# 9. invalid_enum_value
|
| 268 |
+
# =========================================================================
|
| 269 |
+
|
| 270 |
+
def inject_invalid_enum_value(
|
| 271 |
+
request: Dict[str, Any],
|
| 272 |
+
headers: Dict[str, str],
|
| 273 |
+
spec: Dict[str, Any],
|
| 274 |
+
rng: random_module.Random,
|
| 275 |
+
) -> InjectorResult:
|
| 276 |
+
"""Use a value not in the enum list for an enum field."""
|
| 277 |
+
broken = copy.deepcopy(request)
|
| 278 |
+
enum_fields = [
|
| 279 |
+
f for f in spec["field_types"]
|
| 280 |
+
if spec["field_types"][f].startswith("enum:") and f in broken
|
| 281 |
+
]
|
| 282 |
+
if not enum_fields:
|
| 283 |
+
# Fallback: inject wrong type instead
|
| 284 |
+
return inject_wrong_field_type(request, headers, spec, rng)
|
| 285 |
+
|
| 286 |
+
field = rng.choice(enum_fields)
|
| 287 |
+
broken[field] = "INVALID_ENUM_VALUE"
|
| 288 |
+
return broken, headers, _ground_truth(
|
| 289 |
+
"invalid_enum_value", [field], request, headers
|
| 290 |
+
)
|
| 291 |
+
|
| 292 |
+
|
| 293 |
+
# =========================================================================
|
| 294 |
+
# 10. datetime_format_error
|
| 295 |
+
# =========================================================================
|
| 296 |
+
|
| 297 |
+
def inject_datetime_format_error(
|
| 298 |
+
request: Dict[str, Any],
|
| 299 |
+
headers: Dict[str, str],
|
| 300 |
+
spec: Dict[str, Any],
|
| 301 |
+
rng: random_module.Random,
|
| 302 |
+
) -> InjectorResult:
|
| 303 |
+
"""Replace a datetime field with the wrong format."""
|
| 304 |
+
broken = copy.deepcopy(request)
|
| 305 |
+
datetime_fields = [
|
| 306 |
+
f for f in spec["field_types"]
|
| 307 |
+
if spec["field_types"][f] == "datetime" and f in broken
|
| 308 |
+
]
|
| 309 |
+
if not datetime_fields:
|
| 310 |
+
# Fallback: inject wrong type instead
|
| 311 |
+
return inject_wrong_field_type(request, headers, spec, rng)
|
| 312 |
+
|
| 313 |
+
field = rng.choice(datetime_fields)
|
| 314 |
+
bad_formats = [
|
| 315 |
+
"04/01/2026",
|
| 316 |
+
"2026.04.01",
|
| 317 |
+
"April 1, 2026",
|
| 318 |
+
"1711929600",
|
| 319 |
+
"2026-04-01 09:00",
|
| 320 |
+
]
|
| 321 |
+
broken[field] = rng.choice(bad_formats)
|
| 322 |
+
return broken, headers, _ground_truth(
|
| 323 |
+
"datetime_format_error", [field], request, headers
|
| 324 |
+
)
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
# =========================================================================
|
| 328 |
+
# Registry and helpers
|
| 329 |
+
# =========================================================================
|
| 330 |
+
|
| 331 |
+
ERROR_TYPES = [
|
| 332 |
+
"missing_required_field",
|
| 333 |
+
"wrong_field_type",
|
| 334 |
+
"invalid_email_format",
|
| 335 |
+
"missing_auth_header",
|
| 336 |
+
"extra_unknown_field",
|
| 337 |
+
"null_value_in_required",
|
| 338 |
+
"wrong_http_method",
|
| 339 |
+
"malformed_json_value",
|
| 340 |
+
"invalid_enum_value",
|
| 341 |
+
"datetime_format_error",
|
| 342 |
+
]
|
| 343 |
+
|
| 344 |
+
INJECTOR_MAP = {
|
| 345 |
+
"missing_required_field": inject_missing_required_field,
|
| 346 |
+
"wrong_field_type": inject_wrong_field_type,
|
| 347 |
+
"invalid_email_format": inject_invalid_email_format,
|
| 348 |
+
"missing_auth_header": inject_missing_auth_header,
|
| 349 |
+
"extra_unknown_field": inject_extra_unknown_field,
|
| 350 |
+
"null_value_in_required": inject_null_value_in_required,
|
| 351 |
+
"wrong_http_method": inject_wrong_http_method,
|
| 352 |
+
"malformed_json_value": inject_malformed_json_value,
|
| 353 |
+
"invalid_enum_value": inject_invalid_enum_value,
|
| 354 |
+
"datetime_format_error": inject_datetime_format_error,
|
| 355 |
+
}
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
def inject_error(
|
| 359 |
+
error_type: str,
|
| 360 |
+
request: Dict[str, Any],
|
| 361 |
+
headers: Dict[str, str],
|
| 362 |
+
spec: Dict[str, Any],
|
| 363 |
+
rng: random_module.Random,
|
| 364 |
+
) -> InjectorResult:
|
| 365 |
+
"""Inject a single error of the specified type."""
|
| 366 |
+
injector = INJECTOR_MAP[error_type]
|
| 367 |
+
return injector(request, headers, spec, rng)
|
| 368 |
+
|
| 369 |
+
|
| 370 |
+
def inject_multiple_errors(
|
| 371 |
+
request: Dict[str, Any],
|
| 372 |
+
headers: Dict[str, str],
|
| 373 |
+
spec: Dict[str, Any],
|
| 374 |
+
rng: random_module.Random,
|
| 375 |
+
count: int = 2,
|
| 376 |
+
) -> Tuple[Dict[str, Any], Dict[str, str], List[GroundTruth]]:
|
| 377 |
+
"""Inject multiple errors sequentially. Returns list of ground truths."""
|
| 378 |
+
broken_req = copy.deepcopy(request)
|
| 379 |
+
broken_hdrs = copy.deepcopy(headers)
|
| 380 |
+
all_truths = []
|
| 381 |
+
|
| 382 |
+
chosen_types = rng.sample(ERROR_TYPES, min(count, len(ERROR_TYPES)))
|
| 383 |
+
for err_type in chosen_types:
|
| 384 |
+
injector = INJECTOR_MAP[err_type]
|
| 385 |
+
broken_req, broken_hdrs, gt = injector(broken_req, broken_hdrs, spec, rng)
|
| 386 |
+
all_truths.append(gt)
|
| 387 |
+
|
| 388 |
+
return broken_req, broken_hdrs, all_truths
|
server/validators.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Field validation helpers used by the graders.
|
| 3 |
+
|
| 4 |
+
Each validator returns True if the value matches the expected type,
|
| 5 |
+
False otherwise. These are intentionally simple and deterministic.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import re
|
| 9 |
+
from typing import Any, Dict, List, Tuple
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
EMAIL_RE = re.compile(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$")
|
| 13 |
+
PHONE_RE = re.compile(r"^\+?[1-9]\d{6,14}$")
|
| 14 |
+
URL_RE = re.compile(r"^https?://[^\s]+$")
|
| 15 |
+
ISO_DATETIME_RE = re.compile(
|
| 16 |
+
r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(Z|[+-]\d{2}:\d{2})$"
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def validate_email(value: Any) -> bool:
|
| 21 |
+
return isinstance(value, str) and bool(EMAIL_RE.match(value))
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def validate_phone(value: Any) -> bool:
|
| 25 |
+
if not isinstance(value, str):
|
| 26 |
+
return False
|
| 27 |
+
cleaned = value.replace(" ", "").replace("-", "")
|
| 28 |
+
return bool(PHONE_RE.match(cleaned))
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def validate_url(value: Any) -> bool:
|
| 32 |
+
return isinstance(value, str) and bool(URL_RE.match(value))
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def validate_datetime(value: Any) -> bool:
|
| 36 |
+
return isinstance(value, str) and bool(ISO_DATETIME_RE.match(value))
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def validate_enum(value: Any, allowed_values: List[str]) -> bool:
|
| 40 |
+
return isinstance(value, str) and value in allowed_values
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def validate_field_type(value: Any, expected_type: str) -> bool:
|
| 44 |
+
"""Check if a value matches the expected type string from the spec.
|
| 45 |
+
|
| 46 |
+
Supported types: string, integer, float, boolean, email, datetime,
|
| 47 |
+
url, phone, enum:val1,val2, object, array.
|
| 48 |
+
"""
|
| 49 |
+
if value is None:
|
| 50 |
+
return False
|
| 51 |
+
|
| 52 |
+
if expected_type == "string":
|
| 53 |
+
return isinstance(value, str)
|
| 54 |
+
elif expected_type == "integer":
|
| 55 |
+
return isinstance(value, int) and not isinstance(value, bool)
|
| 56 |
+
elif expected_type == "float":
|
| 57 |
+
return isinstance(value, (int, float)) and not isinstance(value, bool)
|
| 58 |
+
elif expected_type == "boolean":
|
| 59 |
+
return isinstance(value, bool)
|
| 60 |
+
elif expected_type == "email":
|
| 61 |
+
return validate_email(value)
|
| 62 |
+
elif expected_type == "datetime":
|
| 63 |
+
return validate_datetime(value)
|
| 64 |
+
elif expected_type == "url":
|
| 65 |
+
return validate_url(value)
|
| 66 |
+
elif expected_type == "phone":
|
| 67 |
+
return validate_phone(value)
|
| 68 |
+
elif expected_type.startswith("enum:"):
|
| 69 |
+
allowed = expected_type.split(":", 1)[1].split(",")
|
| 70 |
+
return validate_enum(value, allowed)
|
| 71 |
+
elif expected_type == "object":
|
| 72 |
+
return isinstance(value, dict)
|
| 73 |
+
elif expected_type == "array":
|
| 74 |
+
return isinstance(value, list)
|
| 75 |
+
else:
|
| 76 |
+
# Unknown type, accept anything non-None
|
| 77 |
+
return True
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def validate_request_against_spec(
|
| 81 |
+
request: Dict[str, Any],
|
| 82 |
+
spec: Dict[str, Any],
|
| 83 |
+
) -> Tuple[float, str]:
|
| 84 |
+
"""Validate a request body against its spec.
|
| 85 |
+
|
| 86 |
+
Returns (score, feedback_string) where score is 0.0 to 1.0
|
| 87 |
+
based on how many checks pass.
|
| 88 |
+
"""
|
| 89 |
+
checks = []
|
| 90 |
+
total = 0
|
| 91 |
+
passed = 0
|
| 92 |
+
|
| 93 |
+
# Check required fields are present and non-null
|
| 94 |
+
for field in spec["required_fields"]:
|
| 95 |
+
total += 1
|
| 96 |
+
if field in request and request[field] is not None:
|
| 97 |
+
passed += 1
|
| 98 |
+
checks.append(f" {field}: PRESENT")
|
| 99 |
+
else:
|
| 100 |
+
checks.append(f" {field}: MISSING")
|
| 101 |
+
|
| 102 |
+
# Check field types for fields that are present
|
| 103 |
+
for field, expected_type in spec["field_types"].items():
|
| 104 |
+
if field not in request or request[field] is None:
|
| 105 |
+
continue
|
| 106 |
+
total += 1
|
| 107 |
+
if validate_field_type(request[field], expected_type):
|
| 108 |
+
passed += 1
|
| 109 |
+
checks.append(f" {field} type: VALID ({expected_type})")
|
| 110 |
+
else:
|
| 111 |
+
checks.append(f" {field} type: INVALID (expected {expected_type})")
|
| 112 |
+
|
| 113 |
+
# Check no extra unknown fields
|
| 114 |
+
all_known = set(spec["required_fields"]) | set(spec.get("optional_fields", []))
|
| 115 |
+
for field in request:
|
| 116 |
+
if field not in all_known:
|
| 117 |
+
total += 1
|
| 118 |
+
checks.append(f" {field}: UNKNOWN FIELD (not in spec)")
|
| 119 |
+
|
| 120 |
+
score = passed / total if total > 0 else 0.0
|
| 121 |
+
feedback = f"Validation: {passed}/{total} checks passed.\n" + "\n".join(checks)
|
| 122 |
+
return round(score, 4), feedback
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def validate_headers_against_spec(
|
| 126 |
+
headers: Dict[str, str],
|
| 127 |
+
spec: Dict[str, Any],
|
| 128 |
+
) -> Tuple[float, str]:
|
| 129 |
+
"""Validate request headers against the spec's required_headers.
|
| 130 |
+
|
| 131 |
+
Returns (score, feedback_string).
|
| 132 |
+
"""
|
| 133 |
+
required = spec.get("required_headers", {})
|
| 134 |
+
if not required:
|
| 135 |
+
return 1.0, "No required headers."
|
| 136 |
+
|
| 137 |
+
total = len(required)
|
| 138 |
+
passed = 0
|
| 139 |
+
checks = []
|
| 140 |
+
|
| 141 |
+
for header_name in required:
|
| 142 |
+
if header_name in headers:
|
| 143 |
+
passed += 1
|
| 144 |
+
checks.append(f" {header_name}: PRESENT")
|
| 145 |
+
else:
|
| 146 |
+
checks.append(f" {header_name}: MISSING")
|
| 147 |
+
|
| 148 |
+
score = passed / total if total > 0 else 1.0
|
| 149 |
+
feedback = f"Headers: {passed}/{total} present.\n" + "\n".join(checks)
|
| 150 |
+
return round(score, 4), feedback
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|