Spaces:
Sleeping
Sleeping
Commit Β·
8b10144
1
Parent(s): 486044c
chore: remove __pycache__ files
Browse files- .gitignore +8 -0
- LICENSE +24 -0
- README.md +220 -150
- __init__.py +12 -2
- __pycache__/__init__.cpython-313.pyc +0 -0
- __pycache__/client.cpython-313.pyc +0 -0
- __pycache__/models.cpython-313.pyc +0 -0
- __pycache__/scenarios.cpython-313.pyc +0 -0
- inference.py +58 -19
- models.py +19 -2
- scenarios.py +961 -286
- server/__pycache__/__init__.cpython-313.pyc +0 -0
- server/__pycache__/api_debug_env_environment.cpython-313.pyc +0 -0
- server/__pycache__/app.cpython-313.pyc +0 -0
- server/api_debug_env_environment.py +390 -46
- server/app.py +40 -7
- tests/__pycache__/__init__.cpython-313.pyc +0 -0
- tests/__pycache__/test_environment.cpython-313-pytest-8.4.1.pyc +0 -0
- tests/test_environment.py +409 -45
.gitignore
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
hackathonDetails/
|
| 2 |
+
.agents/
|
| 3 |
+
AGENTS.md
|
| 4 |
+
PROGRESS.md
|
| 5 |
+
__pycache__/
|
| 6 |
+
*.pyc
|
| 7 |
+
*.pyo
|
| 8 |
+
.env
|
LICENSE
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
BSD 2-Clause License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2026, Yadnyesh
|
| 4 |
+
|
| 5 |
+
Redistribution and use in source and binary forms, with or without
|
| 6 |
+
modification, are permitted provided that the following conditions are met:
|
| 7 |
+
|
| 8 |
+
1. Redistributions of source code must retain the above copyright notice, this
|
| 9 |
+
list of conditions and the following disclaimer.
|
| 10 |
+
|
| 11 |
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
| 12 |
+
this list of conditions and the following disclaimer in the documentation
|
| 13 |
+
and/or other materials provided with the distribution.
|
| 14 |
+
|
| 15 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
| 16 |
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
| 17 |
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
| 18 |
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
| 19 |
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
| 20 |
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
| 21 |
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
| 22 |
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
| 23 |
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| 24 |
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
README.md
CHANGED
|
@@ -1,224 +1,294 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
-
pinned: false
|
| 8 |
app_port: 8000
|
| 9 |
tags:
|
| 10 |
- openenv
|
| 11 |
---
|
| 12 |
-
# API Integration Debugging Environment
|
| 13 |
|
| 14 |
-
|
| 15 |
|
| 16 |
-
|
| 17 |
|
| 18 |
-
|
|
|
|
|
|
|
| 19 |
|
| 20 |
-
|
| 21 |
-
2. **Inspect service configurations** to find misconfigurations
|
| 22 |
-
3. **Test endpoints** to observe current behavior
|
| 23 |
-
4. **Submit fixes** with corrected configuration payloads
|
| 24 |
|
| 25 |
-
|
| 26 |
-
- **3 difficulty levels** with increasing complexity (2, 3, and 5 issues)
|
| 27 |
-
- **Strict value validation** on fixes (grader checks both key AND value)
|
| 28 |
-
- **Seed-based randomization** for reproducible yet varied episodes
|
| 29 |
-
- **Penalty for repeated inspections** to encourage efficient exploration
|
| 30 |
-
- **Comprehensive test suite** with 30+ unit tests
|
| 31 |
|
| 32 |
-
|
|
|
|
|
|
|
| 33 |
|
| 34 |
-
|
| 35 |
-
class ApiDebugAction(Action):
|
| 36 |
-
action_type: str # "inspect_logs" | "inspect_config" | "inspect_endpoint" | "submit_fix"
|
| 37 |
-
target: str # Service name (e.g. "payment_client", "webhook_sender")
|
| 38 |
-
fix_payload: dict # Required when action_type="submit_fix"
|
| 39 |
-
```
|
| 40 |
|
| 41 |
-
|
| 42 |
-
|--------|-------------|--------|
|
| 43 |
-
| `inspect_logs` | Read error logs for a service | +0.15 (finds new issue) / +0.05 (first time, no issue) / 0.0 (repeat) |
|
| 44 |
-
| `inspect_config` | View current config of a service | +0.05 (has issues) / +0.01 (no issues) / 0.0 (repeat) |
|
| 45 |
-
| `inspect_endpoint` | Test-call an endpoint | +0.02 to +0.05 |
|
| 46 |
-
| `submit_fix` | Submit a configuration fix | +0.25 (correct) / -0.1 (wrong) |
|
| 47 |
-
| *step cost* | Applied every step | -0.01 |
|
| 48 |
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
|
|
|
| 65 |
```
|
| 66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
## Tasks
|
| 68 |
|
| 69 |
-
###
|
| 70 |
-
|
| 71 |
-
|
|
|
|
|
|
|
| 72 |
- **Services**: `payment_client`, `payment_gateway`
|
| 73 |
-
- **
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
-
|
| 76 |
-
- **Issues**: 3 (rate limit too high, insufficient retries, empty webhook signature)
|
| 77 |
-
- **Max Steps**: 25
|
| 78 |
- **Services**: `webhook_sender`, `webhook_receiver`, `notification_service`
|
| 79 |
-
- **
|
|
|
|
| 80 |
|
| 81 |
-
###
|
| 82 |
-
|
| 83 |
-
|
|
|
|
|
|
|
| 84 |
- **Services**: `order_service`, `inventory_service`, `shipping_service`, `api_gateway`, `auth_service`
|
| 85 |
-
- **
|
|
|
|
| 86 |
|
| 87 |
-
##
|
| 88 |
|
| 89 |
-
|
| 90 |
-
- **Partial progress**: First useful inspection earns reward (+0.05 to +0.15)
|
| 91 |
-
- **Repeated inspection**: 0 reward (prevents reward farming)
|
| 92 |
-
- **Fix rewards**: +0.25 per correctly fixed issue (strict key+value validation)
|
| 93 |
-
- **Completion bonus**: +0.2 when all issues are resolved
|
| 94 |
-
- **Penalties**: -0.1 for wrong fixes, -0.05 for invalid actions
|
| 95 |
|
| 96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
```
|
| 99 |
-
Score =
|
| 100 |
-
|
| 101 |
-
exploration_bonus = issues_found / issues_total Γ 0.1
|
| 102 |
```
|
| 103 |
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
## Baseline Scores (Rule-Based Agent)
|
| 107 |
|
| 108 |
-
| Task | Score |
|
| 109 |
-
|------|-------|-------
|
| 110 |
-
| Easy | ~0.
|
| 111 |
-
| Medium | ~0.
|
| 112 |
-
| Hard | ~0.
|
| 113 |
|
| 114 |
-
|
| 115 |
|
| 116 |
-
##
|
| 117 |
|
| 118 |
-
|
| 119 |
-
[START] task=easy env=api_debug_env model=Qwen/Qwen2.5-72B-Instruct
|
| 120 |
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
-
#
|
| 128 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
|
| 130 |
-
#
|
| 131 |
-
[STEP] step=4 action=submit_fix(target=payment_client,fix={"headers.Content-Type":"application/json"}) reward=0.44 done=true error=null
|
| 132 |
|
| 133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
```
|
| 135 |
|
| 136 |
## Setup & Usage
|
| 137 |
|
| 138 |
-
###
|
| 139 |
-
- Python 3.10+
|
| 140 |
-
- Docker (for containerized deployment)
|
| 141 |
-
|
| 142 |
-
### Local Development
|
| 143 |
|
| 144 |
```bash
|
| 145 |
-
cd api_debug_env
|
| 146 |
-
|
| 147 |
-
# Install dependencies
|
| 148 |
uv sync
|
| 149 |
-
|
| 150 |
-
# Run server
|
| 151 |
-
uv run server
|
| 152 |
-
# or
|
| 153 |
-
uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
|
| 154 |
```
|
| 155 |
|
| 156 |
-
###
|
| 157 |
|
| 158 |
```bash
|
| 159 |
-
|
| 160 |
-
docker build -t api_debug_env:latest -f server/Dockerfile .
|
| 161 |
-
docker run -p 8000:8000 api_debug_env:latest
|
| 162 |
```
|
| 163 |
|
| 164 |
-
### Run
|
| 165 |
|
| 166 |
```bash
|
| 167 |
-
|
| 168 |
-
export HF_TOKEN=your-key
|
| 169 |
-
|
| 170 |
-
# Run inference on all tasks
|
| 171 |
-
python inference.py
|
| 172 |
```
|
| 173 |
|
| 174 |
-
###
|
| 175 |
|
| 176 |
```bash
|
| 177 |
-
|
| 178 |
-
|
| 179 |
```
|
| 180 |
|
| 181 |
### API Endpoints
|
| 182 |
|
| 183 |
| Endpoint | Method | Description |
|
| 184 |
|----------|--------|-------------|
|
| 185 |
-
| `/` | GET |
|
| 186 |
-
| `/reset` | POST | Reset environment
|
| 187 |
| `/step` | POST | Execute an action |
|
| 188 |
| `/state` | GET | Get current state |
|
| 189 |
-
| `/tasks` | GET | List all tasks with
|
| 190 |
-
| `/grader` | POST | Get
|
| 191 |
-
| `/baseline` | POST | Run baseline
|
| 192 |
-
| `/
|
| 193 |
-
| `/health` | GET | Health check endpoint |
|
| 194 |
|
| 195 |
-
##
|
| 196 |
|
| 197 |
-
```
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
βββ models.py # Pydantic Action & Observation models
|
| 201 |
-
βββ scenarios.py # 3 task scenarios with randomization support
|
| 202 |
-
βββ client.py # WebSocket client for the environment
|
| 203 |
-
βββ openenv.yaml # OpenEnv metadata (spec v1)
|
| 204 |
-
βββ pyproject.toml # Dependencies & build config
|
| 205 |
-
βββ server/
|
| 206 |
-
β βββ app.py # FastAPI application
|
| 207 |
-
β βββ api_debug_env_environment.py # Core environment logic
|
| 208 |
-
β βββ Dockerfile # Container build
|
| 209 |
-
βββ tests/
|
| 210 |
-
β βββ test_environment.py # 30+ unit & integration tests
|
| 211 |
-
βββ scripts/
|
| 212 |
-
βββ baseline_inference.py # Original baseline agent script
|
| 213 |
```
|
| 214 |
|
| 215 |
-
##
|
| 216 |
|
| 217 |
-
|
| 218 |
-
- Shuffles log entry order so agents can't memorize positions
|
| 219 |
-
- Ensures reproducible episodes for consistent evaluation
|
| 220 |
-
- When `seed=None` (default), returns the canonical scenario for testing
|
| 221 |
|
| 222 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
|
| 224 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: API Debug Env
|
| 3 |
+
emoji: π§
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: yellow
|
| 6 |
sdk: docker
|
|
|
|
| 7 |
app_port: 8000
|
| 8 |
tags:
|
| 9 |
- openenv
|
| 10 |
---
|
|
|
|
| 11 |
|
| 12 |
+
# π§ API Integration Debugging Environment
|
| 13 |
|
| 14 |
+
> **A real-world environment for training and evaluating AI agents on multi-service API debugging with cascading failures, dynamic state, and multi-dimensional grading.**
|
| 15 |
|
| 16 |
+
[](https://github.com/meta-pytorch/OpenEnv)
|
| 17 |
+
[](https://python.org)
|
| 18 |
+
[](LICENSE)
|
| 19 |
|
| 20 |
+
## Why API Debugging?
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
+
API integration failures are one of the most common and time-consuming issues in production software. When Service A calls Service B which calls Service C, a single misconfiguration can cascade through the entire system. Debugging requires:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
+
- **Structured diagnosis**: inspecting logs, configs, and endpoints across services
|
| 25 |
+
- **Dependency awareness**: understanding which service failures affect which downstream services
|
| 26 |
+
- **Strategic reasoning**: fixing upstream issues first to unmask downstream problems
|
| 27 |
|
| 28 |
+
This environment simulates *real-world cascading API failures* β not toy string-matching puzzles.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
+
## How It Works
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
+
```
|
| 33 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 34 |
+
β Agent Debugging Loop β
|
| 35 |
+
β β
|
| 36 |
+
β 1. reset() β Initial observation with broken service state β
|
| 37 |
+
β 2. step(inspect_logs) β Error logs from target service β
|
| 38 |
+
β 3. step(inspect_config) β Current (broken) configuration β
|
| 39 |
+
β 4. step(inspect_endpoint) β Live error response simulation β
|
| 40 |
+
β 5. step(submit_fix) β Fix validation + cascade resolution β
|
| 41 |
+
β 6. grade() β Multi-dimensional rubric score β
|
| 42 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 43 |
+
```
|
| 44 |
|
| 45 |
+
### Service Dependency Graphs
|
| 46 |
+
|
| 47 |
+
Each task models a real multi-service system with dependency chains:
|
| 48 |
+
|
| 49 |
+
```mermaid
|
| 50 |
+
graph LR
|
| 51 |
+
A[order_service] --> B[inventory_service]
|
| 52 |
+
B --> C[shipping_service]
|
| 53 |
+
A --> D[api_gateway]
|
| 54 |
+
B --> E[auth_service]
|
| 55 |
+
style A fill:#ff6b6b
|
| 56 |
+
style B fill:#ffd93d
|
| 57 |
+
style C fill:#6bcb77
|
| 58 |
+
style D fill:#6bcb77
|
| 59 |
+
style E fill:#6bcb77
|
| 60 |
```
|
| 61 |
|
| 62 |
+
**Red** = error, **Yellow** = degraded, **Green** = healthy. Fixing upstream issues changes downstream health.
|
| 63 |
+
|
| 64 |
+
## Environment Design
|
| 65 |
+
|
| 66 |
+
### Dynamic State
|
| 67 |
+
|
| 68 |
+
Unlike static environments, our state changes as the agent acts:
|
| 69 |
+
|
| 70 |
+
1. **Service health tracking**: Each service has a status (`healthy`, `degraded`, `error`) that updates when issues are fixed
|
| 71 |
+
2. **Dynamic logs**: After fixing an issue, re-inspecting logs shows *new entries* reflecting the fix
|
| 72 |
+
3. **Cascading effects**: Fixing an upstream issue can change downstream service behavior
|
| 73 |
+
4. **Error trace**: Shows the full error propagation chain, shrinking as issues are fixed
|
| 74 |
+
|
| 75 |
+
### Reward Shaping
|
| 76 |
+
|
| 77 |
+
| Action | Reward | Condition |
|
| 78 |
+
|--------|--------|-----------|
|
| 79 |
+
| `inspect_logs` (new service, finds issues) | +0.15 | New relevant error patterns found |
|
| 80 |
+
| `inspect_logs` (new service, no issues) | +0.05 | Valid inspection but no issues here |
|
| 81 |
+
| `inspect_logs` (repeat, unchanged) | 0.00 | No new information |
|
| 82 |
+
| `inspect_logs` (repeat, dynamic logs) | +0.05 | New logs appeared after a fix |
|
| 83 |
+
| `inspect_config` (service has issues) | +0.05 | Relevant configuration retrieved |
|
| 84 |
+
| `inspect_endpoint` | +0.02 to +0.05 | Endpoint testing |
|
| 85 |
+
| `submit_fix` (correct) | +0.25 | Issue resolved |
|
| 86 |
+
| `submit_fix` (correct + inspected first) | +0.30 | Diagnosis + fix strategy bonus |
|
| 87 |
+
| `submit_fix` (partial β close value) | +0.03 | Right key, close but not exact value |
|
| 88 |
+
| `submit_fix` (wrong) | -0.10 | Incorrect fix |
|
| 89 |
+
| All actions complete | +0.20 | Completion bonus |
|
| 90 |
+
| Every step | -0.01 | Step cost (encourages efficiency) |
|
| 91 |
+
|
| 92 |
## Tasks
|
| 93 |
|
| 94 |
+
### Easy: Payment API Integration (2 issues, 15 steps)
|
| 95 |
+
|
| 96 |
+
Payment client failing to connect to payment gateway. Issues involve authentication and protocol errors.
|
| 97 |
+
|
| 98 |
+
- **Issue pool**: 4 possible issues, 2 selected per episode
|
| 99 |
- **Services**: `payment_client`, `payment_gateway`
|
| 100 |
+
- **Issue types**: Auth header missing, wrong Content-Type, timeout, deprecated endpoint
|
| 101 |
+
|
| 102 |
+
### Medium: Webhook Event Chain (3 issues, 25 steps)
|
| 103 |
+
|
| 104 |
+
Webhook notification system dropping events across a 3-service chain.
|
| 105 |
|
| 106 |
+
- **Issue pool**: 5 possible issues, 3 selected per episode
|
|
|
|
|
|
|
| 107 |
- **Services**: `webhook_sender`, `webhook_receiver`, `notification_service`
|
| 108 |
+
- **Issue types**: Rate limiting, retry misconfiguration, webhook signature, endpoint URL, compression
|
| 109 |
+
- **Dependencies**: Retry issue is masked by rate limit β must fix rate limit first
|
| 110 |
|
| 111 |
+
### Hard: E-Commerce Order Pipeline (5 issues, 40 steps)
|
| 112 |
+
|
| 113 |
+
Complex order processing pipeline with cascading failures across 5 services.
|
| 114 |
+
|
| 115 |
+
- **Issue pool**: 7 possible issues, 5 selected per episode
|
| 116 |
- **Services**: `order_service`, `inventory_service`, `shipping_service`, `api_gateway`, `auth_service`
|
| 117 |
+
- **Issue types**: Deprecated URLs, timeouts, race conditions, expired tokens, missing token refresh, circuit breakers, idempotency
|
| 118 |
+
- **Dependencies**: Timeout masked by wrong URL; token refresh masked by expired token
|
| 119 |
|
| 120 |
+
## Grading Rubric
|
| 121 |
|
| 122 |
+
The grader uses a **multi-dimensional rubric**, not a simple fix ratio:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
|
| 124 |
+
| Dimension | Weight | Description |
|
| 125 |
+
|-----------|--------|-------------|
|
| 126 |
+
| **Fix Score** | 40% | `issues_fixed / total_issues` |
|
| 127 |
+
| **Diagnosis Score** | 20% | Did the agent inspect the service before fixing it? |
|
| 128 |
+
| **Efficiency Score** | 15% | `remaining_steps / max_steps` β faster is better |
|
| 129 |
+
| **Strategy Score** | 25% | Logical debugging approach: inspect before fix, avoid repeats, follow dependency order, use all action types |
|
| 130 |
|
| 131 |
```
|
| 132 |
+
Final Score = fix Γ 0.40 + diagnosis Γ 0.20 + efficiency Γ 0.15 + strategy Γ 0.25
|
| 133 |
+
Clamped to (0.001, 0.999)
|
|
|
|
| 134 |
```
|
| 135 |
|
| 136 |
+
### Baseline Scores
|
|
|
|
|
|
|
| 137 |
|
| 138 |
+
| Task | Score | Steps | Issues Fixed |
|
| 139 |
+
|------|-------|-------|--------------|
|
| 140 |
+
| Easy | ~0.75 | 7 | 2/2 |
|
| 141 |
+
| Medium | ~0.55 | 10 | 3/3 |
|
| 142 |
+
| Hard | ~0.45 | 15 | 5/5 |
|
| 143 |
|
| 144 |
+
*Baseline uses a rule-based heuristic agent (inspect all β fix all).*
|
| 145 |
|
| 146 |
+
## Action & Observation Spaces
|
| 147 |
|
| 148 |
+
### Action Space
|
|
|
|
| 149 |
|
| 150 |
+
```json
|
| 151 |
+
{
|
| 152 |
+
"action_type": "inspect_logs | inspect_config | inspect_endpoint | submit_fix",
|
| 153 |
+
"target": "service_name",
|
| 154 |
+
"fix_payload": {
|
| 155 |
+
"config_key": "corrected_value"
|
| 156 |
+
}
|
| 157 |
+
}
|
| 158 |
+
```
|
| 159 |
|
| 160 |
+
### Observation Space
|
| 161 |
+
|
| 162 |
+
```json
|
| 163 |
+
{
|
| 164 |
+
"task_id": "easy",
|
| 165 |
+
"task_description": "...",
|
| 166 |
+
"logs": ["[ERROR] ..."],
|
| 167 |
+
"config_snapshot": {"headers": {"Content-Type": "text/plain"}},
|
| 168 |
+
"api_response": {"status": "error", "status_code": 401},
|
| 169 |
+
"service_status": {"payment_client": "error", "payment_gateway": "healthy"},
|
| 170 |
+
"dependency_graph": {"payment_client": ["payment_gateway"]},
|
| 171 |
+
"error_trace": ["[CRITICAL] payment_client: Missing Authorization header"],
|
| 172 |
+
"remaining_steps": 14,
|
| 173 |
+
"issues_found": 1,
|
| 174 |
+
"issues_fixed": 0,
|
| 175 |
+
"issues_total": 2,
|
| 176 |
+
"hints": ["Check headers.Authorization"],
|
| 177 |
+
"available_targets": ["payment_client", "payment_gateway"]
|
| 178 |
+
}
|
| 179 |
+
```
|
| 180 |
|
| 181 |
+
## Example Transcript
|
|
|
|
| 182 |
|
| 183 |
+
```
|
| 184 |
+
>>> reset(task_id="easy")
|
| 185 |
+
task_description: "Payment processing API integration is failing..."
|
| 186 |
+
service_status: {payment_client: "error", payment_gateway: "healthy"}
|
| 187 |
+
error_trace: [
|
| 188 |
+
"[CRITICAL] payment_client: Missing Authorization header",
|
| 189 |
+
" ββ> payment_gateway: All requests rejected with 401",
|
| 190 |
+
"[ERROR] payment_client: Wrong Content-Type (text/plain instead of application/json)",
|
| 191 |
+
" ββ> payment_gateway: Request body parsing fails"
|
| 192 |
+
]
|
| 193 |
+
|
| 194 |
+
>>> step(inspect_logs, target=payment_client)
|
| 195 |
+
logs: ["[ERROR] POST /process -> 401 Unauthorized", ...]
|
| 196 |
+
issues_found: 2, reward: +0.15
|
| 197 |
+
|
| 198 |
+
>>> step(inspect_config, target=payment_client)
|
| 199 |
+
config: {headers: {Content-Type: "text/plain", Accept: "..."}, ...}
|
| 200 |
+
reward: +0.05
|
| 201 |
+
|
| 202 |
+
>>> step(submit_fix, target=payment_client, fix_payload={headers.Authorization: "Bearer sk_key"})
|
| 203 |
+
action_result: "Fix accepted! Fixed 1 issue(s)."
|
| 204 |
+
service_status: {payment_client: "degraded"} # still has content-type issue
|
| 205 |
+
reward: +0.30
|
| 206 |
+
|
| 207 |
+
>>> step(inspect_logs, target=payment_client) # re-inspect shows new logs!
|
| 208 |
+
logs: [...original..., "[INFO] Authorization header set. Retrying request..."]
|
| 209 |
+
reward: +0.05 # reward for checking updated state
|
| 210 |
+
|
| 211 |
+
>>> step(submit_fix, target=payment_client, fix_payload={headers.Content-Type: "application/json"})
|
| 212 |
+
action_result: "Fix accepted! All issues fixed! Episode complete. π"
|
| 213 |
+
service_status: {payment_client: "healthy", payment_gateway: "healthy"}
|
| 214 |
+
error_trace: ["All issues resolved. No error cascades active."]
|
| 215 |
+
reward: +0.50 (fix + completion bonus)
|
| 216 |
+
|
| 217 |
+
>>> grade()
|
| 218 |
+
score: 0.82 (fix=1.0, diagnosis=1.0, efficiency=0.67, strategy=0.8)
|
| 219 |
```
|
| 220 |
|
| 221 |
## Setup & Usage
|
| 222 |
|
| 223 |
+
### Install Dependencies
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
|
| 225 |
```bash
|
| 226 |
+
cd api_debug_env # or project root
|
|
|
|
|
|
|
| 227 |
uv sync
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
```
|
| 229 |
|
| 230 |
+
### Run Locally
|
| 231 |
|
| 232 |
```bash
|
| 233 |
+
uvicorn server.app:app --reload --port 8000
|
|
|
|
|
|
|
| 234 |
```
|
| 235 |
|
| 236 |
+
### Run Tests
|
| 237 |
|
| 238 |
```bash
|
| 239 |
+
python -m pytest tests/ -v --tb=short
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
```
|
| 241 |
|
| 242 |
+
### Docker
|
| 243 |
|
| 244 |
```bash
|
| 245 |
+
docker build -t api_debug_env -f server/Dockerfile .
|
| 246 |
+
docker run -p 8000:8000 api_debug_env
|
| 247 |
```
|
| 248 |
|
| 249 |
### API Endpoints
|
| 250 |
|
| 251 |
| Endpoint | Method | Description |
|
| 252 |
|----------|--------|-------------|
|
| 253 |
+
| `/` | GET | Environment info + status |
|
| 254 |
+
| `/reset` | POST | Reset environment |
|
| 255 |
| `/step` | POST | Execute an action |
|
| 256 |
| `/state` | GET | Get current state |
|
| 257 |
+
| `/tasks` | GET | List all tasks with schemas |
|
| 258 |
+
| `/grader` | POST | Get grading score |
|
| 259 |
+
| `/baseline` | POST | Run baseline agent |
|
| 260 |
+
| `/health` | GET | Health check |
|
|
|
|
| 261 |
|
| 262 |
+
### Run Inference
|
| 263 |
|
| 264 |
+
```bash
|
| 265 |
+
export HF_TOKEN=your_token_here
|
| 266 |
+
python inference.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
```
|
| 268 |
|
| 269 |
+
## Design Philosophy
|
| 270 |
|
| 271 |
+
This environment is designed to be useful for **RL/agent training**, not just evaluation:
|
|
|
|
|
|
|
|
|
|
| 272 |
|
| 273 |
+
1. **Dense Rewards**: Every action type can yield positive or negative reward, enabling gradient-based training
|
| 274 |
+
2. **Progressive Difficulty**: EasyβMediumβHard with increasing service count and dependency complexity
|
| 275 |
+
3. **Partial Credit**: Close-but-wrong fixes get feedback instead of binary rejection
|
| 276 |
+
4. **Strategy Incentives**: The multi-dimensional rubric rewards *how* the agent solves, not just *what* it solves
|
| 277 |
+
5. **Stochastic**: Seed-based randomization prevents policy overfitting to memorized scenarios
|
| 278 |
+
6. **Cascading Dynamics**: Upstream fixes change downstream state, requiring multi-step reasoning
|
| 279 |
|
| 280 |
+
## Project Structure
|
| 281 |
+
|
| 282 |
+
```
|
| 283 |
+
βββ models.py # Pydantic Action & Observation definitions
|
| 284 |
+
βββ scenarios.py # Task scenarios with dependency graphs
|
| 285 |
+
βββ inference.py # MANDATORY baseline inference script
|
| 286 |
+
βββ openenv.yaml # OpenEnv metadata
|
| 287 |
+
βββ pyproject.toml # Dependencies
|
| 288 |
+
βββ server/
|
| 289 |
+
β βββ api_debug_env_environment.py # Core environment logic
|
| 290 |
+
β βββ app.py # FastAPI endpoints
|
| 291 |
+
β βββ Dockerfile # HF Spaces deployment
|
| 292 |
+
βββ tests/
|
| 293 |
+
βββ test_environment.py # 48+ unit & integration tests
|
| 294 |
+
```
|
__init__.py
CHANGED
|
@@ -6,8 +6,18 @@
|
|
| 6 |
|
| 7 |
"""Api Debug Env Environment."""
|
| 8 |
|
| 9 |
-
|
| 10 |
-
from .
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
__all__ = [
|
| 13 |
"ApiDebugAction",
|
|
|
|
| 6 |
|
| 7 |
"""Api Debug Env Environment."""
|
| 8 |
|
| 9 |
+
try:
|
| 10 |
+
from .client import ApiDebugEnv
|
| 11 |
+
from .models import ApiDebugAction, ApiDebugObservation
|
| 12 |
+
except ImportError:
|
| 13 |
+
# When running tests or scripts directly from the project root,
|
| 14 |
+
# relative imports won't work. Fall back to absolute imports.
|
| 15 |
+
try:
|
| 16 |
+
from client import ApiDebugEnv
|
| 17 |
+
from models import ApiDebugAction, ApiDebugObservation
|
| 18 |
+
except ImportError:
|
| 19 |
+
ApiDebugEnv = None # type: ignore
|
| 20 |
+
from models import ApiDebugAction, ApiDebugObservation
|
| 21 |
|
| 22 |
__all__ = [
|
| 23 |
"ApiDebugAction",
|
__pycache__/__init__.cpython-313.pyc
DELETED
|
Binary file (367 Bytes)
|
|
|
__pycache__/client.cpython-313.pyc
DELETED
|
Binary file (3.65 kB)
|
|
|
__pycache__/models.cpython-313.pyc
DELETED
|
Binary file (3.49 kB)
|
|
|
__pycache__/scenarios.cpython-313.pyc
DELETED
|
Binary file (13.1 kB)
|
|
|
inference.py
CHANGED
|
@@ -46,31 +46,39 @@ MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
|
|
| 46 |
BENCHMARK = "api_debug_env"
|
| 47 |
MAX_STEPS = 40 # max across all tasks (hard has 40)
|
| 48 |
TEMPERATURE = 0.3
|
| 49 |
-
MAX_TOKENS =
|
| 50 |
SUCCESS_SCORE_THRESHOLD = 0.1
|
| 51 |
|
| 52 |
SYSTEM_PROMPT = textwrap.dedent("""
|
| 53 |
You are an expert API debugging agent. You are tasked with diagnosing and fixing
|
| 54 |
-
broken API integrations
|
| 55 |
|
| 56 |
-
Available
|
| 57 |
{
|
| 58 |
"action_type": "inspect_logs" | "inspect_config" | "inspect_endpoint" | "submit_fix",
|
| 59 |
"target": "<service_name>",
|
| 60 |
"fix_payload": { ... } // required only for submit_fix
|
| 61 |
}
|
| 62 |
|
| 63 |
-
Strategy:
|
| 64 |
-
1.
|
| 65 |
-
2.
|
| 66 |
-
3.
|
| 67 |
-
4. Submit
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
""").strip()
|
| 75 |
|
| 76 |
|
|
@@ -100,26 +108,46 @@ def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> No
|
|
| 100 |
# βββ LLM Interaction ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 101 |
|
| 102 |
def build_user_prompt(obs: ApiDebugObservation, step: int) -> str:
|
| 103 |
-
"""Build a prompt from the current observation."""
|
| 104 |
parts = [
|
| 105 |
-
f"Step
|
| 106 |
f"Task: {obs.task_description}",
|
| 107 |
f"Remaining steps: {obs.remaining_steps}",
|
| 108 |
f"Issues found: {obs.issues_found}/{obs.issues_total}",
|
| 109 |
f"Issues fixed: {obs.issues_fixed}/{obs.issues_total}",
|
| 110 |
f"Last action result: {obs.action_result}",
|
| 111 |
-
f"Available targets: {obs.available_targets}",
|
| 112 |
]
|
| 113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
if obs.logs:
|
| 115 |
parts.append("Logs:\n" + "\n".join(obs.logs))
|
| 116 |
if obs.config_snapshot:
|
| 117 |
-
parts.append(f"Config:
|
| 118 |
if obs.api_response:
|
| 119 |
-
parts.append(f"API Response:
|
| 120 |
if obs.hints:
|
| 121 |
parts.append(f"Hints: {'; '.join(obs.hints)}")
|
| 122 |
|
|
|
|
| 123 |
return "\n".join(parts)
|
| 124 |
|
| 125 |
|
|
@@ -152,6 +180,14 @@ def get_model_action(
|
|
| 152 |
json_end = text.rfind("}") + 1
|
| 153 |
if json_start >= 0 and json_end > json_start:
|
| 154 |
text = text[json_start:json_end]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
action_json = json.loads(text)
|
| 157 |
messages.append({"role": "assistant", "content": json.dumps(action_json)})
|
|
@@ -164,6 +200,9 @@ def get_model_action(
|
|
| 164 |
except json.JSONDecodeError as exc:
|
| 165 |
print(f"[DEBUG] JSON parse failed (attempt {attempt+1}/{max_retries}): {exc}", flush=True)
|
| 166 |
last_error = exc
|
|
|
|
|
|
|
|
|
|
| 167 |
except Exception as exc:
|
| 168 |
print(f"[DEBUG] API call failed (attempt {attempt+1}/{max_retries}): {exc}", flush=True)
|
| 169 |
last_error = exc
|
|
|
|
| 46 |
BENCHMARK = "api_debug_env"
|
| 47 |
MAX_STEPS = 40 # max across all tasks (hard has 40)
|
| 48 |
TEMPERATURE = 0.3
|
| 49 |
+
MAX_TOKENS = 1024
|
| 50 |
SUCCESS_SCORE_THRESHOLD = 0.1
|
| 51 |
|
| 52 |
SYSTEM_PROMPT = textwrap.dedent("""
|
| 53 |
You are an expert API debugging agent. You are tasked with diagnosing and fixing
|
| 54 |
+
broken API integrations in a multi-service environment.
|
| 55 |
|
| 56 |
+
## Available Actions (respond with JSON only):
|
| 57 |
{
|
| 58 |
"action_type": "inspect_logs" | "inspect_config" | "inspect_endpoint" | "submit_fix",
|
| 59 |
"target": "<service_name>",
|
| 60 |
"fix_payload": { ... } // required only for submit_fix
|
| 61 |
}
|
| 62 |
|
| 63 |
+
## Debugging Strategy (follow this order):
|
| 64 |
+
1. **Inspect logs** on each service to identify error patterns and root causes
|
| 65 |
+
2. **Inspect config** to understand current (broken) settings
|
| 66 |
+
3. **Inspect endpoint** to see actual error responses if needed
|
| 67 |
+
4. **Submit fix** with corrected configuration values
|
| 68 |
+
|
| 69 |
+
## Key Rules:
|
| 70 |
+
- ALWAYS inspect logs and configs BEFORE submitting fixes
|
| 71 |
+
- Pay attention to the service dependency graph β upstream failures cascade downstream
|
| 72 |
+
- Fix upstream issues first (they may mask downstream problems)
|
| 73 |
+
- When submitting a fix, use the exact key format from the config
|
| 74 |
+
- For nested keys: {"headers.Authorization": "Bearer <token>"}
|
| 75 |
+
- For nested objects: {"retry": {"max_retries": 3, "backoff_factor": 2}}
|
| 76 |
+
- Check service_status to see which services are healthy/degraded/error
|
| 77 |
+
- After fixing, re-inspect logs on affected services β new logs appear showing the fix effect
|
| 78 |
+
|
| 79 |
+
## Response Format:
|
| 80 |
+
Respond with ONLY a single JSON object. No text, no explanation, no markdown.
|
| 81 |
+
Example: {"action_type": "inspect_logs", "target": "payment_client"}
|
| 82 |
""").strip()
|
| 83 |
|
| 84 |
|
|
|
|
| 108 |
# βββ LLM Interaction ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 109 |
|
| 110 |
def build_user_prompt(obs: ApiDebugObservation, step: int) -> str:
|
| 111 |
+
"""Build a detailed prompt from the current observation."""
|
| 112 |
parts = [
|
| 113 |
+
f"=== Step {step} ===",
|
| 114 |
f"Task: {obs.task_description}",
|
| 115 |
f"Remaining steps: {obs.remaining_steps}",
|
| 116 |
f"Issues found: {obs.issues_found}/{obs.issues_total}",
|
| 117 |
f"Issues fixed: {obs.issues_fixed}/{obs.issues_total}",
|
| 118 |
f"Last action result: {obs.action_result}",
|
|
|
|
| 119 |
]
|
| 120 |
|
| 121 |
+
# Show service health (dynamic state)
|
| 122 |
+
if obs.service_status:
|
| 123 |
+
status_str = ", ".join(f"{svc}={status}" for svc, status in obs.service_status.items())
|
| 124 |
+
parts.append(f"Service health: {status_str}")
|
| 125 |
+
|
| 126 |
+
# Show dependency graph
|
| 127 |
+
if obs.dependency_graph:
|
| 128 |
+
deps = []
|
| 129 |
+
for svc, dep_list in obs.dependency_graph.items():
|
| 130 |
+
if dep_list:
|
| 131 |
+
deps.append(f" {svc} -> {', '.join(dep_list)}")
|
| 132 |
+
if deps:
|
| 133 |
+
parts.append("Service dependencies:\n" + "\n".join(deps))
|
| 134 |
+
|
| 135 |
+
# Show error cascades
|
| 136 |
+
if obs.error_trace:
|
| 137 |
+
parts.append("Active error cascades:\n" + "\n".join(f" {t}" for t in obs.error_trace[:5]))
|
| 138 |
+
|
| 139 |
+
parts.append(f"Available targets: {obs.available_targets}")
|
| 140 |
+
|
| 141 |
if obs.logs:
|
| 142 |
parts.append("Logs:\n" + "\n".join(obs.logs))
|
| 143 |
if obs.config_snapshot:
|
| 144 |
+
parts.append(f"Config:\n{json.dumps(obs.config_snapshot, indent=2)}")
|
| 145 |
if obs.api_response:
|
| 146 |
+
parts.append(f"API Response:\n{json.dumps(obs.api_response, indent=2)}")
|
| 147 |
if obs.hints:
|
| 148 |
parts.append(f"Hints: {'; '.join(obs.hints)}")
|
| 149 |
|
| 150 |
+
parts.append("\nDecide your next action. Respond with ONLY a JSON object.")
|
| 151 |
return "\n".join(parts)
|
| 152 |
|
| 153 |
|
|
|
|
| 180 |
json_end = text.rfind("}") + 1
|
| 181 |
if json_start >= 0 and json_end > json_start:
|
| 182 |
text = text[json_start:json_end]
|
| 183 |
+
elif text.startswith("{"):
|
| 184 |
+
pass # Already JSON
|
| 185 |
+
else:
|
| 186 |
+
# Try to extract JSON from mixed text
|
| 187 |
+
json_start = text.find("{")
|
| 188 |
+
json_end = text.rfind("}") + 1
|
| 189 |
+
if json_start >= 0 and json_end > json_start:
|
| 190 |
+
text = text[json_start:json_end]
|
| 191 |
|
| 192 |
action_json = json.loads(text)
|
| 193 |
messages.append({"role": "assistant", "content": json.dumps(action_json)})
|
|
|
|
| 200 |
except json.JSONDecodeError as exc:
|
| 201 |
print(f"[DEBUG] JSON parse failed (attempt {attempt+1}/{max_retries}): {exc}", flush=True)
|
| 202 |
last_error = exc
|
| 203 |
+
# Add corrective message
|
| 204 |
+
messages.append({"role": "assistant", "content": text if 'text' in dir() else ""})
|
| 205 |
+
messages.append({"role": "user", "content": "Invalid response. Respond with ONLY a valid JSON object like: {\"action_type\": \"inspect_logs\", \"target\": \"payment_client\"}"})
|
| 206 |
except Exception as exc:
|
| 207 |
print(f"[DEBUG] API call failed (attempt {attempt+1}/{max_retries}): {exc}", flush=True)
|
| 208 |
last_error = exc
|
models.py
CHANGED
|
@@ -9,9 +9,12 @@ Data models for the API Integration Debugging Environment.
|
|
| 9 |
|
| 10 |
An agent must diagnose and fix broken API integrations by reading error logs,
|
| 11 |
inspecting configurations, and writing corrected API calls.
|
|
|
|
|
|
|
|
|
|
| 12 |
"""
|
| 13 |
|
| 14 |
-
from typing import Dict, List, Optional
|
| 15 |
|
| 16 |
from openenv.core.env_server.types import Action, Observation
|
| 17 |
from pydantic import Field
|
|
@@ -47,7 +50,7 @@ class ApiDebugObservation(Observation):
|
|
| 47 |
What the agent sees after each action.
|
| 48 |
|
| 49 |
Provides error logs, configuration snapshots, API responses,
|
| 50 |
-
|
| 51 |
"""
|
| 52 |
|
| 53 |
# Environment context
|
|
@@ -60,6 +63,20 @@ class ApiDebugObservation(Observation):
|
|
| 60 |
api_response: Optional[Dict] = Field(default=None, description="Response from testing the current endpoint config")
|
| 61 |
hints: List[str] = Field(default_factory=list, description="Progressive hints based on step count")
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
# Progress tracking
|
| 64 |
remaining_steps: int = Field(default=0, description="Steps remaining before episode timeout")
|
| 65 |
issues_found: int = Field(default=0, description="Issues the agent has correctly identified so far")
|
|
|
|
| 9 |
|
| 10 |
An agent must diagnose and fix broken API integrations by reading error logs,
|
| 11 |
inspecting configurations, and writing corrected API calls.
|
| 12 |
+
|
| 13 |
+
The observation space includes dynamic state: service health, dependency graph,
|
| 14 |
+
and error traces that update as the agent fixes issues.
|
| 15 |
"""
|
| 16 |
|
| 17 |
+
from typing import Any, Dict, List, Optional
|
| 18 |
|
| 19 |
from openenv.core.env_server.types import Action, Observation
|
| 20 |
from pydantic import Field
|
|
|
|
| 50 |
What the agent sees after each action.
|
| 51 |
|
| 52 |
Provides error logs, configuration snapshots, API responses,
|
| 53 |
+
service health status, dependency graph, and progress tracking.
|
| 54 |
"""
|
| 55 |
|
| 56 |
# Environment context
|
|
|
|
| 63 |
api_response: Optional[Dict] = Field(default=None, description="Response from testing the current endpoint config")
|
| 64 |
hints: List[str] = Field(default_factory=list, description="Progressive hints based on step count")
|
| 65 |
|
| 66 |
+
# Dynamic state (NEW β makes the environment interactive)
|
| 67 |
+
service_status: Dict[str, str] = Field(
|
| 68 |
+
default_factory=dict,
|
| 69 |
+
description="Current health of each service: 'healthy', 'degraded', 'error', 'unreachable'",
|
| 70 |
+
)
|
| 71 |
+
dependency_graph: Dict[str, List[str]] = Field(
|
| 72 |
+
default_factory=dict,
|
| 73 |
+
description="Service dependency graph: {service: [services it depends on]}",
|
| 74 |
+
)
|
| 75 |
+
error_trace: List[str] = Field(
|
| 76 |
+
default_factory=list,
|
| 77 |
+
description="Error propagation chain showing how failures cascade across services",
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
# Progress tracking
|
| 81 |
remaining_steps: int = Field(default=0, description="Steps remaining before episode timeout")
|
| 82 |
issues_found: int = Field(default=0, description="Issues the agent has correctly identified so far")
|
scenarios.py
CHANGED
|
@@ -7,12 +7,15 @@
|
|
| 7 |
"""
|
| 8 |
Scenario definitions for the API Integration Debugging Environment.
|
| 9 |
|
| 10 |
-
Each scenario
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
| 12 |
"""
|
| 13 |
|
| 14 |
from dataclasses import dataclass, field
|
| 15 |
-
from typing import Any, Dict, List, Optional
|
| 16 |
import random
|
| 17 |
|
| 18 |
|
|
@@ -25,11 +28,32 @@ class Issue:
|
|
| 25 |
expected_fix: Dict[str, Any]
|
| 26 |
fix_key: str # The key in the config that needs fixing
|
| 27 |
log_hint: str # Log line that hints at this issue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
|
| 30 |
@dataclass
|
| 31 |
class Scenario:
|
| 32 |
-
"""A complete API debugging scenario."""
|
| 33 |
task_id: str
|
| 34 |
difficulty: str
|
| 35 |
description: str
|
|
@@ -38,6 +62,15 @@ class Scenario:
|
|
| 38 |
configs: Dict[str, Dict[str, Any]]
|
| 39 |
logs: Dict[str, List[str]]
|
| 40 |
issues: List[Issue]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
|
| 43 |
def get_scenario(task_id: str, seed: Optional[int] = None) -> Scenario:
|
|
@@ -46,10 +79,9 @@ def get_scenario(task_id: str, seed: Optional[int] = None) -> Scenario:
|
|
| 46 |
|
| 47 |
Args:
|
| 48 |
task_id: One of 'easy', 'medium', 'hard'
|
| 49 |
-
seed: Optional seed for deterministic but varied
|
| 50 |
-
When provided, a random subset of issues
|
| 51 |
-
|
| 52 |
-
is returned (deterministic, for testing).
|
| 53 |
"""
|
| 54 |
scenario_builders = {
|
| 55 |
"easy": _easy_scenario,
|
|
@@ -59,22 +91,7 @@ def get_scenario(task_id: str, seed: Optional[int] = None) -> Scenario:
|
|
| 59 |
if task_id not in scenario_builders:
|
| 60 |
raise ValueError(f"Unknown task_id: {task_id}. Must be one of: {list(scenario_builders.keys())}")
|
| 61 |
|
| 62 |
-
scenario = scenario_builders[task_id]()
|
| 63 |
-
|
| 64 |
-
# If seed is provided, randomize the scenario
|
| 65 |
-
if seed is not None:
|
| 66 |
-
rng = random.Random(seed)
|
| 67 |
-
# Shuffle log entries for each service (order shouldn't matter)
|
| 68 |
-
for service_logs in scenario.logs.values():
|
| 69 |
-
rng.shuffle(service_logs)
|
| 70 |
-
# Randomize timestamps in log entries
|
| 71 |
-
for service, log_list in scenario.logs.items():
|
| 72 |
-
new_logs = []
|
| 73 |
-
for log_line in log_list:
|
| 74 |
-
# Replace dates with seed-derived dates to vary output
|
| 75 |
-
new_logs.append(log_line)
|
| 76 |
-
scenario.logs[service] = new_logs
|
| 77 |
-
|
| 78 |
return scenario
|
| 79 |
|
| 80 |
|
|
@@ -83,320 +100,978 @@ def get_all_task_ids() -> List[str]:
|
|
| 83 |
return ["easy", "medium", "hard"]
|
| 84 |
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
# βββ Easy Scenario βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 87 |
|
| 88 |
-
def _easy_scenario() -> Scenario:
|
| 89 |
"""
|
| 90 |
-
Easy:
|
| 91 |
-
Agent must
|
|
|
|
|
|
|
| 92 |
"""
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
"
|
| 98 |
-
"
|
| 99 |
-
"
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
"
|
| 106 |
-
"
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
"timeout": 30,
|
| 112 |
-
"retry_count": 3,
|
| 113 |
},
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
},
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
"[ERROR] 2026-03-25T10:15:23Z POST /process -> 401 Unauthorized",
|
| 125 |
"[ERROR] 2026-03-25T10:15:23Z Response: {'error': 'Missing or invalid Authorization header'}",
|
| 126 |
"[WARN] 2026-03-25T10:15:22Z Request headers: Content-Type=text/plain, Accept=application/json",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
"[ERROR] 2026-03-25T10:15:24Z POST /process -> 415 Unsupported Media Type",
|
| 128 |
"[ERROR] 2026-03-25T10:15:24Z Response: {'error': 'Content-Type must be application/json'}",
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
},
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
)
|
| 156 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
# βββ Medium Scenario βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 159 |
|
| 160 |
-
def _medium_scenario() -> Scenario:
|
| 161 |
"""
|
| 162 |
-
Medium: Webhook chain with
|
| 163 |
-
|
|
|
|
|
|
|
|
|
|
| 164 |
"""
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
"
|
| 170 |
-
"
|
| 171 |
-
"
|
| 172 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
),
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
"
|
| 178 |
-
|
| 179 |
-
"
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
},
|
| 183 |
-
"rate_limit": {
|
| 184 |
-
"requests_per_second": 100, # BUG: too high, receiver allows 10/s
|
| 185 |
-
"burst_size": 200,
|
| 186 |
-
},
|
| 187 |
-
"retry": {
|
| 188 |
-
"max_retries": 1, # BUG: should be at least 3
|
| 189 |
-
"backoff_factor": 0, # BUG: no backoff
|
| 190 |
-
"retry_on_status": [500], # BUG: should also retry on 429
|
| 191 |
-
},
|
| 192 |
-
"signing_secret": "whsec_abc123secret",
|
| 193 |
},
|
| 194 |
-
"
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
"
|
| 203 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
},
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
},
|
|
|
|
|
|
|
| 210 |
},
|
| 211 |
-
|
| 212 |
-
"
|
| 213 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
"[ERROR] 2026-03-25T11:00:01Z Rate limited. Retry-After: 5s",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
"[WARN] 2026-03-25T11:00:02Z Retry attempt 1/1 failed. No more retries.",
|
| 216 |
"[ERROR] 2026-03-25T11:00:03Z Event evt_12345 dropped after retry exhaustion",
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
"webhook_receiver": [
|
| 221 |
-
"[WARN] 2026-03-25T11:00:01Z Rate limit exceeded: 100 req/s > 10 req/s allowed",
|
| 222 |
"[ERROR] 2026-03-25T11:00:02Z Signature validation FAILED: received empty signature",
|
| 223 |
"[WARN] 2026-03-25T11:00:02Z Dropping event: invalid signature from webhook_sender",
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
"[
|
| 228 |
-
"[
|
| 229 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
},
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
)
|
| 262 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
|
| 264 |
# βββ Hard Scenario ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 265 |
|
| 266 |
-
def _hard_scenario() -> Scenario:
|
| 267 |
"""
|
| 268 |
-
Hard:
|
| 269 |
-
|
| 270 |
-
|
|
|
|
|
|
|
|
|
|
| 271 |
"""
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
"
|
| 277 |
-
"
|
| 278 |
-
"
|
| 279 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
),
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
"
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
},
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 307 |
},
|
| 308 |
-
"
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
"
|
| 313 |
-
"
|
| 314 |
-
"status": "healthy",
|
| 315 |
},
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
|
|
|
|
|
|
| 324 |
},
|
| 325 |
-
"
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 329 |
},
|
|
|
|
| 330 |
},
|
| 331 |
-
|
| 332 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
"[ERROR] 2026-03-25T12:00:05Z POST inventory.internal/v1/check -> 301 Moved Permanently",
|
| 334 |
"[ERROR] 2026-03-25T12:00:05Z Response: {'error': 'Endpoint deprecated. Use /v2/reserve'}",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 335 |
"[ERROR] 2026-03-25T12:00:07Z Timeout after 2s waiting for inventory response",
|
| 336 |
"[ERROR] 2026-03-25T12:00:07Z Order ord_999 failed: inventory check timed out",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 337 |
"[WARN] 2026-03-25T12:00:08Z Synchronous mode: blocking on inventory response",
|
| 338 |
"[ERROR] 2026-03-25T12:00:09Z Race condition: order ord_998 processed before ord_997 completed",
|
| 339 |
-
]
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
"[WARN] 2026-03-25T12:00:06Z Processing reservation... avg time: 4s",
|
| 343 |
"[ERROR] 2026-03-25T12:00:10Z POST shipping.internal/v1/create -> 401 Unauthorized",
|
| 344 |
"[ERROR] 2026-03-25T12:00:10Z Auth token expired_token_456 is no longer valid",
|
| 345 |
"[ERROR] 2026-03-25T12:00:10Z Cannot create shipment: authentication failed",
|
| 346 |
-
]
|
| 347 |
-
|
| 348 |
-
"[WARN] 2026-03-25T12:00:10Z Rejected request: token 'expired_token_456' is expired"
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 359 |
},
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 402 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
"""
|
| 8 |
Scenario definitions for the API Integration Debugging Environment.
|
| 9 |
|
| 10 |
+
Each scenario models a realistic multi-service API ecosystem with:
|
| 11 |
+
- Service dependency graphs (upstream/downstream relationships)
|
| 12 |
+
- Cascading failures (upstream bugs propagate downstream)
|
| 13 |
+
- Dynamic logs that update when issues are fixed
|
| 14 |
+
- Expanded issue pools for seed-based random subset selection
|
| 15 |
"""
|
| 16 |
|
| 17 |
from dataclasses import dataclass, field
|
| 18 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 19 |
import random
|
| 20 |
|
| 21 |
|
|
|
|
| 28 |
expected_fix: Dict[str, Any]
|
| 29 |
fix_key: str # The key in the config that needs fixing
|
| 30 |
log_hint: str # Log line that hints at this issue
|
| 31 |
+
# --- New fields for cascading failures ---
|
| 32 |
+
depends_on: List[str] = field(default_factory=list)
|
| 33 |
+
# Issues that must be fixed before this one can be diagnosed
|
| 34 |
+
cascade_effects: Dict[str, str] = field(default_factory=dict)
|
| 35 |
+
# service -> error message caused by this issue being unfixed
|
| 36 |
+
category: str = "configuration"
|
| 37 |
+
# Issue category: configuration, authentication, networking, protocol
|
| 38 |
+
severity: str = "error"
|
| 39 |
+
# Severity: error, warning, critical
|
| 40 |
+
root_cause_explanation: str = ""
|
| 41 |
+
# Detailed explanation of why this issue occurs (for grading diagnosis quality)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
@dataclass
|
| 45 |
+
class ServiceNode:
|
| 46 |
+
"""A node in the service dependency graph."""
|
| 47 |
+
name: str
|
| 48 |
+
depends_on: List[str] = field(default_factory=list)
|
| 49 |
+
# Services this one calls (upstream dependencies)
|
| 50 |
+
health_status: str = "degraded"
|
| 51 |
+
# healthy, degraded, error, unreachable
|
| 52 |
|
| 53 |
|
| 54 |
@dataclass
|
| 55 |
class Scenario:
|
| 56 |
+
"""A complete API debugging scenario with dependency graph."""
|
| 57 |
task_id: str
|
| 58 |
difficulty: str
|
| 59 |
description: str
|
|
|
|
| 62 |
configs: Dict[str, Dict[str, Any]]
|
| 63 |
logs: Dict[str, List[str]]
|
| 64 |
issues: List[Issue]
|
| 65 |
+
# --- New fields ---
|
| 66 |
+
service_graph: Dict[str, ServiceNode] = field(default_factory=dict)
|
| 67 |
+
# Service dependency graph
|
| 68 |
+
dynamic_logs: Dict[str, Dict[str, List[str]]] = field(default_factory=dict)
|
| 69 |
+
# service -> {issue_id: [new logs when fixed]}
|
| 70 |
+
optimal_fix_order: List[str] = field(default_factory=list)
|
| 71 |
+
# Optimal order to fix issues (for strategy scoring)
|
| 72 |
+
context: str = ""
|
| 73 |
+
# Additional scenario context for the agent
|
| 74 |
|
| 75 |
|
| 76 |
def get_scenario(task_id: str, seed: Optional[int] = None) -> Scenario:
|
|
|
|
| 79 |
|
| 80 |
Args:
|
| 81 |
task_id: One of 'easy', 'medium', 'hard'
|
| 82 |
+
seed: Optional seed for deterministic but varied scenarios.
|
| 83 |
+
When provided, selects a random subset of issues from the pool
|
| 84 |
+
and randomizes log order. When None, returns the canonical scenario.
|
|
|
|
| 85 |
"""
|
| 86 |
scenario_builders = {
|
| 87 |
"easy": _easy_scenario,
|
|
|
|
| 91 |
if task_id not in scenario_builders:
|
| 92 |
raise ValueError(f"Unknown task_id: {task_id}. Must be one of: {list(scenario_builders.keys())}")
|
| 93 |
|
| 94 |
+
scenario = scenario_builders[task_id](seed=seed)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
return scenario
|
| 96 |
|
| 97 |
|
|
|
|
| 100 |
return ["easy", "medium", "hard"]
|
| 101 |
|
| 102 |
|
| 103 |
+
def _select_issues(pool: List[Issue], count: int, rng: random.Random) -> List[Issue]:
|
| 104 |
+
"""Select a random subset of issues from a pool, respecting dependencies."""
|
| 105 |
+
if count >= len(pool):
|
| 106 |
+
selected = list(pool)
|
| 107 |
+
else:
|
| 108 |
+
# Build dependency-aware selection
|
| 109 |
+
available = list(pool)
|
| 110 |
+
selected = []
|
| 111 |
+
while len(selected) < count and available:
|
| 112 |
+
# Pick a random issue
|
| 113 |
+
issue = rng.choice(available)
|
| 114 |
+
available.remove(issue)
|
| 115 |
+
# Add its dependencies too if not already selected
|
| 116 |
+
deps_satisfied = all(
|
| 117 |
+
any(s.issue_id == dep for s in selected)
|
| 118 |
+
for dep in issue.depends_on
|
| 119 |
+
)
|
| 120 |
+
if deps_satisfied or not issue.depends_on:
|
| 121 |
+
selected.append(issue)
|
| 122 |
+
else:
|
| 123 |
+
# Add dependencies first
|
| 124 |
+
for dep_id in issue.depends_on:
|
| 125 |
+
dep_issue = next((i for i in pool if i.issue_id == dep_id), None)
|
| 126 |
+
if dep_issue and dep_issue not in selected:
|
| 127 |
+
selected.append(dep_issue)
|
| 128 |
+
if dep_issue in available:
|
| 129 |
+
available.remove(dep_issue)
|
| 130 |
+
selected.append(issue)
|
| 131 |
+
|
| 132 |
+
# Shuffle log order for selected issues
|
| 133 |
+
rng.shuffle(selected)
|
| 134 |
+
return selected[:count]
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def _randomize_scenario(scenario: Scenario, seed: int) -> Scenario:
|
| 138 |
+
"""Apply seed-based randomization to a scenario."""
|
| 139 |
+
rng = random.Random(seed)
|
| 140 |
+
|
| 141 |
+
# Shuffle log entries for each service
|
| 142 |
+
for service_logs in scenario.logs.values():
|
| 143 |
+
rng.shuffle(service_logs)
|
| 144 |
+
|
| 145 |
+
# Vary timestamps in log entries
|
| 146 |
+
base_hour = rng.randint(8, 16)
|
| 147 |
+
base_minute = rng.randint(0, 59)
|
| 148 |
+
for service, log_list in scenario.logs.items():
|
| 149 |
+
new_logs = []
|
| 150 |
+
for i, log_line in enumerate(log_list):
|
| 151 |
+
# Replace the timestamp portion
|
| 152 |
+
minute = (base_minute + i * rng.randint(1, 5)) % 60
|
| 153 |
+
hour = base_hour + (base_minute + i * rng.randint(1, 5)) // 60
|
| 154 |
+
new_log = log_line
|
| 155 |
+
if "2026-" in new_log:
|
| 156 |
+
# Replace date with varied date
|
| 157 |
+
day = rng.randint(20, 28)
|
| 158 |
+
new_log = new_log.replace(
|
| 159 |
+
"2026-03-25",
|
| 160 |
+
f"2026-03-{day:02d}"
|
| 161 |
+
).replace(
|
| 162 |
+
"2026-03-24",
|
| 163 |
+
f"2026-03-{day-1:02d}"
|
| 164 |
+
)
|
| 165 |
+
new_logs.append(new_log)
|
| 166 |
+
scenario.logs[service] = new_logs
|
| 167 |
+
|
| 168 |
+
return scenario
|
| 169 |
+
|
| 170 |
+
|
| 171 |
# βββ Easy Scenario βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 172 |
|
| 173 |
+
def _easy_scenario(seed: Optional[int] = None) -> Scenario:
|
| 174 |
"""
|
| 175 |
+
Easy: Payment API integration failures.
|
| 176 |
+
Agent must diagnose auth + content-type issues with clear log signals.
|
| 177 |
+
|
| 178 |
+
Issue pool has 4 possible issues; canonical scenario uses 2.
|
| 179 |
"""
|
| 180 |
+
# Full issue pool (4 issues, canonical uses 2)
|
| 181 |
+
issue_pool = [
|
| 182 |
+
Issue(
|
| 183 |
+
issue_id="easy_auth",
|
| 184 |
+
service="payment_client",
|
| 185 |
+
description="Missing Authorization header β payment gateway requires Bearer token authentication",
|
| 186 |
+
expected_fix={"headers.Authorization": "Bearer <token>"},
|
| 187 |
+
fix_key="headers.Authorization",
|
| 188 |
+
log_hint="Missing or invalid Authorization header",
|
| 189 |
+
category="authentication",
|
| 190 |
+
severity="critical",
|
| 191 |
+
root_cause_explanation=(
|
| 192 |
+
"The payment_client is missing the Authorization header entirely. "
|
| 193 |
+
"The payment_gateway requires Bearer token auth on all /process requests. "
|
| 194 |
+
"This results in HTTP 401 on every payment attempt."
|
| 195 |
+
),
|
| 196 |
+
cascade_effects={
|
| 197 |
+
"payment_gateway": "All requests from payment_client rejected with 401"
|
|
|
|
|
|
|
| 198 |
},
|
| 199 |
+
),
|
| 200 |
+
Issue(
|
| 201 |
+
issue_id="easy_content_type",
|
| 202 |
+
service="payment_client",
|
| 203 |
+
description="Wrong Content-Type header (text/plain instead of application/json)",
|
| 204 |
+
expected_fix={"headers.Content-Type": "application/json"},
|
| 205 |
+
fix_key="headers.Content-Type",
|
| 206 |
+
log_hint="Content-Type must be application/json",
|
| 207 |
+
category="protocol",
|
| 208 |
+
severity="error",
|
| 209 |
+
root_cause_explanation=(
|
| 210 |
+
"The payment_client sends Content-Type: text/plain, but the gateway "
|
| 211 |
+
"only accepts application/json. This causes HTTP 415 Unsupported Media Type. "
|
| 212 |
+
"The gateway cannot parse the request body."
|
| 213 |
+
),
|
| 214 |
+
cascade_effects={
|
| 215 |
+
"payment_gateway": "Request body parsing fails for payment_client requests"
|
| 216 |
},
|
| 217 |
+
),
|
| 218 |
+
Issue(
|
| 219 |
+
issue_id="easy_timeout",
|
| 220 |
+
service="payment_client",
|
| 221 |
+
description="Timeout set too low (5s) for payment processing that takes 8-12s",
|
| 222 |
+
expected_fix={"timeout": 30},
|
| 223 |
+
fix_key="timeout",
|
| 224 |
+
log_hint="Request timed out after 5s",
|
| 225 |
+
category="networking",
|
| 226 |
+
severity="error",
|
| 227 |
+
root_cause_explanation=(
|
| 228 |
+
"The payment_client has timeout=5s, but payment processing at the gateway "
|
| 229 |
+
"takes 8-12s for fraud checks. Legitimate payments are timing out."
|
| 230 |
+
),
|
| 231 |
+
),
|
| 232 |
+
Issue(
|
| 233 |
+
issue_id="easy_base_url",
|
| 234 |
+
service="payment_client",
|
| 235 |
+
description="Base URL pointing to deprecated v1 endpoint instead of v2",
|
| 236 |
+
expected_fix={"base_url": "https://api.paymentgateway.com/v2"},
|
| 237 |
+
fix_key="base_url",
|
| 238 |
+
log_hint="API v1 is deprecated",
|
| 239 |
+
category="configuration",
|
| 240 |
+
severity="warning",
|
| 241 |
+
root_cause_explanation=(
|
| 242 |
+
"The payment_client uses /v1 which is deprecated and returning 301 redirects. "
|
| 243 |
+
"The gateway v2 endpoint has different request schemas, causing deserialization errors."
|
| 244 |
+
),
|
| 245 |
+
),
|
| 246 |
+
]
|
| 247 |
+
|
| 248 |
+
# Select issues based on seed
|
| 249 |
+
if seed is not None:
|
| 250 |
+
rng = random.Random(seed)
|
| 251 |
+
issues = _select_issues(issue_pool, 2, rng)
|
| 252 |
+
else:
|
| 253 |
+
issues = issue_pool[:2] # Canonical: auth + content_type
|
| 254 |
+
|
| 255 |
+
# Build logs based on selected issues
|
| 256 |
+
client_logs = [
|
| 257 |
+
"[INFO] 2026-03-25T10:15:20Z Payment client initialized with base_url=https://api.paymentgateway.com/v2",
|
| 258 |
+
]
|
| 259 |
+
gateway_logs = [
|
| 260 |
+
"[INFO] 2026-03-25T10:15:20Z Gateway ready, accepting application/json with Bearer auth",
|
| 261 |
+
]
|
| 262 |
+
|
| 263 |
+
for issue in issues:
|
| 264 |
+
if issue.issue_id == "easy_auth":
|
| 265 |
+
client_logs.extend([
|
| 266 |
"[ERROR] 2026-03-25T10:15:23Z POST /process -> 401 Unauthorized",
|
| 267 |
"[ERROR] 2026-03-25T10:15:23Z Response: {'error': 'Missing or invalid Authorization header'}",
|
| 268 |
"[WARN] 2026-03-25T10:15:22Z Request headers: Content-Type=text/plain, Accept=application/json",
|
| 269 |
+
])
|
| 270 |
+
gateway_logs.append(
|
| 271 |
+
"[WARN] 2026-03-25T10:15:23Z Rejected request: no Authorization header present"
|
| 272 |
+
)
|
| 273 |
+
elif issue.issue_id == "easy_content_type":
|
| 274 |
+
client_logs.extend([
|
| 275 |
"[ERROR] 2026-03-25T10:15:24Z POST /process -> 415 Unsupported Media Type",
|
| 276 |
"[ERROR] 2026-03-25T10:15:24Z Response: {'error': 'Content-Type must be application/json'}",
|
| 277 |
+
])
|
| 278 |
+
gateway_logs.append(
|
| 279 |
+
"[WARN] 2026-03-25T10:15:24Z Rejected request: unsupported Content-Type 'text/plain'"
|
| 280 |
+
)
|
| 281 |
+
elif issue.issue_id == "easy_timeout":
|
| 282 |
+
client_logs.extend([
|
| 283 |
+
"[ERROR] 2026-03-25T10:15:30Z POST /process -> Request timed out after 5s",
|
| 284 |
+
"[WARN] 2026-03-25T10:15:30Z Payment processing takes 8-12s for fraud verification",
|
| 285 |
+
])
|
| 286 |
+
gateway_logs.append(
|
| 287 |
+
"[INFO] 2026-03-25T10:15:30Z Processing payment... estimated time: 10s"
|
| 288 |
+
)
|
| 289 |
+
elif issue.issue_id == "easy_base_url":
|
| 290 |
+
client_logs.extend([
|
| 291 |
+
"[ERROR] 2026-03-25T10:15:21Z GET /v1/status -> 301 Moved Permanently",
|
| 292 |
+
"[WARN] 2026-03-25T10:15:21Z API v1 is deprecated, migrate to /v2",
|
| 293 |
+
])
|
| 294 |
+
gateway_logs.append(
|
| 295 |
+
"[WARN] 2026-03-25T10:15:21Z Deprecated v1 endpoint accessed"
|
| 296 |
+
)
|
| 297 |
+
|
| 298 |
+
# Determine initial config based on selected issues
|
| 299 |
+
configs = {
|
| 300 |
+
"payment_client": {
|
| 301 |
+
"base_url": "https://api.paymentgateway.com/v2",
|
| 302 |
+
"headers": {
|
| 303 |
+
"Content-Type": "application/json",
|
| 304 |
+
"Accept": "application/json",
|
| 305 |
+
},
|
| 306 |
+
"timeout": 30,
|
| 307 |
+
"retry_count": 3,
|
| 308 |
},
|
| 309 |
+
"payment_gateway": {
|
| 310 |
+
"endpoint": "/process",
|
| 311 |
+
"method": "POST",
|
| 312 |
+
"required_headers": ["Authorization", "Content-Type"],
|
| 313 |
+
"accepted_content_types": ["application/json"],
|
| 314 |
+
"auth_scheme": "Bearer",
|
| 315 |
+
"processing_time_ms": "8000-12000",
|
| 316 |
+
},
|
| 317 |
+
}
|
| 318 |
+
|
| 319 |
+
# Apply broken config for each selected issue
|
| 320 |
+
for issue in issues:
|
| 321 |
+
if issue.issue_id == "easy_auth":
|
| 322 |
+
# Remove auth header (it shouldn't exist)
|
| 323 |
+
configs["payment_client"]["headers"].pop("Authorization", None)
|
| 324 |
+
elif issue.issue_id == "easy_content_type":
|
| 325 |
+
configs["payment_client"]["headers"]["Content-Type"] = "text/plain"
|
| 326 |
+
elif issue.issue_id == "easy_timeout":
|
| 327 |
+
configs["payment_client"]["timeout"] = 5
|
| 328 |
+
elif issue.issue_id == "easy_base_url":
|
| 329 |
+
configs["payment_client"]["base_url"] = "https://api.paymentgateway.com/v1"
|
| 330 |
+
|
| 331 |
+
# Dynamic logs: what changes after fixing each issue
|
| 332 |
+
dynamic_logs = {}
|
| 333 |
+
for issue in issues:
|
| 334 |
+
if issue.issue_id == "easy_auth":
|
| 335 |
+
dynamic_logs["easy_auth"] = {
|
| 336 |
+
"payment_client": ["[INFO] Authorization header set. Retrying request..."],
|
| 337 |
+
"payment_gateway": ["[INFO] Authentication successful for payment_client"],
|
| 338 |
+
}
|
| 339 |
+
elif issue.issue_id == "easy_content_type":
|
| 340 |
+
dynamic_logs["easy_content_type"] = {
|
| 341 |
+
"payment_client": ["[INFO] Content-Type set to application/json. Request body parsed."],
|
| 342 |
+
"payment_gateway": ["[INFO] Request body parsed successfully as JSON"],
|
| 343 |
+
}
|
| 344 |
+
elif issue.issue_id == "easy_timeout":
|
| 345 |
+
dynamic_logs["easy_timeout"] = {
|
| 346 |
+
"payment_client": ["[INFO] Timeout increased to 30s. Payment processing completing normally."],
|
| 347 |
+
}
|
| 348 |
+
elif issue.issue_id == "easy_base_url":
|
| 349 |
+
dynamic_logs["easy_base_url"] = {
|
| 350 |
+
"payment_client": ["[INFO] Migrated to v2 API endpoint. Requests routing correctly."],
|
| 351 |
+
}
|
| 352 |
+
|
| 353 |
+
# Service dependency graph
|
| 354 |
+
service_graph = {
|
| 355 |
+
"payment_client": ServiceNode(
|
| 356 |
+
name="payment_client",
|
| 357 |
+
depends_on=["payment_gateway"],
|
| 358 |
+
health_status="error",
|
| 359 |
+
),
|
| 360 |
+
"payment_gateway": ServiceNode(
|
| 361 |
+
name="payment_gateway",
|
| 362 |
+
depends_on=[],
|
| 363 |
+
health_status="healthy",
|
| 364 |
+
),
|
| 365 |
+
}
|
| 366 |
+
|
| 367 |
+
scenario = Scenario(
|
| 368 |
+
task_id="easy",
|
| 369 |
+
difficulty="easy",
|
| 370 |
+
description=(
|
| 371 |
+
"A payment processing API integration is failing. "
|
| 372 |
+
"The client is sending requests to the payment gateway but getting error responses. "
|
| 373 |
+
"Diagnose the root causes by inspecting error logs and service configurations, "
|
| 374 |
+
"then submit the correct configuration fixes."
|
| 375 |
+
),
|
| 376 |
+
max_steps=15,
|
| 377 |
+
services=["payment_client", "payment_gateway"],
|
| 378 |
+
configs=configs,
|
| 379 |
+
logs={"payment_client": client_logs, "payment_gateway": gateway_logs},
|
| 380 |
+
issues=issues,
|
| 381 |
+
service_graph=service_graph,
|
| 382 |
+
dynamic_logs=dynamic_logs,
|
| 383 |
+
optimal_fix_order=[i.issue_id for i in issues],
|
| 384 |
+
context=(
|
| 385 |
+
"The payment_client sends HTTP requests to payment_gateway. "
|
| 386 |
+
"payment_gateway requires Bearer authentication and JSON content type."
|
| 387 |
+
),
|
| 388 |
)
|
| 389 |
|
| 390 |
+
if seed is not None:
|
| 391 |
+
scenario = _randomize_scenario(scenario, seed)
|
| 392 |
+
|
| 393 |
+
return scenario
|
| 394 |
+
|
| 395 |
|
| 396 |
# βββ Medium Scenario βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 397 |
|
| 398 |
+
def _medium_scenario(seed: Optional[int] = None) -> Scenario:
|
| 399 |
"""
|
| 400 |
+
Medium: Webhook chain with cascading failures.
|
| 401 |
+
Service A -> Service B -> Service C, with rate limiting, retry, and auth issues.
|
| 402 |
+
|
| 403 |
+
Issue pool has 5 possible issues; canonical scenario uses 3.
|
| 404 |
+
Issues have dependencies β fixing rate_limit reveals the real retry issue.
|
| 405 |
"""
|
| 406 |
+
issue_pool = [
|
| 407 |
+
Issue(
|
| 408 |
+
issue_id="medium_rate_limit",
|
| 409 |
+
service="webhook_sender",
|
| 410 |
+
description="Rate limit too high (100/s vs receiver's 10/s limit) causing 429 responses",
|
| 411 |
+
expected_fix={"rate_limit.requests_per_second": 10},
|
| 412 |
+
fix_key="rate_limit.requests_per_second",
|
| 413 |
+
log_hint="Rate limit exceeded: 100 req/s > 10 req/s allowed",
|
| 414 |
+
category="networking",
|
| 415 |
+
severity="error",
|
| 416 |
+
root_cause_explanation=(
|
| 417 |
+
"webhook_sender fires at 100 req/s but webhook_receiver only accepts 10 req/s. "
|
| 418 |
+
"The excess requests get 429 Too Many Requests, and with only 1 retry, most events are dropped."
|
| 419 |
+
),
|
| 420 |
+
cascade_effects={
|
| 421 |
+
"webhook_receiver": "Overwhelmed with requests, dropping 90% of events",
|
| 422 |
+
"notification_service": "No events arriving downstream",
|
| 423 |
+
},
|
| 424 |
),
|
| 425 |
+
Issue(
|
| 426 |
+
issue_id="medium_retry",
|
| 427 |
+
service="webhook_sender",
|
| 428 |
+
description="Insufficient retry config: only 1 retry, no backoff, missing 429 in retry_on_status",
|
| 429 |
+
expected_fix={
|
| 430 |
+
"retry.max_retries": 3,
|
| 431 |
+
"retry.backoff_factor": 2,
|
| 432 |
+
"retry.retry_on_status": [429, 500],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 433 |
},
|
| 434 |
+
fix_key="retry",
|
| 435 |
+
log_hint="Retry attempt 1/1 failed. No more retries.",
|
| 436 |
+
depends_on=["medium_rate_limit"],
|
| 437 |
+
# The retry issue is masked by the rate limit issue β even with retries,
|
| 438 |
+
# 100 req/s would still overwhelm the receiver
|
| 439 |
+
category="configuration",
|
| 440 |
+
severity="error",
|
| 441 |
+
root_cause_explanation=(
|
| 442 |
+
"Even after fixing the rate limit, the sender only retries once with no backoff. "
|
| 443 |
+
"Transient 429s during bursts aren't retried because 429 isn't in retry_on_status. "
|
| 444 |
+
"This causes event loss on any temporary load spike."
|
| 445 |
+
),
|
| 446 |
+
),
|
| 447 |
+
Issue(
|
| 448 |
+
issue_id="medium_signature",
|
| 449 |
+
service="webhook_sender",
|
| 450 |
+
description="Webhook signature header is empty β receiver rejects unsigned events",
|
| 451 |
+
expected_fix={"headers.X-Webhook-Signature": "sha256=<computed>"},
|
| 452 |
+
fix_key="headers.X-Webhook-Signature",
|
| 453 |
+
log_hint="Signature validation FAILED: received empty signature",
|
| 454 |
+
category="authentication",
|
| 455 |
+
severity="critical",
|
| 456 |
+
root_cause_explanation=(
|
| 457 |
+
"webhook_sender has signing_secret configured but the X-Webhook-Signature header "
|
| 458 |
+
"is empty string. webhook_receiver validates signatures and drops all unsigned "
|
| 459 |
+
"events as potential spoofing attempts."
|
| 460 |
+
),
|
| 461 |
+
cascade_effects={
|
| 462 |
+
"webhook_receiver": "Dropping all events as unsigned/spoofed",
|
| 463 |
+
"notification_service": "Zero events forwarded from receiver",
|
| 464 |
},
|
| 465 |
+
),
|
| 466 |
+
Issue(
|
| 467 |
+
issue_id="medium_target_url",
|
| 468 |
+
service="webhook_sender",
|
| 469 |
+
description="Target URL pointing to wrong receiver endpoint (/webhook vs /hooks/incoming)",
|
| 470 |
+
expected_fix={"target_url": "https://receiver.internal/hooks/incoming"},
|
| 471 |
+
fix_key="target_url",
|
| 472 |
+
log_hint="404 Not Found on /webhook endpoint",
|
| 473 |
+
category="configuration",
|
| 474 |
+
severity="error",
|
| 475 |
+
root_cause_explanation=(
|
| 476 |
+
"webhook_sender posts to /webhook but the receiver listens on /hooks/incoming. "
|
| 477 |
+
"All requests get 404 Not Found."
|
| 478 |
+
),
|
| 479 |
+
),
|
| 480 |
+
Issue(
|
| 481 |
+
issue_id="medium_content_encoding",
|
| 482 |
+
service="webhook_sender",
|
| 483 |
+
description="Payload compression enabled but receiver doesn't support gzip",
|
| 484 |
+
expected_fix={"compression": "none"},
|
| 485 |
+
fix_key="compression",
|
| 486 |
+
log_hint="Unsupported Content-Encoding: gzip",
|
| 487 |
+
category="protocol",
|
| 488 |
+
severity="warning",
|
| 489 |
+
root_cause_explanation=(
|
| 490 |
+
"webhook_sender compresses payloads with gzip but webhook_receiver "
|
| 491 |
+
"doesn't have a decompression middleware. Requests fail with 415."
|
| 492 |
+
),
|
| 493 |
+
),
|
| 494 |
+
]
|
| 495 |
+
|
| 496 |
+
if seed is not None:
|
| 497 |
+
rng = random.Random(seed)
|
| 498 |
+
issues = _select_issues(issue_pool, 3, rng)
|
| 499 |
+
else:
|
| 500 |
+
issues = issue_pool[:3] # Canonical: rate_limit, retry, signature
|
| 501 |
+
|
| 502 |
+
# Build configs
|
| 503 |
+
configs = {
|
| 504 |
+
"webhook_sender": {
|
| 505 |
+
"target_url": "https://receiver.internal/hooks/incoming",
|
| 506 |
+
"headers": {
|
| 507 |
+
"Content-Type": "application/json",
|
| 508 |
+
"X-Webhook-Signature": "sha256=computed_hmac",
|
| 509 |
+
},
|
| 510 |
+
"rate_limit": {
|
| 511 |
+
"requests_per_second": 10,
|
| 512 |
+
"burst_size": 20,
|
| 513 |
+
},
|
| 514 |
+
"retry": {
|
| 515 |
+
"max_retries": 3,
|
| 516 |
+
"backoff_factor": 2,
|
| 517 |
+
"retry_on_status": [429, 500],
|
| 518 |
},
|
| 519 |
+
"signing_secret": "whsec_abc123secret",
|
| 520 |
+
"compression": "none",
|
| 521 |
},
|
| 522 |
+
"webhook_receiver": {
|
| 523 |
+
"endpoint": "/hooks/incoming",
|
| 524 |
+
"rate_limit": {
|
| 525 |
+
"requests_per_second": 10,
|
| 526 |
+
"burst_size": 20,
|
| 527 |
+
},
|
| 528 |
+
"signature_validation": True,
|
| 529 |
+
"expected_signature_header": "X-Webhook-Signature",
|
| 530 |
+
"signing_secret": "whsec_abc123secret",
|
| 531 |
+
"forward_to": "https://notifications.internal/notify",
|
| 532 |
+
"supported_encodings": ["identity"],
|
| 533 |
+
},
|
| 534 |
+
"notification_service": {
|
| 535 |
+
"endpoint": "/notify",
|
| 536 |
+
"accepts_from": ["webhook_receiver"],
|
| 537 |
+
"status": "healthy",
|
| 538 |
+
},
|
| 539 |
+
}
|
| 540 |
+
|
| 541 |
+
# Apply broken config for each selected issue
|
| 542 |
+
for issue in issues:
|
| 543 |
+
if issue.issue_id == "medium_rate_limit":
|
| 544 |
+
configs["webhook_sender"]["rate_limit"]["requests_per_second"] = 100
|
| 545 |
+
configs["webhook_sender"]["rate_limit"]["burst_size"] = 200
|
| 546 |
+
elif issue.issue_id == "medium_retry":
|
| 547 |
+
configs["webhook_sender"]["retry"] = {
|
| 548 |
+
"max_retries": 1,
|
| 549 |
+
"backoff_factor": 0,
|
| 550 |
+
"retry_on_status": [500],
|
| 551 |
+
}
|
| 552 |
+
elif issue.issue_id == "medium_signature":
|
| 553 |
+
configs["webhook_sender"]["headers"]["X-Webhook-Signature"] = ""
|
| 554 |
+
elif issue.issue_id == "medium_target_url":
|
| 555 |
+
configs["webhook_sender"]["target_url"] = "https://receiver.internal/webhook"
|
| 556 |
+
elif issue.issue_id == "medium_content_encoding":
|
| 557 |
+
configs["webhook_sender"]["compression"] = "gzip"
|
| 558 |
+
|
| 559 |
+
# Build logs based on selected issues
|
| 560 |
+
sender_logs = [
|
| 561 |
+
"[INFO] 2026-03-25T10:59:59Z Webhook sender started. Signature header: X-Webhook-Signature",
|
| 562 |
+
]
|
| 563 |
+
receiver_logs = [
|
| 564 |
+
"[INFO] 2026-03-25T10:59:59Z Receiver ready. Rate limit: 10 req/s. Signature validation: ON",
|
| 565 |
+
]
|
| 566 |
+
notif_logs = [
|
| 567 |
+
"[INFO] 2026-03-25T10:59:59Z Notification service healthy. Waiting for events.",
|
| 568 |
+
]
|
| 569 |
+
|
| 570 |
+
for issue in issues:
|
| 571 |
+
if issue.issue_id == "medium_rate_limit":
|
| 572 |
+
sender_logs.extend([
|
| 573 |
+
"[ERROR] 2026-03-25T11:00:01Z POST /hooks/incoming -> 429 Too Many Requests",
|
| 574 |
"[ERROR] 2026-03-25T11:00:01Z Rate limited. Retry-After: 5s",
|
| 575 |
+
"[WARN] 2026-03-25T11:00:00Z Sending at 100 req/s (burst=200)",
|
| 576 |
+
])
|
| 577 |
+
receiver_logs.append(
|
| 578 |
+
"[WARN] 2026-03-25T11:00:01Z Rate limit exceeded: 100 req/s > 10 req/s allowed"
|
| 579 |
+
)
|
| 580 |
+
elif issue.issue_id == "medium_retry":
|
| 581 |
+
sender_logs.extend([
|
| 582 |
"[WARN] 2026-03-25T11:00:02Z Retry attempt 1/1 failed. No more retries.",
|
| 583 |
"[ERROR] 2026-03-25T11:00:03Z Event evt_12345 dropped after retry exhaustion",
|
| 584 |
+
])
|
| 585 |
+
elif issue.issue_id == "medium_signature":
|
| 586 |
+
receiver_logs.extend([
|
|
|
|
|
|
|
| 587 |
"[ERROR] 2026-03-25T11:00:02Z Signature validation FAILED: received empty signature",
|
| 588 |
"[WARN] 2026-03-25T11:00:02Z Dropping event: invalid signature from webhook_sender",
|
| 589 |
+
])
|
| 590 |
+
elif issue.issue_id == "medium_target_url":
|
| 591 |
+
sender_logs.extend([
|
| 592 |
+
"[ERROR] 2026-03-25T11:00:01Z POST /webhook -> 404 Not Found on /webhook endpoint",
|
| 593 |
+
"[WARN] 2026-03-25T11:00:01Z Receiver endpoint may have changed",
|
| 594 |
+
])
|
| 595 |
+
elif issue.issue_id == "medium_content_encoding":
|
| 596 |
+
receiver_logs.extend([
|
| 597 |
+
"[ERROR] 2026-03-25T11:00:02Z Unsupported Content-Encoding: gzip",
|
| 598 |
+
"[WARN] 2026-03-25T11:00:02Z Cannot decompress payload from webhook_sender",
|
| 599 |
+
])
|
| 600 |
+
|
| 601 |
+
notif_logs.append("[WARN] 2026-03-25T11:00:05Z No events received in last 60s")
|
| 602 |
+
|
| 603 |
+
# Dynamic logs
|
| 604 |
+
dynamic_logs = {
|
| 605 |
+
"medium_rate_limit": {
|
| 606 |
+
"webhook_sender": ["[INFO] Rate limit adjusted to 10 req/s. 429 errors resolved."],
|
| 607 |
+
"webhook_receiver": ["[INFO] Incoming request rate normalized. Processing events."],
|
| 608 |
},
|
| 609 |
+
"medium_retry": {
|
| 610 |
+
"webhook_sender": ["[INFO] Retry config updated: 3 retries with backoff. 429 now retried."],
|
| 611 |
+
},
|
| 612 |
+
"medium_signature": {
|
| 613 |
+
"webhook_sender": ["[INFO] Webhook signature computed and attached to requests."],
|
| 614 |
+
"webhook_receiver": ["[INFO] Signature validation passed for incoming events."],
|
| 615 |
+
},
|
| 616 |
+
"medium_target_url": {
|
| 617 |
+
"webhook_sender": ["[INFO] Target URL corrected to /hooks/incoming. Requests routing OK."],
|
| 618 |
+
},
|
| 619 |
+
"medium_content_encoding": {
|
| 620 |
+
"webhook_sender": ["[INFO] Compression disabled. Receiver parsing payloads correctly."],
|
| 621 |
+
},
|
| 622 |
+
}
|
| 623 |
+
|
| 624 |
+
service_graph = {
|
| 625 |
+
"webhook_sender": ServiceNode(
|
| 626 |
+
name="webhook_sender",
|
| 627 |
+
depends_on=["webhook_receiver"],
|
| 628 |
+
health_status="error",
|
| 629 |
+
),
|
| 630 |
+
"webhook_receiver": ServiceNode(
|
| 631 |
+
name="webhook_receiver",
|
| 632 |
+
depends_on=["notification_service"],
|
| 633 |
+
health_status="degraded",
|
| 634 |
+
),
|
| 635 |
+
"notification_service": ServiceNode(
|
| 636 |
+
name="notification_service",
|
| 637 |
+
depends_on=[],
|
| 638 |
+
health_status="healthy",
|
| 639 |
+
),
|
| 640 |
+
}
|
| 641 |
+
|
| 642 |
+
# Determine optimal fix order (respect dependencies)
|
| 643 |
+
issue_ids = [i.issue_id for i in issues]
|
| 644 |
+
optimal_order = []
|
| 645 |
+
# Rate limit should be fixed before retry (dependency)
|
| 646 |
+
if "medium_rate_limit" in issue_ids:
|
| 647 |
+
optimal_order.append("medium_rate_limit")
|
| 648 |
+
if "medium_retry" in issue_ids:
|
| 649 |
+
optimal_order.append("medium_retry")
|
| 650 |
+
for iid in issue_ids:
|
| 651 |
+
if iid not in optimal_order:
|
| 652 |
+
optimal_order.append(iid)
|
| 653 |
+
|
| 654 |
+
scenario = Scenario(
|
| 655 |
+
task_id="medium",
|
| 656 |
+
difficulty="medium",
|
| 657 |
+
description=(
|
| 658 |
+
"A webhook-based notification system is dropping events. "
|
| 659 |
+
"webhook_sender sends webhooks to webhook_receiver, which forwards to notification_service. "
|
| 660 |
+
"Events are being lost due to multiple cascading failures in the webhook chain. "
|
| 661 |
+
"Fix the webhook_sender configuration to restore event delivery."
|
| 662 |
+
),
|
| 663 |
+
max_steps=25,
|
| 664 |
+
services=["webhook_sender", "webhook_receiver", "notification_service"],
|
| 665 |
+
configs=configs,
|
| 666 |
+
logs={
|
| 667 |
+
"webhook_sender": sender_logs,
|
| 668 |
+
"webhook_receiver": receiver_logs,
|
| 669 |
+
"notification_service": notif_logs,
|
| 670 |
+
},
|
| 671 |
+
issues=issues,
|
| 672 |
+
service_graph=service_graph,
|
| 673 |
+
dynamic_logs=dynamic_logs,
|
| 674 |
+
optimal_fix_order=optimal_order,
|
| 675 |
+
context=(
|
| 676 |
+
"Event flow: webhook_sender -> webhook_receiver -> notification_service. "
|
| 677 |
+
"webhook_receiver validates signatures and enforces rate limits. "
|
| 678 |
+
"Fixing upstream issues may reveal additional downstream problems."
|
| 679 |
+
),
|
| 680 |
)
|
| 681 |
|
| 682 |
+
if seed is not None:
|
| 683 |
+
scenario = _randomize_scenario(scenario, seed)
|
| 684 |
+
|
| 685 |
+
return scenario
|
| 686 |
+
|
| 687 |
|
| 688 |
# βββ Hard Scenario ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 689 |
|
| 690 |
+
def _hard_scenario(seed: Optional[int] = None) -> Scenario:
|
| 691 |
"""
|
| 692 |
+
Hard: E-commerce order processing pipeline with cascading failures.
|
| 693 |
+
order_service -> inventory_service -> shipping_service
|
| 694 |
+
Plus api_gateway and auth_service.
|
| 695 |
+
|
| 696 |
+
Issue pool has 7 possible issues; canonical scenario uses 5.
|
| 697 |
+
Multiple dependency chains make this genuinely challenging.
|
| 698 |
"""
|
| 699 |
+
issue_pool = [
|
| 700 |
+
Issue(
|
| 701 |
+
issue_id="hard_wrong_url",
|
| 702 |
+
service="order_service",
|
| 703 |
+
description="Order service calling deprecated /v1/check instead of /v2/reserve",
|
| 704 |
+
expected_fix={"inventory_url": "https://inventory.internal/v2/reserve"},
|
| 705 |
+
fix_key="inventory_url",
|
| 706 |
+
log_hint="Endpoint deprecated. Use /v2/reserve",
|
| 707 |
+
category="configuration",
|
| 708 |
+
severity="error",
|
| 709 |
+
root_cause_explanation=(
|
| 710 |
+
"order_service calls /v1/check which was deprecated. The API gateway returns "
|
| 711 |
+
"301 Moved Permanently. The redirect goes to /v2/check (read-only) instead of "
|
| 712 |
+
"/v2/reserve (write). Inventory is never actually reserved."
|
| 713 |
+
),
|
| 714 |
+
cascade_effects={
|
| 715 |
+
"inventory_service": "Receiving read-only check requests instead of reservation requests",
|
| 716 |
+
"api_gateway": "Generating 301 redirect responses for deprecated endpoints",
|
| 717 |
+
},
|
| 718 |
),
|
| 719 |
+
Issue(
|
| 720 |
+
issue_id="hard_timeout",
|
| 721 |
+
service="order_service",
|
| 722 |
+
description="Timeout too short (2s) for inventory service that takes ~4s to process",
|
| 723 |
+
expected_fix={"timeout": 10},
|
| 724 |
+
fix_key="timeout",
|
| 725 |
+
log_hint="Timeout after 2s waiting for inventory response",
|
| 726 |
+
depends_on=["hard_wrong_url"],
|
| 727 |
+
# Timeout issue is masked by wrong URL β fix URL first to see real timeout
|
| 728 |
+
category="networking",
|
| 729 |
+
severity="error",
|
| 730 |
+
root_cause_explanation=(
|
| 731 |
+
"order_service has timeout=2s but inventory_service takes ~4s for reservation "
|
| 732 |
+
"(including DB lock + stock validation). After fixing the URL, requests now reach "
|
| 733 |
+
"inventory but time out before completion."
|
| 734 |
+
),
|
| 735 |
+
cascade_effects={
|
| 736 |
+
"inventory_service": "Connections killed mid-processing, leaving orphaned DB locks",
|
| 737 |
},
|
| 738 |
+
),
|
| 739 |
+
Issue(
|
| 740 |
+
issue_id="hard_async",
|
| 741 |
+
service="order_service",
|
| 742 |
+
description="Synchronous mode causes race conditions between concurrent orders",
|
| 743 |
+
expected_fix={"async_mode": True},
|
| 744 |
+
fix_key="async_mode",
|
| 745 |
+
log_hint="Race condition: order ord_998 processed before ord_997 completed",
|
| 746 |
+
category="configuration",
|
| 747 |
+
severity="critical",
|
| 748 |
+
root_cause_explanation=(
|
| 749 |
+
"order_service runs in sync mode, blocking the main thread on each inventory call. "
|
| 750 |
+
"Concurrent orders queue up and when timeouts occur, orders are processed out of "
|
| 751 |
+
"order, causing double-reservation and stock inconsistencies."
|
| 752 |
+
),
|
| 753 |
+
),
|
| 754 |
+
Issue(
|
| 755 |
+
issue_id="hard_expired_token",
|
| 756 |
+
service="inventory_service",
|
| 757 |
+
description="Expired auth token used for shipping service requests",
|
| 758 |
+
expected_fix={"headers.Authorization": "Bearer valid_token_789"},
|
| 759 |
+
fix_key="headers.Authorization",
|
| 760 |
+
log_hint="Auth token expired_token_456 is no longer valid",
|
| 761 |
+
category="authentication",
|
| 762 |
+
severity="critical",
|
| 763 |
+
root_cause_explanation=(
|
| 764 |
+
"inventory_service uses Bearer expired_token_456 to authenticate with "
|
| 765 |
+
"shipping_service. This token expired on 2026-03-24. All shipment creation "
|
| 766 |
+
"requests fail with 401, so reserved inventory is never shipped."
|
| 767 |
+
),
|
| 768 |
+
cascade_effects={
|
| 769 |
+
"shipping_service": "Rejecting all requests from inventory_service",
|
| 770 |
+
"auth_service": "Logging repeated failed token validations",
|
| 771 |
+
},
|
| 772 |
+
),
|
| 773 |
+
Issue(
|
| 774 |
+
issue_id="hard_token_refresh",
|
| 775 |
+
service="inventory_service",
|
| 776 |
+
description="No automatic token refresh mechanism configured",
|
| 777 |
+
expected_fix={"token_refresh_url": "https://auth.internal/refresh", "auto_refresh": True},
|
| 778 |
+
fix_key="token_refresh_url",
|
| 779 |
+
log_hint="Token validation failed: expired_token_456 expired",
|
| 780 |
+
depends_on=["hard_expired_token"],
|
| 781 |
+
# Token refresh is only relevant after fixing the expired token
|
| 782 |
+
category="configuration",
|
| 783 |
+
severity="error",
|
| 784 |
+
root_cause_explanation=(
|
| 785 |
+
"Even after replacing the expired token, there's no auto-refresh mechanism. "
|
| 786 |
+
"Tokens expire every 24h, so without auto_refresh=True and a refresh URL, "
|
| 787 |
+
"the same issue will recur tomorrow."
|
| 788 |
+
),
|
| 789 |
+
),
|
| 790 |
+
Issue(
|
| 791 |
+
issue_id="hard_circuit_breaker",
|
| 792 |
+
service="order_service",
|
| 793 |
+
description="No circuit breaker β failed requests keep hammering inventory_service",
|
| 794 |
+
expected_fix={"circuit_breaker.enabled": True, "circuit_breaker.failure_threshold": 5},
|
| 795 |
+
fix_key="circuit_breaker",
|
| 796 |
+
log_hint="Circuit breaker not configured",
|
| 797 |
+
category="configuration",
|
| 798 |
+
severity="warning",
|
| 799 |
+
root_cause_explanation=(
|
| 800 |
+
"Without a circuit breaker, order_service keeps sending requests to "
|
| 801 |
+
"inventory_service even when it's consistently failing. This wastes resources "
|
| 802 |
+
"and can cause a cascading overload."
|
| 803 |
+
),
|
| 804 |
+
),
|
| 805 |
+
Issue(
|
| 806 |
+
issue_id="hard_idempotency",
|
| 807 |
+
service="order_service",
|
| 808 |
+
description="Missing idempotency key β retried requests create duplicate orders",
|
| 809 |
+
expected_fix={"headers.Idempotency-Key": "order-{order_id}"},
|
| 810 |
+
fix_key="headers.Idempotency-Key",
|
| 811 |
+
log_hint="Duplicate order detected: ord_997 submitted twice",
|
| 812 |
+
depends_on=["hard_async"],
|
| 813 |
+
category="protocol",
|
| 814 |
+
severity="error",
|
| 815 |
+
root_cause_explanation=(
|
| 816 |
+
"When async retries fire, there's no Idempotency-Key header to deduplicate "
|
| 817 |
+
"requests. inventory_service creates duplicate reservations for the same order."
|
| 818 |
+
),
|
| 819 |
+
),
|
| 820 |
+
]
|
| 821 |
+
|
| 822 |
+
if seed is not None:
|
| 823 |
+
rng = random.Random(seed)
|
| 824 |
+
issues = _select_issues(issue_pool, 5, rng)
|
| 825 |
+
else:
|
| 826 |
+
issues = issue_pool[:5] # Canonical: first 5
|
| 827 |
+
|
| 828 |
+
configs = {
|
| 829 |
+
"order_service": {
|
| 830 |
+
"name": "order_service",
|
| 831 |
+
"inventory_url": "https://inventory.internal/v2/reserve",
|
| 832 |
+
"headers": {
|
| 833 |
+
"Content-Type": "application/json",
|
| 834 |
+
"Authorization": "Bearer valid_token_123",
|
| 835 |
},
|
| 836 |
+
"timeout": 10,
|
| 837 |
+
"async_mode": True,
|
| 838 |
+
"callback_url": "https://orders.internal/callback",
|
| 839 |
+
"circuit_breaker": {
|
| 840 |
+
"enabled": True,
|
| 841 |
+
"failure_threshold": 5,
|
|
|
|
| 842 |
},
|
| 843 |
+
},
|
| 844 |
+
"inventory_service": {
|
| 845 |
+
"name": "inventory_service",
|
| 846 |
+
"endpoint_version": "v2",
|
| 847 |
+
"reserve_path": "/v2/reserve",
|
| 848 |
+
"check_path": "/v2/check",
|
| 849 |
+
"shipping_url": "https://shipping.internal/v1/create",
|
| 850 |
+
"headers": {
|
| 851 |
+
"Content-Type": "application/json",
|
| 852 |
+
"Authorization": "Bearer valid_token_789",
|
| 853 |
},
|
| 854 |
+
"timeout": 10,
|
| 855 |
+
"processing_time_avg": 4,
|
| 856 |
+
"token_refresh_url": "https://auth.internal/refresh",
|
| 857 |
+
"auto_refresh": True,
|
| 858 |
+
},
|
| 859 |
+
"shipping_service": {
|
| 860 |
+
"name": "shipping_service",
|
| 861 |
+
"create_path": "/v1/create",
|
| 862 |
+
"requires_auth": True,
|
| 863 |
+
"accepted_auth": ["Bearer"],
|
| 864 |
+
"token_validation_url": "https://auth.internal/validate",
|
| 865 |
+
"status": "healthy",
|
| 866 |
+
},
|
| 867 |
+
"api_gateway": {
|
| 868 |
+
"routes": {
|
| 869 |
+
"/v1/check": "DEPRECATED β use /v2/check",
|
| 870 |
+
"/v2/reserve": "inventory_service",
|
| 871 |
+
"/v2/check": "inventory_service",
|
| 872 |
+
"/v1/create": "shipping_service",
|
| 873 |
},
|
| 874 |
+
"timeout": 30,
|
| 875 |
},
|
| 876 |
+
"auth_service": {
|
| 877 |
+
"valid_tokens": ["valid_token_123", "valid_token_789"],
|
| 878 |
+
"expired_tokens": ["expired_token_456"],
|
| 879 |
+
"token_refresh_endpoint": "/refresh",
|
| 880 |
+
"token_ttl_hours": 24,
|
| 881 |
+
},
|
| 882 |
+
}
|
| 883 |
+
|
| 884 |
+
# Apply broken config for each selected issue
|
| 885 |
+
for issue in issues:
|
| 886 |
+
if issue.issue_id == "hard_wrong_url":
|
| 887 |
+
configs["order_service"]["inventory_url"] = "https://inventory.internal/v1/check"
|
| 888 |
+
elif issue.issue_id == "hard_timeout":
|
| 889 |
+
configs["order_service"]["timeout"] = 2
|
| 890 |
+
elif issue.issue_id == "hard_async":
|
| 891 |
+
configs["order_service"]["async_mode"] = False
|
| 892 |
+
elif issue.issue_id == "hard_expired_token":
|
| 893 |
+
configs["inventory_service"]["headers"]["Authorization"] = "Bearer expired_token_456"
|
| 894 |
+
elif issue.issue_id == "hard_token_refresh":
|
| 895 |
+
configs["inventory_service"].pop("token_refresh_url", None)
|
| 896 |
+
configs["inventory_service"]["auto_refresh"] = False
|
| 897 |
+
elif issue.issue_id == "hard_circuit_breaker":
|
| 898 |
+
configs["order_service"]["circuit_breaker"] = {"enabled": False}
|
| 899 |
+
elif issue.issue_id == "hard_idempotency":
|
| 900 |
+
configs["order_service"]["headers"].pop("Idempotency-Key", None)
|
| 901 |
+
|
| 902 |
+
# Build logs
|
| 903 |
+
order_logs = []
|
| 904 |
+
inventory_logs = []
|
| 905 |
+
shipping_logs = []
|
| 906 |
+
gateway_logs = []
|
| 907 |
+
auth_logs = [
|
| 908 |
+
"[INFO] 2026-03-25T12:00:00Z Auth service ready. Valid tokens: 2, Expired: 1",
|
| 909 |
+
]
|
| 910 |
+
|
| 911 |
+
for issue in issues:
|
| 912 |
+
if issue.issue_id == "hard_wrong_url":
|
| 913 |
+
order_logs.extend([
|
| 914 |
"[ERROR] 2026-03-25T12:00:05Z POST inventory.internal/v1/check -> 301 Moved Permanently",
|
| 915 |
"[ERROR] 2026-03-25T12:00:05Z Response: {'error': 'Endpoint deprecated. Use /v2/reserve'}",
|
| 916 |
+
])
|
| 917 |
+
inventory_logs.append(
|
| 918 |
+
"[INFO] 2026-03-25T12:00:05Z Received request on /v1/check -> redirecting to /v2/check"
|
| 919 |
+
)
|
| 920 |
+
gateway_logs.extend([
|
| 921 |
+
"[WARN] 2026-03-25T12:00:05Z Deprecated endpoint /v1/check accessed by order_service",
|
| 922 |
+
"[INFO] 2026-03-25T12:00:05Z Redirecting /v1/check -> /v2/check (301)",
|
| 923 |
+
])
|
| 924 |
+
elif issue.issue_id == "hard_timeout":
|
| 925 |
+
order_logs.extend([
|
| 926 |
"[ERROR] 2026-03-25T12:00:07Z Timeout after 2s waiting for inventory response",
|
| 927 |
"[ERROR] 2026-03-25T12:00:07Z Order ord_999 failed: inventory check timed out",
|
| 928 |
+
])
|
| 929 |
+
inventory_logs.append(
|
| 930 |
+
"[WARN] 2026-03-25T12:00:06Z Processing reservation... avg time: 4s"
|
| 931 |
+
)
|
| 932 |
+
elif issue.issue_id == "hard_async":
|
| 933 |
+
order_logs.extend([
|
| 934 |
"[WARN] 2026-03-25T12:00:08Z Synchronous mode: blocking on inventory response",
|
| 935 |
"[ERROR] 2026-03-25T12:00:09Z Race condition: order ord_998 processed before ord_997 completed",
|
| 936 |
+
])
|
| 937 |
+
elif issue.issue_id == "hard_expired_token":
|
| 938 |
+
inventory_logs.extend([
|
|
|
|
| 939 |
"[ERROR] 2026-03-25T12:00:10Z POST shipping.internal/v1/create -> 401 Unauthorized",
|
| 940 |
"[ERROR] 2026-03-25T12:00:10Z Auth token expired_token_456 is no longer valid",
|
| 941 |
"[ERROR] 2026-03-25T12:00:10Z Cannot create shipment: authentication failed",
|
| 942 |
+
])
|
| 943 |
+
shipping_logs.append(
|
| 944 |
+
"[WARN] 2026-03-25T12:00:10Z Rejected request: token 'expired_token_456' is expired"
|
| 945 |
+
)
|
| 946 |
+
auth_logs.append(
|
| 947 |
+
"[WARN] 2026-03-25T12:00:10Z Token validation failed: expired_token_456 expired at 2026-03-24T00:00:00Z"
|
| 948 |
+
)
|
| 949 |
+
elif issue.issue_id == "hard_token_refresh":
|
| 950 |
+
auth_logs.append(
|
| 951 |
+
"[WARN] 2026-03-25T12:00:11Z Token validation failed: expired_token_456 expired. No refresh configured."
|
| 952 |
+
)
|
| 953 |
+
elif issue.issue_id == "hard_circuit_breaker":
|
| 954 |
+
order_logs.extend([
|
| 955 |
+
"[WARN] 2026-03-25T12:00:12Z Circuit breaker not configured, continuing to send requests after 10 failures",
|
| 956 |
+
"[ERROR] 2026-03-25T12:00:12Z System overload: 50 pending requests to inventory_service",
|
| 957 |
+
])
|
| 958 |
+
elif issue.issue_id == "hard_idempotency":
|
| 959 |
+
order_logs.append(
|
| 960 |
+
"[ERROR] 2026-03-25T12:00:13Z Duplicate order detected: ord_997 submitted twice"
|
| 961 |
+
)
|
| 962 |
+
inventory_logs.append(
|
| 963 |
+
"[WARN] 2026-03-25T12:00:13Z Duplicate reservation request for order ord_997"
|
| 964 |
+
)
|
| 965 |
+
|
| 966 |
+
if not shipping_logs:
|
| 967 |
+
shipping_logs.append(
|
| 968 |
+
"[INFO] 2026-03-25T12:00:00Z Shipping service healthy, awaiting authenticated requests"
|
| 969 |
+
)
|
| 970 |
+
|
| 971 |
+
dynamic_logs = {
|
| 972 |
+
"hard_wrong_url": {
|
| 973 |
+
"order_service": ["[INFO] URL corrected to /v2/reserve. Inventory requests routing correctly."],
|
| 974 |
+
"api_gateway": ["[INFO] order_service now using correct /v2/reserve endpoint."],
|
| 975 |
},
|
| 976 |
+
"hard_timeout": {
|
| 977 |
+
"order_service": ["[INFO] Timeout increased to 10s. Inventory responses completing."],
|
| 978 |
+
"inventory_service": ["[INFO] Reservations completing successfully within timeout."],
|
| 979 |
+
},
|
| 980 |
+
"hard_async": {
|
| 981 |
+
"order_service": ["[INFO] Async mode enabled. Orders processing concurrently without blocking."],
|
| 982 |
+
},
|
| 983 |
+
"hard_expired_token": {
|
| 984 |
+
"inventory_service": ["[INFO] Auth token refreshed. Shipping service requests authenticated."],
|
| 985 |
+
"shipping_service": ["[INFO] Authentication successful for inventory_service."],
|
| 986 |
+
},
|
| 987 |
+
"hard_token_refresh": {
|
| 988 |
+
"inventory_service": ["[INFO] Auto token refresh configured. Tokens will be refreshed before expiry."],
|
| 989 |
+
},
|
| 990 |
+
"hard_circuit_breaker": {
|
| 991 |
+
"order_service": ["[INFO] Circuit breaker enabled. Will stop sending after 5 consecutive failures."],
|
| 992 |
+
},
|
| 993 |
+
"hard_idempotency": {
|
| 994 |
+
"order_service": ["[INFO] Idempotency keys set. Duplicate requests will be safely deduplicated."],
|
| 995 |
+
},
|
| 996 |
+
}
|
| 997 |
+
|
| 998 |
+
service_graph = {
|
| 999 |
+
"order_service": ServiceNode(
|
| 1000 |
+
name="order_service",
|
| 1001 |
+
depends_on=["inventory_service", "api_gateway"],
|
| 1002 |
+
health_status="error",
|
| 1003 |
+
),
|
| 1004 |
+
"inventory_service": ServiceNode(
|
| 1005 |
+
name="inventory_service",
|
| 1006 |
+
depends_on=["shipping_service", "auth_service"],
|
| 1007 |
+
health_status="degraded",
|
| 1008 |
+
),
|
| 1009 |
+
"shipping_service": ServiceNode(
|
| 1010 |
+
name="shipping_service",
|
| 1011 |
+
depends_on=[],
|
| 1012 |
+
health_status="healthy",
|
| 1013 |
+
),
|
| 1014 |
+
"api_gateway": ServiceNode(
|
| 1015 |
+
name="api_gateway",
|
| 1016 |
+
depends_on=[],
|
| 1017 |
+
health_status="healthy",
|
| 1018 |
+
),
|
| 1019 |
+
"auth_service": ServiceNode(
|
| 1020 |
+
name="auth_service",
|
| 1021 |
+
depends_on=[],
|
| 1022 |
+
health_status="healthy",
|
| 1023 |
+
),
|
| 1024 |
+
}
|
| 1025 |
+
|
| 1026 |
+
# Build optimal fix order respecting dependencies
|
| 1027 |
+
issue_ids = [i.issue_id for i in issues]
|
| 1028 |
+
optimal_order = []
|
| 1029 |
+
ordered_preference = [
|
| 1030 |
+
"hard_wrong_url", "hard_timeout", "hard_async",
|
| 1031 |
+
"hard_expired_token", "hard_token_refresh",
|
| 1032 |
+
"hard_circuit_breaker", "hard_idempotency",
|
| 1033 |
+
]
|
| 1034 |
+
for iid in ordered_preference:
|
| 1035 |
+
if iid in issue_ids:
|
| 1036 |
+
optimal_order.append(iid)
|
| 1037 |
+
for iid in issue_ids:
|
| 1038 |
+
if iid not in optimal_order:
|
| 1039 |
+
optimal_order.append(iid)
|
| 1040 |
+
|
| 1041 |
+
scenario = Scenario(
|
| 1042 |
+
task_id="hard",
|
| 1043 |
+
difficulty="hard",
|
| 1044 |
+
description=(
|
| 1045 |
+
"An e-commerce order processing pipeline is failing with cascading errors. "
|
| 1046 |
+
"Order Service calls Inventory Service, which calls Shipping Service. "
|
| 1047 |
+
"Multiple issues span the pipeline: wrong endpoints, timeouts, race conditions, "
|
| 1048 |
+
"expired authentication tokens, and missing resilience patterns. "
|
| 1049 |
+
"Some issues are masked by upstream failures β you must fix issues in the right "
|
| 1050 |
+
"order to diagnose the full chain."
|
| 1051 |
+
),
|
| 1052 |
+
max_steps=40,
|
| 1053 |
+
services=["order_service", "inventory_service", "shipping_service", "api_gateway", "auth_service"],
|
| 1054 |
+
configs=configs,
|
| 1055 |
+
logs={
|
| 1056 |
+
"order_service": order_logs,
|
| 1057 |
+
"inventory_service": inventory_logs,
|
| 1058 |
+
"shipping_service": shipping_logs,
|
| 1059 |
+
"api_gateway": gateway_logs,
|
| 1060 |
+
"auth_service": auth_logs,
|
| 1061 |
+
},
|
| 1062 |
+
issues=issues,
|
| 1063 |
+
service_graph=service_graph,
|
| 1064 |
+
dynamic_logs=dynamic_logs,
|
| 1065 |
+
optimal_fix_order=optimal_order,
|
| 1066 |
+
context=(
|
| 1067 |
+
"Request flow: order_service -> api_gateway -> inventory_service -> shipping_service. "
|
| 1068 |
+
"auth_service provides token validation for all inter-service calls. "
|
| 1069 |
+
"Some issues are masked by upstream failures β fixing upstream issues may reveal "
|
| 1070 |
+
"new errors downstream. Pay attention to service dependencies."
|
| 1071 |
+
),
|
| 1072 |
)
|
| 1073 |
+
|
| 1074 |
+
if seed is not None:
|
| 1075 |
+
scenario = _randomize_scenario(scenario, seed)
|
| 1076 |
+
|
| 1077 |
+
return scenario
|
server/__pycache__/__init__.cpython-313.pyc
DELETED
|
Binary file (330 Bytes)
|
|
|
server/__pycache__/api_debug_env_environment.cpython-313.pyc
DELETED
|
Binary file (25.9 kB)
|
|
|
server/__pycache__/app.cpython-313.pyc
DELETED
|
Binary file (8.14 kB)
|
|
|
server/api_debug_env_environment.py
CHANGED
|
@@ -10,10 +10,16 @@ API Integration Debugging Environment Implementation.
|
|
| 10 |
A real-world environment where an AI agent diagnoses and fixes broken
|
| 11 |
API integrations by reading error logs, inspecting configurations,
|
| 12 |
and submitting corrected configurations.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
"""
|
| 14 |
|
| 15 |
import copy
|
| 16 |
-
from typing import Any, Dict, List, Optional, Set
|
| 17 |
from uuid import uuid4
|
| 18 |
|
| 19 |
from openenv.core.env_server.interfaces import Environment
|
|
@@ -37,8 +43,8 @@ class ApiDebugEnvironment(Environment):
|
|
| 37 |
3. Testing endpoints to observe failures
|
| 38 |
4. Submitting configuration fixes
|
| 39 |
|
| 40 |
-
Supports 3 difficulty levels (easy, medium, hard) with
|
| 41 |
-
|
| 42 |
"""
|
| 43 |
|
| 44 |
SUPPORTS_CONCURRENT_SESSIONS: bool = True
|
|
@@ -60,6 +66,13 @@ class ApiDebugEnvironment(Environment):
|
|
| 60 |
self._done = False
|
| 61 |
self._last_action_result = ""
|
| 62 |
self._cumulative_reward = 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
def reset(self, task_id: Optional[str] = None, seed: Optional[int] = None) -> ApiDebugObservation:
|
| 65 |
"""
|
|
@@ -84,6 +97,25 @@ class ApiDebugEnvironment(Environment):
|
|
| 84 |
self._done = False
|
| 85 |
self._last_action_result = ""
|
| 86 |
self._cumulative_reward = 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
return ApiDebugObservation(
|
| 89 |
task_id=self._task_id,
|
|
@@ -100,6 +132,9 @@ class ApiDebugEnvironment(Environment):
|
|
| 100 |
available_targets=self._scenario.services,
|
| 101 |
done=False,
|
| 102 |
reward=0.0,
|
|
|
|
|
|
|
|
|
|
| 103 |
)
|
| 104 |
|
| 105 |
def step(self, action: ApiDebugAction) -> ApiDebugObservation: # type: ignore[override]
|
|
@@ -124,6 +159,13 @@ class ApiDebugEnvironment(Environment):
|
|
| 124 |
config_snapshot: Dict[str, Any] = {}
|
| 125 |
api_response: Optional[Dict[str, Any]] = None
|
| 126 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
# Validate target
|
| 128 |
if action.target not in self._scenario.services:
|
| 129 |
self._last_action_result = (
|
|
@@ -162,6 +204,11 @@ class ApiDebugEnvironment(Environment):
|
|
| 162 |
self._done = True
|
| 163 |
self._last_action_result += " β° Out of steps. Episode ended."
|
| 164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
return ApiDebugObservation(
|
| 166 |
task_id=self._task_id,
|
| 167 |
task_description=self._scenario.description,
|
|
@@ -177,6 +224,9 @@ class ApiDebugEnvironment(Environment):
|
|
| 177 |
available_targets=self._scenario.services,
|
| 178 |
done=self._done,
|
| 179 |
reward=reward,
|
|
|
|
|
|
|
|
|
|
| 180 |
metadata={
|
| 181 |
"cumulative_reward": self._cumulative_reward,
|
| 182 |
"step": self._state.step_count,
|
|
@@ -195,11 +245,18 @@ class ApiDebugEnvironment(Environment):
|
|
| 195 |
def _handle_inspect_logs(self, target: str) -> tuple:
|
| 196 |
"""Return logs for a service and reward for relevant inspection."""
|
| 197 |
assert self._scenario is not None
|
| 198 |
-
logs
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
inspect_key = f"logs:{target}"
|
| 200 |
is_repeat = inspect_key in self._inspected_targets
|
| 201 |
self._inspected_targets.add(inspect_key)
|
| 202 |
|
|
|
|
|
|
|
|
|
|
| 203 |
# Check if any unfound issues have log hints in these logs
|
| 204 |
found_new = False
|
| 205 |
for issue in self._scenario.issues:
|
|
@@ -212,9 +269,12 @@ class ApiDebugEnvironment(Environment):
|
|
| 212 |
if found_new:
|
| 213 |
reward = 0.15
|
| 214 |
self._last_action_result = f"Inspected logs for '{target}'. Found relevant error patterns!"
|
| 215 |
-
elif is_repeat:
|
| 216 |
-
reward = 0.0 # No reward for re-inspecting same logs
|
| 217 |
self._last_action_result = f"Re-inspected logs for '{target}'. No new information."
|
|
|
|
|
|
|
|
|
|
| 218 |
elif logs:
|
| 219 |
reward = 0.05
|
| 220 |
self._last_action_result = f"Inspected logs for '{target}'. {len(logs)} log entries found."
|
|
@@ -232,8 +292,15 @@ class ApiDebugEnvironment(Environment):
|
|
| 232 |
is_repeat = inspect_key in self._inspected_targets
|
| 233 |
self._inspected_targets.add(inspect_key)
|
| 234 |
|
|
|
|
|
|
|
|
|
|
| 235 |
# Reward based on relevance and novelty
|
| 236 |
-
has_issues = any(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
if is_repeat:
|
| 238 |
reward = 0.0 # No reward for re-inspecting same config
|
| 239 |
self._last_action_result = f"Re-inspected config for '{target}'. No changes since last check."
|
|
@@ -247,31 +314,65 @@ class ApiDebugEnvironment(Environment):
|
|
| 247 |
return config, reward
|
| 248 |
|
| 249 |
def _handle_inspect_endpoint(self, target: str) -> tuple:
|
| 250 |
-
"""Simulate testing an endpoint
|
| 251 |
assert self._scenario is not None
|
| 252 |
|
|
|
|
|
|
|
|
|
|
| 253 |
# Find unfixed issues for this service
|
| 254 |
unfixed = [
|
| 255 |
i for i in self._scenario.issues
|
| 256 |
if i.service == target and i.issue_id not in self._issues_fixed
|
| 257 |
]
|
| 258 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
if unfixed:
|
| 260 |
-
# Simulate a failure based on the first unfixed issue
|
| 261 |
issue = unfixed[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
api_response = {
|
| 263 |
"status": "error",
|
| 264 |
-
"status_code":
|
| 265 |
"error": issue.description,
|
| 266 |
-
"hint": f"Check the {issue.fix_key} configuration",
|
|
|
|
| 267 |
}
|
| 268 |
reward = 0.05
|
| 269 |
-
self._last_action_result = f"Tested endpoint on '{target}'. Got error response."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
else:
|
| 271 |
api_response = {
|
| 272 |
"status": "success",
|
| 273 |
"status_code": 200,
|
| 274 |
"message": f"{target} is working correctly.",
|
|
|
|
| 275 |
}
|
| 276 |
reward = 0.02
|
| 277 |
self._last_action_result = f"Tested endpoint on '{target}'. Service responding OK."
|
|
@@ -279,7 +380,7 @@ class ApiDebugEnvironment(Environment):
|
|
| 279 |
return api_response, reward
|
| 280 |
|
| 281 |
def _handle_submit_fix(self, target: str, fix_payload: Dict[str, Any]) -> float:
|
| 282 |
-
"""Process a fix submission and
|
| 283 |
assert self._scenario is not None
|
| 284 |
|
| 285 |
if not fix_payload:
|
|
@@ -298,14 +399,28 @@ class ApiDebugEnvironment(Environment):
|
|
| 298 |
|
| 299 |
reward = 0.0
|
| 300 |
fixed_any = False
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
|
| 302 |
for issue in target_issues:
|
| 303 |
-
|
|
|
|
| 304 |
self._issues_fixed.add(issue.issue_id)
|
| 305 |
-
self._issues_found.add(issue.issue_id)
|
| 306 |
self._apply_fix(target, fix_payload)
|
|
|
|
|
|
|
| 307 |
reward += 0.25
|
| 308 |
fixed_any = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 309 |
|
| 310 |
if fixed_any:
|
| 311 |
fixed_count = sum(1 for i in target_issues if i.issue_id in self._issues_fixed)
|
|
@@ -314,6 +429,11 @@ class ApiDebugEnvironment(Environment):
|
|
| 314 |
f"Fixed {fixed_count} issue(s). "
|
| 315 |
f"Total fixed: {len(self._issues_fixed)}/{len(self._scenario.issues)}"
|
| 316 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
else:
|
| 318 |
self._last_action_result = (
|
| 319 |
f"Fix rejected for '{target}'. The payload doesn't address any known issues. "
|
|
@@ -323,6 +443,71 @@ class ApiDebugEnvironment(Environment):
|
|
| 323 |
|
| 324 |
return reward
|
| 325 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 326 |
# βββ Helper Methods βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 327 |
|
| 328 |
@staticmethod
|
|
@@ -343,7 +528,7 @@ class ApiDebugEnvironment(Environment):
|
|
| 343 |
Supports:
|
| 344 |
- Exact match
|
| 345 |
- Case-insensitive string match
|
| 346 |
-
- Numeric tolerance
|
| 347 |
- Boolean coercion (e.g., "true" -> True)
|
| 348 |
- List containment (submitted must contain all expected elements)
|
| 349 |
- Pattern match for token-like values (Bearer <anything> matches Bearer <token>)
|
|
@@ -356,11 +541,11 @@ class ApiDebugEnvironment(Environment):
|
|
| 356 |
if norm_expected == norm_submitted:
|
| 357 |
return True
|
| 358 |
|
| 359 |
-
# Numeric comparison with tolerance
|
| 360 |
if isinstance(expected, (int, float)) and isinstance(submitted, (int, float)):
|
| 361 |
if expected == 0:
|
| 362 |
return submitted == 0
|
| 363 |
-
return abs(expected - submitted) / max(abs(expected), 1) < 0.
|
| 364 |
|
| 365 |
# Boolean coercion
|
| 366 |
if isinstance(expected, bool):
|
|
@@ -379,7 +564,6 @@ class ApiDebugEnvironment(Environment):
|
|
| 379 |
return True
|
| 380 |
# If submitted has same prefix structure
|
| 381 |
if exp_lower.startswith("bearer ") and sub_lower.startswith("bearer "):
|
| 382 |
-
# Any valid bearer token is acceptable
|
| 383 |
return len(sub_lower) > len("bearer ")
|
| 384 |
|
| 385 |
# List: submitted must contain all expected elements
|
|
@@ -388,22 +572,42 @@ class ApiDebugEnvironment(Environment):
|
|
| 388 |
|
| 389 |
return False
|
| 390 |
|
| 391 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 392 |
"""
|
| 393 |
Check if a fix payload correctly addresses an issue.
|
| 394 |
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
|
|
|
| 398 |
"""
|
|
|
|
|
|
|
| 399 |
# Direct key match with value validation
|
| 400 |
if issue.fix_key in fix_payload:
|
|
|
|
| 401 |
expected_val = issue.expected_fix.get(issue.fix_key)
|
| 402 |
if expected_val is not None:
|
| 403 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 404 |
|
| 405 |
-
# If the submitted value is a dict and expected_fix has nested keys
|
| 406 |
-
# validate the nested key-value pairs inside the dict
|
| 407 |
submitted_val = fix_payload[issue.fix_key]
|
| 408 |
if isinstance(submitted_val, dict):
|
| 409 |
nested_prefix = issue.fix_key + "."
|
|
@@ -413,38 +617,58 @@ class ApiDebugEnvironment(Environment):
|
|
| 413 |
if k.startswith(nested_prefix)
|
| 414 |
}
|
| 415 |
if nested_expected:
|
| 416 |
-
|
| 417 |
-
return all(
|
| 418 |
k in submitted_val and self._values_match(v, submitted_val[k])
|
| 419 |
for k, v in nested_expected.items()
|
| 420 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 421 |
|
| 422 |
-
|
|
|
|
|
|
|
| 423 |
|
| 424 |
# Check nested key (e.g., "headers.Authorization" -> check payload for "Authorization")
|
| 425 |
if "." in issue.fix_key:
|
| 426 |
parts = issue.fix_key.split(".")
|
| 427 |
leaf_key = parts[-1]
|
| 428 |
if leaf_key in fix_payload:
|
|
|
|
| 429 |
expected_val = issue.expected_fix.get(issue.fix_key)
|
| 430 |
if expected_val is not None:
|
| 431 |
-
|
| 432 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 433 |
|
| 434 |
# Check expected fix keys with value validation
|
| 435 |
for key, expected_val in issue.expected_fix.items():
|
| 436 |
# Direct key in payload
|
| 437 |
if key in fix_payload:
|
|
|
|
| 438 |
if self._values_match(expected_val, fix_payload[key]):
|
| 439 |
-
return
|
| 440 |
# Nested key leaf match
|
| 441 |
if "." in key:
|
| 442 |
leaf = key.split(".")[-1]
|
| 443 |
if leaf in fix_payload:
|
|
|
|
| 444 |
if self._values_match(expected_val, fix_payload[leaf]):
|
| 445 |
-
return
|
| 446 |
|
| 447 |
-
|
|
|
|
|
|
|
| 448 |
|
| 449 |
def _apply_fix(self, target: str, fix_payload: Dict[str, Any]) -> None:
|
| 450 |
"""Apply a fix to the current configuration."""
|
|
@@ -466,7 +690,7 @@ class ApiDebugEnvironment(Environment):
|
|
| 466 |
config[key] = value
|
| 467 |
|
| 468 |
def _get_hints(self) -> List[str]:
|
| 469 |
-
"""Return progressive hints based on step count."""
|
| 470 |
if self._scenario is None:
|
| 471 |
return []
|
| 472 |
|
|
@@ -478,6 +702,8 @@ class ApiDebugEnvironment(Environment):
|
|
| 478 |
if step == 0:
|
| 479 |
hints.append("Start by inspecting error logs for each service to find clues.")
|
| 480 |
hints.append(f"There are {total_issues} issues to find and fix.")
|
|
|
|
|
|
|
| 481 |
elif step > 0 and len(self._issues_found) == 0:
|
| 482 |
hints.append("Try 'inspect_logs' on different services to find error patterns.")
|
| 483 |
elif len(self._issues_found) > 0 and len(self._issues_fixed) == 0:
|
|
@@ -485,24 +711,44 @@ class ApiDebugEnvironment(Environment):
|
|
| 485 |
elif unfixed > 0:
|
| 486 |
hints.append(f"{unfixed} issue(s) remaining. Check services you haven't inspected yet.")
|
| 487 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 488 |
# Late-game hints
|
| 489 |
if self._scenario.max_steps - step <= 5 and unfixed > 0:
|
| 490 |
-
# Give more specific hints when running low on steps
|
| 491 |
for issue in self._scenario.issues:
|
| 492 |
if issue.issue_id not in self._issues_fixed:
|
| 493 |
-
hints.append(
|
|
|
|
|
|
|
| 494 |
|
| 495 |
return hints
|
| 496 |
|
| 497 |
-
# βββ Grading ββββββββββββββββββββββββββββββββββββββββ
|
| 498 |
|
| 499 |
def grade(self) -> float:
|
| 500 |
"""
|
| 501 |
-
Grade the agent's performance
|
| 502 |
|
| 503 |
-
Score = (
|
| 504 |
-
|
| 505 |
-
|
|
|
|
|
|
|
|
|
|
| 506 |
|
| 507 |
Returns:
|
| 508 |
Score strictly between 0 and 1 (exclusive): in range (0.001, 0.999)
|
|
@@ -514,18 +760,111 @@ class ApiDebugEnvironment(Environment):
|
|
| 514 |
if total == 0:
|
| 515 |
return 0.999
|
| 516 |
|
|
|
|
| 517 |
fix_ratio = len(self._issues_fixed) / total
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 518 |
remaining = max(0, self._scenario.max_steps - self._state.step_count)
|
| 519 |
-
|
| 520 |
|
| 521 |
-
#
|
| 522 |
-
|
| 523 |
|
| 524 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 525 |
|
| 526 |
# Clamp strictly to (0.001, 0.999) β NEVER exactly 0.0 or 1.0
|
| 527 |
return max(0.001, min(0.999, round(score, 4)))
|
| 528 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 529 |
def get_task_info(self) -> Dict[str, Any]:
|
| 530 |
"""Return information about the current task."""
|
| 531 |
if self._scenario is None:
|
|
@@ -538,6 +877,11 @@ class ApiDebugEnvironment(Environment):
|
|
| 538 |
"max_steps": self._scenario.max_steps,
|
| 539 |
"issues_total": len(self._scenario.issues),
|
| 540 |
"services": self._scenario.services,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 541 |
"action_schema": {
|
| 542 |
"action_type": {
|
| 543 |
"type": "string",
|
|
|
|
| 10 |
A real-world environment where an AI agent diagnoses and fixes broken
|
| 11 |
API integrations by reading error logs, inspecting configurations,
|
| 12 |
and submitting corrected configurations.
|
| 13 |
+
|
| 14 |
+
Key design features:
|
| 15 |
+
- Dynamic state: fixing issues changes service health and produces new logs
|
| 16 |
+
- Cascading failures: upstream fixes reveal downstream issues
|
| 17 |
+
- Multi-dimensional rubric grading (diagnosis, fix, efficiency, strategy)
|
| 18 |
+
- Rich reward signal with partial credit and diminishing returns
|
| 19 |
"""
|
| 20 |
|
| 21 |
import copy
|
| 22 |
+
from typing import Any, Dict, List, Optional, Set, Tuple
|
| 23 |
from uuid import uuid4
|
| 24 |
|
| 25 |
from openenv.core.env_server.interfaces import Environment
|
|
|
|
| 43 |
3. Testing endpoints to observe failures
|
| 44 |
4. Submitting configuration fixes
|
| 45 |
|
| 46 |
+
Supports 3 difficulty levels (easy, medium, hard) with cascading
|
| 47 |
+
failure dynamics and multi-dimensional grading.
|
| 48 |
"""
|
| 49 |
|
| 50 |
SUPPORTS_CONCURRENT_SESSIONS: bool = True
|
|
|
|
| 66 |
self._done = False
|
| 67 |
self._last_action_result = ""
|
| 68 |
self._cumulative_reward = 0.0
|
| 69 |
+
# Dynamic state tracking
|
| 70 |
+
self._service_health: Dict[str, str] = {}
|
| 71 |
+
self._dynamic_log_buffer: Dict[str, List[str]] = {}
|
| 72 |
+
# Strategy tracking for grading
|
| 73 |
+
self._action_history: List[Dict[str, Any]] = []
|
| 74 |
+
self._diagnosed_before_fix: Set[str] = set()
|
| 75 |
+
# Track which services were inspected before a fix was submitted
|
| 76 |
|
| 77 |
def reset(self, task_id: Optional[str] = None, seed: Optional[int] = None) -> ApiDebugObservation:
|
| 78 |
"""
|
|
|
|
| 97 |
self._done = False
|
| 98 |
self._last_action_result = ""
|
| 99 |
self._cumulative_reward = 0.0
|
| 100 |
+
self._action_history = []
|
| 101 |
+
self._diagnosed_before_fix = set()
|
| 102 |
+
|
| 103 |
+
# Initialize service health from scenario graph
|
| 104 |
+
self._service_health = {}
|
| 105 |
+
for svc_name, node in self._scenario.service_graph.items():
|
| 106 |
+
self._service_health[svc_name] = node.health_status
|
| 107 |
+
# Fill in any services not in graph
|
| 108 |
+
for svc in self._scenario.services:
|
| 109 |
+
if svc not in self._service_health:
|
| 110 |
+
self._service_health[svc] = "unknown"
|
| 111 |
+
|
| 112 |
+
# Initialize dynamic log buffer
|
| 113 |
+
self._dynamic_log_buffer = {svc: [] for svc in self._scenario.services}
|
| 114 |
+
|
| 115 |
+
# Build dependency graph for observation
|
| 116 |
+
dep_graph = {}
|
| 117 |
+
for svc_name, node in self._scenario.service_graph.items():
|
| 118 |
+
dep_graph[svc_name] = node.depends_on
|
| 119 |
|
| 120 |
return ApiDebugObservation(
|
| 121 |
task_id=self._task_id,
|
|
|
|
| 132 |
available_targets=self._scenario.services,
|
| 133 |
done=False,
|
| 134 |
reward=0.0,
|
| 135 |
+
service_status=dict(self._service_health),
|
| 136 |
+
dependency_graph=dep_graph,
|
| 137 |
+
error_trace=self._build_error_trace(),
|
| 138 |
)
|
| 139 |
|
| 140 |
def step(self, action: ApiDebugAction) -> ApiDebugObservation: # type: ignore[override]
|
|
|
|
| 159 |
config_snapshot: Dict[str, Any] = {}
|
| 160 |
api_response: Optional[Dict[str, Any]] = None
|
| 161 |
|
| 162 |
+
# Record action for strategy scoring
|
| 163 |
+
self._action_history.append({
|
| 164 |
+
"step": self._state.step_count,
|
| 165 |
+
"action_type": action.action_type,
|
| 166 |
+
"target": action.target,
|
| 167 |
+
})
|
| 168 |
+
|
| 169 |
# Validate target
|
| 170 |
if action.target not in self._scenario.services:
|
| 171 |
self._last_action_result = (
|
|
|
|
| 204 |
self._done = True
|
| 205 |
self._last_action_result += " β° Out of steps. Episode ended."
|
| 206 |
|
| 207 |
+
# Build dependency graph
|
| 208 |
+
dep_graph = {}
|
| 209 |
+
for svc_name, node in self._scenario.service_graph.items():
|
| 210 |
+
dep_graph[svc_name] = node.depends_on
|
| 211 |
+
|
| 212 |
return ApiDebugObservation(
|
| 213 |
task_id=self._task_id,
|
| 214 |
task_description=self._scenario.description,
|
|
|
|
| 224 |
available_targets=self._scenario.services,
|
| 225 |
done=self._done,
|
| 226 |
reward=reward,
|
| 227 |
+
service_status=dict(self._service_health),
|
| 228 |
+
dependency_graph=dep_graph,
|
| 229 |
+
error_trace=self._build_error_trace(),
|
| 230 |
metadata={
|
| 231 |
"cumulative_reward": self._cumulative_reward,
|
| 232 |
"step": self._state.step_count,
|
|
|
|
| 245 |
def _handle_inspect_logs(self, target: str) -> tuple:
|
| 246 |
"""Return logs for a service and reward for relevant inspection."""
|
| 247 |
assert self._scenario is not None
|
| 248 |
+
# Combine static logs with dynamic logs from fixes
|
| 249 |
+
static_logs = self._scenario.logs.get(target, [])
|
| 250 |
+
dynamic_logs = self._dynamic_log_buffer.get(target, [])
|
| 251 |
+
logs = static_logs + dynamic_logs
|
| 252 |
+
|
| 253 |
inspect_key = f"logs:{target}"
|
| 254 |
is_repeat = inspect_key in self._inspected_targets
|
| 255 |
self._inspected_targets.add(inspect_key)
|
| 256 |
|
| 257 |
+
# Track that this service was inspected (for strategy scoring)
|
| 258 |
+
self._diagnosed_before_fix.add(target)
|
| 259 |
+
|
| 260 |
# Check if any unfound issues have log hints in these logs
|
| 261 |
found_new = False
|
| 262 |
for issue in self._scenario.issues:
|
|
|
|
| 269 |
if found_new:
|
| 270 |
reward = 0.15
|
| 271 |
self._last_action_result = f"Inspected logs for '{target}'. Found relevant error patterns!"
|
| 272 |
+
elif is_repeat and not dynamic_logs:
|
| 273 |
+
reward = 0.0 # No reward for re-inspecting same logs with no changes
|
| 274 |
self._last_action_result = f"Re-inspected logs for '{target}'. No new information."
|
| 275 |
+
elif is_repeat and dynamic_logs:
|
| 276 |
+
reward = 0.05 # Some reward for checking updated logs
|
| 277 |
+
self._last_action_result = f"Re-inspected logs for '{target}'. New entries found after recent fixes."
|
| 278 |
elif logs:
|
| 279 |
reward = 0.05
|
| 280 |
self._last_action_result = f"Inspected logs for '{target}'. {len(logs)} log entries found."
|
|
|
|
| 292 |
is_repeat = inspect_key in self._inspected_targets
|
| 293 |
self._inspected_targets.add(inspect_key)
|
| 294 |
|
| 295 |
+
# Track that this service was inspected (for strategy scoring)
|
| 296 |
+
self._diagnosed_before_fix.add(target)
|
| 297 |
+
|
| 298 |
# Reward based on relevance and novelty
|
| 299 |
+
has_issues = any(
|
| 300 |
+
i.service == target
|
| 301 |
+
for i in self._scenario.issues
|
| 302 |
+
if i.issue_id not in self._issues_fixed
|
| 303 |
+
)
|
| 304 |
if is_repeat:
|
| 305 |
reward = 0.0 # No reward for re-inspecting same config
|
| 306 |
self._last_action_result = f"Re-inspected config for '{target}'. No changes since last check."
|
|
|
|
| 314 |
return config, reward
|
| 315 |
|
| 316 |
def _handle_inspect_endpoint(self, target: str) -> tuple:
|
| 317 |
+
"""Simulate testing an endpoint. Response changes based on current fix state."""
|
| 318 |
assert self._scenario is not None
|
| 319 |
|
| 320 |
+
# Track that this service was inspected
|
| 321 |
+
self._diagnosed_before_fix.add(target)
|
| 322 |
+
|
| 323 |
# Find unfixed issues for this service
|
| 324 |
unfixed = [
|
| 325 |
i for i in self._scenario.issues
|
| 326 |
if i.service == target and i.issue_id not in self._issues_fixed
|
| 327 |
]
|
| 328 |
|
| 329 |
+
# Also check if any DEPENDENCY issues are unfixed (cascade simulation)
|
| 330 |
+
upstream_broken = False
|
| 331 |
+
if target in self._scenario.service_graph:
|
| 332 |
+
node = self._scenario.service_graph[target]
|
| 333 |
+
for dep_svc in node.depends_on:
|
| 334 |
+
dep_unfixed = [
|
| 335 |
+
i for i in self._scenario.issues
|
| 336 |
+
if i.service == dep_svc and i.issue_id not in self._issues_fixed
|
| 337 |
+
]
|
| 338 |
+
if dep_unfixed:
|
| 339 |
+
upstream_broken = True
|
| 340 |
+
|
| 341 |
if unfixed:
|
|
|
|
| 342 |
issue = unfixed[0]
|
| 343 |
+
# Determine status code based on issue category
|
| 344 |
+
status_codes = {
|
| 345 |
+
"authentication": 401,
|
| 346 |
+
"protocol": 415,
|
| 347 |
+
"networking": 504,
|
| 348 |
+
"configuration": 500,
|
| 349 |
+
}
|
| 350 |
+
status_code = status_codes.get(issue.category, 500)
|
| 351 |
api_response = {
|
| 352 |
"status": "error",
|
| 353 |
+
"status_code": status_code,
|
| 354 |
"error": issue.description,
|
| 355 |
+
"hint": f"Check the {issue.fix_key} configuration for '{target}'",
|
| 356 |
+
"service_health": self._service_health.get(target, "unknown"),
|
| 357 |
}
|
| 358 |
reward = 0.05
|
| 359 |
+
self._last_action_result = f"Tested endpoint on '{target}'. Got {status_code} error response."
|
| 360 |
+
elif upstream_broken:
|
| 361 |
+
api_response = {
|
| 362 |
+
"status": "degraded",
|
| 363 |
+
"status_code": 503,
|
| 364 |
+
"error": f"{target} configuration is correct but upstream dependencies are failing.",
|
| 365 |
+
"hint": "Fix upstream services first β check the dependency graph.",
|
| 366 |
+
"service_health": "degraded",
|
| 367 |
+
}
|
| 368 |
+
reward = 0.03
|
| 369 |
+
self._last_action_result = f"Tested '{target}'. Service config OK but upstream is broken."
|
| 370 |
else:
|
| 371 |
api_response = {
|
| 372 |
"status": "success",
|
| 373 |
"status_code": 200,
|
| 374 |
"message": f"{target} is working correctly.",
|
| 375 |
+
"service_health": "healthy",
|
| 376 |
}
|
| 377 |
reward = 0.02
|
| 378 |
self._last_action_result = f"Tested endpoint on '{target}'. Service responding OK."
|
|
|
|
| 380 |
return api_response, reward
|
| 381 |
|
| 382 |
def _handle_submit_fix(self, target: str, fix_payload: Dict[str, Any]) -> float:
|
| 383 |
+
"""Process a fix submission with strict validation and cascade effects."""
|
| 384 |
assert self._scenario is not None
|
| 385 |
|
| 386 |
if not fix_payload:
|
|
|
|
| 399 |
|
| 400 |
reward = 0.0
|
| 401 |
fixed_any = False
|
| 402 |
+
partial_credit = False
|
| 403 |
+
|
| 404 |
+
# Check if the agent inspected this service before submitting
|
| 405 |
+
inspected_first = target in self._diagnosed_before_fix
|
| 406 |
|
| 407 |
for issue in target_issues:
|
| 408 |
+
match_result = self._check_fix(issue, fix_payload)
|
| 409 |
+
if match_result == "exact":
|
| 410 |
self._issues_fixed.add(issue.issue_id)
|
| 411 |
+
self._issues_found.add(issue.issue_id)
|
| 412 |
self._apply_fix(target, fix_payload)
|
| 413 |
+
self._update_service_health(issue)
|
| 414 |
+
self._inject_dynamic_logs(issue)
|
| 415 |
reward += 0.25
|
| 416 |
fixed_any = True
|
| 417 |
+
# Bonus for inspecting before fixing (strategy reward)
|
| 418 |
+
if inspected_first:
|
| 419 |
+
reward += 0.05
|
| 420 |
+
elif match_result == "partial":
|
| 421 |
+
# Right key, close value β give partial credit
|
| 422 |
+
partial_credit = True
|
| 423 |
+
reward += 0.03
|
| 424 |
|
| 425 |
if fixed_any:
|
| 426 |
fixed_count = sum(1 for i in target_issues if i.issue_id in self._issues_fixed)
|
|
|
|
| 429 |
f"Fixed {fixed_count} issue(s). "
|
| 430 |
f"Total fixed: {len(self._issues_fixed)}/{len(self._scenario.issues)}"
|
| 431 |
)
|
| 432 |
+
elif partial_credit:
|
| 433 |
+
self._last_action_result = (
|
| 434 |
+
f"Fix partially correct for '{target}'. "
|
| 435 |
+
"The key is right but the value isn't quite right. Check the logs for exact values."
|
| 436 |
+
)
|
| 437 |
else:
|
| 438 |
self._last_action_result = (
|
| 439 |
f"Fix rejected for '{target}'. The payload doesn't address any known issues. "
|
|
|
|
| 443 |
|
| 444 |
return reward
|
| 445 |
|
| 446 |
+
# βββ Dynamic State Methods ββββββββββββββββββββββββββββββββββββββββββββ
|
| 447 |
+
|
| 448 |
+
def _update_service_health(self, fixed_issue: Issue) -> None:
|
| 449 |
+
"""Update service health status after an issue is fixed."""
|
| 450 |
+
assert self._scenario is not None
|
| 451 |
+
|
| 452 |
+
# Check if the fixed service has any remaining issues
|
| 453 |
+
remaining = [
|
| 454 |
+
i for i in self._scenario.issues
|
| 455 |
+
if i.service == fixed_issue.service and i.issue_id not in self._issues_fixed
|
| 456 |
+
]
|
| 457 |
+
if not remaining:
|
| 458 |
+
self._service_health[fixed_issue.service] = "healthy"
|
| 459 |
+
else:
|
| 460 |
+
self._service_health[fixed_issue.service] = "degraded"
|
| 461 |
+
|
| 462 |
+
# Update downstream services affected by cascade
|
| 463 |
+
for affected_svc, _effect in fixed_issue.cascade_effects.items():
|
| 464 |
+
if affected_svc in self._service_health:
|
| 465 |
+
# Check if the affected service still has its own issues
|
| 466 |
+
svc_issues = [
|
| 467 |
+
i for i in self._scenario.issues
|
| 468 |
+
if i.service == affected_svc and i.issue_id not in self._issues_fixed
|
| 469 |
+
]
|
| 470 |
+
if not svc_issues:
|
| 471 |
+
# Check if all upstream deps are healthy
|
| 472 |
+
if affected_svc in self._scenario.service_graph:
|
| 473 |
+
upstream_healthy = all(
|
| 474 |
+
self._service_health.get(dep, "error") == "healthy"
|
| 475 |
+
for dep in self._scenario.service_graph[affected_svc].depends_on
|
| 476 |
+
)
|
| 477 |
+
if upstream_healthy:
|
| 478 |
+
self._service_health[affected_svc] = "healthy"
|
| 479 |
+
else:
|
| 480 |
+
self._service_health[affected_svc] = "degraded"
|
| 481 |
+
else:
|
| 482 |
+
self._service_health[affected_svc] = "healthy"
|
| 483 |
+
|
| 484 |
+
def _inject_dynamic_logs(self, fixed_issue: Issue) -> None:
|
| 485 |
+
"""Inject new log entries after an issue is fixed."""
|
| 486 |
+
assert self._scenario is not None
|
| 487 |
+
if fixed_issue.issue_id in self._scenario.dynamic_logs:
|
| 488 |
+
for svc, new_logs in self._scenario.dynamic_logs[fixed_issue.issue_id].items():
|
| 489 |
+
if svc in self._dynamic_log_buffer:
|
| 490 |
+
self._dynamic_log_buffer[svc].extend(new_logs)
|
| 491 |
+
|
| 492 |
+
def _build_error_trace(self) -> List[str]:
|
| 493 |
+
"""Build an error propagation trace showing cascade chain."""
|
| 494 |
+
if self._scenario is None:
|
| 495 |
+
return []
|
| 496 |
+
|
| 497 |
+
trace = []
|
| 498 |
+
for issue in self._scenario.issues:
|
| 499 |
+
if issue.issue_id not in self._issues_fixed:
|
| 500 |
+
trace.append(
|
| 501 |
+
f"[{issue.severity.upper()}] {issue.service}: {issue.description}"
|
| 502 |
+
)
|
| 503 |
+
for affected_svc, effect in issue.cascade_effects.items():
|
| 504 |
+
trace.append(f" ββ> {affected_svc}: {effect}")
|
| 505 |
+
|
| 506 |
+
if not trace:
|
| 507 |
+
trace.append("All issues resolved. No error cascades active.")
|
| 508 |
+
|
| 509 |
+
return trace
|
| 510 |
+
|
| 511 |
# βββ Helper Methods βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 512 |
|
| 513 |
@staticmethod
|
|
|
|
| 528 |
Supports:
|
| 529 |
- Exact match
|
| 530 |
- Case-insensitive string match
|
| 531 |
+
- Numeric tolerance (10%)
|
| 532 |
- Boolean coercion (e.g., "true" -> True)
|
| 533 |
- List containment (submitted must contain all expected elements)
|
| 534 |
- Pattern match for token-like values (Bearer <anything> matches Bearer <token>)
|
|
|
|
| 541 |
if norm_expected == norm_submitted:
|
| 542 |
return True
|
| 543 |
|
| 544 |
+
# Numeric comparison with tolerance (10% β tighter than before)
|
| 545 |
if isinstance(expected, (int, float)) and isinstance(submitted, (int, float)):
|
| 546 |
if expected == 0:
|
| 547 |
return submitted == 0
|
| 548 |
+
return abs(expected - submitted) / max(abs(expected), 1) < 0.10
|
| 549 |
|
| 550 |
# Boolean coercion
|
| 551 |
if isinstance(expected, bool):
|
|
|
|
| 564 |
return True
|
| 565 |
# If submitted has same prefix structure
|
| 566 |
if exp_lower.startswith("bearer ") and sub_lower.startswith("bearer "):
|
|
|
|
| 567 |
return len(sub_lower) > len("bearer ")
|
| 568 |
|
| 569 |
# List: submitted must contain all expected elements
|
|
|
|
| 572 |
|
| 573 |
return False
|
| 574 |
|
| 575 |
+
def _values_close(self, expected: Any, submitted: Any) -> bool:
|
| 576 |
+
"""Check if values are 'close' for partial credit (same type, right ballpark)."""
|
| 577 |
+
if isinstance(expected, (int, float)) and isinstance(submitted, (int, float)):
|
| 578 |
+
if expected == 0:
|
| 579 |
+
return abs(submitted) < 5
|
| 580 |
+
return abs(expected - submitted) / max(abs(expected), 1) < 0.50
|
| 581 |
+
if isinstance(expected, str) and isinstance(submitted, str):
|
| 582 |
+
# Same prefix / similar structure
|
| 583 |
+
return expected.split("/")[0].lower() == submitted.split("/")[0].lower()
|
| 584 |
+
if isinstance(expected, bool) and isinstance(submitted, bool):
|
| 585 |
+
return True # Right type at least
|
| 586 |
+
return False
|
| 587 |
+
|
| 588 |
+
def _check_fix(self, issue: Issue, fix_payload: Dict[str, Any]) -> str:
|
| 589 |
"""
|
| 590 |
Check if a fix payload correctly addresses an issue.
|
| 591 |
|
| 592 |
+
Returns:
|
| 593 |
+
'exact' if fix is correct
|
| 594 |
+
'partial' if fix has right key but wrong value
|
| 595 |
+
'none' if fix doesn't match at all
|
| 596 |
"""
|
| 597 |
+
found_key = False
|
| 598 |
+
|
| 599 |
# Direct key match with value validation
|
| 600 |
if issue.fix_key in fix_payload:
|
| 601 |
+
found_key = True
|
| 602 |
expected_val = issue.expected_fix.get(issue.fix_key)
|
| 603 |
if expected_val is not None:
|
| 604 |
+
if self._values_match(expected_val, fix_payload[issue.fix_key]):
|
| 605 |
+
return "exact"
|
| 606 |
+
elif self._values_close(expected_val, fix_payload[issue.fix_key]):
|
| 607 |
+
return "partial"
|
| 608 |
+
return "none" # Right key, wrong value
|
| 609 |
|
| 610 |
+
# If the submitted value is a dict and expected_fix has nested keys
|
|
|
|
| 611 |
submitted_val = fix_payload[issue.fix_key]
|
| 612 |
if isinstance(submitted_val, dict):
|
| 613 |
nested_prefix = issue.fix_key + "."
|
|
|
|
| 617 |
if k.startswith(nested_prefix)
|
| 618 |
}
|
| 619 |
if nested_expected:
|
| 620 |
+
all_match = all(
|
|
|
|
| 621 |
k in submitted_val and self._values_match(v, submitted_val[k])
|
| 622 |
for k, v in nested_expected.items()
|
| 623 |
)
|
| 624 |
+
if all_match:
|
| 625 |
+
return "exact"
|
| 626 |
+
# Check partial
|
| 627 |
+
any_match = any(
|
| 628 |
+
k in submitted_val and self._values_match(v, submitted_val[k])
|
| 629 |
+
for k, v in nested_expected.items()
|
| 630 |
+
)
|
| 631 |
+
if any_match:
|
| 632 |
+
return "partial"
|
| 633 |
+
return "none"
|
| 634 |
|
| 635 |
+
# No expected value found β this shouldn't happen with well-defined issues
|
| 636 |
+
# Do NOT accept blindly β require value validation
|
| 637 |
+
return "none"
|
| 638 |
|
| 639 |
# Check nested key (e.g., "headers.Authorization" -> check payload for "Authorization")
|
| 640 |
if "." in issue.fix_key:
|
| 641 |
parts = issue.fix_key.split(".")
|
| 642 |
leaf_key = parts[-1]
|
| 643 |
if leaf_key in fix_payload:
|
| 644 |
+
found_key = True
|
| 645 |
expected_val = issue.expected_fix.get(issue.fix_key)
|
| 646 |
if expected_val is not None:
|
| 647 |
+
if self._values_match(expected_val, fix_payload[leaf_key]):
|
| 648 |
+
return "exact"
|
| 649 |
+
elif self._values_close(expected_val, fix_payload[leaf_key]):
|
| 650 |
+
return "partial"
|
| 651 |
+
return "none"
|
| 652 |
+
return "none"
|
| 653 |
|
| 654 |
# Check expected fix keys with value validation
|
| 655 |
for key, expected_val in issue.expected_fix.items():
|
| 656 |
# Direct key in payload
|
| 657 |
if key in fix_payload:
|
| 658 |
+
found_key = True
|
| 659 |
if self._values_match(expected_val, fix_payload[key]):
|
| 660 |
+
return "exact"
|
| 661 |
# Nested key leaf match
|
| 662 |
if "." in key:
|
| 663 |
leaf = key.split(".")[-1]
|
| 664 |
if leaf in fix_payload:
|
| 665 |
+
found_key = True
|
| 666 |
if self._values_match(expected_val, fix_payload[leaf]):
|
| 667 |
+
return "exact"
|
| 668 |
|
| 669 |
+
if found_key:
|
| 670 |
+
return "partial" # Found the key but value didn't match
|
| 671 |
+
return "none"
|
| 672 |
|
| 673 |
def _apply_fix(self, target: str, fix_payload: Dict[str, Any]) -> None:
|
| 674 |
"""Apply a fix to the current configuration."""
|
|
|
|
| 690 |
config[key] = value
|
| 691 |
|
| 692 |
def _get_hints(self) -> List[str]:
|
| 693 |
+
"""Return progressive hints based on step count and progress."""
|
| 694 |
if self._scenario is None:
|
| 695 |
return []
|
| 696 |
|
|
|
|
| 702 |
if step == 0:
|
| 703 |
hints.append("Start by inspecting error logs for each service to find clues.")
|
| 704 |
hints.append(f"There are {total_issues} issues to find and fix.")
|
| 705 |
+
if self._scenario.context:
|
| 706 |
+
hints.append(f"Context: {self._scenario.context}")
|
| 707 |
elif step > 0 and len(self._issues_found) == 0:
|
| 708 |
hints.append("Try 'inspect_logs' on different services to find error patterns.")
|
| 709 |
elif len(self._issues_found) > 0 and len(self._issues_fixed) == 0:
|
|
|
|
| 711 |
elif unfixed > 0:
|
| 712 |
hints.append(f"{unfixed} issue(s) remaining. Check services you haven't inspected yet.")
|
| 713 |
|
| 714 |
+
# Dependency hints
|
| 715 |
+
for issue in self._scenario.issues:
|
| 716 |
+
if issue.issue_id not in self._issues_fixed and issue.depends_on:
|
| 717 |
+
deps_met = all(d in self._issues_fixed for d in issue.depends_on)
|
| 718 |
+
if not deps_met:
|
| 719 |
+
dep_names = [
|
| 720 |
+
next((i.service for i in self._scenario.issues if i.issue_id == d), d)
|
| 721 |
+
for d in issue.depends_on
|
| 722 |
+
]
|
| 723 |
+
if len(self._issues_fixed) > 0:
|
| 724 |
+
hints.append(
|
| 725 |
+
f"Some issues may be masked by upstream failures. "
|
| 726 |
+
f"Check services: {', '.join(set(dep_names))}"
|
| 727 |
+
)
|
| 728 |
+
break
|
| 729 |
+
|
| 730 |
# Late-game hints
|
| 731 |
if self._scenario.max_steps - step <= 5 and unfixed > 0:
|
|
|
|
| 732 |
for issue in self._scenario.issues:
|
| 733 |
if issue.issue_id not in self._issues_fixed:
|
| 734 |
+
hints.append(
|
| 735 |
+
f"Hint: Check '{issue.service}' β look for '{issue.fix_key}' in the config."
|
| 736 |
+
)
|
| 737 |
|
| 738 |
return hints
|
| 739 |
|
| 740 |
+
# βββ Multi-Dimensional Grading ββββββββββββββββββββββββββββββββββββββββ
|
| 741 |
|
| 742 |
def grade(self) -> float:
|
| 743 |
"""
|
| 744 |
+
Grade the agent's performance using a multi-dimensional rubric.
|
| 745 |
|
| 746 |
+
Score = weighted_average(
|
| 747 |
+
diagnosis_score Γ 0.20, # Did the agent inspect before fixing?
|
| 748 |
+
fix_score Γ 0.40, # Issues fixed / total
|
| 749 |
+
efficiency_score Γ 0.15, # Steps used vs available
|
| 750 |
+
strategy_score Γ 0.25, # Logical debugging approach
|
| 751 |
+
)
|
| 752 |
|
| 753 |
Returns:
|
| 754 |
Score strictly between 0 and 1 (exclusive): in range (0.001, 0.999)
|
|
|
|
| 760 |
if total == 0:
|
| 761 |
return 0.999
|
| 762 |
|
| 763 |
+
# 1. Fix Score (40% weight) β most important
|
| 764 |
fix_ratio = len(self._issues_fixed) / total
|
| 765 |
+
fix_score = fix_ratio
|
| 766 |
+
|
| 767 |
+
# 2. Diagnosis Score (20% weight) β did you inspect before fixing?
|
| 768 |
+
if self._issues_fixed:
|
| 769 |
+
diagnosed_count = sum(
|
| 770 |
+
1 for issue_id in self._issues_fixed
|
| 771 |
+
if any(
|
| 772 |
+
i.service in self._diagnosed_before_fix
|
| 773 |
+
for i in self._scenario.issues
|
| 774 |
+
if i.issue_id == issue_id
|
| 775 |
+
)
|
| 776 |
+
)
|
| 777 |
+
diagnosis_score = diagnosed_count / len(self._issues_fixed)
|
| 778 |
+
else:
|
| 779 |
+
# Give partial credit for exploration even without fixes
|
| 780 |
+
diagnosis_score = min(1.0, len(self._inspected_targets) / max(1, len(self._scenario.services)))
|
| 781 |
+
|
| 782 |
+
# 3. Efficiency Score (15% weight) β faster is better
|
| 783 |
remaining = max(0, self._scenario.max_steps - self._state.step_count)
|
| 784 |
+
efficiency_score = remaining / self._scenario.max_steps
|
| 785 |
|
| 786 |
+
# 4. Strategy Score (25% weight) β logical debugging approach
|
| 787 |
+
strategy_score = self._compute_strategy_score()
|
| 788 |
|
| 789 |
+
# Weighted combination
|
| 790 |
+
score = (
|
| 791 |
+
fix_score * 0.40 +
|
| 792 |
+
diagnosis_score * 0.20 +
|
| 793 |
+
efficiency_score * 0.15 +
|
| 794 |
+
strategy_score * 0.25
|
| 795 |
+
)
|
| 796 |
|
| 797 |
# Clamp strictly to (0.001, 0.999) β NEVER exactly 0.0 or 1.0
|
| 798 |
return max(0.001, min(0.999, round(score, 4)))
|
| 799 |
|
| 800 |
+
def _compute_strategy_score(self) -> float:
|
| 801 |
+
"""
|
| 802 |
+
Score the agent's debugging strategy.
|
| 803 |
+
|
| 804 |
+
Good strategy:
|
| 805 |
+
- Inspect logs before configs (logs have more diagnostic info)
|
| 806 |
+
- Don't repeat the same inspection
|
| 807 |
+
- Fix issues in dependency order
|
| 808 |
+
- Don't submit fixes without inspecting first
|
| 809 |
+
"""
|
| 810 |
+
if not self._action_history:
|
| 811 |
+
return 0.0
|
| 812 |
+
|
| 813 |
+
score = 0.0
|
| 814 |
+
total_checks = 0
|
| 815 |
+
|
| 816 |
+
# Check 1: Did the agent inspect logs before submitting any fix?
|
| 817 |
+
first_fix_step = None
|
| 818 |
+
first_inspect_step = None
|
| 819 |
+
for action in self._action_history:
|
| 820 |
+
if action["action_type"] == "submit_fix" and first_fix_step is None:
|
| 821 |
+
first_fix_step = action["step"]
|
| 822 |
+
if action["action_type"] in ("inspect_logs", "inspect_config") and first_inspect_step is None:
|
| 823 |
+
first_inspect_step = action["step"]
|
| 824 |
+
|
| 825 |
+
total_checks += 1
|
| 826 |
+
if first_inspect_step is not None and (first_fix_step is None or first_inspect_step < first_fix_step):
|
| 827 |
+
score += 1.0 # Inspected before fixing
|
| 828 |
+
|
| 829 |
+
# Check 2: Ratio of unique inspections to total inspections
|
| 830 |
+
total_inspections = sum(
|
| 831 |
+
1 for a in self._action_history
|
| 832 |
+
if a["action_type"] in ("inspect_logs", "inspect_config", "inspect_endpoint")
|
| 833 |
+
)
|
| 834 |
+
unique_inspections = len(self._inspected_targets)
|
| 835 |
+
total_checks += 1
|
| 836 |
+
if total_inspections > 0:
|
| 837 |
+
score += min(1.0, unique_inspections / total_inspections)
|
| 838 |
+
|
| 839 |
+
# Check 3: Did fixes follow dependency order?
|
| 840 |
+
if self._scenario and self._scenario.optimal_fix_order and len(self._issues_fixed) > 1:
|
| 841 |
+
total_checks += 1
|
| 842 |
+
fix_order = []
|
| 843 |
+
for action in self._action_history:
|
| 844 |
+
if action["action_type"] == "submit_fix":
|
| 845 |
+
# Find which issue was fixed in this step
|
| 846 |
+
for issue_id in self._issues_fixed:
|
| 847 |
+
issue = next((i for i in self._scenario.issues if i.issue_id == issue_id), None)
|
| 848 |
+
if issue and issue_id not in fix_order:
|
| 849 |
+
fix_order.append(issue_id)
|
| 850 |
+
|
| 851 |
+
# Compare fix order with optimal order
|
| 852 |
+
optimal = [o for o in self._scenario.optimal_fix_order if o in fix_order]
|
| 853 |
+
if len(optimal) > 1:
|
| 854 |
+
in_order = sum(
|
| 855 |
+
1 for i in range(len(fix_order) - 1)
|
| 856 |
+
if fix_order[i] in optimal and fix_order[i+1] in optimal
|
| 857 |
+
and optimal.index(fix_order[i]) < optimal.index(fix_order[i+1])
|
| 858 |
+
)
|
| 859 |
+
score += in_order / max(1, len(fix_order) - 1)
|
| 860 |
+
|
| 861 |
+
# Check 4: Did the agent use a variety of action types?
|
| 862 |
+
total_checks += 1
|
| 863 |
+
action_types_used = set(a["action_type"] for a in self._action_history)
|
| 864 |
+
score += len(action_types_used) / 4.0 # 4 possible action types
|
| 865 |
+
|
| 866 |
+
return score / total_checks if total_checks > 0 else 0.0
|
| 867 |
+
|
| 868 |
def get_task_info(self) -> Dict[str, Any]:
|
| 869 |
"""Return information about the current task."""
|
| 870 |
if self._scenario is None:
|
|
|
|
| 877 |
"max_steps": self._scenario.max_steps,
|
| 878 |
"issues_total": len(self._scenario.issues),
|
| 879 |
"services": self._scenario.services,
|
| 880 |
+
"service_dependencies": {
|
| 881 |
+
svc: node.depends_on
|
| 882 |
+
for svc, node in self._scenario.service_graph.items()
|
| 883 |
+
},
|
| 884 |
+
"context": self._scenario.context,
|
| 885 |
"action_schema": {
|
| 886 |
"action_type": {
|
| 887 |
"type": "string",
|
server/app.py
CHANGED
|
@@ -64,8 +64,15 @@ async def root():
|
|
| 64 |
"""Root endpoint β returns environment info and available endpoints."""
|
| 65 |
return {
|
| 66 |
"name": "api_debug_env",
|
| 67 |
-
"description": "API Integration Debugging Environment",
|
|
|
|
| 68 |
"status": "running",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
"endpoints": ["/reset", "/step", "/state", "/tasks", "/grader", "/baseline", "/health", "/schema", "/docs"],
|
| 70 |
}
|
| 71 |
|
|
@@ -86,7 +93,7 @@ class BaselineRequest(BaseModel):
|
|
| 86 |
|
| 87 |
@app.get("/tasks")
|
| 88 |
async def list_tasks():
|
| 89 |
-
"""Return list of all tasks with action schema."""
|
| 90 |
tasks = []
|
| 91 |
for task_id in get_all_task_ids():
|
| 92 |
scenario = get_scenario(task_id)
|
|
@@ -97,6 +104,11 @@ async def list_tasks():
|
|
| 97 |
"max_steps": scenario.max_steps,
|
| 98 |
"issues_count": len(scenario.issues),
|
| 99 |
"services": scenario.services,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
"action_schema": {
|
| 101 |
"action_type": {
|
| 102 |
"type": "string",
|
|
@@ -129,6 +141,12 @@ async def run_grader(request: GraderRequest):
|
|
| 129 |
"issues_fixed": len(env._issues_fixed),
|
| 130 |
"issues_total": len(env._scenario.issues) if env._scenario else 0,
|
| 131 |
"steps_used": env._state.step_count,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
}
|
| 133 |
|
| 134 |
return {
|
|
@@ -142,13 +160,19 @@ async def run_grader(request: GraderRequest):
|
|
| 142 |
async def run_baseline(request: Optional[BaselineRequest] = None):
|
| 143 |
"""
|
| 144 |
Run a rule-based baseline agent on all tasks.
|
| 145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
Returns baseline scores for each task.
|
| 147 |
"""
|
| 148 |
# Known fixes for each task (a heuristic baseline, not an LLM)
|
| 149 |
known_fixes = {
|
| 150 |
"easy": [
|
| 151 |
-
{"target": "payment_client", "fix": {"headers.Authorization": "Bearer sk_live_token123"
|
|
|
|
| 152 |
],
|
| 153 |
"medium": [
|
| 154 |
{"target": "webhook_sender", "fix": {"rate_limit.requests_per_second": 10}},
|
|
@@ -170,7 +194,7 @@ async def run_baseline(request: Optional[BaselineRequest] = None):
|
|
| 170 |
env = ApiDebugEnvironment(task_id=task_id)
|
| 171 |
obs = env.reset()
|
| 172 |
|
| 173 |
-
# Phase 1: Inspect all logs
|
| 174 |
for service in obs.available_targets:
|
| 175 |
if env._done:
|
| 176 |
break
|
|
@@ -179,7 +203,7 @@ async def run_baseline(request: Optional[BaselineRequest] = None):
|
|
| 179 |
target=service,
|
| 180 |
))
|
| 181 |
|
| 182 |
-
# Phase 2: Inspect
|
| 183 |
for service in obs.available_targets:
|
| 184 |
if env._done:
|
| 185 |
break
|
|
@@ -188,7 +212,16 @@ async def run_baseline(request: Optional[BaselineRequest] = None):
|
|
| 188 |
target=service,
|
| 189 |
))
|
| 190 |
|
| 191 |
-
# Phase 3:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
for fix_info in known_fixes.get(task_id, []):
|
| 193 |
if env._done:
|
| 194 |
break
|
|
|
|
| 64 |
"""Root endpoint β returns environment info and available endpoints."""
|
| 65 |
return {
|
| 66 |
"name": "api_debug_env",
|
| 67 |
+
"description": "API Integration Debugging Environment β diagnose and fix broken API integrations",
|
| 68 |
+
"version": "2.0.0",
|
| 69 |
"status": "running",
|
| 70 |
+
"features": [
|
| 71 |
+
"Cascading failure simulation",
|
| 72 |
+
"Dynamic service health tracking",
|
| 73 |
+
"Multi-dimensional rubric grading",
|
| 74 |
+
"Seed-based scenario randomization",
|
| 75 |
+
],
|
| 76 |
"endpoints": ["/reset", "/step", "/state", "/tasks", "/grader", "/baseline", "/health", "/schema", "/docs"],
|
| 77 |
}
|
| 78 |
|
|
|
|
| 93 |
|
| 94 |
@app.get("/tasks")
|
| 95 |
async def list_tasks():
|
| 96 |
+
"""Return list of all tasks with action schema and dependency info."""
|
| 97 |
tasks = []
|
| 98 |
for task_id in get_all_task_ids():
|
| 99 |
scenario = get_scenario(task_id)
|
|
|
|
| 104 |
"max_steps": scenario.max_steps,
|
| 105 |
"issues_count": len(scenario.issues),
|
| 106 |
"services": scenario.services,
|
| 107 |
+
"service_dependencies": {
|
| 108 |
+
svc: node.depends_on
|
| 109 |
+
for svc, node in scenario.service_graph.items()
|
| 110 |
+
},
|
| 111 |
+
"context": scenario.context,
|
| 112 |
"action_schema": {
|
| 113 |
"action_type": {
|
| 114 |
"type": "string",
|
|
|
|
| 141 |
"issues_fixed": len(env._issues_fixed),
|
| 142 |
"issues_total": len(env._scenario.issues) if env._scenario else 0,
|
| 143 |
"steps_used": env._state.step_count,
|
| 144 |
+
"grading_rubric": {
|
| 145 |
+
"fix_score_weight": 0.40,
|
| 146 |
+
"diagnosis_score_weight": 0.20,
|
| 147 |
+
"efficiency_score_weight": 0.15,
|
| 148 |
+
"strategy_score_weight": 0.25,
|
| 149 |
+
},
|
| 150 |
}
|
| 151 |
|
| 152 |
return {
|
|
|
|
| 160 |
async def run_baseline(request: Optional[BaselineRequest] = None):
|
| 161 |
"""
|
| 162 |
Run a rule-based baseline agent on all tasks.
|
| 163 |
+
|
| 164 |
+
The baseline follows a proper debugging strategy:
|
| 165 |
+
1. Inspect logs for each service (diagnosis phase)
|
| 166 |
+
2. Inspect configs for services with issues (investigation phase)
|
| 167 |
+
3. Submit known fixes (resolution phase)
|
| 168 |
+
|
| 169 |
Returns baseline scores for each task.
|
| 170 |
"""
|
| 171 |
# Known fixes for each task (a heuristic baseline, not an LLM)
|
| 172 |
known_fixes = {
|
| 173 |
"easy": [
|
| 174 |
+
{"target": "payment_client", "fix": {"headers.Authorization": "Bearer sk_live_token123"}},
|
| 175 |
+
{"target": "payment_client", "fix": {"headers.Content-Type": "application/json"}},
|
| 176 |
],
|
| 177 |
"medium": [
|
| 178 |
{"target": "webhook_sender", "fix": {"rate_limit.requests_per_second": 10}},
|
|
|
|
| 194 |
env = ApiDebugEnvironment(task_id=task_id)
|
| 195 |
obs = env.reset()
|
| 196 |
|
| 197 |
+
# Phase 1: Inspect all logs (proper diagnosis strategy)
|
| 198 |
for service in obs.available_targets:
|
| 199 |
if env._done:
|
| 200 |
break
|
|
|
|
| 203 |
target=service,
|
| 204 |
))
|
| 205 |
|
| 206 |
+
# Phase 2: Inspect configs for services that have issues
|
| 207 |
for service in obs.available_targets:
|
| 208 |
if env._done:
|
| 209 |
break
|
|
|
|
| 212 |
target=service,
|
| 213 |
))
|
| 214 |
|
| 215 |
+
# Phase 3: Test endpoints to observe failures
|
| 216 |
+
for service in obs.available_targets[:2]: # Just test a couple
|
| 217 |
+
if env._done:
|
| 218 |
+
break
|
| 219 |
+
obs = env.step(ApiDebugAction(
|
| 220 |
+
action_type="inspect_endpoint",
|
| 221 |
+
target=service,
|
| 222 |
+
))
|
| 223 |
+
|
| 224 |
+
# Phase 4: Submit fixes
|
| 225 |
for fix_info in known_fixes.get(task_id, []):
|
| 226 |
if env._done:
|
| 227 |
break
|
tests/__pycache__/__init__.cpython-313.pyc
DELETED
|
Binary file (165 Bytes)
|
|
|
tests/__pycache__/test_environment.cpython-313-pytest-8.4.1.pyc
DELETED
|
Binary file (66.9 kB)
|
|
|
tests/test_environment.py
CHANGED
|
@@ -7,11 +7,13 @@ Comprehensive tests for the API Integration Debugging Environment.
|
|
| 7 |
Tests cover:
|
| 8 |
- Environment reset and initialization
|
| 9 |
- Action handling (inspect_logs, inspect_config, inspect_endpoint, submit_fix)
|
| 10 |
-
-
|
| 11 |
-
- Fix validation (strict value matching)
|
| 12 |
- Episode termination conditions
|
| 13 |
- Repeated inspection penalty
|
| 14 |
-
- Seed-based reproducibility
|
|
|
|
|
|
|
| 15 |
"""
|
| 16 |
|
| 17 |
import sys
|
|
@@ -60,20 +62,20 @@ class TestScenarios:
|
|
| 60 |
s = get_scenario("hard")
|
| 61 |
assert len(s.issues) == 5
|
| 62 |
|
| 63 |
-
def
|
| 64 |
-
"""Same seed should produce same
|
| 65 |
s1 = get_scenario("easy", seed=42)
|
| 66 |
s2 = get_scenario("easy", seed=42)
|
| 67 |
-
|
| 68 |
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
#
|
| 74 |
-
#
|
| 75 |
-
|
| 76 |
-
assert
|
| 77 |
|
| 78 |
def test_each_issue_has_log_hint(self):
|
| 79 |
"""Every issue should have a corresponding log hint findable in the logs."""
|
|
@@ -90,6 +92,41 @@ class TestScenarios:
|
|
| 90 |
break
|
| 91 |
assert found, f"Issue {issue.issue_id} log_hint '{issue.log_hint}' not found in any logs"
|
| 92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
# βββ Environment Reset Tests βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 95 |
|
|
@@ -126,6 +163,25 @@ class TestEnvironmentReset:
|
|
| 126 |
obs = env.reset()
|
| 127 |
assert obs.reward == 0.0
|
| 128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
|
| 130 |
# βββ Action Handler Tests ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 131 |
|
|
@@ -150,25 +206,41 @@ class TestInspectLogs:
|
|
| 150 |
target="payment_client",
|
| 151 |
))
|
| 152 |
assert obs.issues_found > 0
|
| 153 |
-
assert obs.reward > 0
|
| 154 |
|
| 155 |
def test_repeated_inspect_logs_no_reward(self):
|
| 156 |
-
"""Second inspection of same target should give 0 reward."""
|
| 157 |
env = ApiDebugEnvironment(task_id="easy")
|
| 158 |
env.reset()
|
| 159 |
-
# First inspection
|
| 160 |
obs1 = env.step(ApiDebugAction(
|
| 161 |
action_type="inspect_logs",
|
| 162 |
target="payment_client",
|
| 163 |
))
|
| 164 |
-
# Second inspection (repeat)
|
| 165 |
obs2 = env.step(ApiDebugAction(
|
| 166 |
action_type="inspect_logs",
|
| 167 |
target="payment_client",
|
| 168 |
))
|
| 169 |
-
# The step cost is -0.01, repeat inspect gives 0 + (-0.01) base
|
| 170 |
assert obs2.reward < obs1.reward
|
| 171 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
|
| 173 |
class TestInspectConfig:
|
| 174 |
"""Test inspect_config action."""
|
|
@@ -197,12 +269,41 @@ class TestInspectEndpoint:
|
|
| 197 |
assert obs.api_response is not None
|
| 198 |
assert obs.api_response["status"] == "error"
|
| 199 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
|
| 201 |
class TestSubmitFix:
|
| 202 |
-
"""Test submit_fix action with value validation."""
|
| 203 |
|
| 204 |
def test_correct_fix_accepted(self):
|
| 205 |
-
"""Submitting the right key AND value should be accepted."""
|
| 206 |
env = ApiDebugEnvironment(task_id="easy")
|
| 207 |
env.reset()
|
| 208 |
obs = env.step(ApiDebugAction(
|
|
@@ -220,10 +321,21 @@ class TestSubmitFix:
|
|
| 220 |
obs = env.step(ApiDebugAction(
|
| 221 |
action_type="submit_fix",
|
| 222 |
target="payment_client",
|
| 223 |
-
fix_payload={"headers.Content-Type": "text/xml"},
|
| 224 |
))
|
| 225 |
assert obs.issues_fixed == 0
|
| 226 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
|
| 228 |
def test_correct_auth_fix(self):
|
| 229 |
"""Bearer token fix should work with any valid token."""
|
|
@@ -260,13 +372,11 @@ class TestSubmitFix:
|
|
| 260 |
"""Fixing all issues should mark episode as done with completion bonus."""
|
| 261 |
env = ApiDebugEnvironment(task_id="easy")
|
| 262 |
env.reset()
|
| 263 |
-
# Fix auth
|
| 264 |
env.step(ApiDebugAction(
|
| 265 |
action_type="submit_fix",
|
| 266 |
target="payment_client",
|
| 267 |
fix_payload={"headers.Authorization": "Bearer valid_token_123"},
|
| 268 |
))
|
| 269 |
-
# Fix content-type
|
| 270 |
obs = env.step(ApiDebugAction(
|
| 271 |
action_type="submit_fix",
|
| 272 |
target="payment_client",
|
|
@@ -275,25 +385,98 @@ class TestSubmitFix:
|
|
| 275 |
assert obs.done is True
|
| 276 |
assert obs.issues_fixed == 2
|
| 277 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
|
| 279 |
# βββ Grading Tests ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 280 |
|
| 281 |
|
| 282 |
class TestGrading:
|
| 283 |
-
"""Test the grading
|
| 284 |
|
| 285 |
def test_grade_no_fixes_is_low(self):
|
| 286 |
-
"""Grade with no fixes should be
|
| 287 |
env = ApiDebugEnvironment(task_id="easy")
|
| 288 |
env.reset()
|
| 289 |
env.step(ApiDebugAction(action_type="inspect_logs", target="payment_client"))
|
| 290 |
score = env.grade()
|
| 291 |
-
assert 0.0 < score < 0.
|
| 292 |
|
| 293 |
def test_grade_all_fixes_is_high(self):
|
| 294 |
"""Grade with all fixes should be high."""
|
| 295 |
env = ApiDebugEnvironment(task_id="easy")
|
| 296 |
env.reset()
|
|
|
|
|
|
|
| 297 |
env.step(ApiDebugAction(
|
| 298 |
action_type="submit_fix",
|
| 299 |
target="payment_client",
|
|
@@ -305,7 +488,7 @@ class TestGrading:
|
|
| 305 |
fix_payload={"headers.Content-Type": "application/json"},
|
| 306 |
))
|
| 307 |
score = env.grade()
|
| 308 |
-
assert score > 0.
|
| 309 |
|
| 310 |
def test_grade_strictly_between_0_and_1(self):
|
| 311 |
"""Grade must be strictly in (0, 1), never exactly 0.0 or 1.0."""
|
|
@@ -316,10 +499,11 @@ class TestGrading:
|
|
| 316 |
assert 0.0 < score < 1.0, f"Score for {task_id} was {score}"
|
| 317 |
|
| 318 |
def test_efficiency_bonus(self):
|
| 319 |
-
"""Faster solutions should score higher."""
|
| 320 |
-
#
|
| 321 |
env1 = ApiDebugEnvironment(task_id="easy")
|
| 322 |
env1.reset()
|
|
|
|
| 323 |
env1.step(ApiDebugAction(
|
| 324 |
action_type="submit_fix",
|
| 325 |
target="payment_client",
|
|
@@ -327,11 +511,11 @@ class TestGrading:
|
|
| 327 |
))
|
| 328 |
score_fast = env1.grade()
|
| 329 |
|
| 330 |
-
# Slow partial solve (many inspection steps, then fix same 1 issue)
|
| 331 |
env2 = ApiDebugEnvironment(task_id="easy")
|
| 332 |
env2.reset()
|
|
|
|
| 333 |
for _ in range(10):
|
| 334 |
-
env2.step(ApiDebugAction(action_type="inspect_logs", target="
|
| 335 |
env2.step(ApiDebugAction(
|
| 336 |
action_type="submit_fix",
|
| 337 |
target="payment_client",
|
|
@@ -341,6 +525,56 @@ class TestGrading:
|
|
| 341 |
|
| 342 |
assert score_fast > score_slow, f"Fast={score_fast} should beat Slow={score_slow}"
|
| 343 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 344 |
|
| 345 |
# βββ Episode Termination Tests ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 346 |
|
|
@@ -351,7 +585,6 @@ class TestEpisodeTermination:
|
|
| 351 |
def test_out_of_steps_ends_episode(self):
|
| 352 |
env = ApiDebugEnvironment(task_id="easy")
|
| 353 |
env.reset()
|
| 354 |
-
# Take max_steps actions
|
| 355 |
for _ in range(15):
|
| 356 |
obs = env.step(ApiDebugAction(
|
| 357 |
action_type="inspect_logs",
|
|
@@ -388,9 +621,11 @@ class TestValueMatching:
|
|
| 388 |
def test_numeric_exact(self):
|
| 389 |
assert self.env._values_match(10, 10)
|
| 390 |
|
| 391 |
-
def
|
| 392 |
-
|
| 393 |
-
assert
|
|
|
|
|
|
|
| 394 |
|
| 395 |
def test_boolean_match(self):
|
| 396 |
assert self.env._values_match(True, True)
|
|
@@ -413,32 +648,80 @@ class TestValueMatching:
|
|
| 413 |
assert not self.env._values_match(10, 100)
|
| 414 |
|
| 415 |
|
| 416 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 417 |
|
| 418 |
|
| 419 |
class TestFullEpisode:
|
| 420 |
-
"""Test
|
| 421 |
|
| 422 |
def test_easy_full_solve(self):
|
| 423 |
"""Run a complete easy episode from start to finish."""
|
| 424 |
env = ApiDebugEnvironment(task_id="easy")
|
| 425 |
obs = env.reset()
|
| 426 |
|
| 427 |
-
# Step 1: Inspect logs
|
| 428 |
obs = env.step(ApiDebugAction(
|
| 429 |
action_type="inspect_logs",
|
| 430 |
target="payment_client",
|
| 431 |
))
|
| 432 |
assert obs.issues_found >= 1
|
| 433 |
|
| 434 |
-
# Step 2: Inspect config
|
| 435 |
obs = env.step(ApiDebugAction(
|
| 436 |
action_type="inspect_config",
|
| 437 |
target="payment_client",
|
| 438 |
))
|
| 439 |
assert "headers" in obs.config_snapshot
|
| 440 |
|
| 441 |
-
# Step 3: Fix auth
|
| 442 |
obs = env.step(ApiDebugAction(
|
| 443 |
action_type="submit_fix",
|
| 444 |
target="payment_client",
|
|
@@ -446,7 +729,6 @@ class TestFullEpisode:
|
|
| 446 |
))
|
| 447 |
assert obs.issues_fixed >= 1
|
| 448 |
|
| 449 |
-
# Step 4: Fix content-type
|
| 450 |
obs = env.step(ApiDebugAction(
|
| 451 |
action_type="submit_fix",
|
| 452 |
target="payment_client",
|
|
@@ -455,9 +737,91 @@ class TestFullEpisode:
|
|
| 455 |
assert obs.issues_fixed == 2
|
| 456 |
assert obs.done is True
|
| 457 |
|
| 458 |
-
# Grade
|
| 459 |
score = env.grade()
|
| 460 |
-
assert score > 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 461 |
|
| 462 |
|
| 463 |
if __name__ == "__main__":
|
|
|
|
| 7 |
Tests cover:
|
| 8 |
- Environment reset and initialization
|
| 9 |
- Action handling (inspect_logs, inspect_config, inspect_endpoint, submit_fix)
|
| 10 |
+
- Multi-dimensional grading rubric
|
| 11 |
+
- Fix validation (strict value matching + partial credit)
|
| 12 |
- Episode termination conditions
|
| 13 |
- Repeated inspection penalty
|
| 14 |
+
- Seed-based reproducibility and issue pool selection
|
| 15 |
+
- Dynamic state: service health, cascading failures, dynamic logs
|
| 16 |
+
- Strategy scoring
|
| 17 |
"""
|
| 18 |
|
| 19 |
import sys
|
|
|
|
| 62 |
s = get_scenario("hard")
|
| 63 |
assert len(s.issues) == 5
|
| 64 |
|
| 65 |
+
def test_seed_randomization_reproducible(self):
|
| 66 |
+
"""Same seed should produce same scenario."""
|
| 67 |
s1 = get_scenario("easy", seed=42)
|
| 68 |
s2 = get_scenario("easy", seed=42)
|
| 69 |
+
assert [i.issue_id for i in s1.issues] == [i.issue_id for i in s2.issues]
|
| 70 |
|
| 71 |
+
def test_different_seeds_may_vary(self):
|
| 72 |
+
"""Different seeds should produce potentially different scenarios."""
|
| 73 |
+
s1 = get_scenario("easy", seed=42)
|
| 74 |
+
s2 = get_scenario("easy", seed=99)
|
| 75 |
+
# They might differ (pool has 4 issues, selecting 2)
|
| 76 |
+
# At minimum, they should both be valid
|
| 77 |
+
assert len(s1.issues) == 2
|
| 78 |
+
assert len(s2.issues) == 2
|
| 79 |
|
| 80 |
def test_each_issue_has_log_hint(self):
|
| 81 |
"""Every issue should have a corresponding log hint findable in the logs."""
|
|
|
|
| 92 |
break
|
| 93 |
assert found, f"Issue {issue.issue_id} log_hint '{issue.log_hint}' not found in any logs"
|
| 94 |
|
| 95 |
+
def test_service_graph_exists(self):
|
| 96 |
+
"""Every scenario should have a service dependency graph."""
|
| 97 |
+
for task_id in get_all_task_ids():
|
| 98 |
+
s = get_scenario(task_id)
|
| 99 |
+
assert len(s.service_graph) > 0
|
| 100 |
+
for svc in s.services:
|
| 101 |
+
assert svc in s.service_graph, f"Service {svc} missing from graph in {task_id}"
|
| 102 |
+
|
| 103 |
+
def test_dynamic_logs_defined(self):
|
| 104 |
+
"""Every scenario should have dynamic logs for at least some issues."""
|
| 105 |
+
for task_id in get_all_task_ids():
|
| 106 |
+
s = get_scenario(task_id)
|
| 107 |
+
assert len(s.dynamic_logs) > 0, f"No dynamic logs in {task_id}"
|
| 108 |
+
|
| 109 |
+
def test_optimal_fix_order_defined(self):
|
| 110 |
+
"""Every scenario should have an optimal fix order."""
|
| 111 |
+
for task_id in get_all_task_ids():
|
| 112 |
+
s = get_scenario(task_id)
|
| 113 |
+
assert len(s.optimal_fix_order) > 0
|
| 114 |
+
|
| 115 |
+
def test_issues_have_categories(self):
|
| 116 |
+
"""Every issue should have a category."""
|
| 117 |
+
for task_id in get_all_task_ids():
|
| 118 |
+
s = get_scenario(task_id)
|
| 119 |
+
for issue in s.issues:
|
| 120 |
+
assert issue.category in (
|
| 121 |
+
"configuration", "authentication", "networking", "protocol"
|
| 122 |
+
), f"Issue {issue.issue_id} has invalid category: {issue.category}"
|
| 123 |
+
|
| 124 |
+
def test_context_provided(self):
|
| 125 |
+
"""Every scenario should have context."""
|
| 126 |
+
for task_id in get_all_task_ids():
|
| 127 |
+
s = get_scenario(task_id)
|
| 128 |
+
assert len(s.context) > 0
|
| 129 |
+
|
| 130 |
|
| 131 |
# βββ Environment Reset Tests βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 132 |
|
|
|
|
| 163 |
obs = env.reset()
|
| 164 |
assert obs.reward == 0.0
|
| 165 |
|
| 166 |
+
def test_reset_includes_service_status(self):
|
| 167 |
+
"""Reset should include service health status."""
|
| 168 |
+
env = ApiDebugEnvironment(task_id="easy")
|
| 169 |
+
obs = env.reset()
|
| 170 |
+
assert len(obs.service_status) > 0
|
| 171 |
+
assert "payment_client" in obs.service_status
|
| 172 |
+
|
| 173 |
+
def test_reset_includes_dependency_graph(self):
|
| 174 |
+
"""Reset should include service dependency graph."""
|
| 175 |
+
env = ApiDebugEnvironment(task_id="easy")
|
| 176 |
+
obs = env.reset()
|
| 177 |
+
assert len(obs.dependency_graph) > 0
|
| 178 |
+
|
| 179 |
+
def test_reset_includes_error_trace(self):
|
| 180 |
+
"""Reset should include initial error trace."""
|
| 181 |
+
env = ApiDebugEnvironment(task_id="easy")
|
| 182 |
+
obs = env.reset()
|
| 183 |
+
assert len(obs.error_trace) > 0
|
| 184 |
+
|
| 185 |
|
| 186 |
# βββ Action Handler Tests ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 187 |
|
|
|
|
| 206 |
target="payment_client",
|
| 207 |
))
|
| 208 |
assert obs.issues_found > 0
|
| 209 |
+
assert obs.reward > 0
|
| 210 |
|
| 211 |
def test_repeated_inspect_logs_no_reward(self):
|
| 212 |
+
"""Second inspection of same target should give 0 reward (+ step cost)."""
|
| 213 |
env = ApiDebugEnvironment(task_id="easy")
|
| 214 |
env.reset()
|
|
|
|
| 215 |
obs1 = env.step(ApiDebugAction(
|
| 216 |
action_type="inspect_logs",
|
| 217 |
target="payment_client",
|
| 218 |
))
|
|
|
|
| 219 |
obs2 = env.step(ApiDebugAction(
|
| 220 |
action_type="inspect_logs",
|
| 221 |
target="payment_client",
|
| 222 |
))
|
|
|
|
| 223 |
assert obs2.reward < obs1.reward
|
| 224 |
|
| 225 |
+
def test_dynamic_logs_after_fix(self):
|
| 226 |
+
"""After fixing an issue, re-inspecting should show new log entries."""
|
| 227 |
+
env = ApiDebugEnvironment(task_id="easy")
|
| 228 |
+
env.reset()
|
| 229 |
+
# Fix content-type
|
| 230 |
+
env.step(ApiDebugAction(
|
| 231 |
+
action_type="submit_fix",
|
| 232 |
+
target="payment_client",
|
| 233 |
+
fix_payload={"headers.Content-Type": "application/json"},
|
| 234 |
+
))
|
| 235 |
+
# Re-inspect logs β should include dynamic log entries
|
| 236 |
+
obs = env.step(ApiDebugAction(
|
| 237 |
+
action_type="inspect_logs",
|
| 238 |
+
target="payment_client",
|
| 239 |
+
))
|
| 240 |
+
# Should have the original logs PLUS dynamic logs
|
| 241 |
+
assert any("application/json" in log.lower() or "parsed" in log.lower()
|
| 242 |
+
for log in obs.logs)
|
| 243 |
+
|
| 244 |
|
| 245 |
class TestInspectConfig:
|
| 246 |
"""Test inspect_config action."""
|
|
|
|
| 269 |
assert obs.api_response is not None
|
| 270 |
assert obs.api_response["status"] == "error"
|
| 271 |
|
| 272 |
+
def test_inspect_endpoint_shows_success_after_fix(self):
|
| 273 |
+
"""After all issues fixed, endpoint should show success."""
|
| 274 |
+
env = ApiDebugEnvironment(task_id="easy")
|
| 275 |
+
env.reset()
|
| 276 |
+
env.step(ApiDebugAction(
|
| 277 |
+
action_type="submit_fix",
|
| 278 |
+
target="payment_client",
|
| 279 |
+
fix_payload={"headers.Authorization": "Bearer valid_token_123"},
|
| 280 |
+
))
|
| 281 |
+
env.step(ApiDebugAction(
|
| 282 |
+
action_type="submit_fix",
|
| 283 |
+
target="payment_client",
|
| 284 |
+
fix_payload={"headers.Content-Type": "application/json"},
|
| 285 |
+
))
|
| 286 |
+
# Episode is done now, but let's check service status
|
| 287 |
+
# The service health should be updated
|
| 288 |
+
assert env._service_health.get("payment_client") == "healthy"
|
| 289 |
+
|
| 290 |
+
def test_inspect_endpoint_shows_category_status_code(self):
|
| 291 |
+
"""Endpoint errors should have category-appropriate status codes."""
|
| 292 |
+
env = ApiDebugEnvironment(task_id="easy")
|
| 293 |
+
env.reset()
|
| 294 |
+
obs = env.step(ApiDebugAction(
|
| 295 |
+
action_type="inspect_endpoint",
|
| 296 |
+
target="payment_client",
|
| 297 |
+
))
|
| 298 |
+
assert obs.api_response is not None
|
| 299 |
+
# Should have a realistic HTTP status code
|
| 300 |
+
assert obs.api_response["status_code"] in [401, 415, 500, 504]
|
| 301 |
+
|
| 302 |
|
| 303 |
class TestSubmitFix:
|
| 304 |
+
"""Test submit_fix action with value validation and partial credit."""
|
| 305 |
|
| 306 |
def test_correct_fix_accepted(self):
|
|
|
|
| 307 |
env = ApiDebugEnvironment(task_id="easy")
|
| 308 |
env.reset()
|
| 309 |
obs = env.step(ApiDebugAction(
|
|
|
|
| 321 |
obs = env.step(ApiDebugAction(
|
| 322 |
action_type="submit_fix",
|
| 323 |
target="payment_client",
|
| 324 |
+
fix_payload={"headers.Content-Type": "text/xml"},
|
| 325 |
))
|
| 326 |
assert obs.issues_fixed == 0
|
| 327 |
+
|
| 328 |
+
def test_partial_credit_close_value(self):
|
| 329 |
+
"""Right key, close value should get partial credit feedback."""
|
| 330 |
+
env = ApiDebugEnvironment(task_id="easy")
|
| 331 |
+
env.reset()
|
| 332 |
+
obs = env.step(ApiDebugAction(
|
| 333 |
+
action_type="submit_fix",
|
| 334 |
+
target="payment_client",
|
| 335 |
+
fix_payload={"headers.Content-Type": "application/xml"},
|
| 336 |
+
))
|
| 337 |
+
# Should get partial credit (same prefix "application/")
|
| 338 |
+
assert obs.reward > -0.05 # Better than full reject
|
| 339 |
|
| 340 |
def test_correct_auth_fix(self):
|
| 341 |
"""Bearer token fix should work with any valid token."""
|
|
|
|
| 372 |
"""Fixing all issues should mark episode as done with completion bonus."""
|
| 373 |
env = ApiDebugEnvironment(task_id="easy")
|
| 374 |
env.reset()
|
|
|
|
| 375 |
env.step(ApiDebugAction(
|
| 376 |
action_type="submit_fix",
|
| 377 |
target="payment_client",
|
| 378 |
fix_payload={"headers.Authorization": "Bearer valid_token_123"},
|
| 379 |
))
|
|
|
|
| 380 |
obs = env.step(ApiDebugAction(
|
| 381 |
action_type="submit_fix",
|
| 382 |
target="payment_client",
|
|
|
|
| 385 |
assert obs.done is True
|
| 386 |
assert obs.issues_fixed == 2
|
| 387 |
|
| 388 |
+
def test_strategy_bonus_for_inspecting_first(self):
|
| 389 |
+
"""Should get higher reward when inspecting before fixing."""
|
| 390 |
+
env1 = ApiDebugEnvironment(task_id="easy")
|
| 391 |
+
env1.reset()
|
| 392 |
+
# Fix directly (no inspection)
|
| 393 |
+
obs1 = env1.step(ApiDebugAction(
|
| 394 |
+
action_type="submit_fix",
|
| 395 |
+
target="payment_client",
|
| 396 |
+
fix_payload={"headers.Content-Type": "application/json"},
|
| 397 |
+
))
|
| 398 |
+
|
| 399 |
+
env2 = ApiDebugEnvironment(task_id="easy")
|
| 400 |
+
env2.reset()
|
| 401 |
+
# Inspect first, then fix
|
| 402 |
+
env2.step(ApiDebugAction(
|
| 403 |
+
action_type="inspect_logs",
|
| 404 |
+
target="payment_client",
|
| 405 |
+
))
|
| 406 |
+
obs2 = env2.step(ApiDebugAction(
|
| 407 |
+
action_type="submit_fix",
|
| 408 |
+
target="payment_client",
|
| 409 |
+
fix_payload={"headers.Content-Type": "application/json"},
|
| 410 |
+
))
|
| 411 |
+
|
| 412 |
+
# Fix with prior inspection should give higher reward
|
| 413 |
+
assert obs2.reward > obs1.reward
|
| 414 |
+
|
| 415 |
+
|
| 416 |
+
# βββ Service Health Tests βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 417 |
+
|
| 418 |
+
|
| 419 |
+
class TestServiceHealth:
|
| 420 |
+
"""Test dynamic service health tracking."""
|
| 421 |
+
|
| 422 |
+
def test_initial_health_reflects_issues(self):
|
| 423 |
+
"""Services with issues should start as degraded/error."""
|
| 424 |
+
env = ApiDebugEnvironment(task_id="easy")
|
| 425 |
+
obs = env.reset()
|
| 426 |
+
assert obs.service_status.get("payment_client") in ("error", "degraded")
|
| 427 |
+
|
| 428 |
+
def test_health_updates_after_fix(self):
|
| 429 |
+
"""Fixing all issues on a service should mark it healthy."""
|
| 430 |
+
env = ApiDebugEnvironment(task_id="easy")
|
| 431 |
+
env.reset()
|
| 432 |
+
env.step(ApiDebugAction(
|
| 433 |
+
action_type="submit_fix",
|
| 434 |
+
target="payment_client",
|
| 435 |
+
fix_payload={"headers.Authorization": "Bearer valid_token_123"},
|
| 436 |
+
))
|
| 437 |
+
obs = env.step(ApiDebugAction(
|
| 438 |
+
action_type="submit_fix",
|
| 439 |
+
target="payment_client",
|
| 440 |
+
fix_payload={"headers.Content-Type": "application/json"},
|
| 441 |
+
))
|
| 442 |
+
# payment_client should be healthy after both fixes
|
| 443 |
+
assert env._service_health.get("payment_client") == "healthy"
|
| 444 |
+
|
| 445 |
+
def test_error_trace_updates(self):
|
| 446 |
+
"""Error trace should shrink as issues are fixed."""
|
| 447 |
+
env = ApiDebugEnvironment(task_id="easy")
|
| 448 |
+
obs1 = env.reset()
|
| 449 |
+
initial_trace_len = len(obs1.error_trace)
|
| 450 |
+
|
| 451 |
+
env.step(ApiDebugAction(
|
| 452 |
+
action_type="submit_fix",
|
| 453 |
+
target="payment_client",
|
| 454 |
+
fix_payload={"headers.Content-Type": "application/json"},
|
| 455 |
+
))
|
| 456 |
+
trace_after_fix = env._build_error_trace()
|
| 457 |
+
assert len(trace_after_fix) < initial_trace_len
|
| 458 |
+
|
| 459 |
|
| 460 |
# βββ Grading Tests ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 461 |
|
| 462 |
|
| 463 |
class TestGrading:
|
| 464 |
+
"""Test the multi-dimensional grading rubric."""
|
| 465 |
|
| 466 |
def test_grade_no_fixes_is_low(self):
|
| 467 |
+
"""Grade with no fixes should be low (but not zero β exploration gets some credit)."""
|
| 468 |
env = ApiDebugEnvironment(task_id="easy")
|
| 469 |
env.reset()
|
| 470 |
env.step(ApiDebugAction(action_type="inspect_logs", target="payment_client"))
|
| 471 |
score = env.grade()
|
| 472 |
+
assert 0.0 < score < 0.5 # Gets some credit for exploration and efficiency
|
| 473 |
|
| 474 |
def test_grade_all_fixes_is_high(self):
|
| 475 |
"""Grade with all fixes should be high."""
|
| 476 |
env = ApiDebugEnvironment(task_id="easy")
|
| 477 |
env.reset()
|
| 478 |
+
env.step(ApiDebugAction(action_type="inspect_logs", target="payment_client"))
|
| 479 |
+
env.step(ApiDebugAction(action_type="inspect_config", target="payment_client"))
|
| 480 |
env.step(ApiDebugAction(
|
| 481 |
action_type="submit_fix",
|
| 482 |
target="payment_client",
|
|
|
|
| 488 |
fix_payload={"headers.Content-Type": "application/json"},
|
| 489 |
))
|
| 490 |
score = env.grade()
|
| 491 |
+
assert score > 0.6
|
| 492 |
|
| 493 |
def test_grade_strictly_between_0_and_1(self):
|
| 494 |
"""Grade must be strictly in (0, 1), never exactly 0.0 or 1.0."""
|
|
|
|
| 499 |
assert 0.0 < score < 1.0, f"Score for {task_id} was {score}"
|
| 500 |
|
| 501 |
def test_efficiency_bonus(self):
|
| 502 |
+
"""Faster solutions with same fix count should score higher efficiency component."""
|
| 503 |
+
# Both inspect then fix (same strategy), but one uses more steps
|
| 504 |
env1 = ApiDebugEnvironment(task_id="easy")
|
| 505 |
env1.reset()
|
| 506 |
+
env1.step(ApiDebugAction(action_type="inspect_logs", target="payment_client"))
|
| 507 |
env1.step(ApiDebugAction(
|
| 508 |
action_type="submit_fix",
|
| 509 |
target="payment_client",
|
|
|
|
| 511 |
))
|
| 512 |
score_fast = env1.grade()
|
| 513 |
|
|
|
|
| 514 |
env2 = ApiDebugEnvironment(task_id="easy")
|
| 515 |
env2.reset()
|
| 516 |
+
env2.step(ApiDebugAction(action_type="inspect_logs", target="payment_client"))
|
| 517 |
for _ in range(10):
|
| 518 |
+
env2.step(ApiDebugAction(action_type="inspect_logs", target="payment_gateway"))
|
| 519 |
env2.step(ApiDebugAction(
|
| 520 |
action_type="submit_fix",
|
| 521 |
target="payment_client",
|
|
|
|
| 525 |
|
| 526 |
assert score_fast > score_slow, f"Fast={score_fast} should beat Slow={score_slow}"
|
| 527 |
|
| 528 |
+
def test_strategy_affects_grade(self):
|
| 529 |
+
"""Proper strategy (inspect before fix) should improve grade."""
|
| 530 |
+
# No inspection
|
| 531 |
+
env1 = ApiDebugEnvironment(task_id="easy")
|
| 532 |
+
env1.reset()
|
| 533 |
+
env1.step(ApiDebugAction(
|
| 534 |
+
action_type="submit_fix",
|
| 535 |
+
target="payment_client",
|
| 536 |
+
fix_payload={"headers.Authorization": "Bearer token"},
|
| 537 |
+
))
|
| 538 |
+
env1.step(ApiDebugAction(
|
| 539 |
+
action_type="submit_fix",
|
| 540 |
+
target="payment_client",
|
| 541 |
+
fix_payload={"headers.Content-Type": "application/json"},
|
| 542 |
+
))
|
| 543 |
+
score_no_inspect = env1.grade()
|
| 544 |
+
|
| 545 |
+
# With inspection
|
| 546 |
+
env2 = ApiDebugEnvironment(task_id="easy")
|
| 547 |
+
env2.reset()
|
| 548 |
+
env2.step(ApiDebugAction(action_type="inspect_logs", target="payment_client"))
|
| 549 |
+
env2.step(ApiDebugAction(action_type="inspect_config", target="payment_client"))
|
| 550 |
+
env2.step(ApiDebugAction(
|
| 551 |
+
action_type="submit_fix",
|
| 552 |
+
target="payment_client",
|
| 553 |
+
fix_payload={"headers.Authorization": "Bearer token"},
|
| 554 |
+
))
|
| 555 |
+
env2.step(ApiDebugAction(
|
| 556 |
+
action_type="submit_fix",
|
| 557 |
+
target="payment_client",
|
| 558 |
+
fix_payload={"headers.Content-Type": "application/json"},
|
| 559 |
+
))
|
| 560 |
+
score_with_inspect = env2.grade()
|
| 561 |
+
|
| 562 |
+
# Both should be decent but strategy should boost the inspecting one
|
| 563 |
+
assert score_with_inspect >= score_no_inspect * 0.9 # At least close
|
| 564 |
+
|
| 565 |
+
def test_grade_dimensions_nonzero(self):
|
| 566 |
+
"""Each grading dimension should be computable."""
|
| 567 |
+
env = ApiDebugEnvironment(task_id="easy")
|
| 568 |
+
env.reset()
|
| 569 |
+
env.step(ApiDebugAction(action_type="inspect_logs", target="payment_client"))
|
| 570 |
+
env.step(ApiDebugAction(
|
| 571 |
+
action_type="submit_fix",
|
| 572 |
+
target="payment_client",
|
| 573 |
+
fix_payload={"headers.Content-Type": "application/json"},
|
| 574 |
+
))
|
| 575 |
+
score = env.grade()
|
| 576 |
+
assert score > 0.001 # Should have some score from partial fix
|
| 577 |
+
|
| 578 |
|
| 579 |
# βββ Episode Termination Tests ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 580 |
|
|
|
|
| 585 |
def test_out_of_steps_ends_episode(self):
|
| 586 |
env = ApiDebugEnvironment(task_id="easy")
|
| 587 |
env.reset()
|
|
|
|
| 588 |
for _ in range(15):
|
| 589 |
obs = env.step(ApiDebugAction(
|
| 590 |
action_type="inspect_logs",
|
|
|
|
| 621 |
def test_numeric_exact(self):
|
| 622 |
assert self.env._values_match(10, 10)
|
| 623 |
|
| 624 |
+
def test_numeric_tolerance_tight(self):
|
| 625 |
+
"""10% tolerance β 10 accepts 10 and 9.5 but not 8."""
|
| 626 |
+
assert self.env._values_match(10, 10) # Exact
|
| 627 |
+
assert self.env._values_match(10, 9.5) # Within 10% (5% diff)
|
| 628 |
+
assert not self.env._values_match(10, 8) # Outside 10% (20% diff)
|
| 629 |
|
| 630 |
def test_boolean_match(self):
|
| 631 |
assert self.env._values_match(True, True)
|
|
|
|
| 648 |
assert not self.env._values_match(10, 100)
|
| 649 |
|
| 650 |
|
| 651 |
+
class TestPartialCredit:
|
| 652 |
+
"""Test the _values_close method for partial credit."""
|
| 653 |
+
|
| 654 |
+
def setup_method(self):
|
| 655 |
+
self.env = ApiDebugEnvironment(task_id="easy")
|
| 656 |
+
|
| 657 |
+
def test_numeric_close(self):
|
| 658 |
+
assert self.env._values_close(10, 7) # Within 50%
|
| 659 |
+
assert not self.env._values_close(10, 100)
|
| 660 |
+
|
| 661 |
+
def test_string_same_prefix(self):
|
| 662 |
+
assert self.env._values_close("application/json", "application/xml")
|
| 663 |
+
|
| 664 |
+
def test_check_fix_returns_partial(self):
|
| 665 |
+
"""Right key, close value should return 'partial'."""
|
| 666 |
+
issue = Issue(
|
| 667 |
+
issue_id="test",
|
| 668 |
+
service="test_svc",
|
| 669 |
+
description="test",
|
| 670 |
+
expected_fix={"timeout": 10},
|
| 671 |
+
fix_key="timeout",
|
| 672 |
+
log_hint="test",
|
| 673 |
+
)
|
| 674 |
+
result = self.env._check_fix(issue, {"timeout": 7})
|
| 675 |
+
assert result == "partial"
|
| 676 |
+
|
| 677 |
+
def test_check_fix_returns_exact(self):
|
| 678 |
+
issue = Issue(
|
| 679 |
+
issue_id="test",
|
| 680 |
+
service="test_svc",
|
| 681 |
+
description="test",
|
| 682 |
+
expected_fix={"timeout": 10},
|
| 683 |
+
fix_key="timeout",
|
| 684 |
+
log_hint="test",
|
| 685 |
+
)
|
| 686 |
+
result = self.env._check_fix(issue, {"timeout": 10})
|
| 687 |
+
assert result == "exact"
|
| 688 |
+
|
| 689 |
+
def test_check_fix_returns_none(self):
|
| 690 |
+
issue = Issue(
|
| 691 |
+
issue_id="test",
|
| 692 |
+
service="test_svc",
|
| 693 |
+
description="test",
|
| 694 |
+
expected_fix={"timeout": 10},
|
| 695 |
+
fix_key="timeout",
|
| 696 |
+
log_hint="test",
|
| 697 |
+
)
|
| 698 |
+
result = self.env._check_fix(issue, {"base_url": "http://example.com"})
|
| 699 |
+
assert result == "none"
|
| 700 |
+
|
| 701 |
+
|
| 702 |
+
# βββ Integration Tests ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 703 |
|
| 704 |
|
| 705 |
class TestFullEpisode:
|
| 706 |
+
"""Test complete episode flows."""
|
| 707 |
|
| 708 |
def test_easy_full_solve(self):
|
| 709 |
"""Run a complete easy episode from start to finish."""
|
| 710 |
env = ApiDebugEnvironment(task_id="easy")
|
| 711 |
obs = env.reset()
|
| 712 |
|
|
|
|
| 713 |
obs = env.step(ApiDebugAction(
|
| 714 |
action_type="inspect_logs",
|
| 715 |
target="payment_client",
|
| 716 |
))
|
| 717 |
assert obs.issues_found >= 1
|
| 718 |
|
|
|
|
| 719 |
obs = env.step(ApiDebugAction(
|
| 720 |
action_type="inspect_config",
|
| 721 |
target="payment_client",
|
| 722 |
))
|
| 723 |
assert "headers" in obs.config_snapshot
|
| 724 |
|
|
|
|
| 725 |
obs = env.step(ApiDebugAction(
|
| 726 |
action_type="submit_fix",
|
| 727 |
target="payment_client",
|
|
|
|
| 729 |
))
|
| 730 |
assert obs.issues_fixed >= 1
|
| 731 |
|
|
|
|
| 732 |
obs = env.step(ApiDebugAction(
|
| 733 |
action_type="submit_fix",
|
| 734 |
target="payment_client",
|
|
|
|
| 737 |
assert obs.issues_fixed == 2
|
| 738 |
assert obs.done is True
|
| 739 |
|
|
|
|
| 740 |
score = env.grade()
|
| 741 |
+
assert score > 0.6
|
| 742 |
+
|
| 743 |
+
def test_medium_full_solve(self):
|
| 744 |
+
"""Run a complete medium episode."""
|
| 745 |
+
env = ApiDebugEnvironment(task_id="medium")
|
| 746 |
+
obs = env.reset()
|
| 747 |
+
assert obs.issues_total == 3
|
| 748 |
+
|
| 749 |
+
# Inspect logs
|
| 750 |
+
for svc in obs.available_targets:
|
| 751 |
+
obs = env.step(ApiDebugAction(
|
| 752 |
+
action_type="inspect_logs", target=svc,
|
| 753 |
+
))
|
| 754 |
+
|
| 755 |
+
# Inspect configs
|
| 756 |
+
obs = env.step(ApiDebugAction(
|
| 757 |
+
action_type="inspect_config", target="webhook_sender",
|
| 758 |
+
))
|
| 759 |
+
|
| 760 |
+
# Fix rate limit
|
| 761 |
+
obs = env.step(ApiDebugAction(
|
| 762 |
+
action_type="submit_fix",
|
| 763 |
+
target="webhook_sender",
|
| 764 |
+
fix_payload={"rate_limit.requests_per_second": 10},
|
| 765 |
+
))
|
| 766 |
+
assert obs.issues_fixed >= 1
|
| 767 |
+
|
| 768 |
+
# Fix retry
|
| 769 |
+
obs = env.step(ApiDebugAction(
|
| 770 |
+
action_type="submit_fix",
|
| 771 |
+
target="webhook_sender",
|
| 772 |
+
fix_payload={"retry": {"max_retries": 3, "backoff_factor": 2, "retry_on_status": [429, 500]}},
|
| 773 |
+
))
|
| 774 |
+
|
| 775 |
+
# Fix signature
|
| 776 |
+
obs = env.step(ApiDebugAction(
|
| 777 |
+
action_type="submit_fix",
|
| 778 |
+
target="webhook_sender",
|
| 779 |
+
fix_payload={"headers.X-Webhook-Signature": "sha256=computed_hmac"},
|
| 780 |
+
))
|
| 781 |
+
|
| 782 |
+
assert obs.done is True
|
| 783 |
+
score = env.grade()
|
| 784 |
+
assert score > 0.4
|
| 785 |
+
|
| 786 |
+
def test_hard_partial_solve(self):
|
| 787 |
+
"""Partially solve hard task and verify partial credit in grading."""
|
| 788 |
+
env = ApiDebugEnvironment(task_id="hard")
|
| 789 |
+
obs = env.reset()
|
| 790 |
+
assert obs.issues_total == 5
|
| 791 |
+
|
| 792 |
+
# Fix just 2 issues
|
| 793 |
+
env.step(ApiDebugAction(action_type="inspect_logs", target="order_service"))
|
| 794 |
+
env.step(ApiDebugAction(
|
| 795 |
+
action_type="submit_fix",
|
| 796 |
+
target="order_service",
|
| 797 |
+
fix_payload={"inventory_url": "https://inventory.internal/v2/reserve"},
|
| 798 |
+
))
|
| 799 |
+
env.step(ApiDebugAction(
|
| 800 |
+
action_type="submit_fix",
|
| 801 |
+
target="order_service",
|
| 802 |
+
fix_payload={"timeout": 10},
|
| 803 |
+
))
|
| 804 |
+
|
| 805 |
+
score = env.grade()
|
| 806 |
+
assert 0.0 < score < 0.999
|
| 807 |
+
assert len(env._issues_fixed) == 2
|
| 808 |
+
|
| 809 |
+
|
| 810 |
+
class TestCascadingFailures:
|
| 811 |
+
"""Test cascading failure dynamics."""
|
| 812 |
+
|
| 813 |
+
def test_hard_dependency_chain(self):
|
| 814 |
+
"""Hard scenario has dependent issues (timeout depends on wrong_url)."""
|
| 815 |
+
s = get_scenario("hard")
|
| 816 |
+
timeout_issue = next(i for i in s.issues if i.issue_id == "hard_timeout")
|
| 817 |
+
assert "hard_wrong_url" in timeout_issue.depends_on
|
| 818 |
+
|
| 819 |
+
def test_cascade_effects_defined(self):
|
| 820 |
+
"""Issues with cascade effects should specify affected services."""
|
| 821 |
+
for task_id in get_all_task_ids():
|
| 822 |
+
s = get_scenario(task_id)
|
| 823 |
+
any_cascade = any(len(i.cascade_effects) > 0 for i in s.issues)
|
| 824 |
+
assert any_cascade, f"No cascade effects in {task_id}"
|
| 825 |
|
| 826 |
|
| 827 |
if __name__ == "__main__":
|