Spaces:
Sleeping
Sleeping
Deploy OpenEnv Submission
Browse files- Dockerfile +17 -0
- README.md +153 -6
- app/__init__.py +1 -0
- app/main.py +130 -0
- app/models.py +33 -0
- app/tasks/__init__.py +12 -0
- app/tasks/base.py +91 -0
- app/tasks/task_easy.py +88 -0
- app/tasks/task_hard.py +122 -0
- app/tasks/task_medium.py +94 -0
- inference.py +181 -0
- inference_local.py +193 -0
- openenv.yaml +62 -0
- requirements.txt +7 -0
Dockerfile
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
# Set working directory
|
| 4 |
+
WORKDIR /app
|
| 5 |
+
|
| 6 |
+
# Install dependencies first (layer caching)
|
| 7 |
+
COPY requirements.txt .
|
| 8 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 9 |
+
|
| 10 |
+
# Copy project files
|
| 11 |
+
COPY . .
|
| 12 |
+
|
| 13 |
+
# HuggingFace Spaces expects port 7860
|
| 14 |
+
EXPOSE 7860
|
| 15 |
+
|
| 16 |
+
# Start FastAPI server
|
| 17 |
+
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -1,10 +1,157 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
-
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Python Bug Fixer OpenEnv
|
| 3 |
+
emoji: 🐛
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
---
|
| 9 |
|
| 10 |
+
# Python Bug Fixer — OpenEnv
|
| 11 |
+
|
| 12 |
+
An OpenEnv-compliant environment where an AI agent must identify and fix bugs
|
| 13 |
+
in Python code to produce correct program output. Simulates real-world
|
| 14 |
+
software debugging and code review workflows.
|
| 15 |
+
|
| 16 |
+
---
|
| 17 |
+
|
| 18 |
+
## Environment Description
|
| 19 |
+
|
| 20 |
+
The agent receives a buggy Python code snippet along with a description of
|
| 21 |
+
expected behavior. The agent's action is to return the corrected Python code.
|
| 22 |
+
The environment executes the code and rewards the agent based on how many
|
| 23 |
+
expected output lines are produced correctly.
|
| 24 |
+
|
| 25 |
+
---
|
| 26 |
+
|
| 27 |
+
## Observation Space
|
| 28 |
+
|
| 29 |
+
**Type:** Text
|
| 30 |
+
|
| 31 |
+
The observation contains:
|
| 32 |
+
- Task description and difficulty
|
| 33 |
+
- Expected stdout output (ground truth)
|
| 34 |
+
- The buggy Python code to fix
|
| 35 |
+
|
| 36 |
+
---
|
| 37 |
+
|
| 38 |
+
## Action Space
|
| 39 |
+
|
| 40 |
+
**Type:** Text
|
| 41 |
+
|
| 42 |
+
The action is raw Python code (no markdown, no code fences).
|
| 43 |
+
It must be valid Python that can be executed with `python3`.
|
| 44 |
+
|
| 45 |
+
---
|
| 46 |
+
|
| 47 |
+
## Tasks
|
| 48 |
+
|
| 49 |
+
| Task ID | Name | Difficulty | Bugs | Max Steps |
|
| 50 |
+
|---------|------|-----------|------|-----------|
|
| 51 |
+
| `task_easy` | Fix Index Errors | Easy | 2 | 5 |
|
| 52 |
+
| `task_medium` | Fix Binary Search | Medium | 2 | 5 |
|
| 53 |
+
| `task_hard` | Fix DataProcessor Class | Hard | 3 | 7 |
|
| 54 |
+
|
| 55 |
+
### Reward Function
|
| 56 |
+
- Reward ∈ [0.0, 1.0]
|
| 57 |
+
- Each expected output line is worth `1 / N` reward
|
| 58 |
+
- Partial credit awarded for partially correct fixes
|
| 59 |
+
- Code that crashes with runtime error: 0.1 partial credit if some output produced
|
| 60 |
+
|
| 61 |
+
---
|
| 62 |
+
|
| 63 |
+
## Setup & Run Locally
|
| 64 |
+
|
| 65 |
+
```bash
|
| 66 |
+
# 1. Install dependencies
|
| 67 |
+
pip install -r requirements.txt
|
| 68 |
+
|
| 69 |
+
# 2. Start the server
|
| 70 |
+
uvicorn app.main:app --host 0.0.0.0 --port 7860
|
| 71 |
+
|
| 72 |
+
# 3. Test endpoints
|
| 73 |
+
curl http://localhost:7860/health
|
| 74 |
+
curl http://localhost:7860/tasks
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
---
|
| 78 |
+
|
| 79 |
+
## Run Inference
|
| 80 |
+
|
| 81 |
+
```bash
|
| 82 |
+
export API_BASE_URL="https://api-inference.huggingface.co/v1"
|
| 83 |
+
export MODEL_NAME="meta-llama/Meta-Llama-3-8B-Instruct"
|
| 84 |
+
export HF_TOKEN="hf_YOUR_TOKEN_HERE"
|
| 85 |
+
export SPACE_URL="https://YOUR_USERNAME-python-bug-fixer.hf.space"
|
| 86 |
+
|
| 87 |
+
python inference.py
|
| 88 |
+
```
|
| 89 |
+
|
| 90 |
+
Expected output format:
|
| 91 |
+
```
|
| 92 |
+
[START] {"task_id": "task_easy", "session_id": "...", "model": "...", "timestamp": "..."}
|
| 93 |
+
[STEP] {"step": 1, "reward": 1.0, "done": true, ...}
|
| 94 |
+
[END] {"task_id": "task_easy", "total_reward": 1.0, "steps": 1, "success": true, ...}
|
| 95 |
+
```
|
| 96 |
+
|
| 97 |
+
---
|
| 98 |
+
|
| 99 |
+
## API Reference
|
| 100 |
+
|
| 101 |
+
### `POST /reset`
|
| 102 |
+
Start a new episode.
|
| 103 |
+
```json
|
| 104 |
+
Request: { "task_id": "task_easy" }
|
| 105 |
+
Response: { "session_id": "...", "task_id": "...", "observation": "...", "info": {} }
|
| 106 |
+
```
|
| 107 |
+
|
| 108 |
+
### `POST /step`
|
| 109 |
+
Submit fixed code as an action.
|
| 110 |
+
```json
|
| 111 |
+
Request: { "session_id": "...", "action": "def get_last_element(lst): ..." }
|
| 112 |
+
Response: { "observation": "...", "reward": 1.0, "done": true, "info": {} }
|
| 113 |
+
```
|
| 114 |
+
|
| 115 |
+
### `GET /state?session_id=...`
|
| 116 |
+
Get current episode state without advancing.
|
| 117 |
+
```json
|
| 118 |
+
Response: { "session_id": "...", "task_id": "...", "steps": 1, "done": true, "current_observation": "..." }
|
| 119 |
+
```
|
| 120 |
+
|
| 121 |
+
### `GET /tasks`
|
| 122 |
+
List all available tasks and metadata.
|
| 123 |
+
|
| 124 |
+
### `GET /health`
|
| 125 |
+
Returns `{"status": "ok"}`.
|
| 126 |
+
|
| 127 |
+
---
|
| 128 |
+
|
| 129 |
+
## Docker
|
| 130 |
+
|
| 131 |
+
```bash
|
| 132 |
+
docker build -t python-bug-fixer .
|
| 133 |
+
docker run -p 7860:7860 python-bug-fixer
|
| 134 |
+
```
|
| 135 |
+
|
| 136 |
+
---
|
| 137 |
+
|
| 138 |
+
## Project Structure
|
| 139 |
+
|
| 140 |
+
```
|
| 141 |
+
my-openenv/
|
| 142 |
+
├── inference.py # Baseline inference script (root — required)
|
| 143 |
+
├── openenv.yaml # OpenEnv specification
|
| 144 |
+
├── Dockerfile # Container definition
|
| 145 |
+
├── requirements.txt # Python dependencies
|
| 146 |
+
├── README.md
|
| 147 |
+
└── app/
|
| 148 |
+
├── __init__.py
|
| 149 |
+
├── main.py # FastAPI server (reset/step/state endpoints)
|
| 150 |
+
├── models.py # Pydantic request/response models
|
| 151 |
+
└── tasks/
|
| 152 |
+
├── __init__.py # Task registry
|
| 153 |
+
├── base.py # BaseTask + safe code runner
|
| 154 |
+
├── task_easy.py # Easy task (2 index bugs)
|
| 155 |
+
├── task_medium.py # Medium task (2 binary search bugs)
|
| 156 |
+
└── task_hard.py # Hard task (3 DataProcessor bugs)
|
| 157 |
+
```
|
app/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# app package
|
app/main.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Python Bug Fixer — OpenEnv-compliant FastAPI server.
|
| 3 |
+
|
| 4 |
+
Endpoints
|
| 5 |
+
---------
|
| 6 |
+
GET / → health + metadata
|
| 7 |
+
GET /health → {"status": "ok"}
|
| 8 |
+
GET /tasks → list all task IDs
|
| 9 |
+
POST /reset → start new episode body: {"task_id": "task_easy"}
|
| 10 |
+
POST /step → submit action body: {"session_id": "...", "action": "..."}
|
| 11 |
+
GET /state → current episode state ?session_id=...
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
from fastapi import FastAPI, HTTPException
|
| 15 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 16 |
+
from app.models import ResetRequest, ResetResponse, StepRequest, StepResponse, StateResponse
|
| 17 |
+
from app.tasks import TASKS
|
| 18 |
+
import uuid
|
| 19 |
+
|
| 20 |
+
app = FastAPI(
|
| 21 |
+
title="Python Bug Fixer OpenEnv",
|
| 22 |
+
description="OpenEnv environment: agent fixes bugs in Python code.",
|
| 23 |
+
version="1.0.0",
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
# Allow external services to call the OpenEnv API
|
| 27 |
+
app.add_middleware(
|
| 28 |
+
CORSMiddleware,
|
| 29 |
+
allow_origins=["*"],
|
| 30 |
+
allow_credentials=True,
|
| 31 |
+
allow_methods=["*"],
|
| 32 |
+
allow_headers=["*"],
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
# In-memory session store {session_id: {"task_id": str, "task": BaseTask}}
|
| 36 |
+
sessions: dict = {}
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
# ── Health / metadata ──────────────────────────────────────────────────────────
|
| 40 |
+
|
| 41 |
+
@app.get("/")
|
| 42 |
+
def root():
|
| 43 |
+
return {
|
| 44 |
+
"status": "ok",
|
| 45 |
+
"name": "python-bug-fixer",
|
| 46 |
+
"version": "1.0.0",
|
| 47 |
+
"tasks": list(TASKS.keys()),
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
@app.get("/health")
|
| 52 |
+
def health():
|
| 53 |
+
return {"status": "ok"}
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
@app.get("/tasks")
|
| 57 |
+
def list_tasks():
|
| 58 |
+
task_meta = []
|
| 59 |
+
for task_id, task_cls in TASKS.items():
|
| 60 |
+
t = task_cls()
|
| 61 |
+
result = t.reset()
|
| 62 |
+
task_meta.append({
|
| 63 |
+
"task_id": task_id,
|
| 64 |
+
"info": result.get("info", {}),
|
| 65 |
+
})
|
| 66 |
+
return {"tasks": task_meta}
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
# ── Core OpenEnv endpoints ─────────────────────────────────────────────────────
|
| 70 |
+
|
| 71 |
+
@app.post("/reset", response_model=ResetResponse)
|
| 72 |
+
def reset(req: ResetRequest):
|
| 73 |
+
"""Start a new episode for the given task."""
|
| 74 |
+
if req.task_id not in TASKS:
|
| 75 |
+
raise HTTPException(
|
| 76 |
+
status_code=400,
|
| 77 |
+
detail=f"Unknown task_id '{req.task_id}'. Valid: {list(TASKS.keys())}",
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
session_id = str(uuid.uuid4())
|
| 81 |
+
task = TASKS[req.task_id]()
|
| 82 |
+
result = task.reset()
|
| 83 |
+
|
| 84 |
+
sessions[session_id] = {"task_id": req.task_id, "task": task}
|
| 85 |
+
|
| 86 |
+
return ResetResponse(
|
| 87 |
+
session_id=session_id,
|
| 88 |
+
task_id=req.task_id,
|
| 89 |
+
observation=result["observation"],
|
| 90 |
+
info=result.get("info", {}),
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
@app.post("/step", response_model=StepResponse)
|
| 95 |
+
def step(req: StepRequest):
|
| 96 |
+
"""Submit an action (fixed code) for the current episode."""
|
| 97 |
+
if req.session_id not in sessions:
|
| 98 |
+
raise HTTPException(status_code=404, detail="Session not found. Call /reset first.")
|
| 99 |
+
|
| 100 |
+
task = sessions[req.session_id]["task"]
|
| 101 |
+
result = task.step(req.action)
|
| 102 |
+
|
| 103 |
+
# Cleanup session to prevent memory leak
|
| 104 |
+
if result.get("done"):
|
| 105 |
+
del sessions[req.session_id]
|
| 106 |
+
|
| 107 |
+
return StepResponse(
|
| 108 |
+
observation=result["observation"],
|
| 109 |
+
reward=result["reward"],
|
| 110 |
+
done=result["done"],
|
| 111 |
+
info=result.get("info", {}),
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
@app.get("/state", response_model=StateResponse)
|
| 116 |
+
def state(session_id: str):
|
| 117 |
+
"""Return current state of an episode without advancing it."""
|
| 118 |
+
if session_id not in sessions:
|
| 119 |
+
raise HTTPException(status_code=404, detail="Session not found.")
|
| 120 |
+
|
| 121 |
+
s = sessions[session_id]
|
| 122 |
+
state_data = s["task"].state()
|
| 123 |
+
|
| 124 |
+
return StateResponse(
|
| 125 |
+
session_id=session_id,
|
| 126 |
+
task_id=s["task_id"],
|
| 127 |
+
steps=state_data["steps"],
|
| 128 |
+
done=state_data["done"],
|
| 129 |
+
current_observation=state_data["current_observation"],
|
| 130 |
+
)
|
app/models.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel
|
| 2 |
+
from typing import Any, Dict, Optional
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class ResetRequest(BaseModel):
|
| 6 |
+
task_id: str = "task_easy"
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class ResetResponse(BaseModel):
|
| 10 |
+
session_id: str
|
| 11 |
+
task_id: str
|
| 12 |
+
observation: str
|
| 13 |
+
info: Dict[str, Any] = {}
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class StepRequest(BaseModel):
|
| 17 |
+
session_id: str
|
| 18 |
+
action: str
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class StepResponse(BaseModel):
|
| 22 |
+
observation: str
|
| 23 |
+
reward: float
|
| 24 |
+
done: bool
|
| 25 |
+
info: Dict[str, Any] = {}
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class StateResponse(BaseModel):
|
| 29 |
+
session_id: str
|
| 30 |
+
task_id: str
|
| 31 |
+
steps: int
|
| 32 |
+
done: bool
|
| 33 |
+
current_observation: str
|
app/tasks/__init__.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from app.tasks.task_easy import TaskEasy
|
| 2 |
+
from app.tasks.task_medium import TaskMedium
|
| 3 |
+
from app.tasks.task_hard import TaskHard
|
| 4 |
+
|
| 5 |
+
# Registry — maps task_id → task class
|
| 6 |
+
TASKS = {
|
| 7 |
+
"task_easy": TaskEasy,
|
| 8 |
+
"task_medium": TaskMedium,
|
| 9 |
+
"task_hard": TaskHard,
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
__all__ = ["TASKS", "TaskEasy", "TaskMedium", "TaskHard"]
|
app/tasks/base.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import subprocess
|
| 2 |
+
import sys
|
| 3 |
+
import tempfile
|
| 4 |
+
import os
|
| 5 |
+
from abc import ABC, abstractmethod
|
| 6 |
+
from typing import Dict, Any, List
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class BaseTask(ABC):
|
| 10 |
+
"""
|
| 11 |
+
Base class for all OpenEnv tasks.
|
| 12 |
+
Every task must implement reset() and step().
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
def __init__(self):
|
| 16 |
+
self.current_step: int = 0
|
| 17 |
+
self.done: bool = False
|
| 18 |
+
self.current_observation: str = ""
|
| 19 |
+
|
| 20 |
+
@abstractmethod
|
| 21 |
+
def reset(self) -> Dict[str, Any]:
|
| 22 |
+
"""
|
| 23 |
+
Reset the environment and return the initial observation.
|
| 24 |
+
Returns dict with keys: observation, info
|
| 25 |
+
"""
|
| 26 |
+
pass
|
| 27 |
+
|
| 28 |
+
@abstractmethod
|
| 29 |
+
def step(self, action: str) -> Dict[str, Any]:
|
| 30 |
+
"""
|
| 31 |
+
Take an action and return (observation, reward, done, info).
|
| 32 |
+
Returns dict with keys: observation, reward, done, info
|
| 33 |
+
"""
|
| 34 |
+
pass
|
| 35 |
+
|
| 36 |
+
def state(self) -> Dict[str, Any]:
|
| 37 |
+
"""Return the current state of the environment."""
|
| 38 |
+
return {
|
| 39 |
+
"steps": self.current_step,
|
| 40 |
+
"done": self.done,
|
| 41 |
+
"current_observation": self.current_observation,
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
def _run_code_safely(self, code: str, expected_outputs: List[str]) -> float:
|
| 45 |
+
"""
|
| 46 |
+
Execute code in a subprocess and check how many expected outputs
|
| 47 |
+
appear in stdout. Returns a reward in [0.0, 1.0].
|
| 48 |
+
|
| 49 |
+
Partial credit: each matching expected string = 1/N reward.
|
| 50 |
+
"""
|
| 51 |
+
tmp_path = None
|
| 52 |
+
try:
|
| 53 |
+
# Write code to temp file
|
| 54 |
+
with tempfile.NamedTemporaryFile(
|
| 55 |
+
mode="w", suffix=".py", delete=False, encoding="utf-8"
|
| 56 |
+
) as f:
|
| 57 |
+
f.write(code)
|
| 58 |
+
tmp_path = f.name
|
| 59 |
+
|
| 60 |
+
# Run with timeout
|
| 61 |
+
result = subprocess.run(
|
| 62 |
+
[sys.executable, tmp_path],
|
| 63 |
+
capture_output=True,
|
| 64 |
+
text=True,
|
| 65 |
+
timeout=10,
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
# Non-zero exit = runtime error but partial credit possible
|
| 69 |
+
if result.returncode != 0:
|
| 70 |
+
# Check if any expected output appeared before crash
|
| 71 |
+
stdout = result.stdout
|
| 72 |
+
correct = sum(1 for e in expected_outputs if e in stdout)
|
| 73 |
+
if correct == 0:
|
| 74 |
+
return 0.1 # tiny credit for at least running
|
| 75 |
+
return round(correct / len(expected_outputs) * 0.5, 2)
|
| 76 |
+
|
| 77 |
+
stdout = result.stdout
|
| 78 |
+
correct = sum(1 for e in expected_outputs if e in stdout)
|
| 79 |
+
reward = round(correct / len(expected_outputs), 2)
|
| 80 |
+
return reward
|
| 81 |
+
|
| 82 |
+
except subprocess.TimeoutExpired:
|
| 83 |
+
return 0.0
|
| 84 |
+
except Exception:
|
| 85 |
+
return 0.0
|
| 86 |
+
finally:
|
| 87 |
+
if tmp_path and os.path.exists(tmp_path):
|
| 88 |
+
try:
|
| 89 |
+
os.unlink(tmp_path)
|
| 90 |
+
except OSError:
|
| 91 |
+
pass
|
app/tasks/task_easy.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from app.tasks.base import BaseTask
|
| 2 |
+
from typing import Dict, Any
|
| 3 |
+
|
| 4 |
+
# ── Buggy code the agent must fix ──────────────────────────────────────────────
|
| 5 |
+
BUGGY_CODE = '''\
|
| 6 |
+
def get_last_element(lst):
|
| 7 |
+
# BUG 1: len(lst) is out of range — should be len(lst) - 1
|
| 8 |
+
return lst[len(lst)]
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def compute_sum(numbers):
|
| 12 |
+
total = 0
|
| 13 |
+
# BUG 2: range(len(numbers) + 1) goes one too far → IndexError
|
| 14 |
+
for i in range(len(numbers) + 1):
|
| 15 |
+
total += numbers[i]
|
| 16 |
+
return total
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
result = get_last_element([1, 2, 3, 4, 5])
|
| 20 |
+
print(result)
|
| 21 |
+
|
| 22 |
+
total = compute_sum([10, 20, 30])
|
| 23 |
+
print(total)
|
| 24 |
+
'''
|
| 25 |
+
|
| 26 |
+
# ── What the agent sees ────────────────────────────────────────────────────────
|
| 27 |
+
DESCRIPTION = """\
|
| 28 |
+
=== TASK: Fix Index Errors (EASY) ===
|
| 29 |
+
|
| 30 |
+
The code below contains 2 bugs — both are off-by-one index errors.
|
| 31 |
+
Fix them so the program runs without errors and prints the expected output.
|
| 32 |
+
|
| 33 |
+
Expected output (exactly 2 lines):
|
| 34 |
+
5
|
| 35 |
+
60
|
| 36 |
+
|
| 37 |
+
Buggy code:
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
TASK_INFO = {
|
| 41 |
+
"task_id": "task_easy",
|
| 42 |
+
"difficulty": "easy",
|
| 43 |
+
"num_bugs": 2,
|
| 44 |
+
"max_steps": 5,
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
# Expected strings that must appear in stdout to earn full reward
|
| 48 |
+
EXPECTED_OUTPUTS = ["5\n", "60\n"]
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
class TaskEasy(BaseTask):
|
| 52 |
+
"""
|
| 53 |
+
Easy task: fix two off-by-one index errors.
|
| 54 |
+
Reward: 0.5 per correct output line → max 1.0
|
| 55 |
+
"""
|
| 56 |
+
|
| 57 |
+
def reset(self) -> Dict[str, Any]:
|
| 58 |
+
self.current_step = 0
|
| 59 |
+
self.done = False
|
| 60 |
+
self.current_observation = DESCRIPTION + BUGGY_CODE
|
| 61 |
+
return {
|
| 62 |
+
"observation": self.current_observation,
|
| 63 |
+
"info": TASK_INFO,
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
def step(self, action: str) -> Dict[str, Any]:
|
| 67 |
+
self.current_step += 1
|
| 68 |
+
|
| 69 |
+
reward = self._run_code_safely(action, EXPECTED_OUTPUTS)
|
| 70 |
+
max_steps_reached = self.current_step >= TASK_INFO["max_steps"]
|
| 71 |
+
|
| 72 |
+
if reward >= 1.0 or max_steps_reached:
|
| 73 |
+
self.done = True
|
| 74 |
+
|
| 75 |
+
if reward >= 1.0:
|
| 76 |
+
obs = f"✓ All outputs correct! Reward: {reward:.2f}. Task complete."
|
| 77 |
+
elif reward > 0.0:
|
| 78 |
+
obs = f"Partial credit. Reward: {reward:.2f}. Some outputs still wrong. Try again."
|
| 79 |
+
else:
|
| 80 |
+
obs = f"Code failed to run or produced wrong output. Reward: {reward:.2f}. Try again."
|
| 81 |
+
|
| 82 |
+
self.current_observation = obs
|
| 83 |
+
return {
|
| 84 |
+
"observation": obs,
|
| 85 |
+
"reward": reward,
|
| 86 |
+
"done": self.done,
|
| 87 |
+
"info": {"step": self.current_step},
|
| 88 |
+
}
|
app/tasks/task_hard.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from app.tasks.base import BaseTask
|
| 2 |
+
from typing import Dict, Any
|
| 3 |
+
|
| 4 |
+
# ── Buggy code ─────────────────────────────────────────────────────────────────
|
| 5 |
+
BUGGY_CODE = '''\
|
| 6 |
+
class DataProcessor:
|
| 7 |
+
"""Processes a list of employee records."""
|
| 8 |
+
|
| 9 |
+
def __init__(self):
|
| 10 |
+
self.data = []
|
| 11 |
+
|
| 12 |
+
def add_record(self, record: dict):
|
| 13 |
+
self.data.append(record)
|
| 14 |
+
|
| 15 |
+
def get_average(self, field: str) -> float:
|
| 16 |
+
"""Return the average value of a numeric field."""
|
| 17 |
+
if not self.data:
|
| 18 |
+
return 0.0
|
| 19 |
+
return sum(r[field] for r in self.data) / len(self.data)
|
| 20 |
+
|
| 21 |
+
def filter_records(self, field: str, value):
|
| 22 |
+
"""Return all records where record[field] == value."""
|
| 23 |
+
# BUG 1: single = is assignment, not comparison — SyntaxError
|
| 24 |
+
return [r for r in self.data if r[field] = value]
|
| 25 |
+
|
| 26 |
+
def get_sorted(self, field: str, reverse: bool = False):
|
| 27 |
+
"""Return records sorted by field. reverse=True means descending."""
|
| 28 |
+
# BUG 2: reverse logic is inverted — passing `not reverse` flips the sort
|
| 29 |
+
return sorted(self.data, key=lambda x: x[field], reverse=not reverse)
|
| 30 |
+
|
| 31 |
+
def get_max(self, field: str) -> dict:
|
| 32 |
+
"""Return the record with the highest value for field."""
|
| 33 |
+
# BUG 3: "field" is a string literal, not the variable — always uses key "field"
|
| 34 |
+
return max(self.data, key=lambda x: x["field"])
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
# ── Test harness ────────────────────────────────────────────────────────────────
|
| 38 |
+
p = DataProcessor()
|
| 39 |
+
p.add_record({"name": "Alice", "score": 85})
|
| 40 |
+
p.add_record({"name": "Bob", "score": 92})
|
| 41 |
+
p.add_record({"name": "Charlie", "score": 78})
|
| 42 |
+
|
| 43 |
+
print(round(p.get_average("score"), 1)) # Expected: 85.0
|
| 44 |
+
print(len(p.filter_records("name", "Alice"))) # Expected: 1
|
| 45 |
+
print(p.get_sorted("score", reverse=True)[0]["name"]) # Expected: Bob
|
| 46 |
+
print(p.get_max("score")["name"]) # Expected: Bob
|
| 47 |
+
'''
|
| 48 |
+
|
| 49 |
+
DESCRIPTION = """\
|
| 50 |
+
=== TASK: Fix DataProcessor Class (HARD) ===
|
| 51 |
+
|
| 52 |
+
The DataProcessor class below has exactly 3 bugs — one in each of three methods.
|
| 53 |
+
Fix all 3 bugs so the test harness at the bottom runs and prints the expected output.
|
| 54 |
+
|
| 55 |
+
Hint — methods with bugs:
|
| 56 |
+
filter_records → SyntaxError
|
| 57 |
+
get_sorted → wrong sort direction
|
| 58 |
+
get_max → wrong key lookup
|
| 59 |
+
|
| 60 |
+
Expected output (exactly 4 lines):
|
| 61 |
+
85.0
|
| 62 |
+
1
|
| 63 |
+
Bob
|
| 64 |
+
Bob
|
| 65 |
+
|
| 66 |
+
Buggy code:
|
| 67 |
+
"""
|
| 68 |
+
|
| 69 |
+
TASK_INFO = {
|
| 70 |
+
"task_id": "task_hard",
|
| 71 |
+
"difficulty": "hard",
|
| 72 |
+
"num_bugs": 3,
|
| 73 |
+
"max_steps": 7,
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
EXPECTED_OUTPUTS = ["85.0\n", "1\n", "Bob\nBob\n"]
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
class TaskHard(BaseTask):
|
| 80 |
+
"""
|
| 81 |
+
Hard task: fix 3 bugs in a DataProcessor class (one per method).
|
| 82 |
+
Reward: partial credit per correct output.
|
| 83 |
+
"""
|
| 84 |
+
|
| 85 |
+
def reset(self) -> Dict[str, Any]:
|
| 86 |
+
self.current_step = 0
|
| 87 |
+
self.done = False
|
| 88 |
+
self.current_observation = DESCRIPTION + BUGGY_CODE
|
| 89 |
+
return {
|
| 90 |
+
"observation": self.current_observation,
|
| 91 |
+
"info": TASK_INFO,
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
def step(self, action: str) -> Dict[str, Any]:
|
| 95 |
+
self.current_step += 1
|
| 96 |
+
|
| 97 |
+
reward = self._run_code_safely(action, EXPECTED_OUTPUTS)
|
| 98 |
+
max_steps_reached = self.current_step >= TASK_INFO["max_steps"]
|
| 99 |
+
|
| 100 |
+
if reward >= 1.0 or max_steps_reached:
|
| 101 |
+
self.done = True
|
| 102 |
+
|
| 103 |
+
if reward >= 1.0:
|
| 104 |
+
obs = f"✓ All 3 bugs fixed! Perfect output. Reward: {reward:.2f}. Task complete."
|
| 105 |
+
elif reward > 0.0:
|
| 106 |
+
obs = (
|
| 107 |
+
f"Partial fix. Reward: {reward:.2f}. "
|
| 108 |
+
f"Some outputs still wrong — check all 3 buggy methods."
|
| 109 |
+
)
|
| 110 |
+
else:
|
| 111 |
+
obs = (
|
| 112 |
+
f"Code failed or produced wrong output. Reward: {reward:.2f}. "
|
| 113 |
+
f"Look for SyntaxError in filter_records first."
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
self.current_observation = obs
|
| 117 |
+
return {
|
| 118 |
+
"observation": obs,
|
| 119 |
+
"reward": reward,
|
| 120 |
+
"done": self.done,
|
| 121 |
+
"info": {"step": self.current_step},
|
| 122 |
+
}
|
app/tasks/task_medium.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from app.tasks.base import BaseTask
|
| 2 |
+
from typing import Dict, Any
|
| 3 |
+
|
| 4 |
+
# ── Buggy code ─────────────────────────────────────────────────────────────────
|
| 5 |
+
BUGGY_CODE = '''\
|
| 6 |
+
def binary_search(arr, target):
|
| 7 |
+
# BUG 1: right should be len(arr) - 1, not len(arr)
|
| 8 |
+
# This causes an IndexError on the first iteration
|
| 9 |
+
left, right = 0, len(arr)
|
| 10 |
+
|
| 11 |
+
while left <= right:
|
| 12 |
+
mid = (left + right) // 2
|
| 13 |
+
if arr[mid] == target:
|
| 14 |
+
return mid
|
| 15 |
+
elif arr[mid] < target:
|
| 16 |
+
# BUG 2: must be mid + 1 — using just mid causes infinite loop
|
| 17 |
+
left = mid
|
| 18 |
+
else:
|
| 19 |
+
right = mid - 1
|
| 20 |
+
|
| 21 |
+
return -1
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
arr = [1, 3, 5, 7, 9, 11, 13]
|
| 25 |
+
print(binary_search(arr, 7)) # Expected: 3
|
| 26 |
+
print(binary_search(arr, 11)) # Expected: 5
|
| 27 |
+
print(binary_search(arr, 4)) # Expected: -1
|
| 28 |
+
'''
|
| 29 |
+
|
| 30 |
+
DESCRIPTION = """\
|
| 31 |
+
=== TASK: Fix Binary Search (MEDIUM) ===
|
| 32 |
+
|
| 33 |
+
The binary search function below has 2 bugs:
|
| 34 |
+
Bug 1 — the right boundary is wrong (causes IndexError)
|
| 35 |
+
Bug 2 — the left pointer never advances (causes infinite loop)
|
| 36 |
+
|
| 37 |
+
Fix both bugs so the function works correctly.
|
| 38 |
+
|
| 39 |
+
Expected output (exactly 3 lines):
|
| 40 |
+
3
|
| 41 |
+
5
|
| 42 |
+
-1
|
| 43 |
+
|
| 44 |
+
Buggy code:
|
| 45 |
+
"""
|
| 46 |
+
|
| 47 |
+
TASK_INFO = {
|
| 48 |
+
"task_id": "task_medium",
|
| 49 |
+
"difficulty": "medium",
|
| 50 |
+
"num_bugs": 2,
|
| 51 |
+
"max_steps": 5,
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
EXPECTED_OUTPUTS = ["3\n", "5\n", "-1\n"]
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
class TaskMedium(BaseTask):
|
| 58 |
+
"""
|
| 59 |
+
Medium task: fix 2 bugs in a binary search implementation.
|
| 60 |
+
Reward: 1/3 per correct output line → max 1.0
|
| 61 |
+
"""
|
| 62 |
+
|
| 63 |
+
def reset(self) -> Dict[str, Any]:
|
| 64 |
+
self.current_step = 0
|
| 65 |
+
self.done = False
|
| 66 |
+
self.current_observation = DESCRIPTION + BUGGY_CODE
|
| 67 |
+
return {
|
| 68 |
+
"observation": self.current_observation,
|
| 69 |
+
"info": TASK_INFO,
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
def step(self, action: str) -> Dict[str, Any]:
|
| 73 |
+
self.current_step += 1
|
| 74 |
+
|
| 75 |
+
reward = self._run_code_safely(action, EXPECTED_OUTPUTS)
|
| 76 |
+
max_steps_reached = self.current_step >= TASK_INFO["max_steps"]
|
| 77 |
+
|
| 78 |
+
if reward >= 1.0 or max_steps_reached:
|
| 79 |
+
self.done = True
|
| 80 |
+
|
| 81 |
+
if reward >= 1.0:
|
| 82 |
+
obs = f"✓ Perfect! All search results correct. Reward: {reward:.2f}. Task complete."
|
| 83 |
+
elif reward > 0.0:
|
| 84 |
+
obs = f"Partial credit. Reward: {reward:.2f}. Some search results are still wrong."
|
| 85 |
+
else:
|
| 86 |
+
obs = f"Code failed or produced wrong output. Reward: {reward:.2f}. Check both bugs."
|
| 87 |
+
|
| 88 |
+
self.current_observation = obs
|
| 89 |
+
return {
|
| 90 |
+
"observation": obs,
|
| 91 |
+
"reward": reward,
|
| 92 |
+
"done": self.done,
|
| 93 |
+
"info": {"step": self.current_step},
|
| 94 |
+
}
|
inference.py
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
inference.py — Baseline inference script for Python Bug Fixer OpenEnv.
|
| 3 |
+
|
| 4 |
+
Usage:
|
| 5 |
+
export API_BASE_URL="https://api-inference.huggingface.co/v1"
|
| 6 |
+
export MODEL_NAME="meta-llama/Meta-Llama-3-8B-Instruct"
|
| 7 |
+
export HF_TOKEN="hf_YOUR_TOKEN"
|
| 8 |
+
export SPACE_URL="https://YOUR_USERNAME-python-bug-fixer.hf.space"
|
| 9 |
+
python inference.py
|
| 10 |
+
|
| 11 |
+
Log format (required — do not change):
|
| 12 |
+
[START] {...json...}
|
| 13 |
+
[STEP] {...json...}
|
| 14 |
+
[END] {...json...}
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import os
|
| 18 |
+
import json
|
| 19 |
+
import requests
|
| 20 |
+
from datetime import datetime, timezone
|
| 21 |
+
from openai import OpenAI
|
| 22 |
+
|
| 23 |
+
# ── Environment variables ──────────────────────────────────────────────────────
|
| 24 |
+
# Defaults are placeholders only — real values must be set via env vars.
|
| 25 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 26 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/Meta-Llama-3-8B-Instruct")
|
| 27 |
+
HF_TOKEN = os.getenv("HF_TOKEN", "hf_YOUR_TOKEN")
|
| 28 |
+
SPACE_URL = os.getenv("SPACE_URL", "http://localhost:7860")
|
| 29 |
+
|
| 30 |
+
# ── OpenAI client (uses API_BASE_URL + HF_TOKEN) ──────────────────────────────
|
| 31 |
+
client = OpenAI(
|
| 32 |
+
base_url=API_BASE_URL,
|
| 33 |
+
api_key=HF_TOKEN,
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
# Tasks to evaluate (in order)
|
| 37 |
+
TASK_IDS = ["task_easy", "task_medium", "task_hard"]
|
| 38 |
+
|
| 39 |
+
# System prompt for the debugger agent
|
| 40 |
+
SYSTEM_PROMPT = (
|
| 41 |
+
"You are an expert Python developer and debugger. "
|
| 42 |
+
"You will be shown buggy Python code along with the expected output. "
|
| 43 |
+
"Your job is to return ONLY the corrected Python code — raw Python, "
|
| 44 |
+
"no explanations, no markdown, no code fences (no ```). "
|
| 45 |
+
"The code you return will be executed directly. Make it print the exact expected output."
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# ── Helper functions ───────────────────────────────────────────────────────────
|
| 50 |
+
|
| 51 |
+
def now_iso() -> str:
|
| 52 |
+
return datetime.now(timezone.utc).isoformat()
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def reset_task(task_id: str) -> dict:
|
| 56 |
+
"""Call POST /reset and return the response JSON."""
|
| 57 |
+
resp = requests.post(
|
| 58 |
+
f"{SPACE_URL}/reset",
|
| 59 |
+
json={"task_id": task_id},
|
| 60 |
+
timeout=30,
|
| 61 |
+
)
|
| 62 |
+
resp.raise_for_status()
|
| 63 |
+
return resp.json()
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def step_task(session_id: str, action: str) -> dict:
|
| 67 |
+
"""Call POST /step with the fixed code and return the response JSON."""
|
| 68 |
+
resp = requests.post(
|
| 69 |
+
f"{SPACE_URL}/step",
|
| 70 |
+
json={"session_id": session_id, "action": action},
|
| 71 |
+
timeout=30,
|
| 72 |
+
)
|
| 73 |
+
resp.raise_for_status()
|
| 74 |
+
return resp.json()
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def get_fixed_code(observation: str) -> str:
|
| 78 |
+
"""
|
| 79 |
+
Call the LLM with the buggy-code observation and return fixed code.
|
| 80 |
+
Uses the OpenAI client configured via API_BASE_URL + MODEL_NAME.
|
| 81 |
+
"""
|
| 82 |
+
response = client.chat.completions.create(
|
| 83 |
+
model=MODEL_NAME,
|
| 84 |
+
messages=[
|
| 85 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 86 |
+
{"role": "user", "content": observation},
|
| 87 |
+
],
|
| 88 |
+
max_tokens=1000,
|
| 89 |
+
temperature=0.1,
|
| 90 |
+
)
|
| 91 |
+
return response.choices[0].message.content.strip()
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
# ── Core task runner ───────────────────────────────────────────────────────────
|
| 95 |
+
|
| 96 |
+
def run_task(task_id: str) -> dict:
|
| 97 |
+
"""
|
| 98 |
+
Run a single task episode from reset to done.
|
| 99 |
+
Emits [START], [STEP], [END] logs to stdout.
|
| 100 |
+
Returns summary dict.
|
| 101 |
+
"""
|
| 102 |
+
# Reset
|
| 103 |
+
reset_data = reset_task(task_id)
|
| 104 |
+
session_id = reset_data["session_id"]
|
| 105 |
+
observation = reset_data["observation"]
|
| 106 |
+
|
| 107 |
+
# [START] log — required format
|
| 108 |
+
start_log = {
|
| 109 |
+
"task_id": task_id,
|
| 110 |
+
"session_id": session_id,
|
| 111 |
+
"model": MODEL_NAME,
|
| 112 |
+
"timestamp": now_iso(),
|
| 113 |
+
}
|
| 114 |
+
print(f"[START] {json.dumps(start_log)}", flush=True)
|
| 115 |
+
|
| 116 |
+
step_num = 0
|
| 117 |
+
reward = 0.0
|
| 118 |
+
done = False
|
| 119 |
+
|
| 120 |
+
while not done:
|
| 121 |
+
step_num += 1
|
| 122 |
+
|
| 123 |
+
# Get action from LLM
|
| 124 |
+
action = get_fixed_code(observation)
|
| 125 |
+
|
| 126 |
+
# Submit action to environment
|
| 127 |
+
result = step_task(session_id, action)
|
| 128 |
+
observation = result["observation"]
|
| 129 |
+
reward = result["reward"]
|
| 130 |
+
done = result["done"]
|
| 131 |
+
|
| 132 |
+
# [STEP] log — required format
|
| 133 |
+
step_log = {
|
| 134 |
+
"step": step_num,
|
| 135 |
+
"action_chars": len(action),
|
| 136 |
+
"reward": reward,
|
| 137 |
+
"done": done,
|
| 138 |
+
"observation": observation[:200], # truncated for log readability
|
| 139 |
+
}
|
| 140 |
+
print(f"[STEP] {json.dumps(step_log)}", flush=True)
|
| 141 |
+
|
| 142 |
+
# [END] log — required format
|
| 143 |
+
end_log = {
|
| 144 |
+
"task_id": task_id,
|
| 145 |
+
"session_id": session_id,
|
| 146 |
+
"total_reward": reward,
|
| 147 |
+
"steps": step_num,
|
| 148 |
+
"success": reward >= 0.8,
|
| 149 |
+
"timestamp": now_iso(),
|
| 150 |
+
}
|
| 151 |
+
print(f"[END] {json.dumps(end_log)}", flush=True)
|
| 152 |
+
|
| 153 |
+
return {"task_id": task_id, "reward": reward, "steps": step_num, "success": reward >= 0.8}
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
# ── Entry point ────────────────────────────────────────────────────────────────
|
| 157 |
+
|
| 158 |
+
def main():
|
| 159 |
+
print(f"Starting inference — model={MODEL_NAME} space={SPACE_URL}", flush=True)
|
| 160 |
+
print("-" * 60, flush=True)
|
| 161 |
+
|
| 162 |
+
results = []
|
| 163 |
+
for task_id in TASK_IDS:
|
| 164 |
+
result = run_task(task_id)
|
| 165 |
+
results.append(result)
|
| 166 |
+
print("-" * 60, flush=True)
|
| 167 |
+
|
| 168 |
+
# Summary
|
| 169 |
+
print("\n=== SUMMARY ===")
|
| 170 |
+
total_reward = 0.0
|
| 171 |
+
for r in results:
|
| 172 |
+
status = "PASS" if r["success"] else "FAIL"
|
| 173 |
+
print(f" [{status}] {r['task_id']:15s} reward={r['reward']:.2f} steps={r['steps']}")
|
| 174 |
+
total_reward += r["reward"]
|
| 175 |
+
avg = total_reward / len(results)
|
| 176 |
+
print(f"\n Average reward: {avg:.2f}")
|
| 177 |
+
print("=== END SUMMARY ===")
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
if __name__ == "__main__":
|
| 181 |
+
main()
|
inference_local.py
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
inference_local.py — Local inference for Python Bug Fixer OpenEnv.
|
| 3 |
+
|
| 4 |
+
This script runs all 3 tasks using pre-written correct solutions
|
| 5 |
+
(no external LLM required). Demonstrates the full environment loop.
|
| 6 |
+
|
| 7 |
+
Usage:
|
| 8 |
+
# 1. Start the server first:
|
| 9 |
+
# uvicorn app.main:app --host 0.0.0.0 --port 7860
|
| 10 |
+
# 2. Run this script:
|
| 11 |
+
# python inference_local.py
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import json
|
| 15 |
+
import requests
|
| 16 |
+
from datetime import datetime, timezone
|
| 17 |
+
|
| 18 |
+
SPACE_URL = "http://localhost:7860"
|
| 19 |
+
|
| 20 |
+
# ── Pre-written correct solutions for each task ───────────────────────────────
|
| 21 |
+
|
| 22 |
+
SOLUTIONS = {
|
| 23 |
+
"task_easy": """\
|
| 24 |
+
def get_last_element(lst):
|
| 25 |
+
return lst[len(lst) - 1]
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def compute_sum(numbers):
|
| 29 |
+
total = 0
|
| 30 |
+
for i in range(len(numbers)):
|
| 31 |
+
total += numbers[i]
|
| 32 |
+
return total
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
result = get_last_element([1, 2, 3, 4, 5])
|
| 36 |
+
print(result)
|
| 37 |
+
|
| 38 |
+
total = compute_sum([10, 20, 30])
|
| 39 |
+
print(total)
|
| 40 |
+
""",
|
| 41 |
+
|
| 42 |
+
"task_medium": """\
|
| 43 |
+
def binary_search(arr, target):
|
| 44 |
+
left, right = 0, len(arr) - 1
|
| 45 |
+
|
| 46 |
+
while left <= right:
|
| 47 |
+
mid = (left + right) // 2
|
| 48 |
+
if arr[mid] == target:
|
| 49 |
+
return mid
|
| 50 |
+
elif arr[mid] < target:
|
| 51 |
+
left = mid + 1
|
| 52 |
+
else:
|
| 53 |
+
right = mid - 1
|
| 54 |
+
|
| 55 |
+
return -1
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
arr = [1, 3, 5, 7, 9, 11, 13]
|
| 59 |
+
print(binary_search(arr, 7))
|
| 60 |
+
print(binary_search(arr, 11))
|
| 61 |
+
print(binary_search(arr, 4))
|
| 62 |
+
""",
|
| 63 |
+
|
| 64 |
+
"task_hard": """\
|
| 65 |
+
class DataProcessor:
|
| 66 |
+
\"\"\"Processes a list of employee records.\"\"\"
|
| 67 |
+
|
| 68 |
+
def __init__(self):
|
| 69 |
+
self.data = []
|
| 70 |
+
|
| 71 |
+
def add_record(self, record: dict):
|
| 72 |
+
self.data.append(record)
|
| 73 |
+
|
| 74 |
+
def get_average(self, field: str) -> float:
|
| 75 |
+
\"\"\"Return the average value of a numeric field.\"\"\"
|
| 76 |
+
if not self.data:
|
| 77 |
+
return 0.0
|
| 78 |
+
return sum(r[field] for r in self.data) / len(self.data)
|
| 79 |
+
|
| 80 |
+
def filter_records(self, field: str, value):
|
| 81 |
+
\"\"\"Return all records where record[field] == value.\"\"\"
|
| 82 |
+
return [r for r in self.data if r[field] == value]
|
| 83 |
+
|
| 84 |
+
def get_sorted(self, field: str, reverse: bool = False):
|
| 85 |
+
\"\"\"Return records sorted by field. reverse=True means descending.\"\"\"
|
| 86 |
+
return sorted(self.data, key=lambda x: x[field], reverse=reverse)
|
| 87 |
+
|
| 88 |
+
def get_max(self, field: str) -> dict:
|
| 89 |
+
\"\"\"Return the record with the highest value for field.\"\"\"
|
| 90 |
+
return max(self.data, key=lambda x: x[field])
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
p = DataProcessor()
|
| 94 |
+
p.add_record({"name": "Alice", "score": 85})
|
| 95 |
+
p.add_record({"name": "Bob", "score": 92})
|
| 96 |
+
p.add_record({"name": "Charlie", "score": 78})
|
| 97 |
+
|
| 98 |
+
print(round(p.get_average("score"), 1))
|
| 99 |
+
print(len(p.filter_records("name", "Alice")))
|
| 100 |
+
print(p.get_sorted("score", reverse=True)[0]["name"])
|
| 101 |
+
print(p.get_max("score")["name"])
|
| 102 |
+
""",
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
TASK_IDS = ["task_easy", "task_medium", "task_hard"]
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def now_iso() -> str:
|
| 109 |
+
return datetime.now(timezone.utc).isoformat()
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def run_task(task_id: str) -> dict:
|
| 113 |
+
"""Run one task: reset → submit fixed code → check reward."""
|
| 114 |
+
|
| 115 |
+
# Reset
|
| 116 |
+
resp = requests.post(f"{SPACE_URL}/reset", json={"task_id": task_id}, timeout=30)
|
| 117 |
+
resp.raise_for_status()
|
| 118 |
+
reset_data = resp.json()
|
| 119 |
+
session_id = reset_data["session_id"]
|
| 120 |
+
|
| 121 |
+
start_log = {
|
| 122 |
+
"task_id": task_id,
|
| 123 |
+
"session_id": session_id,
|
| 124 |
+
"model": "local-solver",
|
| 125 |
+
"timestamp": now_iso(),
|
| 126 |
+
}
|
| 127 |
+
print(f"[START] {json.dumps(start_log)}", flush=True)
|
| 128 |
+
|
| 129 |
+
# Submit the correct solution
|
| 130 |
+
action = SOLUTIONS[task_id]
|
| 131 |
+
resp = requests.post(
|
| 132 |
+
f"{SPACE_URL}/step",
|
| 133 |
+
json={"session_id": session_id, "action": action},
|
| 134 |
+
timeout=30,
|
| 135 |
+
)
|
| 136 |
+
resp.raise_for_status()
|
| 137 |
+
result = resp.json()
|
| 138 |
+
|
| 139 |
+
step_log = {
|
| 140 |
+
"step": 1,
|
| 141 |
+
"action_chars": len(action),
|
| 142 |
+
"reward": result["reward"],
|
| 143 |
+
"done": result["done"],
|
| 144 |
+
"observation": result["observation"][:200],
|
| 145 |
+
}
|
| 146 |
+
print(f"[STEP] {json.dumps(step_log)}", flush=True)
|
| 147 |
+
|
| 148 |
+
end_log = {
|
| 149 |
+
"task_id": task_id,
|
| 150 |
+
"session_id": session_id,
|
| 151 |
+
"total_reward": result["reward"],
|
| 152 |
+
"steps": 1,
|
| 153 |
+
"success": result["reward"] >= 0.8,
|
| 154 |
+
"timestamp": now_iso(),
|
| 155 |
+
}
|
| 156 |
+
print(f"[END] {json.dumps(end_log)}", flush=True)
|
| 157 |
+
|
| 158 |
+
return {
|
| 159 |
+
"task_id": task_id,
|
| 160 |
+
"reward": result["reward"],
|
| 161 |
+
"steps": 1,
|
| 162 |
+
"success": result["reward"] >= 0.8,
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def main():
|
| 167 |
+
print("=" * 60)
|
| 168 |
+
print(" Python Bug Fixer — Local Inference (no LLM needed)")
|
| 169 |
+
print("=" * 60)
|
| 170 |
+
print(f" Server: {SPACE_URL}")
|
| 171 |
+
print(f" Tasks: {', '.join(TASK_IDS)}")
|
| 172 |
+
print("-" * 60)
|
| 173 |
+
|
| 174 |
+
results = []
|
| 175 |
+
for task_id in TASK_IDS:
|
| 176 |
+
result = run_task(task_id)
|
| 177 |
+
results.append(result)
|
| 178 |
+
print("-" * 60)
|
| 179 |
+
|
| 180 |
+
# Summary
|
| 181 |
+
print("\n=== SUMMARY ===")
|
| 182 |
+
total_reward = 0.0
|
| 183 |
+
for r in results:
|
| 184 |
+
status = "✅ PASS" if r["success"] else "❌ FAIL"
|
| 185 |
+
print(f" [{status}] {r['task_id']:15s} reward={r['reward']:.2f} steps={r['steps']}")
|
| 186 |
+
total_reward += r["reward"]
|
| 187 |
+
avg = total_reward / len(results)
|
| 188 |
+
print(f"\n Average reward: {avg:.2f}")
|
| 189 |
+
print("=== END SUMMARY ===")
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
if __name__ == "__main__":
|
| 193 |
+
main()
|
openenv.yaml
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: python-bug-fixer
|
| 2 |
+
version: "1.0.0"
|
| 3 |
+
description: >
|
| 4 |
+
A real-world environment where an AI agent must identify and fix bugs
|
| 5 |
+
in Python code snippets. The agent receives buggy code along with a
|
| 6 |
+
description of expected behavior and must return corrected code that
|
| 7 |
+
runs without errors and produces the correct output.
|
| 8 |
+
Simulates real-world software debugging and code review workflows.
|
| 9 |
+
|
| 10 |
+
observation_space:
|
| 11 |
+
type: text
|
| 12 |
+
description: >
|
| 13 |
+
A buggy Python code snippet with a description of the expected behavior
|
| 14 |
+
and expected stdout output. May contain 1–3 bugs of varying types
|
| 15 |
+
(SyntaxError, IndexError, LogicError).
|
| 16 |
+
|
| 17 |
+
action_space:
|
| 18 |
+
type: text
|
| 19 |
+
description: >
|
| 20 |
+
The corrected Python code as a raw string. Must be valid Python that
|
| 21 |
+
can be executed directly with python3. No markdown, no code fences.
|
| 22 |
+
|
| 23 |
+
tasks:
|
| 24 |
+
- id: task_easy
|
| 25 |
+
name: "Fix Index Errors"
|
| 26 |
+
difficulty: easy
|
| 27 |
+
max_steps: 5
|
| 28 |
+
reward_threshold: 0.5
|
| 29 |
+
description: "Fix 2 off-by-one index errors in a list-processing script."
|
| 30 |
+
|
| 31 |
+
- id: task_medium
|
| 32 |
+
name: "Fix Binary Search Logic"
|
| 33 |
+
difficulty: medium
|
| 34 |
+
max_steps: 5
|
| 35 |
+
reward_threshold: 0.7
|
| 36 |
+
description: "Fix 2 bugs in a binary search implementation (boundary + infinite loop)."
|
| 37 |
+
|
| 38 |
+
- id: task_hard
|
| 39 |
+
name: "Fix DataProcessor Class"
|
| 40 |
+
difficulty: hard
|
| 41 |
+
max_steps: 7
|
| 42 |
+
reward_threshold: 0.8
|
| 43 |
+
description: "Fix 3 bugs across 3 methods of a DataProcessor class."
|
| 44 |
+
|
| 45 |
+
reward_range: [0.0, 1.0]
|
| 46 |
+
reward_description: >
|
| 47 |
+
Reward is computed by running the agent's fixed code in a sandboxed subprocess
|
| 48 |
+
and checking how many expected output strings appear in stdout.
|
| 49 |
+
Each expected output line is worth an equal fraction of 1.0.
|
| 50 |
+
Partial credit is awarded for partially correct fixes.
|
| 51 |
+
|
| 52 |
+
endpoints:
|
| 53 |
+
reset: "POST /reset"
|
| 54 |
+
step: "POST /step"
|
| 55 |
+
state: "GET /state"
|
| 56 |
+
tasks: "GET /tasks"
|
| 57 |
+
health: "GET /health"
|
| 58 |
+
|
| 59 |
+
runtime:
|
| 60 |
+
max_inference_minutes: 20
|
| 61 |
+
max_vcpu: 2
|
| 62 |
+
max_memory_gb: 8
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi==0.135.3
|
| 2 |
+
uvicorn[standard]==0.44.0
|
| 3 |
+
pydantic==2.12.5
|
| 4 |
+
openai==2.30.0
|
| 5 |
+
PyYAML==6.0.3
|
| 6 |
+
requests==2.33.1
|
| 7 |
+
python-multipart==0.0.24
|