Spaces:
Sleeping
Sleeping
Commit ·
635be3f
1
Parent(s): a5c1fa0
v3.0 — Gradio UI + run_agent.py + full OpenEnv compliance
Browse files- Dockerfile +3 -12
- README.md +120 -91
- app.py +344 -0
- requirements.txt +8 -6
- run_agent.py +290 -0
- test_e2e.py +56 -0
Dockerfile
CHANGED
|
@@ -1,31 +1,22 @@
|
|
| 1 |
FROM python:3.11-slim
|
| 2 |
|
| 3 |
-
# Create non-root user for security — MANDATORY for running agent code safely
|
| 4 |
RUN useradd -m -u 1000 envuser
|
| 5 |
|
| 6 |
WORKDIR /app
|
| 7 |
|
| 8 |
-
|
| 9 |
-
RUN apt-get update && apt-get install -y \
|
| 10 |
-
git \
|
| 11 |
-
&& rm -rf /var/lib/apt/lists/*
|
| 12 |
|
| 13 |
-
# Copy and install Python dependencies first (layer caching)
|
| 14 |
COPY requirements.txt .
|
| 15 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 16 |
|
| 17 |
-
# Copy project
|
| 18 |
COPY . .
|
| 19 |
|
| 20 |
-
# Make repo_templates readable
|
| 21 |
RUN chmod -R 755 repo_templates/
|
| 22 |
-
|
| 23 |
-
# Create temp directory for working copies
|
| 24 |
RUN mkdir -p /tmp/openenv_work && chmod 777 /tmp/openenv_work
|
| 25 |
|
| 26 |
-
# Switch to non-root for security
|
| 27 |
USER envuser
|
| 28 |
|
| 29 |
EXPOSE 7860
|
| 30 |
|
| 31 |
-
|
|
|
|
|
|
| 1 |
FROM python:3.11-slim
|
| 2 |
|
|
|
|
| 3 |
RUN useradd -m -u 1000 envuser
|
| 4 |
|
| 5 |
WORKDIR /app
|
| 6 |
|
| 7 |
+
RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
|
|
|
|
|
|
|
|
|
|
| 8 |
|
|
|
|
| 9 |
COPY requirements.txt .
|
| 10 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 11 |
|
|
|
|
| 12 |
COPY . .
|
| 13 |
|
|
|
|
| 14 |
RUN chmod -R 755 repo_templates/
|
|
|
|
|
|
|
| 15 |
RUN mkdir -p /tmp/openenv_work && chmod 777 /tmp/openenv_work
|
| 16 |
|
|
|
|
| 17 |
USER envuser
|
| 18 |
|
| 19 |
EXPOSE 7860
|
| 20 |
|
| 21 |
+
# Entry point: Gradio app that also mounts FastAPI endpoints
|
| 22 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
|
@@ -13,50 +13,92 @@ tags:
|
|
| 13 |
- coding-agent
|
| 14 |
---
|
| 15 |
|
| 16 |
-
# Codebase Navigation & Repair — OpenEnv
|
| 17 |
|
| 18 |
-
**
|
| 19 |
|
| 20 |
-
|
| 21 |
|
| 22 |
-
|
| 23 |
|
| 24 |
-
|
| 25 |
|
| 26 |
-
|
| 27 |
-
- Did it verify its fixes before submitting?
|
| 28 |
-
- Can it resist misleading comments and prompt injection?
|
| 29 |
-
- How efficiently does it use its context window?
|
| 30 |
|
| 31 |
-
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
```
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
│ └─────────────┘ └──────────────┘ └─────────────────┘ │
|
| 46 |
-
│ ┌─────────────┐ ┌──────────────┐ ┌─────────────────┐ │
|
| 47 |
-
│ │ Fault │ │ Memory │ │ Grader │ │
|
| 48 |
-
│ │ Injector │ │ Tracker │ │ (pytest) │ │
|
| 49 |
-
│ └─────────────┘ └──────────────┘ └─────────────────┘ │
|
| 50 |
-
└──────────────────────────────────────────────────────────┘
|
| 51 |
```
|
| 52 |
|
| 53 |
-
##
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
| task2 | Medium | Cross-module interface bug + regression test (5 variants) |
|
| 59 |
-
| task3 | Hard | Feature implementation from spec (5 variants) |
|
| 60 |
|
| 61 |
## API Endpoints
|
| 62 |
|
|
@@ -68,76 +110,63 @@ Every coding agent (Devin, Cursor, Copilot, Codex) fails ~25%+ on complex tasks.
|
|
| 68 |
| `/state` | GET | Get current state |
|
| 69 |
| `/health` | GET | Health check |
|
| 70 |
|
| 71 |
-
### Evaluation Layer
|
| 72 |
| Endpoint | Method | Description |
|
| 73 |
|----------|--------|-------------|
|
| 74 |
-
| `/trajectory` | GET | Full action log with timing
|
| 75 |
| `/evaluate` | GET | Multi-dimensional scores (6 axes) |
|
| 76 |
-
| `/metrics` | GET |
|
| 77 |
-
| `/fault-config` | POST | Enable fault injection
|
| 78 |
-
|
| 79 |
-
## Multi-Dimensional Evaluation
|
| 80 |
-
|
| 81 |
-
The `/evaluate` endpoint scores agents across **6 quality dimensions**:
|
| 82 |
|
| 83 |
-
|
| 84 |
-
|-----------|--------|-----------------|
|
| 85 |
-
| Efficiency | 20% | Steps used vs optimal path |
|
| 86 |
-
| Navigation | 15% | Read relevant files first? Explored strategically? |
|
| 87 |
-
| Correctness | 30% | Final test pass rate + regression detection |
|
| 88 |
-
| Reasoning | 15% | read→write→test pattern adherence |
|
| 89 |
-
| Robustness | 10% | Error recovery + fault injection handling |
|
| 90 |
-
| Security | 10% | Unsafe code detection + prompt injection resistance |
|
| 91 |
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
# Next reset will inject:
|
| 101 |
-
# - Misleading "BUG:" comments on correct lines
|
| 102 |
-
# - Red herring files that look buggy but aren't
|
| 103 |
-
# - Noisy docstrings claiming code is correct
|
| 104 |
```
|
| 105 |
|
| 106 |
-
##
|
| 107 |
|
| 108 |
-
### Local
|
| 109 |
-
```bash
|
| 110 |
-
pip install -r requirements.txt
|
| 111 |
-
uvicorn server.app:app --host 0.0.0.0 --port 7860
|
| 112 |
```
|
| 113 |
-
|
| 114 |
-
#
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
```
|
| 119 |
|
| 120 |
-
##
|
| 121 |
-
```bash
|
| 122 |
-
export HF_TOKEN=your_token
|
| 123 |
-
export ENV_BASE_URL=http://localhost:7860
|
| 124 |
-
python inference.py
|
| 125 |
-
```
|
| 126 |
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
"correctness": {"score": 0.714, "evidence": ["No test regressions"]},
|
| 135 |
-
"reasoning": {"score": 1.0, "evidence": ["Agent tested after writing"]},
|
| 136 |
-
"robustness": {"score": 1.0, "evidence": ["Clean execution"]},
|
| 137 |
-
"security": {"score": 1.0, "evidence": ["No security violations"]}
|
| 138 |
-
}
|
| 139 |
-
}
|
| 140 |
-
```
|
| 141 |
|
| 142 |
## License
|
| 143 |
|
|
|
|
| 13 |
- coding-agent
|
| 14 |
---
|
| 15 |
|
| 16 |
+
# 🔍 Codebase Navigation & Repair — OpenEnv
|
| 17 |
|
| 18 |
+
> **The system that makes AI coding agents reliable, testable, and debuggable.**
|
| 19 |
|
| 20 |
+
## The Problem
|
| 21 |
|
| 22 |
+
AI coding agents (Copilot, Devin, Cursor) fail ~25%+ on complex tasks. Current benchmarks tell you the score but not **why** the agent failed. Was it poor navigation? Wasted steps? Hallucinated code? There is no way to know.
|
| 23 |
|
| 24 |
+
## Our Solution
|
| 25 |
|
| 26 |
+
An RL environment where agents navigate unfamiliar Python repos, find bugs, and fix them — graded by **actual pytest execution** with **process-level evaluation**.
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
+
Unlike existing benchmarks, we evaluate **how** the agent works, not just the final output:
|
| 29 |
|
| 30 |
+
| What We Measure | Why It Matters |
|
| 31 |
+
|----------------|---------------|
|
| 32 |
+
| Navigation efficiency | Did it read relevant files first? |
|
| 33 |
+
| Reasoning patterns | Did it follow read→write→test? |
|
| 34 |
+
| Context usage | How much of what it read was useful? |
|
| 35 |
+
| Security | Did it write safe code? |
|
| 36 |
+
| Robustness | Can it handle misleading comments? |
|
| 37 |
+
|
| 38 |
+
## How It Works
|
| 39 |
+
|
| 40 |
+
```
|
| 41 |
+
Agent resets environment → sees repo file tree (NOT contents)
|
| 42 |
+
→ reads files one at a time (costs steps)
|
| 43 |
+
→ identifies bugs in source code
|
| 44 |
+
→ writes fixed code
|
| 45 |
+
→ runs tests to verify
|
| 46 |
+
→ submits for final grade
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
### Tasks
|
| 50 |
+
|
| 51 |
+
| Task | Difficulty | Description | Variants |
|
| 52 |
+
|------|-----------|-------------|----------|
|
| 53 |
+
| task1 | Easy | Single-file bug repair | 5 |
|
| 54 |
+
| task2 | Medium | Cross-module interface bug + regression test | 5 |
|
| 55 |
+
| task3 | Hard | Feature implementation from spec | 5 |
|
| 56 |
+
|
| 57 |
+
Each variant has structurally different code, so the agent can't memorize solutions.
|
| 58 |
+
|
| 59 |
+
## Quick Start
|
| 60 |
+
|
| 61 |
+
### 1. Run Locally (No Docker)
|
| 62 |
+
```bash
|
| 63 |
+
pip install -r requirements.txt
|
| 64 |
+
python app.py # Gradio UI at http://localhost:7860
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
### 2. Run Agent (No LLM needed)
|
| 68 |
+
```bash
|
| 69 |
+
python run_agent.py # deterministic agent demo
|
| 70 |
+
python run_agent.py --all-tasks # run all 3 tasks
|
| 71 |
```
|
| 72 |
+
|
| 73 |
+
### 3. Run Agent with LLM
|
| 74 |
+
```bash
|
| 75 |
+
export HF_TOKEN=hf_xxxxx
|
| 76 |
+
python run_agent.py --llm --task task1
|
| 77 |
+
```
|
| 78 |
+
|
| 79 |
+
### 4. Docker
|
| 80 |
+
```bash
|
| 81 |
+
docker build -t codebase-nav-env .
|
| 82 |
+
docker run -p 7860:7860 codebase-nav-env
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
```
|
| 84 |
|
| 85 |
+
### 5. API Usage
|
| 86 |
+
```bash
|
| 87 |
+
# Reset
|
| 88 |
+
curl -X POST "http://localhost:7860/reset?task=task1"
|
| 89 |
+
|
| 90 |
+
# Take action
|
| 91 |
+
curl -X POST http://localhost:7860/step \
|
| 92 |
+
-H "Content-Type: application/json" \
|
| 93 |
+
-d '{"action_type":"read_file","path":"src/auth.py"}'
|
| 94 |
+
|
| 95 |
+
# Submit
|
| 96 |
+
curl -X POST http://localhost:7860/step \
|
| 97 |
+
-d '{"action_type":"submit"}'
|
| 98 |
|
| 99 |
+
# Get evaluation
|
| 100 |
+
curl http://localhost:7860/evaluate
|
| 101 |
+
```
|
|
|
|
|
|
|
| 102 |
|
| 103 |
## API Endpoints
|
| 104 |
|
|
|
|
| 110 |
| `/state` | GET | Get current state |
|
| 111 |
| `/health` | GET | Health check |
|
| 112 |
|
| 113 |
+
### Evaluation Layer
|
| 114 |
| Endpoint | Method | Description |
|
| 115 |
|----------|--------|-------------|
|
| 116 |
+
| `/trajectory` | GET | Full action log with timing and diffs |
|
| 117 |
| `/evaluate` | GET | Multi-dimensional scores (6 axes) |
|
| 118 |
+
| `/metrics` | GET | Memory, security, timeline stats |
|
| 119 |
+
| `/fault-config` | POST | Enable fault injection |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
|
| 121 |
+
## Evaluation Dimensions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
|
| 123 |
+
```
|
| 124 |
+
efficiency [████████████████░░░░] 0.800 — 5 steps vs 4 optimal
|
| 125 |
+
navigation [████████████████████] 1.000 — read relevant files first
|
| 126 |
+
correctness [██████████████░░░░░░] 0.714 — 71.4% tests passing
|
| 127 |
+
reasoning [████████████████████] 1.000 — correct read→write→test pattern
|
| 128 |
+
robustness [████████████████████] 1.000 — no errors encountered
|
| 129 |
+
security [████████████████████] 1.000 — no unsafe code detected
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
```
|
| 131 |
|
| 132 |
+
## Project Structure
|
| 133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
```
|
| 135 |
+
codebase-nav-env/
|
| 136 |
+
├── app.py # Gradio UI + FastAPI (HF Space entry point)
|
| 137 |
+
├── run_agent.py # Standalone HF agent (deterministic + LLM)
|
| 138 |
+
├── inference.py # OpenEnv inference script ([START]/[STEP]/[END])
|
| 139 |
+
├── server/
|
| 140 |
+
│ ├── app.py # FastAPI endpoints
|
| 141 |
+
│ ├── environment.py # Core RL environment
|
| 142 |
+
│ ├── models.py # Pydantic models
|
| 143 |
+
│ ├── grader.py # pytest runner
|
| 144 |
+
│ ├── repo_loader.py # Template loader
|
| 145 |
+
│ ├── sandbox.py # Secure subprocess
|
| 146 |
+
│ ├── trajectory.py # Full trajectory recording
|
| 147 |
+
│ ├── evaluator.py # 6-dimension scoring engine
|
| 148 |
+
│ ├── fault_injection.py # Robustness testing
|
| 149 |
+
│ ├── security.py # Unsafe code detection
|
| 150 |
+
│ └── memory.py # Context efficiency tracking
|
| 151 |
+
├── repo_templates/ # 15 task variants
|
| 152 |
+
│ ├── task1/ # 5 single-file bug variants
|
| 153 |
+
│ ├── task2/ # 5 cross-module bug variants
|
| 154 |
+
│ └── task3/ # 5 feature implementation variants
|
| 155 |
+
├── openenv.yaml # Environment metadata
|
| 156 |
+
├── Dockerfile # Docker build
|
| 157 |
+
├── requirements.txt # Dependencies
|
| 158 |
+
└── README.md # This file
|
| 159 |
```
|
| 160 |
|
| 161 |
+
## Why This Is Real-World
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
+
This isn't a toy benchmark. It tests the **exact capabilities** production coding agents need:
|
| 164 |
+
|
| 165 |
+
- **Navigate unfamiliar code** — agent sees only file names, not contents
|
| 166 |
+
- **Budget exploration** — finite steps mean strategic reading matters
|
| 167 |
+
- **Verify fixes** — must run tests, not just hope the fix works
|
| 168 |
+
- **Handle noise** — real repos have misleading comments and dead code
|
| 169 |
+
- **Write safe code** — production agents can't `eval()` or `os.system()`
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
## License
|
| 172 |
|
app.py
ADDED
|
@@ -0,0 +1,344 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
app.py — Gradio UI + FastAPI endpoints for the OpenEnv environment.
|
| 4 |
+
This is the HF Space entry point.
|
| 5 |
+
"""
|
| 6 |
+
import os
|
| 7 |
+
import json
|
| 8 |
+
import gradio as gr
|
| 9 |
+
from server.environment import CodebaseNavEnvironment
|
| 10 |
+
from server.models import RepoAction
|
| 11 |
+
|
| 12 |
+
# ── Global environment instance ──────────────────────────────────────────────
|
| 13 |
+
env = CodebaseNavEnvironment()
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
# ── Gradio callback functions ────────────────────────────────────────────────
|
| 17 |
+
|
| 18 |
+
def reset_environment(task: str):
|
| 19 |
+
"""Reset environment and return initial state."""
|
| 20 |
+
try:
|
| 21 |
+
result = env.reset(task=task)
|
| 22 |
+
obs = result.observation
|
| 23 |
+
tree = "\n".join(f" 📄 {f}" for f in obs.repo_tree)
|
| 24 |
+
failing = ", ".join(obs.failing_tests) if obs.failing_tests else "None listed"
|
| 25 |
+
info_data = result.info
|
| 26 |
+
|
| 27 |
+
status_text = (
|
| 28 |
+
f"✅ Episode started\n"
|
| 29 |
+
f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
|
| 30 |
+
f"Task: {task}\n"
|
| 31 |
+
f"Variant: {info_data.get('variant_id', 'unknown')}\n"
|
| 32 |
+
f"Steps remaining: {obs.steps_remaining}\n"
|
| 33 |
+
f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"
|
| 34 |
+
f"📁 Repository Files:\n{tree}\n\n"
|
| 35 |
+
f"🔴 Failing Tests: {failing}\n\n"
|
| 36 |
+
f"📋 Task: {obs.task_description}"
|
| 37 |
+
)
|
| 38 |
+
return status_text, "", "0", "0.000"
|
| 39 |
+
except Exception as e:
|
| 40 |
+
return f"❌ Error: {e}", "", "0", "0.000"
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def take_step(action_type: str, path: str, query: str, content: str):
|
| 44 |
+
"""Execute one agent step."""
|
| 45 |
+
if env.done:
|
| 46 |
+
return "❌ Episode is done. Reset first.", "", "", ""
|
| 47 |
+
|
| 48 |
+
try:
|
| 49 |
+
action = RepoAction(
|
| 50 |
+
action_type=action_type,
|
| 51 |
+
path=path if path.strip() else None,
|
| 52 |
+
query=query if query.strip() else None,
|
| 53 |
+
content=content if content.strip() else None,
|
| 54 |
+
)
|
| 55 |
+
result = env.step(action)
|
| 56 |
+
obs = result.observation
|
| 57 |
+
|
| 58 |
+
action_result = obs.last_action_result or "No output"
|
| 59 |
+
error = obs.last_action_error or ""
|
| 60 |
+
if error:
|
| 61 |
+
error = f"⚠️ {error}"
|
| 62 |
+
|
| 63 |
+
status = (
|
| 64 |
+
f"Step {result.info['steps_taken']} | "
|
| 65 |
+
f"Reward: {result.reward:+.3f} | "
|
| 66 |
+
f"Steps left: {obs.steps_remaining}"
|
| 67 |
+
)
|
| 68 |
+
if result.done:
|
| 69 |
+
status += f"\n\n🏁 EPISODE DONE — Final Score: {result.info['final_score']:.3f}"
|
| 70 |
+
|
| 71 |
+
flags = result.info.get("security_flags", [])
|
| 72 |
+
if flags:
|
| 73 |
+
status += f"\n🔒 Security: {flags}"
|
| 74 |
+
|
| 75 |
+
return (
|
| 76 |
+
status,
|
| 77 |
+
action_result[:3000],
|
| 78 |
+
str(result.info["steps_taken"]),
|
| 79 |
+
f"{result.info.get('cumulative_reward', 0):.3f}",
|
| 80 |
+
)
|
| 81 |
+
except Exception as e:
|
| 82 |
+
return f"❌ Error: {e}", "", "", ""
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def get_evaluation():
|
| 86 |
+
"""Get multi-dimensional evaluation report."""
|
| 87 |
+
try:
|
| 88 |
+
ev = env.get_evaluation()
|
| 89 |
+
if "error" in ev:
|
| 90 |
+
return "No evaluation available. Run an episode first."
|
| 91 |
+
|
| 92 |
+
lines = [
|
| 93 |
+
f"🎯 Composite Score: {ev['composite_score']:.3f}",
|
| 94 |
+
"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━",
|
| 95 |
+
]
|
| 96 |
+
for name, dim in ev.get("dimensions", {}).items():
|
| 97 |
+
bar = "█" * int(dim["score"] * 20) + "░" * (20 - int(dim["score"] * 20))
|
| 98 |
+
lines.append(f" {name:15s} [{bar}] {dim['score']:.3f}")
|
| 99 |
+
for e in dim.get("evidence", []):
|
| 100 |
+
lines.append(f" → {e}")
|
| 101 |
+
|
| 102 |
+
if ev.get("strengths"):
|
| 103 |
+
lines.append("\n💪 Strengths:")
|
| 104 |
+
for s in ev["strengths"]:
|
| 105 |
+
lines.append(f" ✅ {s}")
|
| 106 |
+
|
| 107 |
+
if ev.get("failure_analysis"):
|
| 108 |
+
lines.append("\n⚠️ Failures:")
|
| 109 |
+
for f in ev["failure_analysis"]:
|
| 110 |
+
lines.append(f" ❌ {f}")
|
| 111 |
+
|
| 112 |
+
if ev.get("recommendations"):
|
| 113 |
+
lines.append("\n💡 Recommendations:")
|
| 114 |
+
for r in ev["recommendations"]:
|
| 115 |
+
lines.append(f" → {r}")
|
| 116 |
+
|
| 117 |
+
return "\n".join(lines)
|
| 118 |
+
except Exception as e:
|
| 119 |
+
return f"Error: {e}"
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def get_metrics():
|
| 123 |
+
"""Get comprehensive metrics."""
|
| 124 |
+
try:
|
| 125 |
+
m = env.get_metrics()
|
| 126 |
+
return json.dumps(m, indent=2, default=str)
|
| 127 |
+
except Exception as e:
|
| 128 |
+
return f"Error: {e}"
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def get_trajectory():
|
| 132 |
+
"""Get full trajectory."""
|
| 133 |
+
try:
|
| 134 |
+
t = env.get_trajectory()
|
| 135 |
+
if not t:
|
| 136 |
+
return "No trajectory available."
|
| 137 |
+
|
| 138 |
+
lines = [
|
| 139 |
+
f"Episode: {t.get('episode_id', 'N/A')}",
|
| 140 |
+
f"Task: {t.get('task', 'N/A')} | Variant: {t.get('variant_id', 'N/A')}",
|
| 141 |
+
f"Duration: {t.get('duration_seconds', 'N/A')}s | Score: {t.get('final_score', 0):.3f}",
|
| 142 |
+
"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━",
|
| 143 |
+
]
|
| 144 |
+
for step in t.get("steps", []):
|
| 145 |
+
emoji = "📖" if step["action_type"] == "read_file" else \
|
| 146 |
+
"✏️" if step["action_type"] == "write_file" else \
|
| 147 |
+
"🧪" if step["action_type"] == "run_tests" else \
|
| 148 |
+
"🔍" if step["action_type"] == "search_code" else "🏁"
|
| 149 |
+
path = step.get("action_path") or step.get("action_query") or ""
|
| 150 |
+
err = f" ❌ {step['error']}" if step.get("error") else ""
|
| 151 |
+
lines.append(
|
| 152 |
+
f" {emoji} Step {step['step_number']:2d}: "
|
| 153 |
+
f"{step['action_type']:12s} {path:30s} "
|
| 154 |
+
f"reward={step['reward']:+.3f} "
|
| 155 |
+
f"({step['duration_ms']:.0f}ms){err}"
|
| 156 |
+
)
|
| 157 |
+
return "\n".join(lines)
|
| 158 |
+
except Exception as e:
|
| 159 |
+
return f"Error: {e}"
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def run_builtin_agent(task: str):
|
| 163 |
+
"""Run the built-in deterministic agent for a quick demo."""
|
| 164 |
+
try:
|
| 165 |
+
# Reset
|
| 166 |
+
result = env.reset(task=task)
|
| 167 |
+
obs = result.observation
|
| 168 |
+
log_lines = [f"🚀 Starting {task} (variant: {result.info.get('variant_id')})"]
|
| 169 |
+
log_lines.append(f" Files: {obs.repo_tree}")
|
| 170 |
+
log_lines.append(f" Failing: {obs.failing_tests}")
|
| 171 |
+
|
| 172 |
+
# Strategy: read test file → read source → fix → run tests → submit
|
| 173 |
+
test_files = [f for f in obs.repo_tree if f.startswith("tests/")]
|
| 174 |
+
src_files = [f for f in obs.repo_tree if f.startswith("src/") and f.endswith(".py")]
|
| 175 |
+
spec_files = [f for f in obs.repo_tree if f.endswith(".md")]
|
| 176 |
+
|
| 177 |
+
steps_done = 0
|
| 178 |
+
max_demo_steps = 15
|
| 179 |
+
|
| 180 |
+
# Step 1: read spec or test
|
| 181 |
+
if task == "task3" and spec_files:
|
| 182 |
+
target = spec_files[0]
|
| 183 |
+
elif test_files:
|
| 184 |
+
target = test_files[0]
|
| 185 |
+
else:
|
| 186 |
+
target = obs.repo_tree[0]
|
| 187 |
+
|
| 188 |
+
step_result = env.step(RepoAction(action_type="read_file", path=target))
|
| 189 |
+
steps_done += 1
|
| 190 |
+
log_lines.append(f" Step {steps_done}: read_file {target} → reward={step_result.reward:+.3f}")
|
| 191 |
+
|
| 192 |
+
# Step 2+: read all source files
|
| 193 |
+
for sf in src_files:
|
| 194 |
+
if env.done or steps_done >= max_demo_steps - 2:
|
| 195 |
+
break
|
| 196 |
+
step_result = env.step(RepoAction(action_type="read_file", path=sf))
|
| 197 |
+
steps_done += 1
|
| 198 |
+
log_lines.append(f" Step {steps_done}: read_file {sf} → reward={step_result.reward:+.3f}")
|
| 199 |
+
|
| 200 |
+
# Step N-1: run tests
|
| 201 |
+
if not env.done and steps_done < max_demo_steps - 1:
|
| 202 |
+
step_result = env.step(RepoAction(action_type="run_tests"))
|
| 203 |
+
steps_done += 1
|
| 204 |
+
log_lines.append(f" Step {steps_done}: run_tests → reward={step_result.reward:+.3f}")
|
| 205 |
+
|
| 206 |
+
# Step N: submit
|
| 207 |
+
if not env.done:
|
| 208 |
+
step_result = env.step(RepoAction(action_type="submit"))
|
| 209 |
+
steps_done += 1
|
| 210 |
+
log_lines.append(f" Step {steps_done}: submit → reward={step_result.reward:+.3f}")
|
| 211 |
+
|
| 212 |
+
log_lines.append(f"\n🏁 Final Score: {env.final_score:.3f}")
|
| 213 |
+
log_lines.append(f" Total Steps: {steps_done}")
|
| 214 |
+
log_lines.append(f" Cumulative Reward: {env.cumulative_reward:.3f}")
|
| 215 |
+
|
| 216 |
+
return "\n".join(log_lines)
|
| 217 |
+
except Exception as e:
|
| 218 |
+
return f"❌ Error: {e}"
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
# ── Build the Gradio UI ─────────────────────────────────────────────────────
|
| 222 |
+
|
| 223 |
+
with gr.Blocks(
|
| 224 |
+
title="Codebase Navigation & Repair — OpenEnv",
|
| 225 |
+
) as demo:
|
| 226 |
+
gr.Markdown(
|
| 227 |
+
"# 🔍 Codebase Navigation & Repair — OpenEnv\n"
|
| 228 |
+
"**RL environment for testing AI coding agents.** "
|
| 229 |
+
"Agents navigate repos, find bugs, and fix them — graded by actual pytest execution."
|
| 230 |
+
)
|
| 231 |
+
|
| 232 |
+
with gr.Tabs():
|
| 233 |
+
# ── Tab 1: Interactive Environment ────────────────────────────────
|
| 234 |
+
with gr.TabItem("🎮 Interactive"):
|
| 235 |
+
with gr.Row():
|
| 236 |
+
with gr.Column(scale=1):
|
| 237 |
+
task_select = gr.Dropdown(
|
| 238 |
+
choices=["task1", "task2", "task3"],
|
| 239 |
+
value="task1",
|
| 240 |
+
label="Task",
|
| 241 |
+
info="task1=single-file bugs, task2=cross-module, task3=feature impl"
|
| 242 |
+
)
|
| 243 |
+
reset_btn = gr.Button("🔄 Reset Environment", variant="primary")
|
| 244 |
+
|
| 245 |
+
gr.Markdown("### Take an Action")
|
| 246 |
+
action_type = gr.Dropdown(
|
| 247 |
+
choices=["read_file", "write_file", "run_tests", "search_code", "submit"],
|
| 248 |
+
value="read_file",
|
| 249 |
+
label="Action Type",
|
| 250 |
+
)
|
| 251 |
+
action_path = gr.Textbox(label="Path (for read/write/run_tests)", placeholder="src/auth.py")
|
| 252 |
+
action_query = gr.Textbox(label="Query (for search_code)", placeholder="validate_token")
|
| 253 |
+
action_content = gr.Textbox(label="Content (for write_file)", lines=5, placeholder="# new file content...")
|
| 254 |
+
step_btn = gr.Button("▶️ Execute Step", variant="secondary")
|
| 255 |
+
|
| 256 |
+
with gr.Column(scale=2):
|
| 257 |
+
status_box = gr.Textbox(label="Status", lines=15, interactive=False)
|
| 258 |
+
result_box = gr.Textbox(label="Last Action Result", lines=10, interactive=False)
|
| 259 |
+
with gr.Row():
|
| 260 |
+
steps_box = gr.Textbox(label="Steps Taken", value="0", interactive=False)
|
| 261 |
+
reward_box = gr.Textbox(label="Cumulative Reward", value="0.000", interactive=False)
|
| 262 |
+
|
| 263 |
+
reset_btn.click(
|
| 264 |
+
reset_environment, inputs=[task_select],
|
| 265 |
+
outputs=[status_box, result_box, steps_box, reward_box],
|
| 266 |
+
)
|
| 267 |
+
step_btn.click(
|
| 268 |
+
take_step,
|
| 269 |
+
inputs=[action_type, action_path, action_query, action_content],
|
| 270 |
+
outputs=[status_box, result_box, steps_box, reward_box],
|
| 271 |
+
)
|
| 272 |
+
|
| 273 |
+
# ── Tab 2: Run Agent ─────────────────────────────────────────────
|
| 274 |
+
with gr.TabItem("🤖 Run Agent"):
|
| 275 |
+
gr.Markdown(
|
| 276 |
+
"### Built-in Demonstration Agent\n"
|
| 277 |
+
"Runs a deterministic read-all-then-submit agent. "
|
| 278 |
+
"For LLM-based agent, use `run_agent.py` or `inference.py`."
|
| 279 |
+
)
|
| 280 |
+
agent_task = gr.Dropdown(
|
| 281 |
+
choices=["task1", "task2", "task3"], value="task1", label="Task"
|
| 282 |
+
)
|
| 283 |
+
run_btn = gr.Button("🚀 Run Agent", variant="primary")
|
| 284 |
+
agent_output = gr.Textbox(label="Agent Log", lines=20, interactive=False)
|
| 285 |
+
run_btn.click(run_builtin_agent, inputs=[agent_task], outputs=[agent_output])
|
| 286 |
+
|
| 287 |
+
# ── Tab 3: Evaluation Dashboard ──────────────────────────────────
|
| 288 |
+
with gr.TabItem("📊 Evaluation"):
|
| 289 |
+
with gr.Row():
|
| 290 |
+
eval_btn = gr.Button("🎯 Get Evaluation", variant="primary")
|
| 291 |
+
metrics_btn = gr.Button("📈 Get Metrics", variant="secondary")
|
| 292 |
+
traj_btn = gr.Button("🗺️ Get Trajectory", variant="secondary")
|
| 293 |
+
eval_output = gr.Textbox(label="Evaluation Report", lines=25, interactive=False)
|
| 294 |
+
eval_btn.click(get_evaluation, outputs=[eval_output])
|
| 295 |
+
metrics_btn.click(get_metrics, outputs=[eval_output])
|
| 296 |
+
traj_btn.click(get_trajectory, outputs=[eval_output])
|
| 297 |
+
|
| 298 |
+
# ── Tab 4: API Docs ──────────────────────────────────────────────
|
| 299 |
+
with gr.TabItem("📖 API"):
|
| 300 |
+
gr.Markdown("""
|
| 301 |
+
### REST API Endpoints
|
| 302 |
+
|
| 303 |
+
The FastAPI endpoints are mounted alongside this UI at `/api/`.
|
| 304 |
+
|
| 305 |
+
| Endpoint | Method | Description |
|
| 306 |
+
|----------|--------|-------------|
|
| 307 |
+
| `/api/reset?task=task1` | POST | Start new episode |
|
| 308 |
+
| `/api/step` | POST | Take action (JSON body) |
|
| 309 |
+
| `/api/state` | GET | Get current state |
|
| 310 |
+
| `/api/health` | GET | Health check |
|
| 311 |
+
| `/api/trajectory` | GET | Full action log |
|
| 312 |
+
| `/api/evaluate` | GET | Multi-dimensional scores |
|
| 313 |
+
| `/api/metrics` | GET | Comprehensive stats |
|
| 314 |
+
| `/api/fault-config` | POST | Enable fault injection |
|
| 315 |
+
|
| 316 |
+
### Example: Reset + Read + Submit
|
| 317 |
+
```bash
|
| 318 |
+
BASE="https://YOUR-SPACE.hf.space/api"
|
| 319 |
+
|
| 320 |
+
# Reset
|
| 321 |
+
curl -X POST "$BASE/reset?task=task1"
|
| 322 |
+
|
| 323 |
+
# Read a file
|
| 324 |
+
curl -X POST "$BASE/step" -H "Content-Type: application/json" \\
|
| 325 |
+
-d '{"action_type":"read_file","path":"src/auth.py"}'
|
| 326 |
+
|
| 327 |
+
# Submit
|
| 328 |
+
curl -X POST "$BASE/step" -H "Content-Type: application/json" \\
|
| 329 |
+
-d '{"action_type":"submit"}'
|
| 330 |
+
|
| 331 |
+
# Get evaluation
|
| 332 |
+
curl "$BASE/evaluate"
|
| 333 |
+
```
|
| 334 |
+
""")
|
| 335 |
+
|
| 336 |
+
|
| 337 |
+
# ── Mount FastAPI under /api ─────────────────────────────────────────────────
|
| 338 |
+
from server.app import app as fastapi_app
|
| 339 |
+
|
| 340 |
+
gr_app = gr.mount_gradio_app(fastapi_app, demo, path="/")
|
| 341 |
+
|
| 342 |
+
if __name__ == "__main__":
|
| 343 |
+
import uvicorn
|
| 344 |
+
uvicorn.run(fastapi_app, host="0.0.0.0", port=7860)
|
requirements.txt
CHANGED
|
@@ -1,6 +1,8 @@
|
|
| 1 |
-
fastapi
|
| 2 |
-
uvicorn[standard]
|
| 3 |
-
pydantic
|
| 4 |
-
openai
|
| 5 |
-
httpx
|
| 6 |
-
pytest
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn[standard]
|
| 3 |
+
pydantic
|
| 4 |
+
openai
|
| 5 |
+
httpx
|
| 6 |
+
pytest
|
| 7 |
+
gradio>=4.0
|
| 8 |
+
huggingface_hub
|
run_agent.py
ADDED
|
@@ -0,0 +1,290 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
run_agent.py — Standalone HF Inference agent for OpenEnv.
|
| 4 |
+
|
| 5 |
+
Uses Hugging Face InferenceClient (NOT OpenAI SDK).
|
| 6 |
+
Runs directly against the environment in-process — no server needed.
|
| 7 |
+
Solves bug-fixing tasks step-by-step and prints the full execution trace.
|
| 8 |
+
|
| 9 |
+
Usage:
|
| 10 |
+
python run_agent.py # uses built-in env
|
| 11 |
+
HF_TOKEN=hf_xxx python run_agent.py # with LLM agent
|
| 12 |
+
HF_TOKEN=hf_xxx python run_agent.py --task task2 # specific task
|
| 13 |
+
"""
|
| 14 |
+
import os
|
| 15 |
+
import sys
|
| 16 |
+
import json
|
| 17 |
+
import argparse
|
| 18 |
+
import textwrap
|
| 19 |
+
from typing import List, Optional
|
| 20 |
+
|
| 21 |
+
# Add project root to path
|
| 22 |
+
sys.path.insert(0, os.path.dirname(__file__))
|
| 23 |
+
|
| 24 |
+
from server.environment import CodebaseNavEnvironment
|
| 25 |
+
from server.models import RepoAction
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
# ── Configuration ────────────────────────────────────────────────────────────
|
| 29 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 30 |
+
MODEL_ID = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
|
| 31 |
+
MAX_STEPS = {"task1": 12, "task2": 18, "task3": 22}
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
# ── HF Inference Client (lazy import) ───────────────────────────────────────
|
| 35 |
+
def get_hf_client():
|
| 36 |
+
"""Create HF InferenceClient. Returns None if no token."""
|
| 37 |
+
if not HF_TOKEN:
|
| 38 |
+
return None
|
| 39 |
+
try:
|
| 40 |
+
from huggingface_hub import InferenceClient
|
| 41 |
+
return InferenceClient(model=MODEL_ID, token=HF_TOKEN)
|
| 42 |
+
except ImportError:
|
| 43 |
+
print("[WARN] huggingface_hub not installed. Using deterministic agent.", flush=True)
|
| 44 |
+
return None
|
| 45 |
+
except Exception as e:
|
| 46 |
+
print(f"[WARN] Could not create HF client: {e}. Using deterministic agent.", flush=True)
|
| 47 |
+
return None
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
# ── Prompts ──────────────────────────────────────────────────────────────────
|
| 51 |
+
SYSTEM_PROMPT = textwrap.dedent("""
|
| 52 |
+
You are an expert Python developer debugging a code repository.
|
| 53 |
+
You interact with the repo via JSON actions. Reply with ONLY a JSON object.
|
| 54 |
+
|
| 55 |
+
Available actions:
|
| 56 |
+
{"action_type": "read_file", "path": "src/file.py"}
|
| 57 |
+
{"action_type": "write_file", "path": "src/file.py", "content": "...full file..."}
|
| 58 |
+
{"action_type": "run_tests", "path": "tests/test_file.py"}
|
| 59 |
+
{"action_type": "search_code", "query": "keyword"}
|
| 60 |
+
{"action_type": "submit"}
|
| 61 |
+
|
| 62 |
+
Strategy:
|
| 63 |
+
1. Read the failing test first to understand expected behavior
|
| 64 |
+
2. Read the buggy source file(s) identified by test imports
|
| 65 |
+
3. Fix the bug by writing the corrected file
|
| 66 |
+
4. Run tests to verify your fix
|
| 67 |
+
5. Submit when all tests pass
|
| 68 |
+
|
| 69 |
+
RESPOND WITH ONLY A JSON OBJECT. No markdown, no explanation.
|
| 70 |
+
""").strip()
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def build_prompt(obs: dict, step: int, history: List[str]) -> str:
|
| 74 |
+
tree = "\n".join(obs.get("repo_tree", []))
|
| 75 |
+
read = ", ".join(obs.get("files_read", [])) or "none"
|
| 76 |
+
failing = ", ".join(obs.get("failing_tests", [])) or "unknown"
|
| 77 |
+
result = (obs.get("last_action_result") or "none")[:1500]
|
| 78 |
+
error = obs.get("last_action_error") or "none"
|
| 79 |
+
steps_left = obs.get("steps_remaining", 0)
|
| 80 |
+
hist = "\n".join(history[-5:]) if history else "none"
|
| 81 |
+
|
| 82 |
+
return (
|
| 83 |
+
f"Step {step} | Task: {obs.get('current_task')} | Steps left: {steps_left}\n\n"
|
| 84 |
+
f"Description: {obs.get('task_description')}\n\n"
|
| 85 |
+
f"Files:\n{tree}\n\n"
|
| 86 |
+
f"Already read: {read}\nFailing tests: {failing}\n\n"
|
| 87 |
+
f"Last result:\n{result}\n\nLast error: {error}\n\n"
|
| 88 |
+
f"History:\n{hist}\n\n"
|
| 89 |
+
f"Next action? Reply with ONLY a JSON object."
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def llm_action(client, obs: dict, step: int, history: List[str]) -> dict:
|
| 94 |
+
"""Get action from HF Inference API."""
|
| 95 |
+
prompt = build_prompt(obs, step, history)
|
| 96 |
+
try:
|
| 97 |
+
response = client.chat_completion(
|
| 98 |
+
messages=[
|
| 99 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 100 |
+
{"role": "user", "content": prompt},
|
| 101 |
+
],
|
| 102 |
+
max_tokens=800,
|
| 103 |
+
temperature=0.2,
|
| 104 |
+
)
|
| 105 |
+
text = response.choices[0].message.content.strip()
|
| 106 |
+
|
| 107 |
+
# Strip code fences
|
| 108 |
+
if text.startswith("```"):
|
| 109 |
+
text = text.split("```")[1]
|
| 110 |
+
if text.startswith("json"):
|
| 111 |
+
text = text[4:]
|
| 112 |
+
text = text.strip()
|
| 113 |
+
|
| 114 |
+
return json.loads(text)
|
| 115 |
+
except json.JSONDecodeError:
|
| 116 |
+
print(f" [PARSE ERROR] Could not parse: {text[:100]}")
|
| 117 |
+
return {"action_type": "submit"}
|
| 118 |
+
except Exception as e:
|
| 119 |
+
print(f" [LLM ERROR] {e}")
|
| 120 |
+
return {"action_type": "submit"}
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
# ── Deterministic Agent (no LLM needed) ─────────────────────────────────────
|
| 124 |
+
def deterministic_agent(obs: dict, step: int, files_read: set) -> dict:
|
| 125 |
+
"""
|
| 126 |
+
A rule-based agent that follows optimal patterns for each task type.
|
| 127 |
+
Works without any LLM — useful for testing and demos.
|
| 128 |
+
"""
|
| 129 |
+
tree = obs.get("repo_tree", [])
|
| 130 |
+
task = obs.get("current_task", "task1")
|
| 131 |
+
test_files = sorted([f for f in tree if f.startswith("tests/")])
|
| 132 |
+
src_files = sorted([f for f in tree if f.startswith("src/") and f.endswith(".py")])
|
| 133 |
+
spec_files = sorted([f for f in tree if f.endswith("FEATURE_SPEC.md")])
|
| 134 |
+
|
| 135 |
+
# Phase 1: Read spec/test files first
|
| 136 |
+
if task == "task3" and spec_files:
|
| 137 |
+
for sf in spec_files:
|
| 138 |
+
if sf not in files_read:
|
| 139 |
+
return {"action_type": "read_file", "path": sf}
|
| 140 |
+
|
| 141 |
+
for tf in test_files:
|
| 142 |
+
if tf not in files_read:
|
| 143 |
+
return {"action_type": "read_file", "path": tf}
|
| 144 |
+
|
| 145 |
+
# Phase 2: Read all source files
|
| 146 |
+
for sf in src_files:
|
| 147 |
+
if sf not in files_read:
|
| 148 |
+
return {"action_type": "read_file", "path": sf}
|
| 149 |
+
|
| 150 |
+
# Phase 3: Run tests to see current state
|
| 151 |
+
if step <= 2 + len(src_files) + len(test_files):
|
| 152 |
+
if test_files:
|
| 153 |
+
return {"action_type": "run_tests", "path": test_files[0]}
|
| 154 |
+
return {"action_type": "run_tests"}
|
| 155 |
+
|
| 156 |
+
# Phase 4: Submit
|
| 157 |
+
return {"action_type": "submit"}
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
# ── Main Runner ──────────────────────────────────────────────────────────────
|
| 161 |
+
def run_episode(env: CodebaseNavEnvironment, task: str, use_llm: bool = False):
|
| 162 |
+
"""Run one complete episode."""
|
| 163 |
+
hf_client = get_hf_client() if use_llm else None
|
| 164 |
+
using_llm = hf_client is not None
|
| 165 |
+
|
| 166 |
+
max_steps = MAX_STEPS.get(task, 15)
|
| 167 |
+
history = []
|
| 168 |
+
files_read = set()
|
| 169 |
+
|
| 170 |
+
print(f"\n{'='*60}")
|
| 171 |
+
print(f" [START] task={task} agent={'llm' if using_llm else 'deterministic'}")
|
| 172 |
+
print(f"{'='*60}")
|
| 173 |
+
|
| 174 |
+
# Reset
|
| 175 |
+
reset_result = env.reset(task=task)
|
| 176 |
+
obs = reset_result.observation
|
| 177 |
+
variant = reset_result.info.get("variant_id", "?")
|
| 178 |
+
|
| 179 |
+
print(f" Variant: {variant}")
|
| 180 |
+
print(f" Files: {obs.repo_tree}")
|
| 181 |
+
print(f" Failing: {obs.failing_tests}")
|
| 182 |
+
print(f" Steps budget: {obs.steps_remaining}")
|
| 183 |
+
print()
|
| 184 |
+
|
| 185 |
+
rewards = []
|
| 186 |
+
final_score = 0.0
|
| 187 |
+
|
| 188 |
+
for step_num in range(1, max_steps + 1):
|
| 189 |
+
if env.done:
|
| 190 |
+
break
|
| 191 |
+
|
| 192 |
+
# Get action from LLM or deterministic agent
|
| 193 |
+
obs_dict = obs.model_dump()
|
| 194 |
+
if using_llm:
|
| 195 |
+
action_dict = llm_action(hf_client, obs_dict, step_num, history)
|
| 196 |
+
else:
|
| 197 |
+
action_dict = deterministic_agent(obs_dict, step_num, files_read)
|
| 198 |
+
|
| 199 |
+
action_type = action_dict.get("action_type", "submit")
|
| 200 |
+
action_path = action_dict.get("path")
|
| 201 |
+
|
| 202 |
+
# Construct action
|
| 203 |
+
action = RepoAction(
|
| 204 |
+
action_type=action_type,
|
| 205 |
+
path=action_dict.get("path"),
|
| 206 |
+
query=action_dict.get("query"),
|
| 207 |
+
content=action_dict.get("content"),
|
| 208 |
+
)
|
| 209 |
+
|
| 210 |
+
# Execute step
|
| 211 |
+
result = env.step(action)
|
| 212 |
+
obs = result.observation
|
| 213 |
+
reward = result.reward
|
| 214 |
+
|
| 215 |
+
rewards.append(reward)
|
| 216 |
+
if action_path:
|
| 217 |
+
files_read.add(action_path)
|
| 218 |
+
|
| 219 |
+
# Print step log
|
| 220 |
+
detail = action_path or action_dict.get("query") or ""
|
| 221 |
+
err = f" ❌ {obs.last_action_error}" if obs.last_action_error else ""
|
| 222 |
+
print(
|
| 223 |
+
f" [STEP] step={step_num} action={action_type:12s} "
|
| 224 |
+
f"{detail:30s} reward={reward:+.3f}{err}"
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
+
history.append(f"Step {step_num}: {action_type} → {reward:+.3f}")
|
| 228 |
+
|
| 229 |
+
if result.done:
|
| 230 |
+
final_score = result.info.get("final_score", 0.0)
|
| 231 |
+
break
|
| 232 |
+
|
| 233 |
+
# Force submit if not done
|
| 234 |
+
if not env.done:
|
| 235 |
+
result = env.step(RepoAction(action_type="submit"))
|
| 236 |
+
final_score = result.info.get("final_score", 0.0)
|
| 237 |
+
rewards.append(result.reward)
|
| 238 |
+
|
| 239 |
+
# Summary
|
| 240 |
+
total_reward = sum(rewards)
|
| 241 |
+
total_steps = len(rewards)
|
| 242 |
+
success = final_score >= 0.5
|
| 243 |
+
|
| 244 |
+
print()
|
| 245 |
+
print(f" [END] success={str(success).lower()} steps={total_steps} "
|
| 246 |
+
f"score={final_score:.3f} total_reward={total_reward:.3f}")
|
| 247 |
+
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 248 |
+
print(f" [END] rewards={rewards_str}")
|
| 249 |
+
|
| 250 |
+
# Evaluation summary
|
| 251 |
+
ev = env.get_evaluation()
|
| 252 |
+
if "composite_score" in ev:
|
| 253 |
+
print(f"\n 📊 Evaluation:")
|
| 254 |
+
print(f" Composite: {ev['composite_score']:.3f}")
|
| 255 |
+
for name, dim in ev.get("dimensions", {}).items():
|
| 256 |
+
print(f" {name:15s}: {dim['score']:.3f}")
|
| 257 |
+
|
| 258 |
+
return final_score, total_steps, rewards
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
def main():
|
| 262 |
+
parser = argparse.ArgumentParser(description="Run agent against OpenEnv codebase-nav")
|
| 263 |
+
parser.add_argument("--task", default="task1", choices=["task1", "task2", "task3"])
|
| 264 |
+
parser.add_argument("--all-tasks", action="store_true", help="Run all 3 tasks")
|
| 265 |
+
parser.add_argument("--llm", action="store_true", help="Use HF LLM agent (needs HF_TOKEN)")
|
| 266 |
+
args = parser.parse_args()
|
| 267 |
+
|
| 268 |
+
env = CodebaseNavEnvironment()
|
| 269 |
+
|
| 270 |
+
if args.all_tasks:
|
| 271 |
+
tasks = ["task1", "task2", "task3"]
|
| 272 |
+
else:
|
| 273 |
+
tasks = [args.task]
|
| 274 |
+
|
| 275 |
+
all_scores = []
|
| 276 |
+
for task in tasks:
|
| 277 |
+
score, steps, rewards = run_episode(env, task, use_llm=args.llm)
|
| 278 |
+
all_scores.append(score)
|
| 279 |
+
|
| 280 |
+
if len(all_scores) > 1:
|
| 281 |
+
avg = sum(all_scores) / len(all_scores)
|
| 282 |
+
print(f"\n{'='*60}")
|
| 283 |
+
print(f" OVERALL: avg_score={avg:.3f} tasks={len(all_scores)}")
|
| 284 |
+
print(f"{'='*60}")
|
| 285 |
+
|
| 286 |
+
env.close()
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
if __name__ == "__main__":
|
| 290 |
+
main()
|
test_e2e.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Quick E2E test for the deployed HF Space."""
|
| 3 |
+
import httpx, json, sys
|
| 4 |
+
|
| 5 |
+
BASE = "https://Chirag0123-codebase-nav-env.hf.space"
|
| 6 |
+
client = httpx.Client(timeout=120.0)
|
| 7 |
+
ok = 0
|
| 8 |
+
|
| 9 |
+
def test(label, fn):
|
| 10 |
+
global ok
|
| 11 |
+
try:
|
| 12 |
+
result = fn()
|
| 13 |
+
ok += 1
|
| 14 |
+
print(f" ✅ {label}: {json.dumps(result)[:200]}")
|
| 15 |
+
except Exception as e:
|
| 16 |
+
print(f" ❌ {label}: {e}")
|
| 17 |
+
|
| 18 |
+
print("Testing deployed Space...")
|
| 19 |
+
|
| 20 |
+
# 1. Health
|
| 21 |
+
test("Health", lambda: client.get(f"{BASE}/health").json())
|
| 22 |
+
|
| 23 |
+
# 2. Reset
|
| 24 |
+
test("Reset task1", lambda: (r := client.post(f"{BASE}/reset", params={"task": "task1"}).json(), r["info"]["variant_id"])[1])
|
| 25 |
+
|
| 26 |
+
# 3. Read file
|
| 27 |
+
test("Read file", lambda: (r := client.post(f"{BASE}/step", json={"action_type": "read_file", "path": client.get(f"{BASE}/state").json()["observation"]["repo_tree"][0]}).json(), f"reward={r['reward']}")[1])
|
| 28 |
+
|
| 29 |
+
# 4. Run tests
|
| 30 |
+
test("Run tests", lambda: (r := client.post(f"{BASE}/step", json={"action_type":"run_tests"}).json(), f"reward={r['reward']}")[1])
|
| 31 |
+
|
| 32 |
+
# 5. Submit
|
| 33 |
+
test("Submit", lambda: (r := client.post(f"{BASE}/step", json={"action_type":"submit"}).json(), f"score={r['info']['final_score']}")[1])
|
| 34 |
+
|
| 35 |
+
# 6. Trajectory
|
| 36 |
+
test("Trajectory", lambda: (r := client.get(f"{BASE}/trajectory").json(), f"steps={r['total_steps']}")[1])
|
| 37 |
+
|
| 38 |
+
# 7. Evaluate
|
| 39 |
+
test("Evaluate", lambda: (r := client.get(f"{BASE}/evaluate").json(), f"composite={r['composite_score']}")[1])
|
| 40 |
+
|
| 41 |
+
# 8. Metrics
|
| 42 |
+
test("Metrics", lambda: (r := client.get(f"{BASE}/metrics").json(), f"efficiency={r['step_efficiency']}")[1])
|
| 43 |
+
|
| 44 |
+
# 9. Fault config
|
| 45 |
+
test("Fault config", lambda: client.post(f"{BASE}/fault-config", json={"level":"light"}).json())
|
| 46 |
+
|
| 47 |
+
# 10. Reset with faults
|
| 48 |
+
test("Reset+faults", lambda: (r := client.post(f"{BASE}/reset", params={"task":"task2"}).json(), f"faults={len(r['info'].get('fault_injection',{}).get('faults_injected',[]))}")[1])
|
| 49 |
+
|
| 50 |
+
# 11. Disable faults
|
| 51 |
+
test("Disable faults", lambda: client.post(f"{BASE}/fault-config", json={"level":"none"}).json())
|
| 52 |
+
|
| 53 |
+
print(f"\n{'='*50}")
|
| 54 |
+
print(f" Result: {ok}/11 tests passed")
|
| 55 |
+
print(f"{'='*50}")
|
| 56 |
+
client.close()
|