Spaces:
Sleeping
Sleeping
Commit ·
90fc756
0
Parent(s):
Build SQL Query Reviewer environment
Browse files- .dockerignore +14 -0
- .github/workflows/ci.yml +40 -0
- .github/workflows/sync-to-hf.yml +36 -0
- .gitignore +18 -0
- Dockerfile +20 -0
- README.md +175 -0
- client.py +4 -0
- files/00-winning-plan.md +200 -0
- files/01-problem-statement.md +32 -0
- files/02-requirements.md +58 -0
- files/03-information-architecture.md +66 -0
- files/04-system-architecture.md +54 -0
- files/05-database-schema.md +52 -0
- files/06-api-contracts.md +96 -0
- files/07-monorepo-structure.md +65 -0
- files/08-computation-engine-spec.md +86 -0
- files/09-engineering-scope-definition.md +39 -0
- files/10-development-phases.md +48 -0
- files/11-environment-and-devops.md +77 -0
- files/12-testing-strategy.md +52 -0
- files/architecture-diagram.md +61 -0
- files/project-design.md +40 -0
- files/project-readme.md +91 -0
- inference.py +131 -0
- models.py +22 -0
- openenv.yaml +23 -0
- pyproject.toml +39 -0
- server/__init__.py +5 -0
- server/app.py +59 -0
- server/environment.py +185 -0
- server/grader.py +91 -0
- server/reward.py +36 -0
- sql_query_reviewer/__init__.py +25 -0
- sql_query_reviewer/client.py +95 -0
- sql_query_reviewer/models.py +99 -0
- tasks/easy_tasks.json +148 -0
- tasks/hard_tasks.json +158 -0
- tasks/medium_tasks.json +152 -0
- tests/test_api.py +93 -0
- tests/test_grader.py +38 -0
- tests/test_inference.py +82 -0
- tests/test_models.py +21 -0
- uv.lock +0 -0
.dockerignore
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.git
|
| 2 |
+
.github
|
| 3 |
+
.pytest_cache
|
| 4 |
+
.venv
|
| 5 |
+
__pycache__
|
| 6 |
+
*.pyc
|
| 7 |
+
.coverage
|
| 8 |
+
build
|
| 9 |
+
dist
|
| 10 |
+
files
|
| 11 |
+
htmlcov
|
| 12 |
+
outputs
|
| 13 |
+
tests
|
| 14 |
+
|
.github/workflows/ci.yml
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: CI
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches: ["main"]
|
| 6 |
+
pull_request:
|
| 7 |
+
|
| 8 |
+
jobs:
|
| 9 |
+
test:
|
| 10 |
+
runs-on: ubuntu-latest
|
| 11 |
+
|
| 12 |
+
steps:
|
| 13 |
+
- name: Check out repository
|
| 14 |
+
uses: actions/checkout@v4
|
| 15 |
+
|
| 16 |
+
- name: Set up Python
|
| 17 |
+
uses: actions/setup-python@v5
|
| 18 |
+
with:
|
| 19 |
+
python-version: "3.11"
|
| 20 |
+
|
| 21 |
+
- name: Install package and test dependencies
|
| 22 |
+
run: |
|
| 23 |
+
python -m pip install --upgrade pip
|
| 24 |
+
python -m pip install -e .[dev]
|
| 25 |
+
|
| 26 |
+
- name: Run unit and integration tests
|
| 27 |
+
run: pytest
|
| 28 |
+
|
| 29 |
+
- name: Build Docker image
|
| 30 |
+
run: docker build -t sql-query-reviewer .
|
| 31 |
+
|
| 32 |
+
- name: Attempt OpenEnv validation
|
| 33 |
+
run: |
|
| 34 |
+
python -m pip install "git+https://github.com/meta-pytorch/OpenEnv.git" || true
|
| 35 |
+
if command -v openenv >/dev/null 2>&1; then
|
| 36 |
+
openenv validate
|
| 37 |
+
else
|
| 38 |
+
echo "OpenEnv CLI unavailable; skipping openenv validate"
|
| 39 |
+
fi
|
| 40 |
+
|
.github/workflows/sync-to-hf.yml
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Sync To Hugging Face
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches: ["main"]
|
| 6 |
+
workflow_dispatch:
|
| 7 |
+
|
| 8 |
+
jobs:
|
| 9 |
+
sync-to-space:
|
| 10 |
+
runs-on: ubuntu-latest
|
| 11 |
+
|
| 12 |
+
steps:
|
| 13 |
+
- name: Check out repository
|
| 14 |
+
uses: actions/checkout@v4
|
| 15 |
+
with:
|
| 16 |
+
fetch-depth: 0
|
| 17 |
+
lfs: true
|
| 18 |
+
|
| 19 |
+
- name: Push repository to Hugging Face Space
|
| 20 |
+
env:
|
| 21 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 22 |
+
HF_SPACE_ID: ${{ vars.HF_SPACE_ID }}
|
| 23 |
+
GITHUB_REPOSITORY_OWNER: ${{ github.repository_owner }}
|
| 24 |
+
run: |
|
| 25 |
+
set -euo pipefail
|
| 26 |
+
if [ -z "${HF_TOKEN:-}" ]; then
|
| 27 |
+
echo "HF_TOKEN is not configured; skipping Hugging Face sync."
|
| 28 |
+
exit 0
|
| 29 |
+
fi
|
| 30 |
+
|
| 31 |
+
space_id="${HF_SPACE_ID:-${GITHUB_REPOSITORY_OWNER}/sql-query-reviewer}"
|
| 32 |
+
|
| 33 |
+
git config user.email "actions@github.com"
|
| 34 |
+
git config user.name "github-actions[bot]"
|
| 35 |
+
git remote add hf "https://oauth2:${HF_TOKEN}@huggingface.co/spaces/${space_id}"
|
| 36 |
+
git push --force hf HEAD:main
|
.gitignore
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
.coverage
|
| 3 |
+
.mypy_cache/
|
| 4 |
+
.pytest_cache/
|
| 5 |
+
.ruff_cache/
|
| 6 |
+
.venv/
|
| 7 |
+
*.egg-info/
|
| 8 |
+
*.pyc
|
| 9 |
+
build/
|
| 10 |
+
dist/
|
| 11 |
+
htmlcov/
|
| 12 |
+
outputs/
|
| 13 |
+
.env
|
| 14 |
+
.env.*
|
| 15 |
+
.DS_Store
|
| 16 |
+
.idea/
|
| 17 |
+
.vscode/
|
| 18 |
+
|
Dockerfile
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 4 |
+
PYTHONUNBUFFERED=1 \
|
| 5 |
+
PORT=8000
|
| 6 |
+
|
| 7 |
+
WORKDIR /app
|
| 8 |
+
|
| 9 |
+
COPY pyproject.toml README.md models.py client.py openenv.yaml inference.py ./
|
| 10 |
+
COPY sql_query_reviewer ./sql_query_reviewer
|
| 11 |
+
COPY server ./server
|
| 12 |
+
COPY tasks ./tasks
|
| 13 |
+
|
| 14 |
+
RUN pip install --no-cache-dir --upgrade pip && \
|
| 15 |
+
pip install --no-cache-dir .
|
| 16 |
+
|
| 17 |
+
EXPOSE 8000
|
| 18 |
+
|
| 19 |
+
CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000"]
|
| 20 |
+
|
README.md
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: SQL Query Reviewer
|
| 3 |
+
colorFrom: blue
|
| 4 |
+
colorTo: green
|
| 5 |
+
sdk: docker
|
| 6 |
+
app_port: 8000
|
| 7 |
+
pinned: false
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
# SQL Query Reviewer
|
| 11 |
+
|
| 12 |
+
`Meta-hackathon` is the GitHub source repo for `sql-query-reviewer`, an OpenEnv-style environment where an agent reviews SQL queries for correctness, performance, and security issues.
|
| 13 |
+
|
| 14 |
+
The same repository is designed to work in both places:
|
| 15 |
+
- GitHub is the canonical source, CI surface, and collaboration home.
|
| 16 |
+
- Hugging Face Spaces runs the Dockerized FastAPI environment directly from this repo layout.
|
| 17 |
+
|
| 18 |
+
## What The Environment Does
|
| 19 |
+
|
| 20 |
+
Each episode gives the agent:
|
| 21 |
+
- a SQL query
|
| 22 |
+
- schema context when it matters
|
| 23 |
+
- a short explanation of the query's intended purpose
|
| 24 |
+
|
| 25 |
+
The agent responds step by step with one of four actions:
|
| 26 |
+
- `identify_issue`
|
| 27 |
+
- `suggest_fix`
|
| 28 |
+
- `approve`
|
| 29 |
+
- `request_more_context`
|
| 30 |
+
|
| 31 |
+
Rewards are deterministic and shaped for partial progress:
|
| 32 |
+
- correct issue identification earns severity-weighted reward
|
| 33 |
+
- valid fixes earn bonus reward
|
| 34 |
+
- false positives are penalized
|
| 35 |
+
- approving with missed issues is penalized
|
| 36 |
+
|
| 37 |
+
## Repository Layout
|
| 38 |
+
|
| 39 |
+
```text
|
| 40 |
+
.
|
| 41 |
+
|-- .github/workflows/
|
| 42 |
+
|-- client.py
|
| 43 |
+
|-- Dockerfile
|
| 44 |
+
|-- inference.py
|
| 45 |
+
|-- models.py
|
| 46 |
+
|-- openenv.yaml
|
| 47 |
+
|-- pyproject.toml
|
| 48 |
+
|-- server/
|
| 49 |
+
|-- sql_query_reviewer/
|
| 50 |
+
|-- tasks/
|
| 51 |
+
`-- tests/
|
| 52 |
+
```
|
| 53 |
+
|
| 54 |
+
## Task Bank
|
| 55 |
+
|
| 56 |
+
The environment ships with 15 tasks:
|
| 57 |
+
- 5 easy syntax and basic logic reviews
|
| 58 |
+
- 5 medium schema-aware performance reviews
|
| 59 |
+
- 5 hard security and advanced optimization reviews
|
| 60 |
+
|
| 61 |
+
Task data lives in:
|
| 62 |
+
- `tasks/easy_tasks.json`
|
| 63 |
+
- `tasks/medium_tasks.json`
|
| 64 |
+
- `tasks/hard_tasks.json`
|
| 65 |
+
|
| 66 |
+
## Local Development
|
| 67 |
+
|
| 68 |
+
Install dependencies:
|
| 69 |
+
|
| 70 |
+
```bash
|
| 71 |
+
python -m venv .venv
|
| 72 |
+
.venv\Scripts\activate
|
| 73 |
+
python -m pip install --upgrade pip
|
| 74 |
+
python -m pip install -e .[dev]
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
Run the API locally:
|
| 78 |
+
|
| 79 |
+
```bash
|
| 80 |
+
uvicorn server.app:app --reload --port 8000
|
| 81 |
+
```
|
| 82 |
+
|
| 83 |
+
Smoke-test the API:
|
| 84 |
+
|
| 85 |
+
```bash
|
| 86 |
+
curl -X POST http://localhost:8000/reset -H "Content-Type: application/json" -d "{\"task_id\":\"easy_001\"}"
|
| 87 |
+
curl http://localhost:8000/state
|
| 88 |
+
```
|
| 89 |
+
|
| 90 |
+
Run tests:
|
| 91 |
+
|
| 92 |
+
```bash
|
| 93 |
+
pytest
|
| 94 |
+
```
|
| 95 |
+
|
| 96 |
+
Build the container:
|
| 97 |
+
|
| 98 |
+
```bash
|
| 99 |
+
docker build -t sql-query-reviewer .
|
| 100 |
+
docker run -p 8000:8000 sql-query-reviewer
|
| 101 |
+
```
|
| 102 |
+
|
| 103 |
+
## Inference Script
|
| 104 |
+
|
| 105 |
+
`inference.py` uses the OpenAI Python client against any OpenAI-compatible endpoint.
|
| 106 |
+
|
| 107 |
+
Expected environment variables:
|
| 108 |
+
|
| 109 |
+
```bash
|
| 110 |
+
set ENV_BASE_URL=http://localhost:8000
|
| 111 |
+
set API_BASE_URL=https://router.huggingface.co/v1
|
| 112 |
+
set MODEL_NAME=Qwen/Qwen2.5-72B-Instruct
|
| 113 |
+
set HF_TOKEN=hf_xxx
|
| 114 |
+
python inference.py
|
| 115 |
+
```
|
| 116 |
+
|
| 117 |
+
The script emits structured logs using:
|
| 118 |
+
- `[START]`
|
| 119 |
+
- `[STEP]`
|
| 120 |
+
- `[END]`
|
| 121 |
+
|
| 122 |
+
## Hugging Face Spaces
|
| 123 |
+
|
| 124 |
+
This repo is Space-ready because:
|
| 125 |
+
- the README starts with Hugging Face YAML front matter
|
| 126 |
+
- the repo includes a root `Dockerfile`
|
| 127 |
+
- the API listens on port `8000`
|
| 128 |
+
|
| 129 |
+
To deploy manually from a local machine with git:
|
| 130 |
+
|
| 131 |
+
```bash
|
| 132 |
+
git remote add hf https://huggingface.co/spaces/<hf-username>/sql-query-reviewer
|
| 133 |
+
git push hf main
|
| 134 |
+
```
|
| 135 |
+
|
| 136 |
+
If you install the OpenEnv CLI, you can also use:
|
| 137 |
+
|
| 138 |
+
```bash
|
| 139 |
+
python -m pip install "git+https://github.com/meta-pytorch/OpenEnv.git"
|
| 140 |
+
openenv push --repo-id <hf-username>/sql-query-reviewer
|
| 141 |
+
```
|
| 142 |
+
|
| 143 |
+
## GitHub Actions
|
| 144 |
+
|
| 145 |
+
CI runs tests and a Docker build on pushes and pull requests.
|
| 146 |
+
|
| 147 |
+
The Hugging Face sync workflow expects:
|
| 148 |
+
- GitHub secret `HF_TOKEN`
|
| 149 |
+
- optional GitHub variable `HF_SPACE_ID`
|
| 150 |
+
|
| 151 |
+
If `HF_SPACE_ID` is not set, the workflow defaults to:
|
| 152 |
+
|
| 153 |
+
```text
|
| 154 |
+
<github-repository-owner>/sql-query-reviewer
|
| 155 |
+
```
|
| 156 |
+
|
| 157 |
+
## Usage Example
|
| 158 |
+
|
| 159 |
+
```python
|
| 160 |
+
from sql_query_reviewer import SQLReviewAction, SQLReviewEnv
|
| 161 |
+
|
| 162 |
+
with SQLReviewEnv(base_url="http://localhost:8000").sync() as env:
|
| 163 |
+
result = env.reset(task_id="easy_001")
|
| 164 |
+
result = env.step(
|
| 165 |
+
SQLReviewAction(
|
| 166 |
+
action_type="identify_issue",
|
| 167 |
+
issue_category="syntax",
|
| 168 |
+
issue_description="SELCT is misspelled and should be SELECT",
|
| 169 |
+
suggested_fix="SELECT * FROM users WHERE id = 1;",
|
| 170 |
+
confidence=0.98,
|
| 171 |
+
)
|
| 172 |
+
)
|
| 173 |
+
print(result.reward)
|
| 174 |
+
print(result.observation.feedback)
|
| 175 |
+
```
|
client.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sql_query_reviewer.client import SQLReviewEnv, SyncSQLReviewEnv
|
| 2 |
+
|
| 3 |
+
__all__ = ["SQLReviewEnv", "SyncSQLReviewEnv"]
|
| 4 |
+
|
files/00-winning-plan.md
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# OpenEnv Hackathon — Winning Plan
|
| 2 |
+
|
| 3 |
+
**Participant:** Ravi (Solo)
|
| 4 |
+
**Deadline:** April 12, 2026, 11:59 PM IST
|
| 5 |
+
**Goal:** Top 3,000 out of 20,000 teams → Finale April 25–26
|
| 6 |
+
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
## Chosen Domain: **SQL Query Optimizer Review**
|
| 10 |
+
|
| 11 |
+
An environment where an AI agent reviews SQL queries for correctness, performance, and security issues — then suggests fixes. This scores high on real-world utility (30% weight), is novel in OpenEnv, has natural difficulty progression, and produces clear measurable rewards.
|
| 12 |
+
|
| 13 |
+
**Why this wins:**
|
| 14 |
+
- Every engineering team at Meta deals with SQL/data pipelines daily — maximum relevance
|
| 15 |
+
- Clear grading: each query has known issues, agent either finds them or doesn't → partial credit is natural
|
| 16 |
+
- Difficulty scales cleanly: syntax errors (easy) → performance anti-patterns (medium) → subtle injection vulnerabilities + schema-aware optimization (hard)
|
| 17 |
+
- Novel domain not seen in existing OpenEnv environments (creativity 10%)
|
| 18 |
+
- Deterministic grading with score variance (agents that find more issues score higher)
|
| 19 |
+
|
| 20 |
+
---
|
| 21 |
+
|
| 22 |
+
## Timeline
|
| 23 |
+
|
| 24 |
+
| When | What |
|
| 25 |
+
|---|---|
|
| 26 |
+
| **Apr 10, Morning** | Complete prep modules 1-4 on Colab, watch bootcamp recording |
|
| 27 |
+
| **Apr 10, Afternoon** | Install prerequisites, study sample inference script, study echo env code |
|
| 28 |
+
| **Apr 10, Evening** | Scaffold project with `openenv init`, define Pydantic models, implement core env logic |
|
| 29 |
+
| **Apr 11, Morning** | Implement 3 tasks (easy/medium/hard) with graders and reward functions |
|
| 30 |
+
| **Apr 11, Afternoon** | Write `inference.py`, test locally, iterate on reward shaping |
|
| 31 |
+
| **Apr 11, Evening** | Dockerize, deploy to HF Spaces, run pre-validation script |
|
| 32 |
+
| **Apr 12, Morning** | Write README, final testing, fix issues |
|
| 33 |
+
| **Apr 12, Afternoon** | Final pre-validation, submit |
|
| 34 |
+
| **Apr 12, Before 11:59 PM** | Verify HF Space is live and responding |
|
| 35 |
+
|
| 36 |
+
---
|
| 37 |
+
|
| 38 |
+
## Phase 0: Preparation (Today — First 3 Hours)
|
| 39 |
+
|
| 40 |
+
### Step 1: Complete Prep Course Modules
|
| 41 |
+
- Module 1: Interface basics (`reset()`, `step()`, `state()`)
|
| 42 |
+
- Module 2: Using existing environments, typed models
|
| 43 |
+
- Module 3: Deployment to HF Spaces with `openenv push`
|
| 44 |
+
- Module 4: **Building your own environment** — most critical, take detailed notes
|
| 45 |
+
|
| 46 |
+
### Step 2: Watch Bootcamp Recording
|
| 47 |
+
- Note tips from Ben Burtenshaw (HF) and Pulkit Aneja about what judges look for
|
| 48 |
+
|
| 49 |
+
### Step 3: Install Prerequisites
|
| 50 |
+
```bash
|
| 51 |
+
pip install openenv-core huggingface_hub openai pydantic
|
| 52 |
+
pip install docker # or ensure Docker Desktop is running
|
| 53 |
+
huggingface-cli login
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
### Step 4: Study the Sample Inference Script
|
| 57 |
+
- Memorize the `[START]`, `[STEP]`, `[END]` stdout format
|
| 58 |
+
- Any deviation in field names/ordering = incorrect evaluation scoring
|
| 59 |
+
|
| 60 |
+
### Step 5: Study Existing Environments
|
| 61 |
+
- Clone `https://github.com/meta-pytorch/OpenEnv`
|
| 62 |
+
- Study `envs/echo_env/` structure: models.py, client.py, server/environment.py, server/app.py, server/Dockerfile
|
| 63 |
+
|
| 64 |
+
---
|
| 65 |
+
|
| 66 |
+
## Phase 1: Build the Environment
|
| 67 |
+
|
| 68 |
+
### Project Structure
|
| 69 |
+
```
|
| 70 |
+
sql-query-reviewer/
|
| 71 |
+
├── openenv.yaml
|
| 72 |
+
├── models.py # Action, Observation, State Pydantic models
|
| 73 |
+
├── client.py # EnvClient subclass
|
| 74 |
+
├── inference.py # Baseline inference script (root!)
|
| 75 |
+
├── README.md
|
| 76 |
+
├── tasks/
|
| 77 |
+
│ ├── easy_tasks.json # Syntax error queries
|
| 78 |
+
│ ├── medium_tasks.json # Performance anti-pattern queries
|
| 79 |
+
│ └── hard_tasks.json # Security + schema-aware optimization queries
|
| 80 |
+
└── server/
|
| 81 |
+
├── environment.py # Core environment logic
|
| 82 |
+
├── grader.py # Deterministic grading functions
|
| 83 |
+
├── app.py # FastAPI server
|
| 84 |
+
├── Dockerfile
|
| 85 |
+
└── requirements.txt
|
| 86 |
+
```
|
| 87 |
+
|
| 88 |
+
### Pydantic Models Design
|
| 89 |
+
|
| 90 |
+
**Observation:**
|
| 91 |
+
- `query`: The SQL query to review
|
| 92 |
+
- `schema_info`: Table/column definitions (for medium/hard tasks)
|
| 93 |
+
- `context`: What the query is supposed to do
|
| 94 |
+
- `issues_found_so_far`: List of issues already identified
|
| 95 |
+
- `remaining_actions`: How many review steps remain
|
| 96 |
+
- `difficulty`: easy | medium | hard
|
| 97 |
+
|
| 98 |
+
**Action:**
|
| 99 |
+
- `action_type`: "identify_issue" | "suggest_fix" | "approve" | "request_more_context"
|
| 100 |
+
- `issue_category`: "syntax" | "performance" | "security" | "logic" | "style"
|
| 101 |
+
- `issue_description`: Free text description of the issue
|
| 102 |
+
- `suggested_fix`: The corrected SQL (optional)
|
| 103 |
+
- `confidence`: Float 0.0-1.0
|
| 104 |
+
|
| 105 |
+
**Reward:** Float 0.0-1.0 with partial credit
|
| 106 |
+
|
| 107 |
+
### Three Tasks with Progressive Difficulty
|
| 108 |
+
|
| 109 |
+
**Task 1 — Easy: Syntax & Basic Logic Errors**
|
| 110 |
+
- Queries with missing keywords, wrong joins, typos in column names
|
| 111 |
+
- Agent identifies each error → 0.2 reward per correct identification
|
| 112 |
+
- Suggesting a valid fix → bonus 0.1 per fix
|
| 113 |
+
- Expected baseline score: 0.7-0.9
|
| 114 |
+
|
| 115 |
+
**Task 2 — Medium: Performance Anti-Patterns**
|
| 116 |
+
- SELECT *, missing indexes, N+1 patterns, unnecessary subqueries, missing WHERE clauses on large tables
|
| 117 |
+
- Requires understanding schema context
|
| 118 |
+
- Agent identifies anti-pattern + suggests optimization → partial credit
|
| 119 |
+
- Expected baseline score: 0.4-0.6
|
| 120 |
+
|
| 121 |
+
**Task 3 — Hard: Security Vulnerabilities + Schema-Aware Optimization**
|
| 122 |
+
- SQL injection vectors, privilege escalation, data leakage, plus complex optimization (query plan awareness)
|
| 123 |
+
- Requires multi-step reasoning about schema relationships
|
| 124 |
+
- Expected baseline score: 0.2-0.4
|
| 125 |
+
|
| 126 |
+
### Reward Function Design
|
| 127 |
+
- Per-step rewards (not just end-of-episode)
|
| 128 |
+
- Correct issue identification: +0.2 (scaled by issue severity)
|
| 129 |
+
- Valid fix suggestion: +0.1
|
| 130 |
+
- False positive (flagging non-issue): -0.1
|
| 131 |
+
- Missing critical issue at episode end: -0.15
|
| 132 |
+
- Approving a query with unfound issues: -0.2
|
| 133 |
+
- Smooth, informative signal throughout the trajectory
|
| 134 |
+
|
| 135 |
+
### Grader Design
|
| 136 |
+
- Each task has a ground-truth list of issues with categories and severity
|
| 137 |
+
- Grader compares agent's identified issues against ground truth using fuzzy matching on descriptions
|
| 138 |
+
- Score = (correctly_identified × severity_weight) / total_possible_score
|
| 139 |
+
- Deterministic: same agent output → same score every time
|
| 140 |
+
- Returns float in [0.0, 1.0]
|
| 141 |
+
- Never returns the same score for all inputs (variety of queries ensures variance)
|
| 142 |
+
|
| 143 |
+
---
|
| 144 |
+
|
| 145 |
+
## Phase 2: Inference Script
|
| 146 |
+
|
| 147 |
+
Key requirements:
|
| 148 |
+
- Named `inference.py` in root directory
|
| 149 |
+
- Uses OpenAI Client for all LLM calls
|
| 150 |
+
- Reads `API_BASE_URL`, `MODEL_NAME`, `HF_TOKEN` from env vars
|
| 151 |
+
- Emits `[START]`, `[STEP]`, `[END]` logs exactly per spec
|
| 152 |
+
- Completes in <20 minutes on 2 vCPU, 8GB RAM
|
| 153 |
+
- Reproducible scores
|
| 154 |
+
|
| 155 |
+
---
|
| 156 |
+
|
| 157 |
+
## Phase 3: Containerize & Deploy
|
| 158 |
+
|
| 159 |
+
```bash
|
| 160 |
+
# Build and test locally
|
| 161 |
+
docker build -t sql-query-reviewer ./server
|
| 162 |
+
docker run -p 8000:8000 sql-query-reviewer
|
| 163 |
+
|
| 164 |
+
# Verify endpoints
|
| 165 |
+
curl -X POST http://localhost:8000/reset -H "Content-Type: application/json" -d '{}'
|
| 166 |
+
|
| 167 |
+
# Deploy to HF Spaces
|
| 168 |
+
openenv push --repo-id ravi/sql-query-reviewer
|
| 169 |
+
|
| 170 |
+
# Verify deployed version
|
| 171 |
+
curl -X POST https://ravi-sql-query-reviewer.hf.space/reset
|
| 172 |
+
```
|
| 173 |
+
|
| 174 |
+
---
|
| 175 |
+
|
| 176 |
+
## Phase 4: Pre-Submission QA
|
| 177 |
+
|
| 178 |
+
Run pre-validation script:
|
| 179 |
+
```bash
|
| 180 |
+
./validate-submission.sh https://ravi-sql-query-reviewer.hf.space .
|
| 181 |
+
```
|
| 182 |
+
|
| 183 |
+
Checklist:
|
| 184 |
+
- [ ] HF Space deploys and responds to `/reset` with 200
|
| 185 |
+
- [ ] `openenv validate` passes
|
| 186 |
+
- [ ] Dockerfile builds cleanly
|
| 187 |
+
- [ ] Inference script runs without errors, produces scores
|
| 188 |
+
- [ ] 3+ tasks, each grader returns scores in 0.0-1.0 range
|
| 189 |
+
- [ ] Scores are reproducible across runs
|
| 190 |
+
- [ ] README is compelling and complete
|
| 191 |
+
|
| 192 |
+
---
|
| 193 |
+
|
| 194 |
+
## Winning Differentiators
|
| 195 |
+
|
| 196 |
+
1. **Real-world utility (30%)**: SQL review is something every data team needs — immediate value for the RL/agent community
|
| 197 |
+
2. **Score variance**: Different agent capabilities produce meaningfully different scores — a basic agent catches syntax errors but misses security issues
|
| 198 |
+
3. **Reward shaping**: Per-step partial credit signals, not binary end-of-episode
|
| 199 |
+
4. **Novelty**: No SQL review environment exists in OpenEnv yet
|
| 200 |
+
5. **Spec compliance**: Bulletproof adherence to every technical requirement — this alone eliminates most competitors
|
files/01-problem-statement.md
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 01 — Problem Statement & Domain Selection
|
| 2 |
+
|
| 3 |
+
## Domain: SQL Query Review Environment
|
| 4 |
+
|
| 5 |
+
### The Real-World Problem
|
| 6 |
+
Every software team reviews SQL queries — in code reviews, database migrations, ETL pipeline audits, and security assessments. This is a genuine, high-frequency task that requires:
|
| 7 |
+
- Pattern recognition (anti-patterns, vulnerabilities)
|
| 8 |
+
- Domain knowledge (schema relationships, indexing strategies)
|
| 9 |
+
- Multi-step reasoning (understanding query intent before evaluating correctness)
|
| 10 |
+
|
| 11 |
+
### Why This Domain Wins
|
| 12 |
+
|
| 13 |
+
| Evaluation Criteria | Weight | How We Score |
|
| 14 |
+
|---|---|---|
|
| 15 |
+
| Real-world utility | 30% | SQL review is universal — Meta runs millions of queries daily. Fills a real gap in agent evaluation. |
|
| 16 |
+
| Task & grader quality | 25% | Clear ground truth per query, deterministic grading, natural difficulty progression |
|
| 17 |
+
| Environment design | 20% | Clean state (per-query episode), rich observations, well-typed actions, per-step rewards |
|
| 18 |
+
| Code quality & spec compliance | 15% | Full OpenEnv spec, clean project structure, Docker, typed models |
|
| 19 |
+
| Creativity & novelty | 10% | No SQL review env exists in OpenEnv. Reward design uses severity-weighted partial credit. |
|
| 20 |
+
|
| 21 |
+
### What the Agent Does
|
| 22 |
+
1. Receives a SQL query + optional schema context
|
| 23 |
+
2. Reviews it step-by-step, identifying issues (syntax, performance, security, logic)
|
| 24 |
+
3. Suggests fixes for each identified issue
|
| 25 |
+
4. Decides when to approve or flag the query
|
| 26 |
+
5. Gets rewarded for correctly identified issues and penalized for false positives
|
| 27 |
+
|
| 28 |
+
### Scope Boundaries
|
| 29 |
+
- **In scope**: SELECT, INSERT, UPDATE, DELETE queries; joins; subqueries; CTEs; window functions
|
| 30 |
+
- **Out of scope**: Stored procedures, database-specific dialect features, real database execution
|
| 31 |
+
- **Episode length**: 3-8 steps depending on query complexity
|
| 32 |
+
- **No external dependencies**: All query analysis is rule-based and deterministic
|
files/02-requirements.md
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 02 — Requirements Specification
|
| 2 |
+
|
| 3 |
+
## Functional Requirements
|
| 4 |
+
|
| 5 |
+
### FR-1: Real-World Task Simulation
|
| 6 |
+
- Simulates SQL query review — a task humans do daily in engineering teams
|
| 7 |
+
- No games, no toys — purely professional/practical domain
|
| 8 |
+
|
| 9 |
+
### FR-2: OpenEnv Spec Compliance
|
| 10 |
+
- Typed Pydantic models for Observation, Action, State
|
| 11 |
+
- `step(action)` → returns observation, reward, done, info
|
| 12 |
+
- `reset()` → returns initial observation
|
| 13 |
+
- `state()` → returns current internal state
|
| 14 |
+
- Valid `openenv.yaml` with metadata
|
| 15 |
+
- Passes `openenv validate`
|
| 16 |
+
|
| 17 |
+
### FR-3: Minimum 3 Tasks with Agent Graders
|
| 18 |
+
- **Task 1 (Easy):** Syntax & basic logic errors — expected agent score 0.7-0.9
|
| 19 |
+
- **Task 2 (Medium):** Performance anti-patterns — expected agent score 0.4-0.6
|
| 20 |
+
- **Task 3 (Hard):** Security vulnerabilities + schema-aware optimization — expected agent score 0.2-0.4
|
| 21 |
+
- Each grader: deterministic, returns float in [0.0, 1.0], reproducible
|
| 22 |
+
|
| 23 |
+
### FR-4: Meaningful Reward Function
|
| 24 |
+
- Per-step rewards (not just end-of-episode binary)
|
| 25 |
+
- Partial credit for partial issue identification
|
| 26 |
+
- Penalties for false positives and missed critical issues
|
| 27 |
+
- Smooth signal that guides learning
|
| 28 |
+
|
| 29 |
+
### FR-5: Baseline Inference Script
|
| 30 |
+
- Named `inference.py` in project root
|
| 31 |
+
- Uses OpenAI Client for LLM calls
|
| 32 |
+
- Reads `API_BASE_URL`, `MODEL_NAME`, `HF_TOKEN` from env vars
|
| 33 |
+
- Emits `[START]`, `[STEP]`, `[END]` structured stdout logs
|
| 34 |
+
- Produces reproducible baseline scores on all 3 tasks
|
| 35 |
+
|
| 36 |
+
## Non-Functional Requirements
|
| 37 |
+
|
| 38 |
+
### NFR-1: Deploys to Hugging Face Space
|
| 39 |
+
- Containerized HF Space tagged with `openenv`
|
| 40 |
+
- Returns 200 and responds to `/reset` POST
|
| 41 |
+
|
| 42 |
+
### NFR-2: Containerized Execution
|
| 43 |
+
- Working Dockerfile
|
| 44 |
+
- Builds with `docker build`, runs with `docker run`
|
| 45 |
+
- Starts cleanly, responds to HTTP requests
|
| 46 |
+
|
| 47 |
+
### NFR-3: Infrastructure Constraints
|
| 48 |
+
- Inference script runtime < 20 minutes
|
| 49 |
+
- Runs on 2 vCPU, 8GB RAM machine
|
| 50 |
+
|
| 51 |
+
### NFR-4: Documentation
|
| 52 |
+
- README with: environment description, motivation, action/observation space definitions, task descriptions with difficulty, setup instructions, baseline scores
|
| 53 |
+
|
| 54 |
+
## Disqualification Criteria (Must Avoid)
|
| 55 |
+
- ❌ Environment does not deploy or respond
|
| 56 |
+
- ❌ Plagiarized or trivially modified existing environments
|
| 57 |
+
- ❌ Graders that always return the same score
|
| 58 |
+
- ❌ No baseline inference script
|
files/03-information-architecture.md
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 03 — Information Architecture
|
| 2 |
+
|
| 3 |
+
## Data Flow
|
| 4 |
+
|
| 5 |
+
```
|
| 6 |
+
[Task JSON] → reset() → [Observation: query + schema + context]
|
| 7 |
+
↓
|
| 8 |
+
Agent decides action
|
| 9 |
+
↓
|
| 10 |
+
step(Action) → [Observation + Reward + Done]
|
| 11 |
+
↓
|
| 12 |
+
(repeat until done or max_steps)
|
| 13 |
+
↓
|
| 14 |
+
close() → Grader computes final score
|
| 15 |
+
```
|
| 16 |
+
|
| 17 |
+
## Task Data Structure
|
| 18 |
+
|
| 19 |
+
Each task is a JSON object:
|
| 20 |
+
```json
|
| 21 |
+
{
|
| 22 |
+
"task_id": "easy_001",
|
| 23 |
+
"difficulty": "easy",
|
| 24 |
+
"query": "SELCT * FORM users WEHRE id = 1",
|
| 25 |
+
"schema": {
|
| 26 |
+
"users": {"id": "INT PRIMARY KEY", "name": "VARCHAR(255)", "email": "VARCHAR(255)"}
|
| 27 |
+
},
|
| 28 |
+
"context": "Fetch user by ID for profile page",
|
| 29 |
+
"ground_truth_issues": [
|
| 30 |
+
{"category": "syntax", "description": "SELCT should be SELECT", "severity": 0.3, "fix": "SELECT"},
|
| 31 |
+
{"category": "syntax", "description": "FORM should be FROM", "severity": 0.3, "fix": "FROM"},
|
| 32 |
+
{"category": "syntax", "description": "WEHRE should be WHERE", "severity": 0.3, "fix": "WHERE"},
|
| 33 |
+
{"category": "performance", "description": "SELECT * fetches unnecessary columns", "severity": 0.1, "fix": "SELECT id, name, email"}
|
| 34 |
+
],
|
| 35 |
+
"max_steps": 5
|
| 36 |
+
}
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
## State Management
|
| 40 |
+
|
| 41 |
+
| Field | Type | Description |
|
| 42 |
+
|---|---|---|
|
| 43 |
+
| `task_id` | str | Current task identifier |
|
| 44 |
+
| `query` | str | The SQL query under review |
|
| 45 |
+
| `issues_identified` | list | Issues the agent has found so far |
|
| 46 |
+
| `fixes_suggested` | list | Fixes the agent has proposed |
|
| 47 |
+
| `step_count` | int | Current step number |
|
| 48 |
+
| `total_reward` | float | Accumulated reward |
|
| 49 |
+
| `done` | bool | Whether episode is complete |
|
| 50 |
+
| `approved` | bool | Whether agent approved the query |
|
| 51 |
+
|
| 52 |
+
## Observation Space
|
| 53 |
+
- `query`: The full SQL query text
|
| 54 |
+
- `schema_info`: Dict of table → column definitions (empty for easy tasks)
|
| 55 |
+
- `context`: Natural language description of query intent
|
| 56 |
+
- `issues_found_so_far`: List of previously identified issues in this episode
|
| 57 |
+
- `remaining_actions`: Max steps minus current step
|
| 58 |
+
- `difficulty`: "easy" | "medium" | "hard"
|
| 59 |
+
- `feedback`: Result of last action ("correct identification", "false positive", "already identified", etc.)
|
| 60 |
+
|
| 61 |
+
## Action Space
|
| 62 |
+
- `action_type`: enum — "identify_issue" | "suggest_fix" | "approve" | "request_more_context"
|
| 63 |
+
- `issue_category`: enum — "syntax" | "performance" | "security" | "logic" | "style"
|
| 64 |
+
- `issue_description`: str — what the agent thinks is wrong
|
| 65 |
+
- `suggested_fix`: str (optional) — corrected SQL fragment
|
| 66 |
+
- `confidence`: float 0.0-1.0
|
files/04-system-architecture.md
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 04 — System Architecture
|
| 2 |
+
|
| 3 |
+
## Components
|
| 4 |
+
|
| 5 |
+
```
|
| 6 |
+
┌─────────────────────────────────────────────┐
|
| 7 |
+
│ HF Space │
|
| 8 |
+
│ ┌─────────────────────────────────────┐ │
|
| 9 |
+
│ │ FastAPI Server │ │
|
| 10 |
+
│ │ (app.py — Uvicorn) │ │
|
| 11 |
+
│ │ │ │
|
| 12 |
+
│ │ POST /reset → environment.reset() │ │
|
| 13 |
+
│ │ POST /step → environment.step() │ │
|
| 14 |
+
│ │ GET /state → environment.state() │ │
|
| 15 |
+
│ └──────────┬──────────────────────────┘ │
|
| 16 |
+
│ │ │
|
| 17 |
+
│ ┌──────────▼──────────────────────────┐ │
|
| 18 |
+
│ │ SQLReviewEnvironment │ │
|
| 19 |
+
│ │ - task_bank (easy/medium/hard JSON) │ │
|
| 20 |
+
│ │ - grader (deterministic scoring) │ │
|
| 21 |
+
│ │ - reward_fn (per-step signals) │ │
|
| 22 |
+
│ └─────────────────────────────────────┘ │
|
| 23 |
+
│ │
|
| 24 |
+
│ Dockerfile (Python 3.10-slim + deps) │
|
| 25 |
+
└─────────────────────────────────────────────┘
|
| 26 |
+
|
| 27 |
+
┌─────────────────────────────────────────────┐
|
| 28 |
+
│ inference.py (Client) │
|
| 29 |
+
│ - OpenAI Client → LLM API │
|
| 30 |
+
│ - SQLReviewEnvClient → HF Space │
|
| 31 |
+
│ - Structured stdout logging │
|
| 32 |
+
└─────────────────────────────────────────────┘
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
+
## Technology Stack
|
| 36 |
+
- **Runtime:** Python 3.10+
|
| 37 |
+
- **Framework:** FastAPI + Uvicorn
|
| 38 |
+
- **Models:** Pydantic v2
|
| 39 |
+
- **Container:** Docker (python:3.10-slim base)
|
| 40 |
+
- **Deployment:** Hugging Face Spaces (Docker SDK)
|
| 41 |
+
- **LLM Client:** OpenAI Python SDK
|
| 42 |
+
- **Environment SDK:** openenv-core
|
| 43 |
+
|
| 44 |
+
## Communication Protocol
|
| 45 |
+
- WebSocket at `/ws` for persistent sessions (OpenEnv standard)
|
| 46 |
+
- HTTP POST endpoints as fallback: `/reset`, `/step`
|
| 47 |
+
- HTTP GET: `/state`
|
| 48 |
+
- JSON request/response bodies matching typed Pydantic models
|
| 49 |
+
|
| 50 |
+
## Episode Lifecycle
|
| 51 |
+
1. Client calls `reset(task_id="easy_001")` → server loads task, returns initial observation
|
| 52 |
+
2. Client calls `step(action)` → server validates action, computes reward, returns observation
|
| 53 |
+
3. Repeat until `done=True` (all issues found, agent approves, or max_steps reached)
|
| 54 |
+
4. Client calls `close()` → server runs grader, returns final score
|
files/05-database-schema.md
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 05 — Task Bank Schema
|
| 2 |
+
|
| 3 |
+
## Overview
|
| 4 |
+
Tasks are stored as JSON files, not a database. Each difficulty level has its own file with 3-5 queries.
|
| 5 |
+
|
| 6 |
+
## Easy Tasks (`tasks/easy_tasks.json`)
|
| 7 |
+
|
| 8 |
+
Queries with obvious syntax errors, wrong keywords, basic logic mistakes. An LLM should score 0.7-0.9.
|
| 9 |
+
|
| 10 |
+
Example queries:
|
| 11 |
+
1. Misspelled keywords (SELCT, FORM, WEHRE)
|
| 12 |
+
2. Missing FROM clause
|
| 13 |
+
3. Wrong column names that don't exist in schema
|
| 14 |
+
4. Missing semicolons / unclosed quotes
|
| 15 |
+
5. Using = NULL instead of IS NULL
|
| 16 |
+
|
| 17 |
+
## Medium Tasks (`tasks/medium_tasks.json`)
|
| 18 |
+
|
| 19 |
+
Queries with performance anti-patterns. Requires understanding schema context. Target score: 0.4-0.6.
|
| 20 |
+
|
| 21 |
+
Example queries:
|
| 22 |
+
1. SELECT * on a 50-column table when only 2 columns needed
|
| 23 |
+
2. Missing index hint on a JOIN with large table
|
| 24 |
+
3. Correlated subquery that could be a JOIN
|
| 25 |
+
4. Missing LIMIT on unbounded query
|
| 26 |
+
5. Redundant DISTINCT on a column with UNIQUE constraint
|
| 27 |
+
|
| 28 |
+
## Hard Tasks (`tasks/hard_tasks.json`)
|
| 29 |
+
|
| 30 |
+
Security vulnerabilities + complex optimization. Target score: 0.2-0.4.
|
| 31 |
+
|
| 32 |
+
Example queries:
|
| 33 |
+
1. String concatenation enabling SQL injection
|
| 34 |
+
2. Privilege escalation via UNION with system tables
|
| 35 |
+
3. Data leakage through unfiltered JOIN exposing PII
|
| 36 |
+
4. Query that could use window functions instead of self-join (10x perf gain)
|
| 37 |
+
5. Missing transaction isolation causing phantom reads
|
| 38 |
+
|
| 39 |
+
## Ground Truth Format
|
| 40 |
+
|
| 41 |
+
Each issue in ground truth:
|
| 42 |
+
```json
|
| 43 |
+
{
|
| 44 |
+
"category": "security",
|
| 45 |
+
"description": "String concatenation in WHERE clause enables SQL injection",
|
| 46 |
+
"severity": 1.0,
|
| 47 |
+
"fix": "Use parameterized query with ? placeholder",
|
| 48 |
+
"keywords": ["injection", "concatenation", "user input", "unsanitized"]
|
| 49 |
+
}
|
| 50 |
+
```
|
| 51 |
+
|
| 52 |
+
The `keywords` field is used by the grader for fuzzy matching against agent responses.
|
files/06-api-contracts.md
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 06 — API Contracts
|
| 2 |
+
|
| 3 |
+
## OpenEnv Standard Endpoints
|
| 4 |
+
|
| 5 |
+
### POST /reset
|
| 6 |
+
**Request:**
|
| 7 |
+
```json
|
| 8 |
+
{"task_id": "easy_001"}
|
| 9 |
+
```
|
| 10 |
+
**Response (StepResult):**
|
| 11 |
+
```json
|
| 12 |
+
{
|
| 13 |
+
"observation": {
|
| 14 |
+
"query": "SELCT * FORM users WEHRE id = 1",
|
| 15 |
+
"schema_info": {"users": {"id": "INT PK", "name": "VARCHAR(255)", "email": "VARCHAR(255)"}},
|
| 16 |
+
"context": "Fetch user by ID for profile page",
|
| 17 |
+
"issues_found_so_far": [],
|
| 18 |
+
"remaining_actions": 5,
|
| 19 |
+
"difficulty": "easy",
|
| 20 |
+
"feedback": "Review this SQL query and identify any issues."
|
| 21 |
+
},
|
| 22 |
+
"reward": 0.0,
|
| 23 |
+
"done": false,
|
| 24 |
+
"info": {}
|
| 25 |
+
}
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
### POST /step
|
| 29 |
+
**Request (Action):**
|
| 30 |
+
```json
|
| 31 |
+
{
|
| 32 |
+
"action_type": "identify_issue",
|
| 33 |
+
"issue_category": "syntax",
|
| 34 |
+
"issue_description": "SELCT is misspelled, should be SELECT",
|
| 35 |
+
"suggested_fix": "SELECT",
|
| 36 |
+
"confidence": 0.95
|
| 37 |
+
}
|
| 38 |
+
```
|
| 39 |
+
**Response (StepResult):**
|
| 40 |
+
```json
|
| 41 |
+
{
|
| 42 |
+
"observation": {
|
| 43 |
+
"query": "SELCT * FORM users WEHRE id = 1",
|
| 44 |
+
"schema_info": {"users": {"id": "INT PK", "name": "VARCHAR(255)", "email": "VARCHAR(255)"}},
|
| 45 |
+
"context": "Fetch user by ID for profile page",
|
| 46 |
+
"issues_found_so_far": [{"category": "syntax", "description": "SELCT should be SELECT"}],
|
| 47 |
+
"remaining_actions": 4,
|
| 48 |
+
"difficulty": "easy",
|
| 49 |
+
"feedback": "Correct! SELCT is indeed a syntax error. 3 issues remaining."
|
| 50 |
+
},
|
| 51 |
+
"reward": 0.25,
|
| 52 |
+
"done": false,
|
| 53 |
+
"info": {"match_type": "exact", "severity": 0.3}
|
| 54 |
+
}
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
### GET /state
|
| 58 |
+
**Response (State):**
|
| 59 |
+
```json
|
| 60 |
+
{
|
| 61 |
+
"task_id": "easy_001",
|
| 62 |
+
"step_count": 1,
|
| 63 |
+
"issues_identified": [{"category": "syntax", "description": "SELCT should be SELECT"}],
|
| 64 |
+
"total_reward": 0.25,
|
| 65 |
+
"done": false,
|
| 66 |
+
"approved": false
|
| 67 |
+
}
|
| 68 |
+
```
|
| 69 |
+
|
| 70 |
+
## Pydantic Models
|
| 71 |
+
|
| 72 |
+
```python
|
| 73 |
+
class SQLReviewAction(Action):
|
| 74 |
+
action_type: Literal["identify_issue", "suggest_fix", "approve", "request_more_context"]
|
| 75 |
+
issue_category: Optional[Literal["syntax", "performance", "security", "logic", "style"]] = None
|
| 76 |
+
issue_description: Optional[str] = None
|
| 77 |
+
suggested_fix: Optional[str] = None
|
| 78 |
+
confidence: float = 0.5
|
| 79 |
+
|
| 80 |
+
class SQLReviewObservation(Observation):
|
| 81 |
+
query: str
|
| 82 |
+
schema_info: Dict[str, Dict[str, str]]
|
| 83 |
+
context: str
|
| 84 |
+
issues_found_so_far: List[Dict[str, str]]
|
| 85 |
+
remaining_actions: int
|
| 86 |
+
difficulty: str
|
| 87 |
+
feedback: str
|
| 88 |
+
|
| 89 |
+
class SQLReviewState(State):
|
| 90 |
+
task_id: str
|
| 91 |
+
step_count: int
|
| 92 |
+
issues_identified: List[Dict[str, str]]
|
| 93 |
+
total_reward: float
|
| 94 |
+
done: bool
|
| 95 |
+
approved: bool
|
| 96 |
+
```
|
files/07-monorepo-structure.md
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 07 — Monorepo Structure
|
| 2 |
+
|
| 3 |
+
```
|
| 4 |
+
sql-query-reviewer/
|
| 5 |
+
│
|
| 6 |
+
├── openenv.yaml # Environment metadata manifest
|
| 7 |
+
├── models.py # Pydantic: SQLReviewAction, SQLReviewObservation, SQLReviewState
|
| 8 |
+
├── client.py # EnvClient subclass for external consumers
|
| 9 |
+
├── inference.py # MANDATORY: Baseline inference script (root directory!)
|
| 10 |
+
├── README.md # Environment documentation
|
| 11 |
+
├── pyproject.toml # Package config
|
| 12 |
+
│
|
| 13 |
+
├── tasks/
|
| 14 |
+
│ ├── easy_tasks.json # 5 syntax/logic error queries
|
| 15 |
+
│ ├── medium_tasks.json # 5 performance anti-pattern queries
|
| 16 |
+
│ └── hard_tasks.json # 5 security + optimization queries
|
| 17 |
+
│
|
| 18 |
+
└── server/
|
| 19 |
+
├── __init__.py
|
| 20 |
+
├── environment.py # SQLReviewEnvironment(Environment) — core logic
|
| 21 |
+
├── grader.py # Deterministic grading: fuzzy match agent output vs ground truth
|
| 22 |
+
├── reward.py # Per-step reward computation
|
| 23 |
+
├── app.py # FastAPI server (create_app with routes)
|
| 24 |
+
├── Dockerfile # Python 3.10-slim, install deps, expose port
|
| 25 |
+
└── requirements.txt # openenv-core, fastapi, uvicorn, pydantic
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
## Key Files Explained
|
| 29 |
+
|
| 30 |
+
| File | Purpose | Critical? |
|
| 31 |
+
|---|---|---|
|
| 32 |
+
| `openenv.yaml` | Metadata: name, description, author, tasks list | Yes — validated by `openenv validate` |
|
| 33 |
+
| `models.py` | Typed Action/Observation/State contracts | Yes — spec compliance |
|
| 34 |
+
| `inference.py` | Baseline agent using OpenAI Client | Yes — DQ if missing |
|
| 35 |
+
| `server/environment.py` | `reset()`, `step()`, `state()` implementation | Yes — core logic |
|
| 36 |
+
| `server/grader.py` | Score computation per task | Yes — must return 0.0-1.0 |
|
| 37 |
+
| `server/Dockerfile` | Container definition | Yes — must build cleanly |
|
| 38 |
+
| `README.md` | Human-readable documentation | Yes — judges read this first |
|
| 39 |
+
|
| 40 |
+
## openenv.yaml
|
| 41 |
+
|
| 42 |
+
```yaml
|
| 43 |
+
name: sql-query-reviewer
|
| 44 |
+
description: "AI agent reviews SQL queries for correctness, performance, and security"
|
| 45 |
+
author: ravi
|
| 46 |
+
version: "1.0.0"
|
| 47 |
+
tags:
|
| 48 |
+
- openenv
|
| 49 |
+
- sql
|
| 50 |
+
- code-review
|
| 51 |
+
- security
|
| 52 |
+
tasks:
|
| 53 |
+
- id: easy_syntax
|
| 54 |
+
name: "Syntax Error Detection"
|
| 55 |
+
difficulty: easy
|
| 56 |
+
description: "Find and fix obvious SQL syntax errors"
|
| 57 |
+
- id: medium_performance
|
| 58 |
+
name: "Performance Anti-Pattern Review"
|
| 59 |
+
difficulty: medium
|
| 60 |
+
description: "Identify performance issues requiring schema awareness"
|
| 61 |
+
- id: hard_security
|
| 62 |
+
name: "Security & Optimization Audit"
|
| 63 |
+
difficulty: hard
|
| 64 |
+
description: "Find SQL injection vectors and complex optimization opportunities"
|
| 65 |
+
```
|
files/08-computation-engine-spec.md
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 08 — Reward & Grading Engine Spec
|
| 2 |
+
|
| 3 |
+
## Per-Step Reward Function
|
| 4 |
+
|
| 5 |
+
```python
|
| 6 |
+
def compute_reward(action, ground_truth_issues, already_found):
|
| 7 |
+
if action.action_type == "identify_issue":
|
| 8 |
+
match = fuzzy_match(action.issue_description, ground_truth_issues, already_found)
|
| 9 |
+
if match:
|
| 10 |
+
base = match["severity"] # 0.1 - 1.0
|
| 11 |
+
fix_bonus = 0.1 if action.suggested_fix and is_valid_fix(action.suggested_fix, match) else 0.0
|
| 12 |
+
confidence_bonus = 0.05 * action.confidence if match else 0.0
|
| 13 |
+
return min(base + fix_bonus + confidence_bonus, 0.4) # cap per-step
|
| 14 |
+
else:
|
| 15 |
+
return -0.1 # false positive penalty
|
| 16 |
+
|
| 17 |
+
elif action.action_type == "approve":
|
| 18 |
+
unfound = len(ground_truth_issues) - len(already_found)
|
| 19 |
+
if unfound == 0:
|
| 20 |
+
return 0.2 # correct approval
|
| 21 |
+
else:
|
| 22 |
+
return -0.15 * unfound # penalty per missed issue
|
| 23 |
+
|
| 24 |
+
elif action.action_type == "suggest_fix":
|
| 25 |
+
if not already_found:
|
| 26 |
+
return -0.05 # fixing without identifying first
|
| 27 |
+
last_issue = already_found[-1]
|
| 28 |
+
if is_valid_fix(action.suggested_fix, last_issue):
|
| 29 |
+
return 0.1
|
| 30 |
+
return 0.0
|
| 31 |
+
|
| 32 |
+
elif action.action_type == "request_more_context":
|
| 33 |
+
return 0.0 # neutral — no reward, no penalty
|
| 34 |
+
|
| 35 |
+
return 0.0
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
## Fuzzy Matching Algorithm
|
| 39 |
+
|
| 40 |
+
```python
|
| 41 |
+
def fuzzy_match(agent_description, ground_truth_issues, already_found):
|
| 42 |
+
"""Match agent's issue description to a ground truth issue."""
|
| 43 |
+
best_match = None
|
| 44 |
+
best_score = 0.0
|
| 45 |
+
|
| 46 |
+
for issue in ground_truth_issues:
|
| 47 |
+
if issue in already_found:
|
| 48 |
+
continue
|
| 49 |
+
# Keyword overlap score
|
| 50 |
+
agent_words = set(agent_description.lower().split())
|
| 51 |
+
truth_words = set(issue["keywords"])
|
| 52 |
+
overlap = len(agent_words & truth_words) / max(len(truth_words), 1)
|
| 53 |
+
# Category match bonus
|
| 54 |
+
category_bonus = 0.3 if action.issue_category == issue["category"] else 0.0
|
| 55 |
+
score = overlap + category_bonus
|
| 56 |
+
if score > best_score and score > 0.3: # threshold
|
| 57 |
+
best_score = score
|
| 58 |
+
best_match = issue
|
| 59 |
+
|
| 60 |
+
return best_match
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
## End-of-Episode Grader
|
| 64 |
+
|
| 65 |
+
```python
|
| 66 |
+
def grade_episode(issues_found, ground_truth_issues, total_steps, max_steps):
|
| 67 |
+
"""Deterministic grader returning float in [0.0, 1.0]."""
|
| 68 |
+
if not ground_truth_issues:
|
| 69 |
+
return 1.0 if not issues_found else 0.5
|
| 70 |
+
|
| 71 |
+
total_severity = sum(i["severity"] for i in ground_truth_issues)
|
| 72 |
+
found_severity = sum(i["severity"] for i in issues_found if i in matched_ground_truth)
|
| 73 |
+
|
| 74 |
+
coverage_score = found_severity / total_severity # 0.0 - 1.0
|
| 75 |
+
efficiency_bonus = max(0, 0.1 * (1 - total_steps / max_steps)) # reward fewer steps
|
| 76 |
+
false_positive_penalty = 0.05 * count_false_positives(issues_found, ground_truth_issues)
|
| 77 |
+
|
| 78 |
+
score = coverage_score + efficiency_bonus - false_positive_penalty
|
| 79 |
+
return max(0.0, min(1.0, score))
|
| 80 |
+
```
|
| 81 |
+
|
| 82 |
+
## Score Variance Guarantee
|
| 83 |
+
- Easy tasks: 5 different queries with 2-5 issues each → scores range from 0.4 to 1.0
|
| 84 |
+
- Medium tasks: different anti-patterns → scores range from 0.2 to 0.8
|
| 85 |
+
- Hard tasks: varied security issues → scores range from 0.0 to 0.6
|
| 86 |
+
- A grader that always returns the same score = instant DQ. Our design inherently prevents this because different queries have different ground truth issues.
|
files/09-engineering-scope-definition.md
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 09 — Engineering Scope Definition
|
| 2 |
+
|
| 3 |
+
## In Scope (Must Build)
|
| 4 |
+
1. **Environment server** — `environment.py` with `reset()`, `step()`, `state()`
|
| 5 |
+
2. **Pydantic models** — `models.py` with typed Action, Observation, State
|
| 6 |
+
3. **Client** — `client.py` with EnvClient subclass
|
| 7 |
+
4. **Task bank** — 15 SQL queries (5 easy, 5 medium, 5 hard) with ground truth
|
| 8 |
+
5. **Grader** — Deterministic scoring function per task
|
| 9 |
+
6. **Reward function** — Per-step partial credit with penalties
|
| 10 |
+
7. **Inference script** — `inference.py` using OpenAI Client
|
| 11 |
+
8. **Dockerfile** — Working container that builds and runs
|
| 12 |
+
9. **HF Space deployment** — Live, tagged with `openenv`
|
| 13 |
+
10. **README** — Complete documentation
|
| 14 |
+
11. **openenv.yaml** — Valid metadata manifest
|
| 15 |
+
|
| 16 |
+
## Out of Scope (Don't Build)
|
| 17 |
+
- Real database execution (all analysis is pattern-matching based)
|
| 18 |
+
- Custom LLM fine-tuning
|
| 19 |
+
- Web UI beyond OpenEnv's built-in web interface
|
| 20 |
+
- Multiple language SQL dialects (stick to standard SQL)
|
| 21 |
+
- Integration tests against real databases
|
| 22 |
+
|
| 23 |
+
## Effort Estimates
|
| 24 |
+
|
| 25 |
+
| Component | Hours | Priority |
|
| 26 |
+
|---|---|---|
|
| 27 |
+
| Prep course + bootcamp | 3.0 | P0 |
|
| 28 |
+
| Task bank creation (15 queries + ground truth) | 2.5 | P0 |
|
| 29 |
+
| Pydantic models | 0.5 | P0 |
|
| 30 |
+
| Environment logic (reset/step/state) | 3.0 | P0 |
|
| 31 |
+
| Grader + reward function | 2.0 | P0 |
|
| 32 |
+
| Inference script | 1.5 | P0 |
|
| 33 |
+
| Dockerfile + local testing | 1.0 | P0 |
|
| 34 |
+
| HF Space deployment | 0.5 | P0 |
|
| 35 |
+
| README | 1.0 | P0 |
|
| 36 |
+
| Pre-validation + bug fixes | 2.0 | P0 |
|
| 37 |
+
| **Total** | **~17 hours** | |
|
| 38 |
+
|
| 39 |
+
Fits within the 2-day window with buffer for debugging.
|
files/10-development-phases.md
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 10 — Development Phases
|
| 2 |
+
|
| 3 |
+
## Phase 1: Learn (Apr 10, 9 AM – 12 PM)
|
| 4 |
+
- [ ] Complete Module 1: Interface basics
|
| 5 |
+
- [ ] Complete Module 2: Using existing environments
|
| 6 |
+
- [ ] Complete Module 3: Deployment to HF Spaces
|
| 7 |
+
- [ ] Complete Module 4: Building your own environment
|
| 8 |
+
- [ ] Watch bootcamp recording, note judge preferences
|
| 9 |
+
- [ ] Study sample inference script format
|
| 10 |
+
|
| 11 |
+
## Phase 2: Scaffold (Apr 10, 12 PM – 2 PM)
|
| 12 |
+
- [ ] `pip install openenv-core huggingface_hub openai`
|
| 13 |
+
- [ ] `openenv init sql-query-reviewer`
|
| 14 |
+
- [ ] Clone and study echo env for reference
|
| 15 |
+
- [ ] Set up project structure per 07-monorepo-structure.md
|
| 16 |
+
|
| 17 |
+
## Phase 3: Core Build (Apr 10, 2 PM – Apr 11, 12 PM)
|
| 18 |
+
- [ ] Write `models.py` — Action, Observation, State
|
| 19 |
+
- [ ] Create task bank — 5 easy, 5 medium, 5 hard queries with ground truth
|
| 20 |
+
- [ ] Implement `environment.py` — reset(), step(), state()
|
| 21 |
+
- [ ] Implement `grader.py` — deterministic scoring
|
| 22 |
+
- [ ] Implement `reward.py` — per-step reward computation
|
| 23 |
+
- [ ] Implement fuzzy matching for issue identification
|
| 24 |
+
- [ ] Write `app.py` — FastAPI routes
|
| 25 |
+
- [ ] Local testing: `uv run server` → test all endpoints manually
|
| 26 |
+
|
| 27 |
+
## Phase 4: Inference (Apr 11, 12 PM – 3 PM)
|
| 28 |
+
- [ ] Write `inference.py` following sample script format exactly
|
| 29 |
+
- [ ] System prompt design for SQL review agent
|
| 30 |
+
- [ ] Test with free HF Inference API
|
| 31 |
+
- [ ] Verify `[START]`, `[STEP]`, `[END]` output format
|
| 32 |
+
- [ ] Run 3x to verify reproducible scores
|
| 33 |
+
|
| 34 |
+
## Phase 5: Containerize & Deploy (Apr 11, 3 PM – 6 PM)
|
| 35 |
+
- [ ] Write Dockerfile (python:3.10-slim base)
|
| 36 |
+
- [ ] `docker build -t sql-query-reviewer ./server`
|
| 37 |
+
- [ ] `docker run -p 8000:8000 sql-query-reviewer`
|
| 38 |
+
- [ ] Test `/reset`, `/step`, `/state` against running container
|
| 39 |
+
- [ ] `openenv push --repo-id ravi/sql-query-reviewer`
|
| 40 |
+
- [ ] Verify HF Space returns 200 on `/reset`
|
| 41 |
+
|
| 42 |
+
## Phase 6: Polish & Submit (Apr 11, 6 PM – Apr 12, 11:59 PM)
|
| 43 |
+
- [ ] Write compelling README
|
| 44 |
+
- [ ] Run `openenv validate`
|
| 45 |
+
- [ ] Run `validate-submission.sh`
|
| 46 |
+
- [ ] Fix any issues
|
| 47 |
+
- [ ] Submit early, iterate if time permits
|
| 48 |
+
- [ ] Final verification: HF Space live and responding
|
files/11-environment-and-devops.md
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 11 — Environment & DevOps
|
| 2 |
+
|
| 3 |
+
## Local Development Setup
|
| 4 |
+
|
| 5 |
+
```bash
|
| 6 |
+
# Python environment
|
| 7 |
+
python3.10 -m venv .venv
|
| 8 |
+
source .venv/bin/activate
|
| 9 |
+
pip install openenv-core fastapi uvicorn pydantic openai huggingface_hub
|
| 10 |
+
|
| 11 |
+
# Run locally
|
| 12 |
+
cd server && uvicorn app:app --reload --port 8000
|
| 13 |
+
|
| 14 |
+
# Test endpoints
|
| 15 |
+
curl -X POST http://localhost:8000/reset -H "Content-Type: application/json" -d '{"task_id": "easy_001"}'
|
| 16 |
+
```
|
| 17 |
+
|
| 18 |
+
## Dockerfile
|
| 19 |
+
|
| 20 |
+
```dockerfile
|
| 21 |
+
FROM python:3.10-slim
|
| 22 |
+
|
| 23 |
+
WORKDIR /app
|
| 24 |
+
|
| 25 |
+
COPY server/requirements.txt .
|
| 26 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 27 |
+
|
| 28 |
+
COPY models.py .
|
| 29 |
+
COPY tasks/ ./tasks/
|
| 30 |
+
COPY server/ ./server/
|
| 31 |
+
COPY openenv.yaml .
|
| 32 |
+
|
| 33 |
+
EXPOSE 8000
|
| 34 |
+
|
| 35 |
+
CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000"]
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
## server/requirements.txt
|
| 39 |
+
|
| 40 |
+
```
|
| 41 |
+
openenv-core>=0.1.0
|
| 42 |
+
fastapi>=0.100.0
|
| 43 |
+
uvicorn>=0.23.0
|
| 44 |
+
pydantic>=2.0.0
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
## HF Space Deployment
|
| 48 |
+
|
| 49 |
+
```bash
|
| 50 |
+
# Login
|
| 51 |
+
huggingface-cli login
|
| 52 |
+
|
| 53 |
+
# Deploy
|
| 54 |
+
openenv push --repo-id ravi/sql-query-reviewer
|
| 55 |
+
|
| 56 |
+
# Verify
|
| 57 |
+
curl -s -o /dev/null -w "%{http_code}" -X POST https://ravi-sql-query-reviewer.hf.space/reset -H "Content-Type: application/json" -d '{}'
|
| 58 |
+
# Expected: 200
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
## Environment Variables for Inference
|
| 62 |
+
|
| 63 |
+
```bash
|
| 64 |
+
export API_BASE_URL="https://router.huggingface.co/v1"
|
| 65 |
+
export MODEL_NAME="Qwen/Qwen2.5-72B-Instruct"
|
| 66 |
+
export HF_TOKEN="hf_xxxxxxxxxxxxx"
|
| 67 |
+
export IMAGE_NAME="sql-query-reviewer"
|
| 68 |
+
```
|
| 69 |
+
|
| 70 |
+
## Pre-Validation
|
| 71 |
+
|
| 72 |
+
```bash
|
| 73 |
+
chmod +x validate-submission.sh
|
| 74 |
+
./validate-submission.sh https://ravi-sql-query-reviewer.hf.space .
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
Expected output: All 3/3 checks passed.
|
files/12-testing-strategy.md
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 12 — Testing Strategy
|
| 2 |
+
|
| 3 |
+
## Level 1: Unit Tests (During Build)
|
| 4 |
+
- **Models:** Validate Pydantic models accept/reject correct/incorrect data
|
| 5 |
+
- **Grader:** Test with known inputs → known scores. Verify determinism (run 10x, same result).
|
| 6 |
+
- **Reward function:** Test each action type returns expected reward range
|
| 7 |
+
- **Fuzzy matcher:** Test with exact match, partial match, no match, already-found cases
|
| 8 |
+
|
| 9 |
+
## Level 2: Integration Tests (Before Docker)
|
| 10 |
+
- Run `uv run server` locally
|
| 11 |
+
- POST `/reset` with each task ID → verify valid observation returned
|
| 12 |
+
- POST `/step` with valid action → verify reward, done, observation
|
| 13 |
+
- POST `/step` with invalid action → verify graceful error handling
|
| 14 |
+
- GET `/state` → verify state matches expectations
|
| 15 |
+
- Run full episode: reset → steps → done → verify final grader score
|
| 16 |
+
|
| 17 |
+
## Level 3: Container Tests (Before Deploy)
|
| 18 |
+
```bash
|
| 19 |
+
docker build -t sql-query-reviewer ./server
|
| 20 |
+
docker run -d -p 8000:8000 sql-query-reviewer
|
| 21 |
+
# Wait for startup
|
| 22 |
+
sleep 5
|
| 23 |
+
# Test reset
|
| 24 |
+
curl -X POST http://localhost:8000/reset -d '{}' | python -m json.tool
|
| 25 |
+
# Test step
|
| 26 |
+
curl -X POST http://localhost:8000/step -d '{"action_type":"identify_issue","issue_category":"syntax","issue_description":"test"}' | python -m json.tool
|
| 27 |
+
docker stop $(docker ps -q)
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
## Level 4: Validation Tests (Before Submit)
|
| 31 |
+
- `openenv validate` — must pass
|
| 32 |
+
- `validate-submission.sh <url> .` — all 3 checks must pass
|
| 33 |
+
- Run `inference.py` 3 times → verify scores are consistent
|
| 34 |
+
- Verify stdout format matches `[START]`, `[STEP]`, `[END]` exactly
|
| 35 |
+
- Check memory usage stays under 8GB
|
| 36 |
+
- Check runtime stays under 20 minutes
|
| 37 |
+
|
| 38 |
+
## Level 5: Score Variance Check
|
| 39 |
+
- Run inference on all 3 tasks → verify different scores
|
| 40 |
+
- Confirm no grader returns the same score for different inputs
|
| 41 |
+
- Verify easy > medium > hard in terms of baseline agent performance
|
| 42 |
+
|
| 43 |
+
## DQ Prevention Checklist
|
| 44 |
+
- [ ] HF Space returns 200 on POST /reset
|
| 45 |
+
- [ ] openenv.yaml is valid
|
| 46 |
+
- [ ] Typed models work
|
| 47 |
+
- [ ] Dockerfile builds
|
| 48 |
+
- [ ] 3+ tasks with graders returning 0.0-1.0
|
| 49 |
+
- [ ] Graders DON'T always return the same score
|
| 50 |
+
- [ ] inference.py exists in root
|
| 51 |
+
- [ ] Baseline produces reproducible scores
|
| 52 |
+
- [ ] Not plagiarized from existing environments
|
files/architecture-diagram.md
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Architecture Diagram
|
| 2 |
+
|
| 3 |
+
## High-Level Flow
|
| 4 |
+
|
| 5 |
+
```
|
| 6 |
+
┌──────────────┐ ┌───────────────────────────────────┐
|
| 7 |
+
│ │ │ HF Space (Docker) │
|
| 8 |
+
│ inference.py│ │ │
|
| 9 |
+
│ (Agent) │ │ ┌──────────────────────────┐ │
|
| 10 |
+
│ │ WS │ │ FastAPI Server │ │
|
| 11 |
+
│ ┌────────┐ ├────►│ │ (app.py) │ │
|
| 12 |
+
│ │ OpenAI │ │ │ │ │ │
|
| 13 |
+
│ │ Client │ │ │ │ /reset → load task │ │
|
| 14 |
+
│ │ ↕ │ │◄────┤ │ /step → grade action │ │
|
| 15 |
+
│ │ LLM │ │ │ │ /state → return state │ │
|
| 16 |
+
│ └────────┘ │ │ └──────────┬───────────────┘ │
|
| 17 |
+
│ │ │ │ │
|
| 18 |
+
│ stdout: │ │ ┌──────────▼───────────────┐ │
|
| 19 |
+
│ [START] │ │ │ SQLReviewEnvironment │ │
|
| 20 |
+
│ [STEP] │ │ │ - task_bank (JSON) │ │
|
| 21 |
+
│ [END] │ │ │ - fuzzy_matcher │ │
|
| 22 |
+
│ │ │ │ - reward_fn │ │
|
| 23 |
+
└──────────────┘ │ │ - grader │ │
|
| 24 |
+
│ └──────────────────────────┘ │
|
| 25 |
+
└───────────────────────────────────┘
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
## Episode Sequence
|
| 29 |
+
|
| 30 |
+
```
|
| 31 |
+
Agent Environment
|
| 32 |
+
│ │
|
| 33 |
+
│──── reset(task_id) ──────────►│ Load task from JSON
|
| 34 |
+
│◄─── observation ──────────────│ Return query + schema + context
|
| 35 |
+
│ │
|
| 36 |
+
│──── step(identify_issue) ────►│ Fuzzy match vs ground truth
|
| 37 |
+
│◄─── obs + reward + done ──────│ Return feedback + reward
|
| 38 |
+
│ │
|
| 39 |
+
│──── step(suggest_fix) ───────►│ Validate fix
|
| 40 |
+
│◄─── obs + reward + done ──────│ Return feedback + reward
|
| 41 |
+
│ │
|
| 42 |
+
│──── step(approve) ───────────►│ Check remaining issues
|
| 43 |
+
│◄─── obs + reward + done=true──│ Episode ends
|
| 44 |
+
│ │
|
| 45 |
+
│──── close() ─────────────────►│ Run grader → final score
|
| 46 |
+
│◄─── final_score ──────────────│
|
| 47 |
+
│ │
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
## Evaluation Pipeline (Hackathon Judges)
|
| 51 |
+
|
| 52 |
+
```
|
| 53 |
+
Phase 1: Automated Validation
|
| 54 |
+
└─ HF Space responds? → openenv validate? → Docker builds? → inference.py runs? → 3+ tasks?
|
| 55 |
+
|
| 56 |
+
Phase 2: Agentic Evaluation
|
| 57 |
+
└─ Run Nemotron 3 Super against all envs → check score variance
|
| 58 |
+
|
| 59 |
+
Phase 3: Human Review
|
| 60 |
+
└─ Meta + HF engineers review for utility, creativity, exploit checks
|
| 61 |
+
```
|
files/project-design.md
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Project Design
|
| 2 |
+
|
| 3 |
+
## Design Principles
|
| 4 |
+
|
| 5 |
+
1. **Spec compliance first, creativity second.** Most teams will fail on automated validation. Perfect adherence to the OpenEnv spec is the highest-ROI activity.
|
| 6 |
+
|
| 7 |
+
2. **Reward shaping is the differentiator.** Binary end-of-episode rewards are common. Per-step, severity-weighted, partial-credit rewards are what separate top submissions.
|
| 8 |
+
|
| 9 |
+
3. **Score variance is mandatory.** The environment must produce different scores for different agent capabilities. Our design inherently ensures this: different queries have different issues, so no two episodes produce identical scores.
|
| 10 |
+
|
| 11 |
+
4. **Domain authenticity wins the 30%.** Real-world utility is the highest-weighted criterion. SQL review is a task every Meta engineer knows and values. The task bank should contain queries that feel like real code review findings, not synthetic puzzles.
|
| 12 |
+
|
| 13 |
+
## Key Design Decisions
|
| 14 |
+
|
| 15 |
+
| Decision | Choice | Rationale |
|
| 16 |
+
|---|---|---|
|
| 17 |
+
| Domain | SQL Query Review | Universal relevance, clear grading, natural difficulty progression |
|
| 18 |
+
| Task count | 15 queries (5/5/5) | Well above minimum 3, shows depth |
|
| 19 |
+
| Matching | Fuzzy keyword matching | Robust to LLM phrasing variation while staying deterministic |
|
| 20 |
+
| Reward | Per-step partial credit | Provides learning signal throughout trajectory |
|
| 21 |
+
| Episode length | 3-8 steps | Short enough for 20-min inference limit across all tasks |
|
| 22 |
+
| Grader | Severity-weighted coverage | Rewards finding critical issues more than trivial ones |
|
| 23 |
+
|
| 24 |
+
## Risk Mitigation
|
| 25 |
+
|
| 26 |
+
| Risk | Mitigation |
|
| 27 |
+
|---|---|
|
| 28 |
+
| Fuzzy matching too loose → inflated scores | Require 30% keyword overlap threshold + category match |
|
| 29 |
+
| Fuzzy matching too strict → no agent can score | Include broad keywords list, test with actual LLM output |
|
| 30 |
+
| Inference timeout | 15 queries × 5-8 steps × ~3s per LLM call = ~6 min. Well under 20 min. |
|
| 31 |
+
| Docker build fails on HF | Use minimal dependencies, test Dockerfile locally first |
|
| 32 |
+
| Grader returns same score | Impossible with varied queries — but verify during testing |
|
| 33 |
+
|
| 34 |
+
## What Judges Will See
|
| 35 |
+
|
| 36 |
+
1. **README** — Clear, compelling, explains why SQL review matters and how the env works
|
| 37 |
+
2. **HF Space** — Live, responds instantly to `/reset`
|
| 38 |
+
3. **Code** — Clean, well-structured, typed models, deterministic graders
|
| 39 |
+
4. **Scores** — Meaningful variance: easy ~0.8, medium ~0.5, hard ~0.3
|
| 40 |
+
5. **Novelty** — No existing SQL review env in OpenEnv ecosystem
|
files/project-readme.md
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SQL Query Reviewer — OpenEnv Environment
|
| 2 |
+
|
| 3 |
+
An AI agent environment for reviewing SQL queries for correctness, performance, and security issues.
|
| 4 |
+
|
| 5 |
+
## Why This Matters
|
| 6 |
+
|
| 7 |
+
Every engineering team reviews SQL queries daily — in code reviews, migration scripts, ETL pipelines, and security audits. This environment lets you train and evaluate AI agents on a task that directly maps to real engineering workflows. Unlike toy benchmarks, the queries here reflect genuine patterns found in production codebases: misspelled keywords, N+1 anti-patterns, missing indexes, SQL injection vectors, and schema-aware optimization opportunities.
|
| 8 |
+
|
| 9 |
+
## Environment Overview
|
| 10 |
+
|
| 11 |
+
The agent receives a SQL query (plus optional schema context) and must identify issues through a multi-step review process. It earns rewards for correctly flagging problems and suggesting fixes, while being penalized for false positives or approving buggy queries.
|
| 12 |
+
|
| 13 |
+
## Action Space
|
| 14 |
+
|
| 15 |
+
| Action Type | Description |
|
| 16 |
+
|---|---|
|
| 17 |
+
| `identify_issue` | Flag a specific issue with category and description |
|
| 18 |
+
| `suggest_fix` | Propose corrected SQL for a previously identified issue |
|
| 19 |
+
| `approve` | Mark the query as acceptable (ends episode) |
|
| 20 |
+
| `request_more_context` | Ask for additional schema information |
|
| 21 |
+
|
| 22 |
+
**Fields:** `action_type`, `issue_category` (syntax/performance/security/logic/style), `issue_description`, `suggested_fix`, `confidence` (0.0-1.0)
|
| 23 |
+
|
| 24 |
+
## Observation Space
|
| 25 |
+
|
| 26 |
+
| Field | Type | Description |
|
| 27 |
+
|---|---|---|
|
| 28 |
+
| `query` | str | The SQL query under review |
|
| 29 |
+
| `schema_info` | dict | Table/column definitions (richer for harder tasks) |
|
| 30 |
+
| `context` | str | What the query is supposed to do |
|
| 31 |
+
| `issues_found_so_far` | list | Previously identified issues this episode |
|
| 32 |
+
| `remaining_actions` | int | Steps left before episode ends |
|
| 33 |
+
| `difficulty` | str | easy, medium, or hard |
|
| 34 |
+
| `feedback` | str | Result of last action |
|
| 35 |
+
|
| 36 |
+
## Tasks
|
| 37 |
+
|
| 38 |
+
### Task 1: Syntax Error Detection (Easy)
|
| 39 |
+
Queries with obvious typos, missing keywords, wrong column names. A baseline agent should score **0.7-0.9**.
|
| 40 |
+
|
| 41 |
+
### Task 2: Performance Anti-Pattern Review (Medium)
|
| 42 |
+
Queries with SELECT *, missing indexes, correlated subqueries, unbounded queries. Requires schema awareness. Expected score: **0.4-0.6**.
|
| 43 |
+
|
| 44 |
+
### Task 3: Security & Optimization Audit (Hard)
|
| 45 |
+
SQL injection vectors, privilege escalation, data leakage, complex optimization. Requires multi-step reasoning. Expected score: **0.2-0.4**.
|
| 46 |
+
|
| 47 |
+
## Reward Design
|
| 48 |
+
- Per-step partial credit (not binary end-of-episode)
|
| 49 |
+
- Correct issue identification: +0.1 to +0.4 (scaled by severity)
|
| 50 |
+
- Valid fix suggestion: +0.1 bonus
|
| 51 |
+
- False positive: -0.1 penalty
|
| 52 |
+
- Approving a query with unfound issues: -0.15 per missed issue
|
| 53 |
+
- Correct approval of clean query: +0.2
|
| 54 |
+
|
| 55 |
+
## Setup
|
| 56 |
+
|
| 57 |
+
```bash
|
| 58 |
+
# Install
|
| 59 |
+
pip install openenv-core
|
| 60 |
+
pip install git+https://huggingface.co/spaces/ravi/sql-query-reviewer
|
| 61 |
+
|
| 62 |
+
# Use
|
| 63 |
+
from sql_query_reviewer import SQLReviewEnv, SQLReviewAction
|
| 64 |
+
|
| 65 |
+
with SQLReviewEnv(base_url="https://ravi-sql-query-reviewer.hf.space").sync() as env:
|
| 66 |
+
result = env.reset()
|
| 67 |
+
result = env.step(SQLReviewAction(
|
| 68 |
+
action_type="identify_issue",
|
| 69 |
+
issue_category="syntax",
|
| 70 |
+
issue_description="SELCT should be SELECT"
|
| 71 |
+
))
|
| 72 |
+
print(result.observation.feedback)
|
| 73 |
+
```
|
| 74 |
+
|
| 75 |
+
## Docker
|
| 76 |
+
|
| 77 |
+
```bash
|
| 78 |
+
docker build -t sql-query-reviewer ./server
|
| 79 |
+
docker run -p 8000:8000 sql-query-reviewer
|
| 80 |
+
```
|
| 81 |
+
|
| 82 |
+
## Baseline Scores
|
| 83 |
+
|
| 84 |
+
| Task | Difficulty | Baseline Score |
|
| 85 |
+
|---|---|---|
|
| 86 |
+
| Syntax Error Detection | Easy | ~0.82 |
|
| 87 |
+
| Performance Anti-Pattern Review | Medium | ~0.51 |
|
| 88 |
+
| Security & Optimization Audit | Hard | ~0.29 |
|
| 89 |
+
|
| 90 |
+
## Author
|
| 91 |
+
**Ravi** — Solo participant, Meta PyTorch OpenEnv Hackathon 2026
|
inference.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
from typing import Any
|
| 6 |
+
|
| 7 |
+
from openai import OpenAI
|
| 8 |
+
|
| 9 |
+
from sql_query_reviewer.client import SyncSQLReviewEnv
|
| 10 |
+
from sql_query_reviewer.models import SQLReviewAction, SQLReviewObservation
|
| 11 |
+
|
| 12 |
+
DEFAULT_TASK_IDS = ("easy_001", "medium_001", "hard_001")
|
| 13 |
+
|
| 14 |
+
SYSTEM_PROMPT = """You are reviewing a SQL query for correctness, performance, and security.
|
| 15 |
+
Return exactly one JSON object with these keys:
|
| 16 |
+
- action_type: identify_issue, suggest_fix, approve, or request_more_context
|
| 17 |
+
- issue_category: syntax, performance, security, logic, or style when relevant
|
| 18 |
+
- issue_description: concise issue statement when relevant
|
| 19 |
+
- suggested_fix: corrected SQL or corrected fragment when relevant
|
| 20 |
+
- confidence: float between 0.0 and 1.0
|
| 21 |
+
|
| 22 |
+
Guidelines:
|
| 23 |
+
- Prefer identify_issue until you have high confidence all important issues are covered.
|
| 24 |
+
- Use approve only when the query looks acceptable or all issues have already been identified.
|
| 25 |
+
- Keep the JSON valid and do not wrap it in prose.
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def print_event(prefix: str, payload: dict[str, Any]) -> None:
|
| 30 |
+
print(f"[{prefix}] {json.dumps(payload, sort_keys=True)}")
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def build_user_prompt(observation: SQLReviewObservation) -> str:
|
| 34 |
+
payload = {
|
| 35 |
+
"query": observation.query,
|
| 36 |
+
"schema_info": observation.schema_info,
|
| 37 |
+
"context": observation.context,
|
| 38 |
+
"issues_found_so_far": [issue.model_dump() for issue in observation.issues_found_so_far],
|
| 39 |
+
"remaining_actions": observation.remaining_actions,
|
| 40 |
+
"difficulty": observation.difficulty,
|
| 41 |
+
"feedback": observation.feedback,
|
| 42 |
+
}
|
| 43 |
+
return json.dumps(payload, indent=2)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def extract_json(content: str) -> dict[str, Any]:
|
| 47 |
+
stripped = content.strip()
|
| 48 |
+
if stripped.startswith("```"):
|
| 49 |
+
lines = [line for line in stripped.splitlines() if not line.startswith("```")]
|
| 50 |
+
stripped = "\n".join(lines).strip()
|
| 51 |
+
|
| 52 |
+
start = stripped.find("{")
|
| 53 |
+
end = stripped.rfind("}")
|
| 54 |
+
if start == -1 or end == -1 or end <= start:
|
| 55 |
+
raise ValueError(f"Could not find JSON object in model response: {content!r}")
|
| 56 |
+
return json.loads(stripped[start : end + 1])
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def choose_action(llm_client: Any, model_name: str, observation: SQLReviewObservation) -> SQLReviewAction:
|
| 60 |
+
response = llm_client.chat.completions.create(
|
| 61 |
+
model=model_name,
|
| 62 |
+
temperature=0,
|
| 63 |
+
messages=[
|
| 64 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 65 |
+
{"role": "user", "content": build_user_prompt(observation)},
|
| 66 |
+
],
|
| 67 |
+
)
|
| 68 |
+
content = response.choices[0].message.content or ""
|
| 69 |
+
return SQLReviewAction.model_validate(extract_json(content))
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def run_episode(env: Any, llm_client: Any, model_name: str, task_id: str) -> dict[str, Any]:
|
| 73 |
+
result = env.reset(task_id=task_id)
|
| 74 |
+
print_event(
|
| 75 |
+
"START",
|
| 76 |
+
{
|
| 77 |
+
"difficulty": result.observation.difficulty,
|
| 78 |
+
"remaining_actions": result.observation.remaining_actions,
|
| 79 |
+
"task_id": task_id,
|
| 80 |
+
},
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
while True:
|
| 84 |
+
action = choose_action(llm_client=llm_client, model_name=model_name, observation=result.observation)
|
| 85 |
+
result = env.step(action)
|
| 86 |
+
print_event(
|
| 87 |
+
"STEP",
|
| 88 |
+
{
|
| 89 |
+
"action": action.model_dump(exclude_none=True),
|
| 90 |
+
"done": result.done,
|
| 91 |
+
"feedback": result.observation.feedback,
|
| 92 |
+
"reward": result.reward,
|
| 93 |
+
"task_id": task_id,
|
| 94 |
+
},
|
| 95 |
+
)
|
| 96 |
+
if result.done:
|
| 97 |
+
state = env.state()
|
| 98 |
+
summary = {
|
| 99 |
+
"final_score": state.final_score,
|
| 100 |
+
"steps": state.step_count,
|
| 101 |
+
"task_id": task_id,
|
| 102 |
+
"total_reward": state.total_reward,
|
| 103 |
+
}
|
| 104 |
+
print_event("END", summary)
|
| 105 |
+
return summary
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def main() -> int:
|
| 109 |
+
env_base_url = os.getenv("ENV_BASE_URL", "http://localhost:8000")
|
| 110 |
+
api_base_url = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
|
| 111 |
+
model_name = os.getenv("MODEL_NAME", "gpt-4o-mini")
|
| 112 |
+
api_key = os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY")
|
| 113 |
+
if not api_key:
|
| 114 |
+
raise SystemExit("Set HF_TOKEN or OPENAI_API_KEY before running inference.py")
|
| 115 |
+
|
| 116 |
+
task_ids = tuple(
|
| 117 |
+
task_id.strip()
|
| 118 |
+
for task_id in os.getenv("TASK_IDS", ",".join(DEFAULT_TASK_IDS)).split(",")
|
| 119 |
+
if task_id.strip()
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
llm_client = OpenAI(api_key=api_key, base_url=api_base_url)
|
| 123 |
+
with SyncSQLReviewEnv(base_url=env_base_url) as env:
|
| 124 |
+
for task_id in task_ids:
|
| 125 |
+
run_episode(env=env, llm_client=llm_client, model_name=model_name, task_id=task_id)
|
| 126 |
+
return 0
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
if __name__ == "__main__":
|
| 130 |
+
raise SystemExit(main())
|
| 131 |
+
|
models.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sql_query_reviewer.models import (
|
| 2 |
+
GroundTruthIssue,
|
| 3 |
+
IdentifiedIssue,
|
| 4 |
+
ResetRequest,
|
| 5 |
+
SQLReviewAction,
|
| 6 |
+
SQLReviewObservation,
|
| 7 |
+
SQLReviewState,
|
| 8 |
+
StepResult,
|
| 9 |
+
TaskRecord,
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
__all__ = [
|
| 13 |
+
"GroundTruthIssue",
|
| 14 |
+
"IdentifiedIssue",
|
| 15 |
+
"ResetRequest",
|
| 16 |
+
"SQLReviewAction",
|
| 17 |
+
"SQLReviewObservation",
|
| 18 |
+
"SQLReviewState",
|
| 19 |
+
"StepResult",
|
| 20 |
+
"TaskRecord",
|
| 21 |
+
]
|
| 22 |
+
|
openenv.yaml
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: sql-query-reviewer
|
| 2 |
+
description: "AI agent reviews SQL queries for correctness, performance, and security."
|
| 3 |
+
author: Hellinferno
|
| 4 |
+
version: "0.1.0"
|
| 5 |
+
tags:
|
| 6 |
+
- openenv
|
| 7 |
+
- sql
|
| 8 |
+
- code-review
|
| 9 |
+
- security
|
| 10 |
+
tasks:
|
| 11 |
+
- id: easy_syntax
|
| 12 |
+
name: Syntax Error Detection
|
| 13 |
+
difficulty: easy
|
| 14 |
+
description: Find obvious SQL syntax and logic defects.
|
| 15 |
+
- id: medium_performance
|
| 16 |
+
name: Performance Anti-Pattern Review
|
| 17 |
+
difficulty: medium
|
| 18 |
+
description: Identify schema-aware performance problems.
|
| 19 |
+
- id: hard_security
|
| 20 |
+
name: Security and Optimization Audit
|
| 21 |
+
difficulty: hard
|
| 22 |
+
description: Detect injection, data exposure, and advanced optimization issues.
|
| 23 |
+
|
pyproject.toml
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=68", "wheel"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "sql-query-reviewer"
|
| 7 |
+
version = "0.1.0"
|
| 8 |
+
description = "An OpenEnv-style SQL review environment for correctness, performance, and security auditing."
|
| 9 |
+
readme = "README.md"
|
| 10 |
+
requires-python = ">=3.11"
|
| 11 |
+
license = { text = "MIT" }
|
| 12 |
+
authors = [
|
| 13 |
+
{ name = "Hellinferno" }
|
| 14 |
+
]
|
| 15 |
+
dependencies = [
|
| 16 |
+
"fastapi>=0.115,<1.0",
|
| 17 |
+
"httpx>=0.27,<1.0",
|
| 18 |
+
"openenv-core>=0.2.0",
|
| 19 |
+
"openai>=2.7.2,<3.0",
|
| 20 |
+
"pydantic>=2.8,<3.0",
|
| 21 |
+
"uvicorn>=0.30,<1.0",
|
| 22 |
+
]
|
| 23 |
+
|
| 24 |
+
[project.scripts]
|
| 25 |
+
server = "server.app:main"
|
| 26 |
+
|
| 27 |
+
[project.optional-dependencies]
|
| 28 |
+
dev = [
|
| 29 |
+
"pytest>=8.3,<9.0",
|
| 30 |
+
]
|
| 31 |
+
|
| 32 |
+
[tool.setuptools.packages.find]
|
| 33 |
+
include = ["sql_query_reviewer*"]
|
| 34 |
+
|
| 35 |
+
[tool.fastapi]
|
| 36 |
+
entrypoint = "server.app:app"
|
| 37 |
+
|
| 38 |
+
[tool.pytest.ini_options]
|
| 39 |
+
testpaths = ["tests"]
|
server/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from server.app import app, create_app
|
| 2 |
+
from server.environment import SQLReviewEnvironment
|
| 3 |
+
|
| 4 |
+
__all__ = ["SQLReviewEnvironment", "app", "create_app"]
|
| 5 |
+
|
server/app.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from typing import Annotated
|
| 5 |
+
|
| 6 |
+
from fastapi import Body, FastAPI, HTTPException
|
| 7 |
+
import uvicorn
|
| 8 |
+
|
| 9 |
+
from sql_query_reviewer.models import ResetRequest, SQLReviewAction, SQLReviewState, StepResult
|
| 10 |
+
from server.environment import SQLReviewEnvironment
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def create_app(environment: SQLReviewEnvironment | None = None) -> FastAPI:
|
| 14 |
+
app = FastAPI(
|
| 15 |
+
title="SQL Query Reviewer",
|
| 16 |
+
description="OpenEnv-style SQL review environment served over FastAPI.",
|
| 17 |
+
version="0.1.0",
|
| 18 |
+
)
|
| 19 |
+
env = environment or SQLReviewEnvironment()
|
| 20 |
+
|
| 21 |
+
@app.get("/health")
|
| 22 |
+
async def health() -> dict[str, str]:
|
| 23 |
+
return {"status": "ok"}
|
| 24 |
+
|
| 25 |
+
@app.post("/reset", response_model=StepResult)
|
| 26 |
+
async def reset_environment(request: Annotated[ResetRequest | None, Body()] = None) -> StepResult:
|
| 27 |
+
try:
|
| 28 |
+
return env.reset(task_id=request.task_id if request else None)
|
| 29 |
+
except ValueError as exc:
|
| 30 |
+
raise HTTPException(status_code=404, detail=str(exc)) from exc
|
| 31 |
+
|
| 32 |
+
@app.post("/step", response_model=StepResult)
|
| 33 |
+
async def step_environment(action: SQLReviewAction) -> StepResult:
|
| 34 |
+
try:
|
| 35 |
+
return env.step(action)
|
| 36 |
+
except RuntimeError as exc:
|
| 37 |
+
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
| 38 |
+
|
| 39 |
+
@app.get("/state", response_model=SQLReviewState)
|
| 40 |
+
async def get_state() -> SQLReviewState:
|
| 41 |
+
try:
|
| 42 |
+
return env.state()
|
| 43 |
+
except RuntimeError as exc:
|
| 44 |
+
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
| 45 |
+
|
| 46 |
+
return app
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
app = create_app()
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def main() -> None:
|
| 53 |
+
port = int(os.getenv("PORT", "8000"))
|
| 54 |
+
host = os.getenv("HOST", "0.0.0.0")
|
| 55 |
+
uvicorn.run("server.app:app", host=host, port=port)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
if __name__ == "__main__":
|
| 59 |
+
main()
|
server/environment.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
from sql_query_reviewer.models import (
|
| 7 |
+
IdentifiedIssue,
|
| 8 |
+
SQLReviewAction,
|
| 9 |
+
SQLReviewObservation,
|
| 10 |
+
SQLReviewState,
|
| 11 |
+
StepResult,
|
| 12 |
+
TaskRecord,
|
| 13 |
+
)
|
| 14 |
+
from server.grader import grade_episode, match_issue, validate_fix
|
| 15 |
+
from server.reward import compute_reward
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class SQLReviewEnvironment:
|
| 19 |
+
def __init__(self, task_directory: Path | None = None) -> None:
|
| 20 |
+
self.task_directory = task_directory or Path(__file__).resolve().parent.parent / "tasks"
|
| 21 |
+
self.tasks = self._load_tasks()
|
| 22 |
+
self.task_order = sorted(self.tasks)
|
| 23 |
+
self.current_task: TaskRecord | None = None
|
| 24 |
+
self.current_state: SQLReviewState | None = None
|
| 25 |
+
self._reset_index = 0
|
| 26 |
+
|
| 27 |
+
def available_task_ids(self) -> list[str]:
|
| 28 |
+
return list(self.task_order)
|
| 29 |
+
|
| 30 |
+
def reset(self, task_id: str | None = None) -> StepResult:
|
| 31 |
+
selected_task_id = task_id or self._next_task_id()
|
| 32 |
+
if selected_task_id not in self.tasks:
|
| 33 |
+
raise ValueError(f"Unknown task_id: {selected_task_id}")
|
| 34 |
+
|
| 35 |
+
self.current_task = self.tasks[selected_task_id]
|
| 36 |
+
self.current_state = SQLReviewState(task_id=self.current_task.task_id)
|
| 37 |
+
observation = self._build_observation(
|
| 38 |
+
feedback="Review this SQL query and identify correctness, performance, or security issues."
|
| 39 |
+
)
|
| 40 |
+
return StepResult(observation=observation, reward=0.0, done=False, info={})
|
| 41 |
+
|
| 42 |
+
def step(self, action: SQLReviewAction) -> StepResult:
|
| 43 |
+
task = self._require_task()
|
| 44 |
+
state = self._require_state()
|
| 45 |
+
if state.done:
|
| 46 |
+
raise RuntimeError("Episode already finished. Call reset() before taking more steps.")
|
| 47 |
+
|
| 48 |
+
found_ids = {issue.issue_id for issue in state.issues_identified}
|
| 49 |
+
reward = 0.0
|
| 50 |
+
info: dict[str, object] = {}
|
| 51 |
+
feedback = "No-op."
|
| 52 |
+
state.step_count += 1
|
| 53 |
+
|
| 54 |
+
if action.action_type == "identify_issue":
|
| 55 |
+
duplicate_issue, duplicate_score = match_issue(action, task.ground_truth_issues, set())
|
| 56 |
+
if duplicate_issue is not None and duplicate_issue.id in found_ids:
|
| 57 |
+
reward = compute_reward(action, duplicate_issue, duplicate_issue=True)
|
| 58 |
+
feedback = f"Issue '{duplicate_issue.id}' was already identified earlier in the episode."
|
| 59 |
+
info = {"match_score": round(duplicate_score, 3), "match_type": "duplicate", "issue_id": duplicate_issue.id}
|
| 60 |
+
else:
|
| 61 |
+
matched_issue, score = match_issue(action, task.ground_truth_issues, found_ids)
|
| 62 |
+
if matched_issue is None:
|
| 63 |
+
state.false_positive_count += 1
|
| 64 |
+
reward = compute_reward(action, None)
|
| 65 |
+
feedback = "No matching issue found for that description."
|
| 66 |
+
info = {"match_score": round(score, 3), "match_type": "none"}
|
| 67 |
+
else:
|
| 68 |
+
fix_valid = validate_fix(action.suggested_fix, matched_issue)
|
| 69 |
+
state.issues_identified.append(
|
| 70 |
+
IdentifiedIssue(
|
| 71 |
+
issue_id=matched_issue.id,
|
| 72 |
+
category=matched_issue.category,
|
| 73 |
+
description=matched_issue.description,
|
| 74 |
+
)
|
| 75 |
+
)
|
| 76 |
+
reward = compute_reward(action, matched_issue, fix_valid=fix_valid)
|
| 77 |
+
remaining = len(task.ground_truth_issues) - len(state.issues_identified)
|
| 78 |
+
feedback = f"Matched {matched_issue.category} issue '{matched_issue.id}'. {remaining} issue(s) remaining."
|
| 79 |
+
info = {
|
| 80 |
+
"match_score": round(score, 3),
|
| 81 |
+
"match_type": "fuzzy",
|
| 82 |
+
"severity": matched_issue.severity,
|
| 83 |
+
"issue_id": matched_issue.id,
|
| 84 |
+
"all_issues_found": remaining == 0,
|
| 85 |
+
}
|
| 86 |
+
if fix_valid and action.suggested_fix:
|
| 87 |
+
state.fixes_suggested.append(action.suggested_fix)
|
| 88 |
+
|
| 89 |
+
elif action.action_type == "suggest_fix":
|
| 90 |
+
if not state.issues_identified:
|
| 91 |
+
reward = compute_reward(action, None, has_previous_issue=False)
|
| 92 |
+
feedback = "Identify an issue before suggesting a fix."
|
| 93 |
+
else:
|
| 94 |
+
last_issue_id = state.issues_identified[-1].issue_id
|
| 95 |
+
last_issue = next(issue for issue in task.ground_truth_issues if issue.id == last_issue_id)
|
| 96 |
+
fix_valid = validate_fix(action.suggested_fix, last_issue)
|
| 97 |
+
reward = compute_reward(action, last_issue, fix_valid=fix_valid, has_previous_issue=True)
|
| 98 |
+
feedback = "Fix accepted for the last identified issue." if fix_valid else "Suggested fix did not match the expected remediation."
|
| 99 |
+
info = {"issue_id": last_issue.id, "fix_valid": fix_valid}
|
| 100 |
+
if fix_valid and action.suggested_fix:
|
| 101 |
+
state.fixes_suggested.append(action.suggested_fix)
|
| 102 |
+
|
| 103 |
+
elif action.action_type == "approve":
|
| 104 |
+
remaining_unfound = len(task.ground_truth_issues) - len(found_ids)
|
| 105 |
+
reward = compute_reward(action, None, remaining_unfound=remaining_unfound)
|
| 106 |
+
state.approved = True
|
| 107 |
+
state.done = True
|
| 108 |
+
feedback = (
|
| 109 |
+
"Query approved with full issue coverage."
|
| 110 |
+
if remaining_unfound == 0
|
| 111 |
+
else f"Query approved too early. {remaining_unfound} issue(s) were missed."
|
| 112 |
+
)
|
| 113 |
+
info = {"remaining_unfound": remaining_unfound}
|
| 114 |
+
|
| 115 |
+
else:
|
| 116 |
+
feedback = self._schema_feedback(task)
|
| 117 |
+
info = {"context_shared": bool(task.schema)}
|
| 118 |
+
|
| 119 |
+
state.total_reward += reward
|
| 120 |
+
|
| 121 |
+
if state.step_count >= task.max_steps and not state.done:
|
| 122 |
+
state.done = True
|
| 123 |
+
feedback = f"{feedback} Maximum step count reached."
|
| 124 |
+
|
| 125 |
+
if state.done:
|
| 126 |
+
state.final_score = grade_episode(
|
| 127 |
+
found_issue_ids={issue.issue_id for issue in state.issues_identified},
|
| 128 |
+
ground_truth_issues=task.ground_truth_issues,
|
| 129 |
+
total_steps=state.step_count,
|
| 130 |
+
max_steps=task.max_steps,
|
| 131 |
+
false_positive_count=state.false_positive_count,
|
| 132 |
+
)
|
| 133 |
+
info["final_score"] = state.final_score
|
| 134 |
+
|
| 135 |
+
observation = self._build_observation(feedback=feedback)
|
| 136 |
+
return StepResult(observation=observation, reward=reward, done=state.done, info=info)
|
| 137 |
+
|
| 138 |
+
def state(self) -> SQLReviewState:
|
| 139 |
+
return self._require_state().model_copy(deep=True)
|
| 140 |
+
|
| 141 |
+
def _load_tasks(self) -> dict[str, TaskRecord]:
|
| 142 |
+
tasks: dict[str, TaskRecord] = {}
|
| 143 |
+
for file_path in sorted(self.task_directory.glob("*_tasks.json")):
|
| 144 |
+
with file_path.open("r", encoding="utf-8") as handle:
|
| 145 |
+
for raw_task in json.load(handle):
|
| 146 |
+
task = TaskRecord.model_validate(raw_task)
|
| 147 |
+
tasks[task.task_id] = task
|
| 148 |
+
if not tasks:
|
| 149 |
+
raise RuntimeError(f"No task files found in {self.task_directory}")
|
| 150 |
+
return tasks
|
| 151 |
+
|
| 152 |
+
def _next_task_id(self) -> str:
|
| 153 |
+
task_id = self.task_order[self._reset_index % len(self.task_order)]
|
| 154 |
+
self._reset_index += 1
|
| 155 |
+
return task_id
|
| 156 |
+
|
| 157 |
+
def _build_observation(self, feedback: str) -> SQLReviewObservation:
|
| 158 |
+
task = self._require_task()
|
| 159 |
+
state = self._require_state()
|
| 160 |
+
remaining_actions = max(task.max_steps - state.step_count, 0)
|
| 161 |
+
return SQLReviewObservation(
|
| 162 |
+
query=task.query,
|
| 163 |
+
schema_info=task.schema_info,
|
| 164 |
+
context=task.context,
|
| 165 |
+
issues_found_so_far=state.issues_identified,
|
| 166 |
+
remaining_actions=remaining_actions,
|
| 167 |
+
difficulty=task.difficulty,
|
| 168 |
+
feedback=feedback,
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
def _schema_feedback(self, task: TaskRecord) -> str:
|
| 172 |
+
if not task.schema_info:
|
| 173 |
+
return "No additional schema context is available for this task."
|
| 174 |
+
tables = ", ".join(sorted(task.schema_info))
|
| 175 |
+
return f"Schema context available for: {tables}."
|
| 176 |
+
|
| 177 |
+
def _require_task(self) -> TaskRecord:
|
| 178 |
+
if self.current_task is None:
|
| 179 |
+
raise RuntimeError("Environment has no active task. Call reset() first.")
|
| 180 |
+
return self.current_task
|
| 181 |
+
|
| 182 |
+
def _require_state(self) -> SQLReviewState:
|
| 183 |
+
if self.current_state is None:
|
| 184 |
+
raise RuntimeError("Environment has no active state. Call reset() first.")
|
| 185 |
+
return self.current_state
|
server/grader.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
from sql_query_reviewer.models import GroundTruthIssue, IssueCategory, SQLReviewAction
|
| 6 |
+
|
| 7 |
+
TOKEN_RE = re.compile(r"[a-zA-Z0-9_]+")
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def clamp(value: float, minimum: float, maximum: float) -> float:
|
| 11 |
+
return max(minimum, min(maximum, value))
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def normalize_text(value: str) -> str:
|
| 15 |
+
return " ".join(TOKEN_RE.findall(value.lower()))
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def tokenize(value: str) -> set[str]:
|
| 19 |
+
return set(TOKEN_RE.findall(value.lower()))
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def _set_overlap(candidate: set[str], target: set[str]) -> float:
|
| 23 |
+
if not candidate or not target:
|
| 24 |
+
return 0.0
|
| 25 |
+
return len(candidate & target) / max(len(target), 1)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def score_issue_match(description: str, category: IssueCategory | None, issue: GroundTruthIssue) -> float:
|
| 29 |
+
candidate_tokens = tokenize(description)
|
| 30 |
+
keyword_tokens = set(issue.keywords)
|
| 31 |
+
description_tokens = tokenize(issue.description)
|
| 32 |
+
keyword_score = _set_overlap(candidate_tokens, keyword_tokens)
|
| 33 |
+
description_score = _set_overlap(candidate_tokens, description_tokens)
|
| 34 |
+
category_bonus = 0.2 if category == issue.category else 0.0
|
| 35 |
+
score = (keyword_score * 0.6) + (description_score * 0.25) + category_bonus
|
| 36 |
+
return clamp(score, 0.0, 1.0)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def match_issue(
|
| 40 |
+
action: SQLReviewAction,
|
| 41 |
+
ground_truth_issues: list[GroundTruthIssue],
|
| 42 |
+
already_found_ids: set[str],
|
| 43 |
+
) -> tuple[GroundTruthIssue | None, float]:
|
| 44 |
+
if not action.issue_description:
|
| 45 |
+
return None, 0.0
|
| 46 |
+
|
| 47 |
+
best_issue: GroundTruthIssue | None = None
|
| 48 |
+
best_score = 0.0
|
| 49 |
+
for issue in ground_truth_issues:
|
| 50 |
+
if issue.id in already_found_ids:
|
| 51 |
+
continue
|
| 52 |
+
score = score_issue_match(action.issue_description, action.issue_category, issue)
|
| 53 |
+
if score > best_score:
|
| 54 |
+
best_score = score
|
| 55 |
+
best_issue = issue
|
| 56 |
+
|
| 57 |
+
if best_issue is None or best_score < 0.35:
|
| 58 |
+
return None, best_score
|
| 59 |
+
return best_issue, best_score
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def validate_fix(suggested_fix: str | None, issue: GroundTruthIssue) -> bool:
|
| 63 |
+
if not suggested_fix:
|
| 64 |
+
return False
|
| 65 |
+
suggestion_tokens = tokenize(suggested_fix)
|
| 66 |
+
canonical_tokens = tokenize(issue.fix)
|
| 67 |
+
if not suggestion_tokens or not canonical_tokens:
|
| 68 |
+
return False
|
| 69 |
+
overlap = _set_overlap(suggestion_tokens, canonical_tokens)
|
| 70 |
+
description_overlap = _set_overlap(suggestion_tokens, tokenize(issue.description))
|
| 71 |
+
return overlap >= 0.5 or description_overlap >= 0.6
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def grade_episode(
|
| 75 |
+
found_issue_ids: set[str],
|
| 76 |
+
ground_truth_issues: list[GroundTruthIssue],
|
| 77 |
+
total_steps: int,
|
| 78 |
+
max_steps: int,
|
| 79 |
+
false_positive_count: int,
|
| 80 |
+
) -> float:
|
| 81 |
+
if not ground_truth_issues:
|
| 82 |
+
return 1.0 if false_positive_count == 0 else clamp(1.0 - (0.1 * false_positive_count), 0.0, 1.0)
|
| 83 |
+
|
| 84 |
+
total_severity = sum(issue.severity for issue in ground_truth_issues)
|
| 85 |
+
found_severity = sum(issue.severity for issue in ground_truth_issues if issue.id in found_issue_ids)
|
| 86 |
+
coverage_score = found_severity / total_severity if total_severity else 0.0
|
| 87 |
+
efficiency_bonus = max(0.0, 0.1 * (1 - (total_steps / max(max_steps, 1))))
|
| 88 |
+
false_positive_penalty = 0.05 * false_positive_count
|
| 89 |
+
final_score = coverage_score + efficiency_bonus - false_positive_penalty
|
| 90 |
+
return clamp(final_score, 0.0, 1.0)
|
| 91 |
+
|
server/reward.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from sql_query_reviewer.models import GroundTruthIssue, SQLReviewAction
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def compute_reward(
|
| 7 |
+
action: SQLReviewAction,
|
| 8 |
+
matched_issue: GroundTruthIssue | None,
|
| 9 |
+
*,
|
| 10 |
+
fix_valid: bool = False,
|
| 11 |
+
duplicate_issue: bool = False,
|
| 12 |
+
remaining_unfound: int = 0,
|
| 13 |
+
has_previous_issue: bool = False,
|
| 14 |
+
) -> float:
|
| 15 |
+
if action.action_type == "identify_issue":
|
| 16 |
+
if duplicate_issue:
|
| 17 |
+
return -0.02
|
| 18 |
+
if matched_issue is None:
|
| 19 |
+
return -0.1
|
| 20 |
+
base_reward = min(matched_issue.severity, 0.35)
|
| 21 |
+
fix_bonus = 0.08 if fix_valid else 0.0
|
| 22 |
+
confidence_bonus = min(0.05, action.confidence * 0.05)
|
| 23 |
+
return min(base_reward + fix_bonus + confidence_bonus, 0.4)
|
| 24 |
+
|
| 25 |
+
if action.action_type == "suggest_fix":
|
| 26 |
+
if not has_previous_issue:
|
| 27 |
+
return -0.05
|
| 28 |
+
return 0.1 if fix_valid else 0.0
|
| 29 |
+
|
| 30 |
+
if action.action_type == "approve":
|
| 31 |
+
if remaining_unfound == 0:
|
| 32 |
+
return 0.2
|
| 33 |
+
return max(-1.0, -0.15 * remaining_unfound)
|
| 34 |
+
|
| 35 |
+
return 0.0
|
| 36 |
+
|
sql_query_reviewer/__init__.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sql_query_reviewer.client import SQLReviewEnv, SyncSQLReviewEnv
|
| 2 |
+
from sql_query_reviewer.models import (
|
| 3 |
+
GroundTruthIssue,
|
| 4 |
+
IdentifiedIssue,
|
| 5 |
+
ResetRequest,
|
| 6 |
+
SQLReviewAction,
|
| 7 |
+
SQLReviewObservation,
|
| 8 |
+
SQLReviewState,
|
| 9 |
+
StepResult,
|
| 10 |
+
TaskRecord,
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
__all__ = [
|
| 14 |
+
"GroundTruthIssue",
|
| 15 |
+
"IdentifiedIssue",
|
| 16 |
+
"ResetRequest",
|
| 17 |
+
"SQLReviewAction",
|
| 18 |
+
"SQLReviewEnv",
|
| 19 |
+
"SQLReviewObservation",
|
| 20 |
+
"SQLReviewState",
|
| 21 |
+
"StepResult",
|
| 22 |
+
"SyncSQLReviewEnv",
|
| 23 |
+
"TaskRecord",
|
| 24 |
+
]
|
| 25 |
+
|
sql_query_reviewer/client.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import Any
|
| 4 |
+
|
| 5 |
+
import httpx
|
| 6 |
+
|
| 7 |
+
from sql_query_reviewer.models import ResetRequest, SQLReviewAction, SQLReviewState, StepResult
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class SQLReviewEnv:
|
| 11 |
+
def __init__(self, base_url: str, timeout: float = 30.0) -> None:
|
| 12 |
+
self.base_url = base_url.rstrip("/")
|
| 13 |
+
self.timeout = timeout
|
| 14 |
+
self._client: httpx.AsyncClient | None = None
|
| 15 |
+
|
| 16 |
+
async def __aenter__(self) -> "SQLReviewEnv":
|
| 17 |
+
self._client = httpx.AsyncClient(base_url=self.base_url, timeout=self.timeout)
|
| 18 |
+
return self
|
| 19 |
+
|
| 20 |
+
async def __aexit__(self, *_: Any) -> None:
|
| 21 |
+
await self.close()
|
| 22 |
+
|
| 23 |
+
async def close(self) -> None:
|
| 24 |
+
if self._client is not None:
|
| 25 |
+
await self._client.aclose()
|
| 26 |
+
self._client = None
|
| 27 |
+
|
| 28 |
+
def sync(self) -> "SyncSQLReviewEnv":
|
| 29 |
+
return SyncSQLReviewEnv(base_url=self.base_url, timeout=self.timeout)
|
| 30 |
+
|
| 31 |
+
async def reset(self, task_id: str | None = None) -> StepResult:
|
| 32 |
+
client = self._require_client()
|
| 33 |
+
response = await client.post("/reset", json=ResetRequest(task_id=task_id).model_dump(exclude_none=True))
|
| 34 |
+
response.raise_for_status()
|
| 35 |
+
return StepResult.model_validate(response.json())
|
| 36 |
+
|
| 37 |
+
async def step(self, action: SQLReviewAction) -> StepResult:
|
| 38 |
+
client = self._require_client()
|
| 39 |
+
response = await client.post("/step", json=action.model_dump(exclude_none=True))
|
| 40 |
+
response.raise_for_status()
|
| 41 |
+
return StepResult.model_validate(response.json())
|
| 42 |
+
|
| 43 |
+
async def state(self) -> SQLReviewState:
|
| 44 |
+
client = self._require_client()
|
| 45 |
+
response = await client.get("/state")
|
| 46 |
+
response.raise_for_status()
|
| 47 |
+
return SQLReviewState.model_validate(response.json())
|
| 48 |
+
|
| 49 |
+
def _require_client(self) -> httpx.AsyncClient:
|
| 50 |
+
if self._client is None:
|
| 51 |
+
raise RuntimeError("Use SQLReviewEnv as an async context manager before calling it.")
|
| 52 |
+
return self._client
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
class SyncSQLReviewEnv:
|
| 56 |
+
def __init__(self, base_url: str, timeout: float = 30.0) -> None:
|
| 57 |
+
self.base_url = base_url.rstrip("/")
|
| 58 |
+
self.timeout = timeout
|
| 59 |
+
self._client: httpx.Client | None = None
|
| 60 |
+
|
| 61 |
+
def __enter__(self) -> "SyncSQLReviewEnv":
|
| 62 |
+
self._client = httpx.Client(base_url=self.base_url, timeout=self.timeout)
|
| 63 |
+
return self
|
| 64 |
+
|
| 65 |
+
def __exit__(self, *_: Any) -> None:
|
| 66 |
+
self.close()
|
| 67 |
+
|
| 68 |
+
def close(self) -> None:
|
| 69 |
+
if self._client is not None:
|
| 70 |
+
self._client.close()
|
| 71 |
+
self._client = None
|
| 72 |
+
|
| 73 |
+
def reset(self, task_id: str | None = None) -> StepResult:
|
| 74 |
+
client = self._require_client()
|
| 75 |
+
response = client.post("/reset", json=ResetRequest(task_id=task_id).model_dump(exclude_none=True))
|
| 76 |
+
response.raise_for_status()
|
| 77 |
+
return StepResult.model_validate(response.json())
|
| 78 |
+
|
| 79 |
+
def step(self, action: SQLReviewAction) -> StepResult:
|
| 80 |
+
client = self._require_client()
|
| 81 |
+
response = client.post("/step", json=action.model_dump(exclude_none=True))
|
| 82 |
+
response.raise_for_status()
|
| 83 |
+
return StepResult.model_validate(response.json())
|
| 84 |
+
|
| 85 |
+
def state(self) -> SQLReviewState:
|
| 86 |
+
client = self._require_client()
|
| 87 |
+
response = client.get("/state")
|
| 88 |
+
response.raise_for_status()
|
| 89 |
+
return SQLReviewState.model_validate(response.json())
|
| 90 |
+
|
| 91 |
+
def _require_client(self) -> httpx.Client:
|
| 92 |
+
if self._client is None:
|
| 93 |
+
raise RuntimeError("Use SyncSQLReviewEnv as a context manager before calling it.")
|
| 94 |
+
return self._client
|
| 95 |
+
|
sql_query_reviewer/models.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import Any, Literal
|
| 4 |
+
|
| 5 |
+
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
|
| 6 |
+
|
| 7 |
+
Difficulty = Literal["easy", "medium", "hard"]
|
| 8 |
+
ActionType = Literal["identify_issue", "suggest_fix", "approve", "request_more_context"]
|
| 9 |
+
IssueCategory = Literal["syntax", "performance", "security", "logic", "style"]
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class StrictModel(BaseModel):
|
| 13 |
+
model_config = ConfigDict(extra="forbid", populate_by_name=True)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class GroundTruthIssue(StrictModel):
|
| 17 |
+
id: str = Field(min_length=1)
|
| 18 |
+
category: IssueCategory
|
| 19 |
+
description: str = Field(min_length=1)
|
| 20 |
+
severity: float = Field(gt=0.0, le=1.0)
|
| 21 |
+
fix: str = Field(min_length=1)
|
| 22 |
+
keywords: list[str] = Field(default_factory=list)
|
| 23 |
+
|
| 24 |
+
@field_validator("keywords")
|
| 25 |
+
@classmethod
|
| 26 |
+
def normalize_keywords(cls, value: list[str]) -> list[str]:
|
| 27 |
+
deduped: list[str] = []
|
| 28 |
+
for keyword in value:
|
| 29 |
+
normalized = keyword.strip().lower()
|
| 30 |
+
if normalized and normalized not in deduped:
|
| 31 |
+
deduped.append(normalized)
|
| 32 |
+
return deduped
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class TaskRecord(StrictModel):
|
| 36 |
+
task_id: str = Field(min_length=1)
|
| 37 |
+
difficulty: Difficulty
|
| 38 |
+
query: str = Field(min_length=1)
|
| 39 |
+
schema_info: dict[str, dict[str, str]] = Field(default_factory=dict, alias="schema")
|
| 40 |
+
context: str = Field(min_length=1)
|
| 41 |
+
ground_truth_issues: list[GroundTruthIssue] = Field(default_factory=list)
|
| 42 |
+
max_steps: int = Field(ge=1, le=12)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class IdentifiedIssue(StrictModel):
|
| 46 |
+
issue_id: str = Field(min_length=1)
|
| 47 |
+
category: IssueCategory
|
| 48 |
+
description: str = Field(min_length=1)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
class SQLReviewAction(StrictModel):
|
| 52 |
+
action_type: ActionType
|
| 53 |
+
issue_category: IssueCategory | None = None
|
| 54 |
+
issue_description: str | None = None
|
| 55 |
+
suggested_fix: str | None = None
|
| 56 |
+
confidence: float = Field(default=0.5, ge=0.0, le=1.0)
|
| 57 |
+
|
| 58 |
+
@model_validator(mode="after")
|
| 59 |
+
def validate_action(self) -> "SQLReviewAction":
|
| 60 |
+
if self.action_type == "identify_issue":
|
| 61 |
+
if not self.issue_category or not self.issue_description:
|
| 62 |
+
raise ValueError("identify_issue requires issue_category and issue_description")
|
| 63 |
+
elif self.action_type == "suggest_fix":
|
| 64 |
+
if not self.suggested_fix:
|
| 65 |
+
raise ValueError("suggest_fix requires suggested_fix")
|
| 66 |
+
return self
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
class SQLReviewObservation(StrictModel):
|
| 70 |
+
query: str
|
| 71 |
+
schema_info: dict[str, dict[str, str]] = Field(default_factory=dict)
|
| 72 |
+
context: str
|
| 73 |
+
issues_found_so_far: list[IdentifiedIssue] = Field(default_factory=list)
|
| 74 |
+
remaining_actions: int = Field(ge=0)
|
| 75 |
+
difficulty: Difficulty
|
| 76 |
+
feedback: str
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
class SQLReviewState(StrictModel):
|
| 80 |
+
task_id: str
|
| 81 |
+
step_count: int = Field(default=0, ge=0)
|
| 82 |
+
issues_identified: list[IdentifiedIssue] = Field(default_factory=list)
|
| 83 |
+
total_reward: float = 0.0
|
| 84 |
+
done: bool = False
|
| 85 |
+
approved: bool = False
|
| 86 |
+
fixes_suggested: list[str] = Field(default_factory=list)
|
| 87 |
+
false_positive_count: int = Field(default=0, ge=0)
|
| 88 |
+
final_score: float | None = Field(default=None, ge=0.0, le=1.0)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
class StepResult(StrictModel):
|
| 92 |
+
observation: SQLReviewObservation
|
| 93 |
+
reward: float
|
| 94 |
+
done: bool
|
| 95 |
+
info: dict[str, Any] = Field(default_factory=dict)
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
class ResetRequest(StrictModel):
|
| 99 |
+
task_id: str | None = None
|
tasks/easy_tasks.json
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"task_id": "easy_001",
|
| 4 |
+
"difficulty": "easy",
|
| 5 |
+
"query": "SELCT * FORM users WEHRE id = 1;",
|
| 6 |
+
"schema": {
|
| 7 |
+
"users": {
|
| 8 |
+
"id": "INT PRIMARY KEY",
|
| 9 |
+
"name": "VARCHAR(255)",
|
| 10 |
+
"email": "VARCHAR(255)"
|
| 11 |
+
}
|
| 12 |
+
},
|
| 13 |
+
"context": "Fetch a user profile by id for the account page.",
|
| 14 |
+
"ground_truth_issues": [
|
| 15 |
+
{
|
| 16 |
+
"id": "easy_001_select",
|
| 17 |
+
"category": "syntax",
|
| 18 |
+
"description": "SELCT should be SELECT.",
|
| 19 |
+
"severity": 0.35,
|
| 20 |
+
"fix": "SELECT * FROM users WHERE id = 1;",
|
| 21 |
+
"keywords": ["selct", "select", "misspelled keyword", "syntax"]
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"id": "easy_001_from",
|
| 25 |
+
"category": "syntax",
|
| 26 |
+
"description": "FORM should be FROM.",
|
| 27 |
+
"severity": 0.35,
|
| 28 |
+
"fix": "SELECT * FROM users WHERE id = 1;",
|
| 29 |
+
"keywords": ["form", "from", "misspelled keyword", "syntax"]
|
| 30 |
+
},
|
| 31 |
+
{
|
| 32 |
+
"id": "easy_001_where",
|
| 33 |
+
"category": "syntax",
|
| 34 |
+
"description": "WEHRE should be WHERE.",
|
| 35 |
+
"severity": 0.25,
|
| 36 |
+
"fix": "SELECT * FROM users WHERE id = 1;",
|
| 37 |
+
"keywords": ["wehre", "where", "misspelled keyword", "syntax"]
|
| 38 |
+
},
|
| 39 |
+
{
|
| 40 |
+
"id": "easy_001_projection",
|
| 41 |
+
"category": "performance",
|
| 42 |
+
"description": "SELECT * fetches unnecessary columns for a profile lookup.",
|
| 43 |
+
"severity": 0.15,
|
| 44 |
+
"fix": "SELECT id, name, email FROM users WHERE id = 1;",
|
| 45 |
+
"keywords": ["select *", "unnecessary columns", "projection", "performance"]
|
| 46 |
+
}
|
| 47 |
+
],
|
| 48 |
+
"max_steps": 5
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"task_id": "easy_002",
|
| 52 |
+
"difficulty": "easy",
|
| 53 |
+
"query": "SELECT id, email users WHERE active = 1;",
|
| 54 |
+
"schema": {
|
| 55 |
+
"users": {
|
| 56 |
+
"id": "INT PRIMARY KEY",
|
| 57 |
+
"email": "VARCHAR(255)",
|
| 58 |
+
"active": "BOOLEAN"
|
| 59 |
+
}
|
| 60 |
+
},
|
| 61 |
+
"context": "List active user emails for a notification job.",
|
| 62 |
+
"ground_truth_issues": [
|
| 63 |
+
{
|
| 64 |
+
"id": "easy_002_missing_from",
|
| 65 |
+
"category": "syntax",
|
| 66 |
+
"description": "The query is missing the FROM clause before users.",
|
| 67 |
+
"severity": 0.6,
|
| 68 |
+
"fix": "SELECT id, email FROM users WHERE active = 1;",
|
| 69 |
+
"keywords": ["missing from", "from clause", "syntax", "users"]
|
| 70 |
+
}
|
| 71 |
+
],
|
| 72 |
+
"max_steps": 4
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"task_id": "easy_003",
|
| 76 |
+
"difficulty": "easy",
|
| 77 |
+
"query": "SELECT order_id, total FROM orders WHERE shipped_at = NULL;",
|
| 78 |
+
"schema": {
|
| 79 |
+
"orders": {
|
| 80 |
+
"order_id": "INT PRIMARY KEY",
|
| 81 |
+
"total": "DECIMAL(10,2)",
|
| 82 |
+
"shipped_at": "TIMESTAMP NULL"
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"context": "Find orders that still need to ship.",
|
| 86 |
+
"ground_truth_issues": [
|
| 87 |
+
{
|
| 88 |
+
"id": "easy_003_null_check",
|
| 89 |
+
"category": "logic",
|
| 90 |
+
"description": "NULL must be compared with IS NULL instead of = NULL.",
|
| 91 |
+
"severity": 0.7,
|
| 92 |
+
"fix": "SELECT order_id, total FROM orders WHERE shipped_at IS NULL;",
|
| 93 |
+
"keywords": ["is null", "= null", "null comparison", "logic"]
|
| 94 |
+
}
|
| 95 |
+
],
|
| 96 |
+
"max_steps": 4
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"task_id": "easy_004",
|
| 100 |
+
"difficulty": "easy",
|
| 101 |
+
"query": "SELECT name FROM customers WHERE city = 'Boston;",
|
| 102 |
+
"schema": {
|
| 103 |
+
"customers": {
|
| 104 |
+
"id": "INT PRIMARY KEY",
|
| 105 |
+
"name": "VARCHAR(255)",
|
| 106 |
+
"city": "VARCHAR(128)"
|
| 107 |
+
}
|
| 108 |
+
},
|
| 109 |
+
"context": "Filter customers who live in Boston.",
|
| 110 |
+
"ground_truth_issues": [
|
| 111 |
+
{
|
| 112 |
+
"id": "easy_004_unclosed_quote",
|
| 113 |
+
"category": "syntax",
|
| 114 |
+
"description": "The string literal is not terminated with a closing quote.",
|
| 115 |
+
"severity": 0.75,
|
| 116 |
+
"fix": "SELECT name FROM customers WHERE city = 'Boston';",
|
| 117 |
+
"keywords": ["unclosed quote", "unterminated string", "syntax", "quote"]
|
| 118 |
+
}
|
| 119 |
+
],
|
| 120 |
+
"max_steps": 4
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"task_id": "easy_005",
|
| 124 |
+
"difficulty": "easy",
|
| 125 |
+
"query": "SELECT id, statuz FROM orders WHERE status = 'paid';",
|
| 126 |
+
"schema": {
|
| 127 |
+
"orders": {
|
| 128 |
+
"id": "INT PRIMARY KEY",
|
| 129 |
+
"status": "VARCHAR(32)",
|
| 130 |
+
"total": "DECIMAL(10,2)",
|
| 131 |
+
"created_at": "TIMESTAMP"
|
| 132 |
+
}
|
| 133 |
+
},
|
| 134 |
+
"context": "List paid orders for revenue accounting.",
|
| 135 |
+
"ground_truth_issues": [
|
| 136 |
+
{
|
| 137 |
+
"id": "easy_005_bad_column",
|
| 138 |
+
"category": "logic",
|
| 139 |
+
"description": "Column statuz does not exist; the intended column is status.",
|
| 140 |
+
"severity": 0.65,
|
| 141 |
+
"fix": "SELECT id, status FROM orders WHERE status = 'paid';",
|
| 142 |
+
"keywords": ["unknown column", "statuz", "status", "column name"]
|
| 143 |
+
}
|
| 144 |
+
],
|
| 145 |
+
"max_steps": 4
|
| 146 |
+
}
|
| 147 |
+
]
|
| 148 |
+
|
tasks/hard_tasks.json
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"task_id": "hard_001",
|
| 4 |
+
"difficulty": "hard",
|
| 5 |
+
"query": "SELECT * FROM users WHERE email = '${user_email}' AND password = '${password}';",
|
| 6 |
+
"schema": {
|
| 7 |
+
"users": {
|
| 8 |
+
"id": "INT PRIMARY KEY",
|
| 9 |
+
"email": "VARCHAR(255) UNIQUE",
|
| 10 |
+
"password_hash": "VARCHAR(255)",
|
| 11 |
+
"role": "VARCHAR(32)",
|
| 12 |
+
"created_at": "TIMESTAMP"
|
| 13 |
+
}
|
| 14 |
+
},
|
| 15 |
+
"context": "Authenticate a user during login.",
|
| 16 |
+
"ground_truth_issues": [
|
| 17 |
+
{
|
| 18 |
+
"id": "hard_001_sql_injection",
|
| 19 |
+
"category": "security",
|
| 20 |
+
"description": "Interpolating user_email and password directly into the SQL creates a SQL injection vulnerability.",
|
| 21 |
+
"severity": 1.0,
|
| 22 |
+
"fix": "SELECT id, email, role FROM users WHERE email = ? AND password_hash = ?;",
|
| 23 |
+
"keywords": ["sql injection", "interpolation", "user input", "parameterized", "security"]
|
| 24 |
+
},
|
| 25 |
+
{
|
| 26 |
+
"id": "hard_001_select_star_sensitive",
|
| 27 |
+
"category": "security",
|
| 28 |
+
"description": "SELECT * returns sensitive columns such as password hashes that the login flow does not need.",
|
| 29 |
+
"severity": 0.4,
|
| 30 |
+
"fix": "SELECT id, email, role FROM users WHERE email = ? AND password_hash = ?;",
|
| 31 |
+
"keywords": ["select *", "sensitive columns", "password hash", "least privilege", "security"]
|
| 32 |
+
}
|
| 33 |
+
],
|
| 34 |
+
"max_steps": 6
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"task_id": "hard_002",
|
| 38 |
+
"difficulty": "hard",
|
| 39 |
+
"query": "SELECT id, email FROM customers UNION SELECT id, secret_value FROM admin_secrets;",
|
| 40 |
+
"schema": {
|
| 41 |
+
"customers": {
|
| 42 |
+
"id": "INT PRIMARY KEY",
|
| 43 |
+
"email": "VARCHAR(255)"
|
| 44 |
+
},
|
| 45 |
+
"admin_secrets": {
|
| 46 |
+
"id": "INT PRIMARY KEY",
|
| 47 |
+
"secret_value": "TEXT"
|
| 48 |
+
}
|
| 49 |
+
},
|
| 50 |
+
"context": "Prepare a data export for a customer marketing campaign.",
|
| 51 |
+
"ground_truth_issues": [
|
| 52 |
+
{
|
| 53 |
+
"id": "hard_002_secret_exfiltration",
|
| 54 |
+
"category": "security",
|
| 55 |
+
"description": "The UNION includes admin_secrets and leaks privileged data into a customer-facing export.",
|
| 56 |
+
"severity": 0.95,
|
| 57 |
+
"fix": "SELECT id, email FROM customers;",
|
| 58 |
+
"keywords": ["union", "admin_secrets", "secret_value", "data leakage", "security"]
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
"id": "hard_002_mixed_data_domains",
|
| 62 |
+
"category": "logic",
|
| 63 |
+
"description": "The query mixes unrelated datasets with incompatible semantics, producing an invalid export.",
|
| 64 |
+
"severity": 0.45,
|
| 65 |
+
"fix": "SELECT id, email FROM customers;",
|
| 66 |
+
"keywords": ["union", "invalid export", "mixed dataset", "logic"]
|
| 67 |
+
}
|
| 68 |
+
],
|
| 69 |
+
"max_steps": 6
|
| 70 |
+
},
|
| 71 |
+
{
|
| 72 |
+
"task_id": "hard_003",
|
| 73 |
+
"difficulty": "hard",
|
| 74 |
+
"query": "SELECT c.id, c.full_name, c.ssn, c.email, t.subject FROM customers c JOIN support_tickets t ON t.customer_id = c.id WHERE t.status = 'open';",
|
| 75 |
+
"schema": {
|
| 76 |
+
"customers": {
|
| 77 |
+
"id": "INT PRIMARY KEY",
|
| 78 |
+
"full_name": "VARCHAR(255)",
|
| 79 |
+
"ssn": "VARCHAR(32)",
|
| 80 |
+
"email": "VARCHAR(255)"
|
| 81 |
+
},
|
| 82 |
+
"support_tickets": {
|
| 83 |
+
"id": "INT PRIMARY KEY",
|
| 84 |
+
"customer_id": "INT INDEX",
|
| 85 |
+
"subject": "VARCHAR(255)",
|
| 86 |
+
"status": "VARCHAR(32)"
|
| 87 |
+
}
|
| 88 |
+
},
|
| 89 |
+
"context": "Show open support tickets to an agent dashboard.",
|
| 90 |
+
"ground_truth_issues": [
|
| 91 |
+
{
|
| 92 |
+
"id": "hard_003_pii_leak",
|
| 93 |
+
"category": "security",
|
| 94 |
+
"description": "The dashboard query exposes SSNs even though the ticket workflow only needs identity and ticket context.",
|
| 95 |
+
"severity": 0.9,
|
| 96 |
+
"fix": "SELECT c.id, c.full_name, c.email, t.subject FROM customers c JOIN support_tickets t ON t.customer_id = c.id WHERE t.status = 'open';",
|
| 97 |
+
"keywords": ["ssn", "pii", "sensitive data", "least privilege", "security"]
|
| 98 |
+
}
|
| 99 |
+
],
|
| 100 |
+
"max_steps": 6
|
| 101 |
+
},
|
| 102 |
+
{
|
| 103 |
+
"task_id": "hard_004",
|
| 104 |
+
"difficulty": "hard",
|
| 105 |
+
"query": "SELECT e1.department_id, e1.id, COUNT(e2.salary) + 1 AS salary_rank FROM employees e1 LEFT JOIN employees e2 ON e1.department_id = e2.department_id AND e2.salary > e1.salary GROUP BY e1.department_id, e1.id;",
|
| 106 |
+
"schema": {
|
| 107 |
+
"employees": {
|
| 108 |
+
"id": "INT PRIMARY KEY",
|
| 109 |
+
"department_id": "INT INDEX",
|
| 110 |
+
"salary": "DECIMAL(10,2)"
|
| 111 |
+
}
|
| 112 |
+
},
|
| 113 |
+
"context": "Rank employees by salary within each department.",
|
| 114 |
+
"ground_truth_issues": [
|
| 115 |
+
{
|
| 116 |
+
"id": "hard_004_self_join_ranking",
|
| 117 |
+
"category": "performance",
|
| 118 |
+
"description": "The self-join ranking pattern is expensive and should use a window function such as DENSE_RANK().",
|
| 119 |
+
"severity": 0.8,
|
| 120 |
+
"fix": "SELECT department_id, id, DENSE_RANK() OVER (PARTITION BY department_id ORDER BY salary DESC) AS salary_rank FROM employees;",
|
| 121 |
+
"keywords": ["self join", "window function", "dense_rank", "ranking", "performance"]
|
| 122 |
+
}
|
| 123 |
+
],
|
| 124 |
+
"max_steps": 7
|
| 125 |
+
},
|
| 126 |
+
{
|
| 127 |
+
"task_id": "hard_005",
|
| 128 |
+
"difficulty": "hard",
|
| 129 |
+
"query": "UPDATE accounts SET balance = balance - 100 WHERE user_id = 10; UPDATE accounts SET balance = balance + 100 WHERE user_id = 11;",
|
| 130 |
+
"schema": {
|
| 131 |
+
"accounts": {
|
| 132 |
+
"user_id": "INT PRIMARY KEY",
|
| 133 |
+
"balance": "DECIMAL(10,2)"
|
| 134 |
+
}
|
| 135 |
+
},
|
| 136 |
+
"context": "Transfer money between two account balances.",
|
| 137 |
+
"ground_truth_issues": [
|
| 138 |
+
{
|
| 139 |
+
"id": "hard_005_missing_transaction",
|
| 140 |
+
"category": "security",
|
| 141 |
+
"description": "The transfer uses two updates without a transaction, so a partial failure can corrupt balances.",
|
| 142 |
+
"severity": 0.9,
|
| 143 |
+
"fix": "BEGIN; UPDATE accounts SET balance = balance - 100 WHERE user_id = 10 AND balance >= 100; UPDATE accounts SET balance = balance + 100 WHERE user_id = 11; COMMIT;",
|
| 144 |
+
"keywords": ["transaction", "partial failure", "atomic", "commit", "security"]
|
| 145 |
+
},
|
| 146 |
+
{
|
| 147 |
+
"id": "hard_005_no_balance_guard",
|
| 148 |
+
"category": "logic",
|
| 149 |
+
"description": "The debit statement does not verify sufficient funds before subtracting the balance.",
|
| 150 |
+
"severity": 0.55,
|
| 151 |
+
"fix": "BEGIN; UPDATE accounts SET balance = balance - 100 WHERE user_id = 10 AND balance >= 100; UPDATE accounts SET balance = balance + 100 WHERE user_id = 11; COMMIT;",
|
| 152 |
+
"keywords": ["balance guard", "insufficient funds", "where balance >=", "logic"]
|
| 153 |
+
}
|
| 154 |
+
],
|
| 155 |
+
"max_steps": 7
|
| 156 |
+
}
|
| 157 |
+
]
|
| 158 |
+
|
tasks/medium_tasks.json
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"task_id": "medium_001",
|
| 4 |
+
"difficulty": "medium",
|
| 5 |
+
"query": "SELECT * FROM events ORDER BY created_at DESC;",
|
| 6 |
+
"schema": {
|
| 7 |
+
"events": {
|
| 8 |
+
"id": "BIGINT PRIMARY KEY",
|
| 9 |
+
"event_name": "VARCHAR(255)",
|
| 10 |
+
"payload": "JSON",
|
| 11 |
+
"created_at": "TIMESTAMP INDEX",
|
| 12 |
+
"actor_id": "BIGINT",
|
| 13 |
+
"metadata": "JSON"
|
| 14 |
+
}
|
| 15 |
+
},
|
| 16 |
+
"context": "Show the most recent events on an admin dashboard.",
|
| 17 |
+
"ground_truth_issues": [
|
| 18 |
+
{
|
| 19 |
+
"id": "medium_001_select_star",
|
| 20 |
+
"category": "performance",
|
| 21 |
+
"description": "SELECT * pulls a wide payload when the dashboard only needs a few columns.",
|
| 22 |
+
"severity": 0.3,
|
| 23 |
+
"fix": "SELECT id, event_name, created_at FROM events ORDER BY created_at DESC LIMIT 50;",
|
| 24 |
+
"keywords": ["select *", "wide table", "projection", "performance"]
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"id": "medium_001_missing_limit",
|
| 28 |
+
"category": "performance",
|
| 29 |
+
"description": "The dashboard query is missing a LIMIT and can scan far more rows than necessary.",
|
| 30 |
+
"severity": 0.3,
|
| 31 |
+
"fix": "SELECT id, event_name, created_at FROM events ORDER BY created_at DESC LIMIT 50;",
|
| 32 |
+
"keywords": ["limit", "unbounded query", "dashboard", "performance"]
|
| 33 |
+
}
|
| 34 |
+
],
|
| 35 |
+
"max_steps": 5
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"task_id": "medium_002",
|
| 39 |
+
"difficulty": "medium",
|
| 40 |
+
"query": "SELECT c.id, c.name, (SELECT COUNT(*) FROM orders o WHERE o.customer_id = c.id) AS order_count FROM customers c;",
|
| 41 |
+
"schema": {
|
| 42 |
+
"customers": {
|
| 43 |
+
"id": "INT PRIMARY KEY",
|
| 44 |
+
"name": "VARCHAR(255)"
|
| 45 |
+
},
|
| 46 |
+
"orders": {
|
| 47 |
+
"id": "INT PRIMARY KEY",
|
| 48 |
+
"customer_id": "INT INDEX",
|
| 49 |
+
"total": "DECIMAL(10,2)"
|
| 50 |
+
}
|
| 51 |
+
},
|
| 52 |
+
"context": "Show each customer with the number of orders they have placed.",
|
| 53 |
+
"ground_truth_issues": [
|
| 54 |
+
{
|
| 55 |
+
"id": "medium_002_correlated_subquery",
|
| 56 |
+
"category": "performance",
|
| 57 |
+
"description": "The correlated subquery re-counts orders per row and should be rewritten as a join with GROUP BY.",
|
| 58 |
+
"severity": 0.6,
|
| 59 |
+
"fix": "SELECT c.id, c.name, COUNT(o.id) AS order_count FROM customers c LEFT JOIN orders o ON o.customer_id = c.id GROUP BY c.id, c.name;",
|
| 60 |
+
"keywords": ["correlated subquery", "group by", "join", "count", "performance"]
|
| 61 |
+
}
|
| 62 |
+
],
|
| 63 |
+
"max_steps": 6
|
| 64 |
+
},
|
| 65 |
+
{
|
| 66 |
+
"task_id": "medium_003",
|
| 67 |
+
"difficulty": "medium",
|
| 68 |
+
"query": "SELECT DISTINCT email FROM users WHERE email IS NOT NULL;",
|
| 69 |
+
"schema": {
|
| 70 |
+
"users": {
|
| 71 |
+
"id": "INT PRIMARY KEY",
|
| 72 |
+
"email": "VARCHAR(255) UNIQUE",
|
| 73 |
+
"last_login_at": "TIMESTAMP NULL"
|
| 74 |
+
}
|
| 75 |
+
},
|
| 76 |
+
"context": "Export non-null user emails for a CRM sync.",
|
| 77 |
+
"ground_truth_issues": [
|
| 78 |
+
{
|
| 79 |
+
"id": "medium_003_redundant_distinct",
|
| 80 |
+
"category": "performance",
|
| 81 |
+
"description": "DISTINCT is redundant because users.email is already unique.",
|
| 82 |
+
"severity": 0.45,
|
| 83 |
+
"fix": "SELECT email FROM users WHERE email IS NOT NULL;",
|
| 84 |
+
"keywords": ["distinct", "unique", "redundant", "email", "performance"]
|
| 85 |
+
}
|
| 86 |
+
],
|
| 87 |
+
"max_steps": 5
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"task_id": "medium_004",
|
| 91 |
+
"difficulty": "medium",
|
| 92 |
+
"query": "SELECT o.id, o.total, u.name FROM orders o JOIN users u ON u.id = o.user_id WHERE DATE(o.created_at) = '2026-04-10';",
|
| 93 |
+
"schema": {
|
| 94 |
+
"orders": {
|
| 95 |
+
"id": "INT PRIMARY KEY",
|
| 96 |
+
"user_id": "INT INDEX",
|
| 97 |
+
"created_at": "TIMESTAMP INDEX",
|
| 98 |
+
"total": "DECIMAL(10,2)"
|
| 99 |
+
},
|
| 100 |
+
"users": {
|
| 101 |
+
"id": "INT PRIMARY KEY",
|
| 102 |
+
"name": "VARCHAR(255)"
|
| 103 |
+
}
|
| 104 |
+
},
|
| 105 |
+
"context": "List orders placed on a specific date with the user name attached.",
|
| 106 |
+
"ground_truth_issues": [
|
| 107 |
+
{
|
| 108 |
+
"id": "medium_004_function_on_indexed_column",
|
| 109 |
+
"category": "performance",
|
| 110 |
+
"description": "Wrapping created_at with DATE() prevents efficient use of the created_at index.",
|
| 111 |
+
"severity": 0.6,
|
| 112 |
+
"fix": "SELECT o.id, o.total, u.name FROM orders o JOIN users u ON u.id = o.user_id WHERE o.created_at >= '2026-04-10' AND o.created_at < '2026-04-11';",
|
| 113 |
+
"keywords": ["date()", "function on column", "index", "range predicate", "performance"]
|
| 114 |
+
}
|
| 115 |
+
],
|
| 116 |
+
"max_steps": 6
|
| 117 |
+
},
|
| 118 |
+
{
|
| 119 |
+
"task_id": "medium_005",
|
| 120 |
+
"difficulty": "medium",
|
| 121 |
+
"query": "SELECT id, name FROM products WHERE LOWER(name) LIKE '%pro%';",
|
| 122 |
+
"schema": {
|
| 123 |
+
"products": {
|
| 124 |
+
"id": "INT PRIMARY KEY",
|
| 125 |
+
"name": "VARCHAR(255) INDEX",
|
| 126 |
+
"category_id": "INT",
|
| 127 |
+
"price": "DECIMAL(10,2)"
|
| 128 |
+
}
|
| 129 |
+
},
|
| 130 |
+
"context": "Search products whose names contain the text pro.",
|
| 131 |
+
"ground_truth_issues": [
|
| 132 |
+
{
|
| 133 |
+
"id": "medium_005_lower_blocks_index",
|
| 134 |
+
"category": "performance",
|
| 135 |
+
"description": "Applying LOWER(name) on every row prevents the index on name from being used efficiently.",
|
| 136 |
+
"severity": 0.35,
|
| 137 |
+
"fix": "SELECT id, name FROM products WHERE name ILIKE 'pro%';",
|
| 138 |
+
"keywords": ["lower", "function on column", "index", "performance"]
|
| 139 |
+
},
|
| 140 |
+
{
|
| 141 |
+
"id": "medium_005_leading_wildcard",
|
| 142 |
+
"category": "performance",
|
| 143 |
+
"description": "The leading wildcard in LIKE '%pro%' forces a full scan instead of an index-friendly prefix lookup.",
|
| 144 |
+
"severity": 0.35,
|
| 145 |
+
"fix": "SELECT id, name FROM products WHERE name ILIKE 'pro%';",
|
| 146 |
+
"keywords": ["leading wildcard", "%pro%", "full scan", "prefix lookup", "performance"]
|
| 147 |
+
}
|
| 148 |
+
],
|
| 149 |
+
"max_steps": 6
|
| 150 |
+
}
|
| 151 |
+
]
|
| 152 |
+
|
tests/test_api.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi.testclient import TestClient
|
| 2 |
+
|
| 3 |
+
from server.app import create_app
|
| 4 |
+
from server.environment import SQLReviewEnvironment
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def build_client() -> TestClient:
|
| 8 |
+
return TestClient(create_app(SQLReviewEnvironment()))
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def test_reset_returns_initial_observation() -> None:
|
| 12 |
+
client = build_client()
|
| 13 |
+
|
| 14 |
+
response = client.post("/reset", json={"task_id": "easy_001"})
|
| 15 |
+
|
| 16 |
+
assert response.status_code == 200
|
| 17 |
+
payload = response.json()
|
| 18 |
+
assert payload["observation"]["difficulty"] == "easy"
|
| 19 |
+
assert payload["reward"] == 0.0
|
| 20 |
+
assert payload["done"] is False
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def test_identify_issue_returns_positive_reward_for_match() -> None:
|
| 24 |
+
client = build_client()
|
| 25 |
+
client.post("/reset", json={"task_id": "easy_002"})
|
| 26 |
+
|
| 27 |
+
response = client.post(
|
| 28 |
+
"/step",
|
| 29 |
+
json={
|
| 30 |
+
"action_type": "identify_issue",
|
| 31 |
+
"issue_category": "syntax",
|
| 32 |
+
"issue_description": "The query is missing the FROM clause before users.",
|
| 33 |
+
"confidence": 0.95,
|
| 34 |
+
},
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
assert response.status_code == 200
|
| 38 |
+
payload = response.json()
|
| 39 |
+
assert payload["reward"] > 0
|
| 40 |
+
assert payload["info"]["issue_id"] == "easy_002_missing_from"
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def test_suggest_fix_without_identifying_issue_is_penalized() -> None:
|
| 44 |
+
client = build_client()
|
| 45 |
+
client.post("/reset", json={"task_id": "easy_002"})
|
| 46 |
+
|
| 47 |
+
response = client.post(
|
| 48 |
+
"/step",
|
| 49 |
+
json={
|
| 50 |
+
"action_type": "suggest_fix",
|
| 51 |
+
"suggested_fix": "SELECT id, email FROM users WHERE active = 1;",
|
| 52 |
+
"confidence": 0.8,
|
| 53 |
+
},
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
assert response.status_code == 200
|
| 57 |
+
assert response.json()["reward"] < 0
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def test_approve_with_missed_issues_ends_episode_with_penalty() -> None:
|
| 61 |
+
client = build_client()
|
| 62 |
+
client.post("/reset", json={"task_id": "easy_001"})
|
| 63 |
+
|
| 64 |
+
response = client.post("/step", json={"action_type": "approve", "confidence": 0.8})
|
| 65 |
+
|
| 66 |
+
assert response.status_code == 200
|
| 67 |
+
payload = response.json()
|
| 68 |
+
assert payload["done"] is True
|
| 69 |
+
assert payload["reward"] < 0
|
| 70 |
+
assert payload["info"]["final_score"] is not None
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def test_identify_then_approve_can_finish_successfully() -> None:
|
| 74 |
+
client = build_client()
|
| 75 |
+
client.post("/reset", json={"task_id": "easy_002"})
|
| 76 |
+
client.post(
|
| 77 |
+
"/step",
|
| 78 |
+
json={
|
| 79 |
+
"action_type": "identify_issue",
|
| 80 |
+
"issue_category": "syntax",
|
| 81 |
+
"issue_description": "The query is missing the FROM clause before users.",
|
| 82 |
+
"confidence": 0.95,
|
| 83 |
+
},
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
response = client.post("/step", json={"action_type": "approve", "confidence": 0.9})
|
| 87 |
+
|
| 88 |
+
assert response.status_code == 200
|
| 89 |
+
payload = response.json()
|
| 90 |
+
assert payload["done"] is True
|
| 91 |
+
assert payload["reward"] > 0
|
| 92 |
+
assert payload["info"]["final_score"] is not None
|
| 93 |
+
|
tests/test_grader.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sql_query_reviewer.models import SQLReviewAction, TaskRecord
|
| 2 |
+
from server.grader import grade_episode, match_issue, validate_fix
|
| 3 |
+
from server.environment import SQLReviewEnvironment
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def test_match_issue_finds_expected_easy_issue() -> None:
|
| 7 |
+
environment = SQLReviewEnvironment()
|
| 8 |
+
task = environment.tasks["easy_002"]
|
| 9 |
+
action = SQLReviewAction(
|
| 10 |
+
action_type="identify_issue",
|
| 11 |
+
issue_category="syntax",
|
| 12 |
+
issue_description="The query is missing the FROM clause before users.",
|
| 13 |
+
confidence=0.95,
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
match, score = match_issue(action, task.ground_truth_issues, set())
|
| 17 |
+
|
| 18 |
+
assert match is not None
|
| 19 |
+
assert match.id == "easy_002_missing_from"
|
| 20 |
+
assert score >= 0.35
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def test_validate_fix_accepts_expected_remediation() -> None:
|
| 24 |
+
environment = SQLReviewEnvironment()
|
| 25 |
+
task = environment.tasks["easy_003"]
|
| 26 |
+
assert validate_fix("SELECT order_id, total FROM orders WHERE shipped_at IS NULL;", task.ground_truth_issues[0])
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def test_grade_episode_is_deterministic_and_bounded() -> None:
|
| 30 |
+
environment = SQLReviewEnvironment()
|
| 31 |
+
task = environment.tasks["medium_001"]
|
| 32 |
+
|
| 33 |
+
first = grade_episode({"medium_001_select_star"}, task.ground_truth_issues, total_steps=2, max_steps=5, false_positive_count=1)
|
| 34 |
+
second = grade_episode({"medium_001_select_star"}, task.ground_truth_issues, total_steps=2, max_steps=5, false_positive_count=1)
|
| 35 |
+
|
| 36 |
+
assert first == second
|
| 37 |
+
assert 0.0 <= first <= 1.0
|
| 38 |
+
|
tests/test_inference.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from types import SimpleNamespace
|
| 2 |
+
|
| 3 |
+
import inference
|
| 4 |
+
from sql_query_reviewer.models import SQLReviewObservation, SQLReviewState, StepResult
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def test_extract_json_handles_code_fence() -> None:
|
| 8 |
+
payload = inference.extract_json(
|
| 9 |
+
"""```json
|
| 10 |
+
{"action_type":"approve","confidence":0.8}
|
| 11 |
+
```"""
|
| 12 |
+
)
|
| 13 |
+
assert payload["action_type"] == "approve"
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def test_run_episode_emits_start_step_end_logs(capsys) -> None:
|
| 17 |
+
class DummyEnv:
|
| 18 |
+
def reset(self, task_id: str) -> StepResult:
|
| 19 |
+
return StepResult(
|
| 20 |
+
observation=SQLReviewObservation(
|
| 21 |
+
query="SELECT 1;",
|
| 22 |
+
schema_info={},
|
| 23 |
+
context="Health check query.",
|
| 24 |
+
issues_found_so_far=[],
|
| 25 |
+
remaining_actions=1,
|
| 26 |
+
difficulty="easy",
|
| 27 |
+
feedback="Review this query.",
|
| 28 |
+
),
|
| 29 |
+
reward=0.0,
|
| 30 |
+
done=False,
|
| 31 |
+
info={},
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
def step(self, action):
|
| 35 |
+
assert action.action_type == "approve"
|
| 36 |
+
return StepResult(
|
| 37 |
+
observation=SQLReviewObservation(
|
| 38 |
+
query="SELECT 1;",
|
| 39 |
+
schema_info={},
|
| 40 |
+
context="Health check query.",
|
| 41 |
+
issues_found_so_far=[],
|
| 42 |
+
remaining_actions=0,
|
| 43 |
+
difficulty="easy",
|
| 44 |
+
feedback="Query approved with full issue coverage.",
|
| 45 |
+
),
|
| 46 |
+
reward=0.2,
|
| 47 |
+
done=True,
|
| 48 |
+
info={},
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
def state(self) -> SQLReviewState:
|
| 52 |
+
return SQLReviewState(
|
| 53 |
+
task_id="easy_999",
|
| 54 |
+
step_count=1,
|
| 55 |
+
total_reward=0.2,
|
| 56 |
+
done=True,
|
| 57 |
+
approved=True,
|
| 58 |
+
final_score=1.0,
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
class DummyCompletions:
|
| 62 |
+
def create(self, **_kwargs):
|
| 63 |
+
return SimpleNamespace(
|
| 64 |
+
choices=[
|
| 65 |
+
SimpleNamespace(
|
| 66 |
+
message=SimpleNamespace(content='{"action_type":"approve","confidence":0.9}')
|
| 67 |
+
)
|
| 68 |
+
]
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
class DummyClient:
|
| 72 |
+
def __init__(self) -> None:
|
| 73 |
+
self.chat = SimpleNamespace(completions=DummyCompletions())
|
| 74 |
+
|
| 75 |
+
summary = inference.run_episode(DummyEnv(), DummyClient(), "dummy-model", "easy_999")
|
| 76 |
+
captured = capsys.readouterr().out
|
| 77 |
+
|
| 78 |
+
assert "[START]" in captured
|
| 79 |
+
assert "[STEP]" in captured
|
| 80 |
+
assert "[END]" in captured
|
| 81 |
+
assert summary["final_score"] == 1.0
|
| 82 |
+
|
tests/test_models.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
from pydantic import ValidationError
|
| 3 |
+
|
| 4 |
+
from sql_query_reviewer.models import SQLReviewAction
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def test_identify_issue_requires_category_and_description() -> None:
|
| 8 |
+
with pytest.raises(ValidationError):
|
| 9 |
+
SQLReviewAction(action_type="identify_issue", confidence=0.8)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def test_suggest_fix_requires_fix_text() -> None:
|
| 13 |
+
with pytest.raises(ValidationError):
|
| 14 |
+
SQLReviewAction(action_type="suggest_fix")
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def test_approve_action_is_valid_without_optional_fields() -> None:
|
| 18 |
+
action = SQLReviewAction(action_type="approve", confidence=0.9)
|
| 19 |
+
assert action.action_type == "approve"
|
| 20 |
+
assert action.issue_description is None
|
| 21 |
+
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|