Spaces:

hellinferno
/

sql-query-reviewer

Sleeping

App Files Files Community

hellinferno commited on 10 days ago

Commit

90fc756

0 Parent(s):

Build SQL Query Reviewer environment

Browse files

Files changed (43) hide show

.dockerignore +14 -0
.github/workflows/ci.yml +40 -0
.github/workflows/sync-to-hf.yml +36 -0
.gitignore +18 -0
Dockerfile +20 -0
README.md +175 -0
client.py +4 -0
files/00-winning-plan.md +200 -0
files/01-problem-statement.md +32 -0
files/02-requirements.md +58 -0
files/03-information-architecture.md +66 -0
files/04-system-architecture.md +54 -0
files/05-database-schema.md +52 -0
files/06-api-contracts.md +96 -0
files/07-monorepo-structure.md +65 -0
files/08-computation-engine-spec.md +86 -0
files/09-engineering-scope-definition.md +39 -0
files/10-development-phases.md +48 -0
files/11-environment-and-devops.md +77 -0
files/12-testing-strategy.md +52 -0
files/architecture-diagram.md +61 -0
files/project-design.md +40 -0
files/project-readme.md +91 -0
inference.py +131 -0
models.py +22 -0
openenv.yaml +23 -0
pyproject.toml +39 -0
server/__init__.py +5 -0
server/app.py +59 -0
server/environment.py +185 -0
server/grader.py +91 -0
server/reward.py +36 -0
sql_query_reviewer/__init__.py +25 -0
sql_query_reviewer/client.py +95 -0
sql_query_reviewer/models.py +99 -0
tasks/easy_tasks.json +148 -0
tasks/hard_tasks.json +158 -0
tasks/medium_tasks.json +152 -0
tests/test_api.py +93 -0
tests/test_grader.py +38 -0
tests/test_inference.py +82 -0
tests/test_models.py +21 -0
uv.lock +0 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,14 @@

+.git
+.github
+.pytest_cache
+.venv
+__pycache__
+*.pyc
+.coverage
+build
+dist
+files
+htmlcov
+outputs
+tests

.github/workflows/ci.yml ADDED Viewed

	@@ -0,0 +1,40 @@

+name: CI
+on:
+  push:
+    branches: ["main"]
+  pull_request:
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Install package and test dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install -e .[dev]
+      - name: Run unit and integration tests
+        run: pytest
+      - name: Build Docker image
+        run: docker build -t sql-query-reviewer .
+      - name: Attempt OpenEnv validation
+        run: |
+          python -m pip install "git+https://github.com/meta-pytorch/OpenEnv.git" || true
+          if command -v openenv >/dev/null 2>&1; then
+            openenv validate
+          else
+            echo "OpenEnv CLI unavailable; skipping openenv validate"
+          fi

.github/workflows/sync-to-hf.yml ADDED Viewed

	@@ -0,0 +1,36 @@

+name: Sync To Hugging Face
+on:
+  push:
+    branches: ["main"]
+  workflow_dispatch:
+jobs:
+  sync-to-space:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          lfs: true
+      - name: Push repository to Hugging Face Space
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          HF_SPACE_ID: ${{ vars.HF_SPACE_ID }}
+          GITHUB_REPOSITORY_OWNER: ${{ github.repository_owner }}
+        run: |
+          set -euo pipefail
+          if [ -z "${HF_TOKEN:-}" ]; then
+            echo "HF_TOKEN is not configured; skipping Hugging Face sync."
+            exit 0
+          fi
+          space_id="${HF_SPACE_ID:-${GITHUB_REPOSITORY_OWNER}/sql-query-reviewer}"
+          git config user.email "actions@github.com"
+          git config user.name "github-actions[bot]"
+          git remote add hf "https://oauth2:${HF_TOKEN}@huggingface.co/spaces/${space_id}"
+          git push --force hf HEAD:main

.gitignore ADDED Viewed

	@@ -0,0 +1,18 @@

+__pycache__/
+.coverage
+.mypy_cache/
+.pytest_cache/
+.ruff_cache/
+.venv/
+*.egg-info/
+*.pyc
+build/
+dist/
+htmlcov/
+outputs/
+.env
+.env.*
+.DS_Store
+.idea/
+.vscode/

Dockerfile ADDED Viewed

	@@ -0,0 +1,20 @@

+FROM python:3.11-slim
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PORT=8000
+WORKDIR /app
+COPY pyproject.toml README.md models.py client.py openenv.yaml inference.py ./
+COPY sql_query_reviewer ./sql_query_reviewer
+COPY server ./server
+COPY tasks ./tasks
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir .
+EXPOSE 8000
+CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000"]

README.md ADDED Viewed

	@@ -0,0 +1,175 @@

+---
+title: SQL Query Reviewer
+colorFrom: blue
+colorTo: green
+sdk: docker
+app_port: 8000
+pinned: false
+---
+# SQL Query Reviewer
+`Meta-hackathon` is the GitHub source repo for `sql-query-reviewer`, an OpenEnv-style environment where an agent reviews SQL queries for correctness, performance, and security issues.
+The same repository is designed to work in both places:
+- GitHub is the canonical source, CI surface, and collaboration home.
+- Hugging Face Spaces runs the Dockerized FastAPI environment directly from this repo layout.
+## What The Environment Does
+Each episode gives the agent:
+- a SQL query
+- schema context when it matters
+- a short explanation of the query's intended purpose
+The agent responds step by step with one of four actions:
+- `identify_issue`
+- `suggest_fix`
+- `approve`
+- `request_more_context`
+Rewards are deterministic and shaped for partial progress:
+- correct issue identification earns severity-weighted reward
+- valid fixes earn bonus reward
+- false positives are penalized
+- approving with missed issues is penalized
+## Repository Layout
+```text
+.
+|-- .github/workflows/
+|-- client.py
+|-- Dockerfile
+|-- inference.py
+|-- models.py
+|-- openenv.yaml
+|-- pyproject.toml
+|-- server/
+|-- sql_query_reviewer/
+|-- tasks/
+`-- tests/
+```
+## Task Bank
+The environment ships with 15 tasks:
+- 5 easy syntax and basic logic reviews
+- 5 medium schema-aware performance reviews
+- 5 hard security and advanced optimization reviews
+Task data lives in:
+- `tasks/easy_tasks.json`
+- `tasks/medium_tasks.json`
+- `tasks/hard_tasks.json`
+## Local Development
+Install dependencies:
+```bash
+python -m venv .venv
+.venv\Scripts\activate
+python -m pip install --upgrade pip
+python -m pip install -e .[dev]
+```
+Run the API locally:
+```bash
+uvicorn server.app:app --reload --port 8000
+```
+Smoke-test the API:
+```bash
+curl -X POST http://localhost:8000/reset -H "Content-Type: application/json" -d "{\"task_id\":\"easy_001\"}"
+curl http://localhost:8000/state
+```
+Run tests:
+```bash
+pytest
+```
+Build the container:
+```bash
+docker build -t sql-query-reviewer .
+docker run -p 8000:8000 sql-query-reviewer
+```
+## Inference Script
+`inference.py` uses the OpenAI Python client against any OpenAI-compatible endpoint.
+Expected environment variables:
+```bash
+set ENV_BASE_URL=http://localhost:8000
+set API_BASE_URL=https://router.huggingface.co/v1
+set MODEL_NAME=Qwen/Qwen2.5-72B-Instruct
+set HF_TOKEN=hf_xxx
+python inference.py
+```
+The script emits structured logs using:
+- `[START]`
+- `[STEP]`
+- `[END]`
+## Hugging Face Spaces
+This repo is Space-ready because:
+- the README starts with Hugging Face YAML front matter
+- the repo includes a root `Dockerfile`
+- the API listens on port `8000`
+To deploy manually from a local machine with git:
+```bash
+git remote add hf https://huggingface.co/spaces/<hf-username>/sql-query-reviewer
+git push hf main
+```
+If you install the OpenEnv CLI, you can also use:
+```bash
+python -m pip install "git+https://github.com/meta-pytorch/OpenEnv.git"
+openenv push --repo-id <hf-username>/sql-query-reviewer
+```
+## GitHub Actions
+CI runs tests and a Docker build on pushes and pull requests.
+The Hugging Face sync workflow expects:
+- GitHub secret `HF_TOKEN`
+- optional GitHub variable `HF_SPACE_ID`
+If `HF_SPACE_ID` is not set, the workflow defaults to:
+```text
+<github-repository-owner>/sql-query-reviewer
+```
+## Usage Example
+```python
+from sql_query_reviewer import SQLReviewAction, SQLReviewEnv
+with SQLReviewEnv(base_url="http://localhost:8000").sync() as env:
+    result = env.reset(task_id="easy_001")
+    result = env.step(
+        SQLReviewAction(
+            action_type="identify_issue",
+            issue_category="syntax",
+            issue_description="SELCT is misspelled and should be SELECT",
+            suggested_fix="SELECT * FROM users WHERE id = 1;",
+            confidence=0.98,
+        )
+    )
+    print(result.reward)
+    print(result.observation.feedback)
+```

client.py ADDED Viewed

	@@ -0,0 +1,4 @@


1	+ from sql_query_reviewer.client import SQLReviewEnv, SyncSQLReviewEnv
2	+
3	+ __all__ = ["SQLReviewEnv", "SyncSQLReviewEnv"]
4	+

files/00-winning-plan.md ADDED Viewed

	@@ -0,0 +1,200 @@

+# OpenEnv Hackathon — Winning Plan
+**Participant:** Ravi (Solo)
+**Deadline:** April 12, 2026, 11:59 PM IST
+**Goal:** Top 3,000 out of 20,000 teams → Finale April 25–26
+---
+## Chosen Domain: **SQL Query Optimizer Review**
+An environment where an AI agent reviews SQL queries for correctness, performance, and security issues — then suggests fixes. This scores high on real-world utility (30% weight), is novel in OpenEnv, has natural difficulty progression, and produces clear measurable rewards.
+**Why this wins:**
+- Every engineering team at Meta deals with SQL/data pipelines daily — maximum relevance
+- Clear grading: each query has known issues, agent either finds them or doesn't → partial credit is natural
+- Difficulty scales cleanly: syntax errors (easy) → performance anti-patterns (medium) → subtle injection vulnerabilities + schema-aware optimization (hard)
+- Novel domain not seen in existing OpenEnv environments (creativity 10%)
+- Deterministic grading with score variance (agents that find more issues score higher)
+---
+## Timeline
+| When | What |
+|---|---|
+| **Apr 10, Morning** | Complete prep modules 1-4 on Colab, watch bootcamp recording |
+| **Apr 10, Afternoon** | Install prerequisites, study sample inference script, study echo env code |
+| **Apr 10, Evening** | Scaffold project with `openenv init`, define Pydantic models, implement core env logic |
+| **Apr 11, Morning** | Implement 3 tasks (easy/medium/hard) with graders and reward functions |
+| **Apr 11, Afternoon** | Write `inference.py`, test locally, iterate on reward shaping |
+| **Apr 11, Evening** | Dockerize, deploy to HF Spaces, run pre-validation script |
+| **Apr 12, Morning** | Write README, final testing, fix issues |
+| **Apr 12, Afternoon** | Final pre-validation, submit |
+| **Apr 12, Before 11:59 PM** | Verify HF Space is live and responding |
+---
+## Phase 0: Preparation (Today — First 3 Hours)
+### Step 1: Complete Prep Course Modules
+- Module 1: Interface basics (`reset()`, `step()`, `state()`)
+- Module 2: Using existing environments, typed models
+- Module 3: Deployment to HF Spaces with `openenv push`
+- Module 4: **Building your own environment** — most critical, take detailed notes
+### Step 2: Watch Bootcamp Recording
+- Note tips from Ben Burtenshaw (HF) and Pulkit Aneja about what judges look for
+### Step 3: Install Prerequisites
+```bash
+pip install openenv-core huggingface_hub openai pydantic
+pip install docker  # or ensure Docker Desktop is running
+huggingface-cli login
+```
+### Step 4: Study the Sample Inference Script
+- Memorize the `[START]`, `[STEP]`, `[END]` stdout format
+- Any deviation in field names/ordering = incorrect evaluation scoring
+### Step 5: Study Existing Environments
+- Clone `https://github.com/meta-pytorch/OpenEnv`
+- Study `envs/echo_env/` structure: models.py, client.py, server/environment.py, server/app.py, server/Dockerfile
+---
+## Phase 1: Build the Environment
+### Project Structure
+```
+sql-query-reviewer/
+├── openenv.yaml
+├── models.py              # Action, Observation, State Pydantic models
+├── client.py              # EnvClient subclass
+├── inference.py           # Baseline inference script (root!)
+├── README.md
+├── tasks/
+│   ├── easy_tasks.json    # Syntax error queries
+│   ├── medium_tasks.json  # Performance anti-pattern queries
+│   └── hard_tasks.json    # Security + schema-aware optimization queries
+└── server/
+    ├── environment.py     # Core environment logic
+    ├── grader.py          # Deterministic grading functions
+    ├── app.py             # FastAPI server
+    ├── Dockerfile
+    └── requirements.txt
+```
+### Pydantic Models Design
+**Observation:**
+- `query`: The SQL query to review
+- `schema_info`: Table/column definitions (for medium/hard tasks)
+- `context`: What the query is supposed to do
+- `issues_found_so_far`: List of issues already identified
+- `remaining_actions`: How many review steps remain
+- `difficulty`: easy | medium | hard
+**Action:**
+- `action_type`: "identify_issue" | "suggest_fix" | "approve" | "request_more_context"
+- `issue_category`: "syntax" | "performance" | "security" | "logic" | "style"
+- `issue_description`: Free text description of the issue
+- `suggested_fix`: The corrected SQL (optional)
+- `confidence`: Float 0.0-1.0
+**Reward:** Float 0.0-1.0 with partial credit
+### Three Tasks with Progressive Difficulty
+**Task 1 — Easy: Syntax & Basic Logic Errors**
+- Queries with missing keywords, wrong joins, typos in column names
+- Agent identifies each error → 0.2 reward per correct identification
+- Suggesting a valid fix → bonus 0.1 per fix
+- Expected baseline score: 0.7-0.9
+**Task 2 — Medium: Performance Anti-Patterns**
+- SELECT *, missing indexes, N+1 patterns, unnecessary subqueries, missing WHERE clauses on large tables
+- Requires understanding schema context
+- Agent identifies anti-pattern + suggests optimization → partial credit
+- Expected baseline score: 0.4-0.6
+**Task 3 — Hard: Security Vulnerabilities + Schema-Aware Optimization**
+- SQL injection vectors, privilege escalation, data leakage, plus complex optimization (query plan awareness)
+- Requires multi-step reasoning about schema relationships
+- Expected baseline score: 0.2-0.4
+### Reward Function Design
+- Per-step rewards (not just end-of-episode)
+- Correct issue identification: +0.2 (scaled by issue severity)
+- Valid fix suggestion: +0.1
+- False positive (flagging non-issue): -0.1
+- Missing critical issue at episode end: -0.15
+- Approving a query with unfound issues: -0.2
+- Smooth, informative signal throughout the trajectory
+### Grader Design
+- Each task has a ground-truth list of issues with categories and severity
+- Grader compares agent's identified issues against ground truth using fuzzy matching on descriptions
+- Score = (correctly_identified × severity_weight) / total_possible_score
+- Deterministic: same agent output → same score every time
+- Returns float in [0.0, 1.0]
+- Never returns the same score for all inputs (variety of queries ensures variance)
+---
+## Phase 2: Inference Script
+Key requirements:
+- Named `inference.py` in root directory
+- Uses OpenAI Client for all LLM calls
+- Reads `API_BASE_URL`, `MODEL_NAME`, `HF_TOKEN` from env vars
+- Emits `[START]`, `[STEP]`, `[END]` logs exactly per spec
+- Completes in <20 minutes on 2 vCPU, 8GB RAM
+- Reproducible scores
+---
+## Phase 3: Containerize & Deploy
+```bash
+# Build and test locally
+docker build -t sql-query-reviewer ./server
+docker run -p 8000:8000 sql-query-reviewer
+# Verify endpoints
+curl -X POST http://localhost:8000/reset -H "Content-Type: application/json" -d '{}'
+# Deploy to HF Spaces
+openenv push --repo-id ravi/sql-query-reviewer
+# Verify deployed version
+curl -X POST https://ravi-sql-query-reviewer.hf.space/reset
+```
+---
+## Phase 4: Pre-Submission QA
+Run pre-validation script:
+```bash
+./validate-submission.sh https://ravi-sql-query-reviewer.hf.space .
+```
+Checklist:
+- [ ] HF Space deploys and responds to `/reset` with 200
+- [ ] `openenv validate` passes
+- [ ] Dockerfile builds cleanly
+- [ ] Inference script runs without errors, produces scores
+- [ ] 3+ tasks, each grader returns scores in 0.0-1.0 range
+- [ ] Scores are reproducible across runs
+- [ ] README is compelling and complete
+---
+## Winning Differentiators
+1. **Real-world utility (30%)**: SQL review is something every data team needs — immediate value for the RL/agent community
+2. **Score variance**: Different agent capabilities produce meaningfully different scores — a basic agent catches syntax errors but misses security issues
+3. **Reward shaping**: Per-step partial credit signals, not binary end-of-episode
+4. **Novelty**: No SQL review environment exists in OpenEnv yet
+5. **Spec compliance**: Bulletproof adherence to every technical requirement — this alone eliminates most competitors

files/01-problem-statement.md ADDED Viewed

	@@ -0,0 +1,32 @@

+# 01 — Problem Statement & Domain Selection
+## Domain: SQL Query Review Environment
+### The Real-World Problem
+Every software team reviews SQL queries — in code reviews, database migrations, ETL pipeline audits, and security assessments. This is a genuine, high-frequency task that requires:
+- Pattern recognition (anti-patterns, vulnerabilities)
+- Domain knowledge (schema relationships, indexing strategies)
+- Multi-step reasoning (understanding query intent before evaluating correctness)
+### Why This Domain Wins
+| Evaluation Criteria | Weight | How We Score |
+|---|---|---|
+| Real-world utility | 30% | SQL review is universal — Meta runs millions of queries daily. Fills a real gap in agent evaluation. |
+| Task & grader quality | 25% | Clear ground truth per query, deterministic grading, natural difficulty progression |
+| Environment design | 20% | Clean state (per-query episode), rich observations, well-typed actions, per-step rewards |
+| Code quality & spec compliance | 15% | Full OpenEnv spec, clean project structure, Docker, typed models |
+| Creativity & novelty | 10% | No SQL review env exists in OpenEnv. Reward design uses severity-weighted partial credit. |
+### What the Agent Does
+1. Receives a SQL query + optional schema context
+2. Reviews it step-by-step, identifying issues (syntax, performance, security, logic)
+3. Suggests fixes for each identified issue
+4. Decides when to approve or flag the query
+5. Gets rewarded for correctly identified issues and penalized for false positives
+### Scope Boundaries
+- **In scope**: SELECT, INSERT, UPDATE, DELETE queries; joins; subqueries; CTEs; window functions
+- **Out of scope**: Stored procedures, database-specific dialect features, real database execution
+- **Episode length**: 3-8 steps depending on query complexity
+- **No external dependencies**: All query analysis is rule-based and deterministic

files/02-requirements.md ADDED Viewed

	@@ -0,0 +1,58 @@

+# 02 — Requirements Specification
+## Functional Requirements
+### FR-1: Real-World Task Simulation
+- Simulates SQL query review — a task humans do daily in engineering teams
+- No games, no toys — purely professional/practical domain
+### FR-2: OpenEnv Spec Compliance
+- Typed Pydantic models for Observation, Action, State
+- `step(action)` → returns observation, reward, done, info
+- `reset()` → returns initial observation
+- `state()` → returns current internal state
+- Valid `openenv.yaml` with metadata
+- Passes `openenv validate`
+### FR-3: Minimum 3 Tasks with Agent Graders
+- **Task 1 (Easy):** Syntax & basic logic errors — expected agent score 0.7-0.9
+- **Task 2 (Medium):** Performance anti-patterns — expected agent score 0.4-0.6
+- **Task 3 (Hard):** Security vulnerabilities + schema-aware optimization — expected agent score 0.2-0.4
+- Each grader: deterministic, returns float in [0.0, 1.0], reproducible
+### FR-4: Meaningful Reward Function
+- Per-step rewards (not just end-of-episode binary)
+- Partial credit for partial issue identification
+- Penalties for false positives and missed critical issues
+- Smooth signal that guides learning
+### FR-5: Baseline Inference Script
+- Named `inference.py` in project root
+- Uses OpenAI Client for LLM calls
+- Reads `API_BASE_URL`, `MODEL_NAME`, `HF_TOKEN` from env vars
+- Emits `[START]`, `[STEP]`, `[END]` structured stdout logs
+- Produces reproducible baseline scores on all 3 tasks
+## Non-Functional Requirements
+### NFR-1: Deploys to Hugging Face Space
+- Containerized HF Space tagged with `openenv`
+- Returns 200 and responds to `/reset` POST
+### NFR-2: Containerized Execution
+- Working Dockerfile
+- Builds with `docker build`, runs with `docker run`
+- Starts cleanly, responds to HTTP requests
+### NFR-3: Infrastructure Constraints
+- Inference script runtime < 20 minutes
+- Runs on 2 vCPU, 8GB RAM machine
+### NFR-4: Documentation
+- README with: environment description, motivation, action/observation space definitions, task descriptions with difficulty, setup instructions, baseline scores
+## Disqualification Criteria (Must Avoid)
+- ❌ Environment does not deploy or respond
+- ❌ Plagiarized or trivially modified existing environments
+- ❌ Graders that always return the same score
+- ❌ No baseline inference script

files/03-information-architecture.md ADDED Viewed

	@@ -0,0 +1,66 @@

+# 03 — Information Architecture
+## Data Flow
+```
+[Task JSON] → reset() → [Observation: query + schema + context]
+                              ↓
+                    Agent decides action
+                              ↓
+                step(Action) → [Observation + Reward + Done]
+                              ↓
+                    (repeat until done or max_steps)
+                              ↓
+                close() → Grader computes final score
+```
+## Task Data Structure
+Each task is a JSON object:
+```json
+{
+  "task_id": "easy_001",
+  "difficulty": "easy",
+  "query": "SELCT * FORM users WEHRE id = 1",
+  "schema": {
+    "users": {"id": "INT PRIMARY KEY", "name": "VARCHAR(255)", "email": "VARCHAR(255)"}
+  },
+  "context": "Fetch user by ID for profile page",
+  "ground_truth_issues": [
+    {"category": "syntax", "description": "SELCT should be SELECT", "severity": 0.3, "fix": "SELECT"},
+    {"category": "syntax", "description": "FORM should be FROM", "severity": 0.3, "fix": "FROM"},
+    {"category": "syntax", "description": "WEHRE should be WHERE", "severity": 0.3, "fix": "WHERE"},
+    {"category": "performance", "description": "SELECT * fetches unnecessary columns", "severity": 0.1, "fix": "SELECT id, name, email"}
+  ],
+  "max_steps": 5
+}
+```
+## State Management
+| Field | Type | Description |
+|---|---|---|
+| `task_id` | str | Current task identifier |
+| `query` | str | The SQL query under review |
+| `issues_identified` | list | Issues the agent has found so far |
+| `fixes_suggested` | list | Fixes the agent has proposed |
+| `step_count` | int | Current step number |
+| `total_reward` | float | Accumulated reward |
+| `done` | bool | Whether episode is complete |
+| `approved` | bool | Whether agent approved the query |
+## Observation Space
+- `query`: The full SQL query text
+- `schema_info`: Dict of table → column definitions (empty for easy tasks)
+- `context`: Natural language description of query intent
+- `issues_found_so_far`: List of previously identified issues in this episode
+- `remaining_actions`: Max steps minus current step
+- `difficulty`: "easy" | "medium" | "hard"
+- `feedback`: Result of last action ("correct identification", "false positive", "already identified", etc.)
+## Action Space
+- `action_type`: enum — "identify_issue" | "suggest_fix" | "approve" | "request_more_context"
+- `issue_category`: enum — "syntax" | "performance" | "security" | "logic" | "style"
+- `issue_description`: str — what the agent thinks is wrong
+- `suggested_fix`: str (optional) — corrected SQL fragment
+- `confidence`: float 0.0-1.0

files/04-system-architecture.md ADDED Viewed

	@@ -0,0 +1,54 @@

+# 04 — System Architecture
+## Components
+```
+┌─────────────────────────────────────────────┐
+│                 HF Space                     │
+│  ┌─────────────────────────────────────┐    │
+│  │           FastAPI Server             │    │
+│  │  (app.py — Uvicorn)                  │    │
+│  │                                      │    │
+│  │  POST /reset  → environment.reset()  │    │
+│  │  POST /step   → environment.step()   │    │
+│  │  GET  /state  → environment.state()  │    │
+│  └──────────┬──────────────────────────┘    │
+│             │                                │
+│  ┌──────────▼──────────────────────────┐    │
+│  │      SQLReviewEnvironment            │    │
+│  │  - task_bank (easy/medium/hard JSON) │    │
+│  │  - grader (deterministic scoring)    │    │
+│  │  - reward_fn (per-step signals)      │    │
+│  └─────────────────────────────────────┘    │
+│                                              │
+│  Dockerfile (Python 3.10-slim + deps)        │
+└─────────────────────────────────────────────┘
+┌─────────────────────────────────────────────┐
+│            inference.py (Client)             │
+│  - OpenAI Client → LLM API                  │
+│  - SQLReviewEnvClient → HF Space            │
+│  - Structured stdout logging                 │
+└─────────────────────────────────────────────┘
+```
+## Technology Stack
+- **Runtime:** Python 3.10+
+- **Framework:** FastAPI + Uvicorn
+- **Models:** Pydantic v2
+- **Container:** Docker (python:3.10-slim base)
+- **Deployment:** Hugging Face Spaces (Docker SDK)
+- **LLM Client:** OpenAI Python SDK
+- **Environment SDK:** openenv-core
+## Communication Protocol
+- WebSocket at `/ws` for persistent sessions (OpenEnv standard)
+- HTTP POST endpoints as fallback: `/reset`, `/step`
+- HTTP GET: `/state`
+- JSON request/response bodies matching typed Pydantic models
+## Episode Lifecycle
+1. Client calls `reset(task_id="easy_001")` → server loads task, returns initial observation
+2. Client calls `step(action)` → server validates action, computes reward, returns observation
+3. Repeat until `done=True` (all issues found, agent approves, or max_steps reached)
+4. Client calls `close()` → server runs grader, returns final score

files/05-database-schema.md ADDED Viewed

	@@ -0,0 +1,52 @@

+# 05 — Task Bank Schema
+## Overview
+Tasks are stored as JSON files, not a database. Each difficulty level has its own file with 3-5 queries.
+## Easy Tasks (`tasks/easy_tasks.json`)
+Queries with obvious syntax errors, wrong keywords, basic logic mistakes. An LLM should score 0.7-0.9.
+Example queries:
+1. Misspelled keywords (SELCT, FORM, WEHRE)
+2. Missing FROM clause
+3. Wrong column names that don't exist in schema
+4. Missing semicolons / unclosed quotes
+5. Using = NULL instead of IS NULL
+## Medium Tasks (`tasks/medium_tasks.json`)
+Queries with performance anti-patterns. Requires understanding schema context. Target score: 0.4-0.6.
+Example queries:
+1. SELECT * on a 50-column table when only 2 columns needed
+2. Missing index hint on a JOIN with large table
+3. Correlated subquery that could be a JOIN
+4. Missing LIMIT on unbounded query
+5. Redundant DISTINCT on a column with UNIQUE constraint
+## Hard Tasks (`tasks/hard_tasks.json`)
+Security vulnerabilities + complex optimization. Target score: 0.2-0.4.
+Example queries:
+1. String concatenation enabling SQL injection
+2. Privilege escalation via UNION with system tables
+3. Data leakage through unfiltered JOIN exposing PII
+4. Query that could use window functions instead of self-join (10x perf gain)
+5. Missing transaction isolation causing phantom reads
+## Ground Truth Format
+Each issue in ground truth:
+```json
+{
+  "category": "security",
+  "description": "String concatenation in WHERE clause enables SQL injection",
+  "severity": 1.0,
+  "fix": "Use parameterized query with ? placeholder",
+  "keywords": ["injection", "concatenation", "user input", "unsanitized"]
+}
+```
+The `keywords` field is used by the grader for fuzzy matching against agent responses.

files/06-api-contracts.md ADDED Viewed

	@@ -0,0 +1,96 @@

+# 06 — API Contracts
+## OpenEnv Standard Endpoints
+### POST /reset
+**Request:**
+```json
+{"task_id": "easy_001"}
+```
+**Response (StepResult):**
+```json
+{
+  "observation": {
+    "query": "SELCT * FORM users WEHRE id = 1",
+    "schema_info": {"users": {"id": "INT PK", "name": "VARCHAR(255)", "email": "VARCHAR(255)"}},
+    "context": "Fetch user by ID for profile page",
+    "issues_found_so_far": [],
+    "remaining_actions": 5,
+    "difficulty": "easy",
+    "feedback": "Review this SQL query and identify any issues."
+  },
+  "reward": 0.0,
+  "done": false,
+  "info": {}
+}
+```
+### POST /step
+**Request (Action):**
+```json
+{
+  "action_type": "identify_issue",
+  "issue_category": "syntax",
+  "issue_description": "SELCT is misspelled, should be SELECT",
+  "suggested_fix": "SELECT",
+  "confidence": 0.95
+}
+```
+**Response (StepResult):**
+```json
+{
+  "observation": {
+    "query": "SELCT * FORM users WEHRE id = 1",
+    "schema_info": {"users": {"id": "INT PK", "name": "VARCHAR(255)", "email": "VARCHAR(255)"}},
+    "context": "Fetch user by ID for profile page",
+    "issues_found_so_far": [{"category": "syntax", "description": "SELCT should be SELECT"}],
+    "remaining_actions": 4,
+    "difficulty": "easy",
+    "feedback": "Correct! SELCT is indeed a syntax error. 3 issues remaining."
+  },
+  "reward": 0.25,
+  "done": false,
+  "info": {"match_type": "exact", "severity": 0.3}
+}
+```
+### GET /state
+**Response (State):**
+```json
+{
+  "task_id": "easy_001",
+  "step_count": 1,
+  "issues_identified": [{"category": "syntax", "description": "SELCT should be SELECT"}],
+  "total_reward": 0.25,
+  "done": false,
+  "approved": false
+}
+```
+## Pydantic Models
+```python
+class SQLReviewAction(Action):
+    action_type: Literal["identify_issue", "suggest_fix", "approve", "request_more_context"]
+    issue_category: Optional[Literal["syntax", "performance", "security", "logic", "style"]] = None
+    issue_description: Optional[str] = None
+    suggested_fix: Optional[str] = None
+    confidence: float = 0.5
+class SQLReviewObservation(Observation):
+    query: str
+    schema_info: Dict[str, Dict[str, str]]
+    context: str
+    issues_found_so_far: List[Dict[str, str]]
+    remaining_actions: int
+    difficulty: str
+    feedback: str
+class SQLReviewState(State):
+    task_id: str
+    step_count: int
+    issues_identified: List[Dict[str, str]]
+    total_reward: float
+    done: bool
+    approved: bool
+```

files/07-monorepo-structure.md ADDED Viewed

	@@ -0,0 +1,65 @@

+# 07 — Monorepo Structure
+```
+sql-query-reviewer/
+│
+├── openenv.yaml                 # Environment metadata manifest
+├── models.py                    # Pydantic: SQLReviewAction, SQLReviewObservation, SQLReviewState
+├── client.py                    # EnvClient subclass for external consumers
+├── inference.py                 # MANDATORY: Baseline inference script (root directory!)
+├── README.md                    # Environment documentation
+├── pyproject.toml               # Package config
+│
+├── tasks/
+│   ├── easy_tasks.json          # 5 syntax/logic error queries
+│   ├── medium_tasks.json        # 5 performance anti-pattern queries
+│   └── hard_tasks.json          # 5 security + optimization queries
+│
+└── server/
+    ├── __init__.py
+    ├── environment.py           # SQLReviewEnvironment(Environment) — core logic
+    ├── grader.py                # Deterministic grading: fuzzy match agent output vs ground truth
+    ├── reward.py                # Per-step reward computation
+    ├── app.py                   # FastAPI server (create_app with routes)
+    ├── Dockerfile               # Python 3.10-slim, install deps, expose port
+    └── requirements.txt         # openenv-core, fastapi, uvicorn, pydantic
+```
+## Key Files Explained
+| File | Purpose | Critical? |
+|---|---|---|
+| `openenv.yaml` | Metadata: name, description, author, tasks list | Yes — validated by `openenv validate` |
+| `models.py` | Typed Action/Observation/State contracts | Yes — spec compliance |
+| `inference.py` | Baseline agent using OpenAI Client | Yes — DQ if missing |
+| `server/environment.py` | `reset()`, `step()`, `state()` implementation | Yes — core logic |
+| `server/grader.py` | Score computation per task | Yes — must return 0.0-1.0 |
+| `server/Dockerfile` | Container definition | Yes — must build cleanly |
+| `README.md` | Human-readable documentation | Yes — judges read this first |
+## openenv.yaml
+```yaml
+name: sql-query-reviewer
+description: "AI agent reviews SQL queries for correctness, performance, and security"
+author: ravi
+version: "1.0.0"
+tags:
+  - openenv
+  - sql
+  - code-review
+  - security
+tasks:
+  - id: easy_syntax
+    name: "Syntax Error Detection"
+    difficulty: easy
+    description: "Find and fix obvious SQL syntax errors"
+  - id: medium_performance
+    name: "Performance Anti-Pattern Review"
+    difficulty: medium
+    description: "Identify performance issues requiring schema awareness"
+  - id: hard_security
+    name: "Security & Optimization Audit"
+    difficulty: hard
+    description: "Find SQL injection vectors and complex optimization opportunities"
+```

files/08-computation-engine-spec.md ADDED Viewed

	@@ -0,0 +1,86 @@

+# 08 — Reward & Grading Engine Spec
+## Per-Step Reward Function
+```python
+def compute_reward(action, ground_truth_issues, already_found):
+    if action.action_type == "identify_issue":
+        match = fuzzy_match(action.issue_description, ground_truth_issues, already_found)
+        if match:
+            base = match["severity"]  # 0.1 - 1.0
+            fix_bonus = 0.1 if action.suggested_fix and is_valid_fix(action.suggested_fix, match) else 0.0
+            confidence_bonus = 0.05 * action.confidence if match else 0.0
+            return min(base + fix_bonus + confidence_bonus, 0.4)  # cap per-step
+        else:
+            return -0.1  # false positive penalty
+    elif action.action_type == "approve":
+        unfound = len(ground_truth_issues) - len(already_found)
+        if unfound == 0:
+            return 0.2  # correct approval
+        else:
+            return -0.15 * unfound  # penalty per missed issue
+    elif action.action_type == "suggest_fix":
+        if not already_found:
+            return -0.05  # fixing without identifying first
+        last_issue = already_found[-1]
+        if is_valid_fix(action.suggested_fix, last_issue):
+            return 0.1
+        return 0.0
+    elif action.action_type == "request_more_context":
+        return 0.0  # neutral — no reward, no penalty
+    return 0.0
+```
+## Fuzzy Matching Algorithm
+```python
+def fuzzy_match(agent_description, ground_truth_issues, already_found):
+    """Match agent's issue description to a ground truth issue."""
+    best_match = None
+    best_score = 0.0
+    for issue in ground_truth_issues:
+        if issue in already_found:
+            continue
+        # Keyword overlap score
+        agent_words = set(agent_description.lower().split())
+        truth_words = set(issue["keywords"])
+        overlap = len(agent_words & truth_words) / max(len(truth_words), 1)
+        # Category match bonus
+        category_bonus = 0.3 if action.issue_category == issue["category"] else 0.0
+        score = overlap + category_bonus
+        if score > best_score and score > 0.3:  # threshold
+            best_score = score
+            best_match = issue
+    return best_match
+```
+## End-of-Episode Grader
+```python
+def grade_episode(issues_found, ground_truth_issues, total_steps, max_steps):
+    """Deterministic grader returning float in [0.0, 1.0]."""
+    if not ground_truth_issues:
+        return 1.0 if not issues_found else 0.5
+    total_severity = sum(i["severity"] for i in ground_truth_issues)
+    found_severity = sum(i["severity"] for i in issues_found if i in matched_ground_truth)
+    coverage_score = found_severity / total_severity  # 0.0 - 1.0
+    efficiency_bonus = max(0, 0.1 * (1 - total_steps / max_steps))  # reward fewer steps
+    false_positive_penalty = 0.05 * count_false_positives(issues_found, ground_truth_issues)
+    score = coverage_score + efficiency_bonus - false_positive_penalty
+    return max(0.0, min(1.0, score))
+```
+## Score Variance Guarantee
+- Easy tasks: 5 different queries with 2-5 issues each → scores range from 0.4 to 1.0
+- Medium tasks: different anti-patterns → scores range from 0.2 to 0.8
+- Hard tasks: varied security issues → scores range from 0.0 to 0.6
+- A grader that always returns the same score = instant DQ. Our design inherently prevents this because different queries have different ground truth issues.

files/09-engineering-scope-definition.md ADDED Viewed

	@@ -0,0 +1,39 @@

+# 09 — Engineering Scope Definition
+## In Scope (Must Build)
+1. **Environment server** — `environment.py` with `reset()`, `step()`, `state()`
+2. **Pydantic models** — `models.py` with typed Action, Observation, State
+3. **Client** — `client.py` with EnvClient subclass
+4. **Task bank** — 15 SQL queries (5 easy, 5 medium, 5 hard) with ground truth
+5. **Grader** — Deterministic scoring function per task
+6. **Reward function** — Per-step partial credit with penalties
+7. **Inference script** — `inference.py` using OpenAI Client
+8. **Dockerfile** — Working container that builds and runs
+9. **HF Space deployment** — Live, tagged with `openenv`
+10. **README** — Complete documentation
+11. **openenv.yaml** — Valid metadata manifest
+## Out of Scope (Don't Build)
+- Real database execution (all analysis is pattern-matching based)
+- Custom LLM fine-tuning
+- Web UI beyond OpenEnv's built-in web interface
+- Multiple language SQL dialects (stick to standard SQL)
+- Integration tests against real databases
+## Effort Estimates
+| Component | Hours | Priority |
+|---|---|---|
+| Prep course + bootcamp | 3.0 | P0 |
+| Task bank creation (15 queries + ground truth) | 2.5 | P0 |
+| Pydantic models | 0.5 | P0 |
+| Environment logic (reset/step/state) | 3.0 | P0 |
+| Grader + reward function | 2.0 | P0 |
+| Inference script | 1.5 | P0 |
+| Dockerfile + local testing | 1.0 | P0 |
+| HF Space deployment | 0.5 | P0 |
+| README | 1.0 | P0 |
+| Pre-validation + bug fixes | 2.0 | P0 |
+| **Total** | **~17 hours** | |
+Fits within the 2-day window with buffer for debugging.

files/10-development-phases.md ADDED Viewed

	@@ -0,0 +1,48 @@

+# 10 — Development Phases
+## Phase 1: Learn (Apr 10, 9 AM – 12 PM)
+- [ ] Complete Module 1: Interface basics
+- [ ] Complete Module 2: Using existing environments
+- [ ] Complete Module 3: Deployment to HF Spaces
+- [ ] Complete Module 4: Building your own environment
+- [ ] Watch bootcamp recording, note judge preferences
+- [ ] Study sample inference script format
+## Phase 2: Scaffold (Apr 10, 12 PM – 2 PM)
+- [ ] `pip install openenv-core huggingface_hub openai`
+- [ ] `openenv init sql-query-reviewer`
+- [ ] Clone and study echo env for reference
+- [ ] Set up project structure per 07-monorepo-structure.md
+## Phase 3: Core Build (Apr 10, 2 PM – Apr 11, 12 PM)
+- [ ] Write `models.py` — Action, Observation, State
+- [ ] Create task bank — 5 easy, 5 medium, 5 hard queries with ground truth
+- [ ] Implement `environment.py` — reset(), step(), state()
+- [ ] Implement `grader.py` — deterministic scoring
+- [ ] Implement `reward.py` — per-step reward computation
+- [ ] Implement fuzzy matching for issue identification
+- [ ] Write `app.py` — FastAPI routes
+- [ ] Local testing: `uv run server` → test all endpoints manually
+## Phase 4: Inference (Apr 11, 12 PM – 3 PM)
+- [ ] Write `inference.py` following sample script format exactly
+- [ ] System prompt design for SQL review agent
+- [ ] Test with free HF Inference API
+- [ ] Verify `[START]`, `[STEP]`, `[END]` output format
+- [ ] Run 3x to verify reproducible scores
+## Phase 5: Containerize & Deploy (Apr 11, 3 PM – 6 PM)
+- [ ] Write Dockerfile (python:3.10-slim base)
+- [ ] `docker build -t sql-query-reviewer ./server`
+- [ ] `docker run -p 8000:8000 sql-query-reviewer`
+- [ ] Test `/reset`, `/step`, `/state` against running container
+- [ ] `openenv push --repo-id ravi/sql-query-reviewer`
+- [ ] Verify HF Space returns 200 on `/reset`
+## Phase 6: Polish & Submit (Apr 11, 6 PM – Apr 12, 11:59 PM)
+- [ ] Write compelling README
+- [ ] Run `openenv validate`
+- [ ] Run `validate-submission.sh`
+- [ ] Fix any issues
+- [ ] Submit early, iterate if time permits
+- [ ] Final verification: HF Space live and responding

files/11-environment-and-devops.md ADDED Viewed

	@@ -0,0 +1,77 @@

+# 11 — Environment & DevOps
+## Local Development Setup
+```bash
+# Python environment
+python3.10 -m venv .venv
+source .venv/bin/activate
+pip install openenv-core fastapi uvicorn pydantic openai huggingface_hub
+# Run locally
+cd server && uvicorn app:app --reload --port 8000
+# Test endpoints
+curl -X POST http://localhost:8000/reset -H "Content-Type: application/json" -d '{"task_id": "easy_001"}'
+```
+## Dockerfile
+```dockerfile
+FROM python:3.10-slim
+WORKDIR /app
+COPY server/requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY models.py .
+COPY tasks/ ./tasks/
+COPY server/ ./server/
+COPY openenv.yaml .
+EXPOSE 8000
+CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000"]
+```
+## server/requirements.txt
+```
+openenv-core>=0.1.0
+fastapi>=0.100.0
+uvicorn>=0.23.0
+pydantic>=2.0.0
+```
+## HF Space Deployment
+```bash
+# Login
+huggingface-cli login
+# Deploy
+openenv push --repo-id ravi/sql-query-reviewer
+# Verify
+curl -s -o /dev/null -w "%{http_code}" -X POST https://ravi-sql-query-reviewer.hf.space/reset -H "Content-Type: application/json" -d '{}'
+# Expected: 200
+```
+## Environment Variables for Inference
+```bash
+export API_BASE_URL="https://router.huggingface.co/v1"
+export MODEL_NAME="Qwen/Qwen2.5-72B-Instruct"
+export HF_TOKEN="hf_xxxxxxxxxxxxx"
+export IMAGE_NAME="sql-query-reviewer"
+```
+## Pre-Validation
+```bash
+chmod +x validate-submission.sh
+./validate-submission.sh https://ravi-sql-query-reviewer.hf.space .
+```
+Expected output: All 3/3 checks passed.

files/12-testing-strategy.md ADDED Viewed

	@@ -0,0 +1,52 @@

+# 12 — Testing Strategy
+## Level 1: Unit Tests (During Build)
+- **Models:** Validate Pydantic models accept/reject correct/incorrect data
+- **Grader:** Test with known inputs → known scores. Verify determinism (run 10x, same result).
+- **Reward function:** Test each action type returns expected reward range
+- **Fuzzy matcher:** Test with exact match, partial match, no match, already-found cases
+## Level 2: Integration Tests (Before Docker)
+- Run `uv run server` locally
+- POST `/reset` with each task ID → verify valid observation returned
+- POST `/step` with valid action → verify reward, done, observation
+- POST `/step` with invalid action → verify graceful error handling
+- GET `/state` → verify state matches expectations
+- Run full episode: reset → steps → done → verify final grader score
+## Level 3: Container Tests (Before Deploy)
+```bash
+docker build -t sql-query-reviewer ./server
+docker run -d -p 8000:8000 sql-query-reviewer
+# Wait for startup
+sleep 5
+# Test reset
+curl -X POST http://localhost:8000/reset -d '{}' | python -m json.tool
+# Test step
+curl -X POST http://localhost:8000/step -d '{"action_type":"identify_issue","issue_category":"syntax","issue_description":"test"}' | python -m json.tool
+docker stop $(docker ps -q)
+```
+## Level 4: Validation Tests (Before Submit)
+- `openenv validate` — must pass
+- `validate-submission.sh <url> .` — all 3 checks must pass
+- Run `inference.py` 3 times → verify scores are consistent
+- Verify stdout format matches `[START]`, `[STEP]`, `[END]` exactly
+- Check memory usage stays under 8GB
+- Check runtime stays under 20 minutes
+## Level 5: Score Variance Check
+- Run inference on all 3 tasks → verify different scores
+- Confirm no grader returns the same score for different inputs
+- Verify easy > medium > hard in terms of baseline agent performance
+## DQ Prevention Checklist
+- [ ] HF Space returns 200 on POST /reset
+- [ ] openenv.yaml is valid
+- [ ] Typed models work
+- [ ] Dockerfile builds
+- [ ] 3+ tasks with graders returning 0.0-1.0
+- [ ] Graders DON'T always return the same score
+- [ ] inference.py exists in root
+- [ ] Baseline produces reproducible scores
+- [ ] Not plagiarized from existing environments

files/architecture-diagram.md ADDED Viewed

	@@ -0,0 +1,61 @@

+# Architecture Diagram
+## High-Level Flow
+```
+┌──────────────┐     ┌───────────────────────────────────┐
+│              │     │        HF Space (Docker)           │
+│  inference.py│     │                                    │
+│  (Agent)     │     │  ┌──────────────────────────┐     │
+│              │ WS  │  │    FastAPI Server         │     │
+│  ┌────────┐  ├────►│  │    (app.py)               │     │
+│  │ OpenAI │  │     │  │                           │     │
+│  │ Client │  │     │  │  /reset → load task       │     │
+│  │   ↕    │  │◄────┤  │  /step  → grade action    │     │
+│  │  LLM   │  │     │  │  /state → return state    │     │
+│  └────────┘  │     │  └──────────┬───────────────┘     │
+│              │     │             │                      │
+│  stdout:     │     │  ┌──────────▼───────────────┐     │
+│  [START]     │     │  │  SQLReviewEnvironment     │     │
+│  [STEP]      │     │  │  - task_bank (JSON)       │     │
+│  [END]       │     │  │  - fuzzy_matcher          │     │
+│              │     │  │  - reward_fn              │     │
+└──────────────┘     │  │  - grader                 │     │
+                     │  └──────────────────────────┘     │
+                     └───────────────────────────────────┘
+```
+## Episode Sequence
+```
+Agent                          Environment
+  │                                │
+  │──── reset(task_id) ──────────►│  Load task from JSON
+  │◄─── observation ──────────────│  Return query + schema + context
+  │                                │
+  │──── step(identify_issue) ────►│  Fuzzy match vs ground truth
+  │◄─── obs + reward + done ──────│  Return feedback + reward
+  │                                │
+  │──── step(suggest_fix) ───────►│  Validate fix
+  │◄─── obs + reward + done ──────│  Return feedback + reward
+  │                                │
+  │──── step(approve) ───────────►│  Check remaining issues
+  │◄─── obs + reward + done=true──│  Episode ends
+  │                                │
+  │──── close() ─────────────────►│  Run grader → final score
+  │◄─── final_score ──────────────│
+  │                                │
+```
+## Evaluation Pipeline (Hackathon Judges)
+```
+Phase 1: Automated Validation
+  └─ HF Space responds? → openenv validate? → Docker builds? → inference.py runs? → 3+ tasks?
+Phase 2: Agentic Evaluation
+  └─ Run Nemotron 3 Super against all envs → check score variance
+Phase 3: Human Review
+  └─ Meta + HF engineers review for utility, creativity, exploit checks
+```

files/project-design.md ADDED Viewed

	@@ -0,0 +1,40 @@

+# Project Design
+## Design Principles
+1. **Spec compliance first, creativity second.** Most teams will fail on automated validation. Perfect adherence to the OpenEnv spec is the highest-ROI activity.
+2. **Reward shaping is the differentiator.** Binary end-of-episode rewards are common. Per-step, severity-weighted, partial-credit rewards are what separate top submissions.
+3. **Score variance is mandatory.** The environment must produce different scores for different agent capabilities. Our design inherently ensures this: different queries have different issues, so no two episodes produce identical scores.
+4. **Domain authenticity wins the 30%.** Real-world utility is the highest-weighted criterion. SQL review is a task every Meta engineer knows and values. The task bank should contain queries that feel like real code review findings, not synthetic puzzles.
+## Key Design Decisions
+| Decision | Choice | Rationale |
+|---|---|---|
+| Domain | SQL Query Review | Universal relevance, clear grading, natural difficulty progression |
+| Task count | 15 queries (5/5/5) | Well above minimum 3, shows depth |
+| Matching | Fuzzy keyword matching | Robust to LLM phrasing variation while staying deterministic |
+| Reward | Per-step partial credit | Provides learning signal throughout trajectory |
+| Episode length | 3-8 steps | Short enough for 20-min inference limit across all tasks |
+| Grader | Severity-weighted coverage | Rewards finding critical issues more than trivial ones |
+## Risk Mitigation
+| Risk | Mitigation |
+|---|---|
+| Fuzzy matching too loose → inflated scores | Require 30% keyword overlap threshold + category match |
+| Fuzzy matching too strict → no agent can score | Include broad keywords list, test with actual LLM output |
+| Inference timeout | 15 queries × 5-8 steps × ~3s per LLM call = ~6 min. Well under 20 min. |
+| Docker build fails on HF | Use minimal dependencies, test Dockerfile locally first |
+| Grader returns same score | Impossible with varied queries — but verify during testing |
+## What Judges Will See
+1. **README** — Clear, compelling, explains why SQL review matters and how the env works
+2. **HF Space** — Live, responds instantly to `/reset`
+3. **Code** — Clean, well-structured, typed models, deterministic graders
+4. **Scores** — Meaningful variance: easy ~0.8, medium ~0.5, hard ~0.3
+5. **Novelty** — No existing SQL review env in OpenEnv ecosystem

files/project-readme.md ADDED Viewed

	@@ -0,0 +1,91 @@

+# SQL Query Reviewer — OpenEnv Environment
+An AI agent environment for reviewing SQL queries for correctness, performance, and security issues.
+## Why This Matters
+Every engineering team reviews SQL queries daily — in code reviews, migration scripts, ETL pipelines, and security audits. This environment lets you train and evaluate AI agents on a task that directly maps to real engineering workflows. Unlike toy benchmarks, the queries here reflect genuine patterns found in production codebases: misspelled keywords, N+1 anti-patterns, missing indexes, SQL injection vectors, and schema-aware optimization opportunities.
+## Environment Overview
+The agent receives a SQL query (plus optional schema context) and must identify issues through a multi-step review process. It earns rewards for correctly flagging problems and suggesting fixes, while being penalized for false positives or approving buggy queries.
+## Action Space
+| Action Type | Description |
+|---|---|
+| `identify_issue` | Flag a specific issue with category and description |
+| `suggest_fix` | Propose corrected SQL for a previously identified issue |
+| `approve` | Mark the query as acceptable (ends episode) |
+| `request_more_context` | Ask for additional schema information |
+**Fields:** `action_type`, `issue_category` (syntax/performance/security/logic/style), `issue_description`, `suggested_fix`, `confidence` (0.0-1.0)
+## Observation Space
+| Field | Type | Description |
+|---|---|---|
+| `query` | str | The SQL query under review |
+| `schema_info` | dict | Table/column definitions (richer for harder tasks) |
+| `context` | str | What the query is supposed to do |
+| `issues_found_so_far` | list | Previously identified issues this episode |
+| `remaining_actions` | int | Steps left before episode ends |
+| `difficulty` | str | easy, medium, or hard |
+| `feedback` | str | Result of last action |
+## Tasks
+### Task 1: Syntax Error Detection (Easy)
+Queries with obvious typos, missing keywords, wrong column names. A baseline agent should score **0.7-0.9**.
+### Task 2: Performance Anti-Pattern Review (Medium)
+Queries with SELECT *, missing indexes, correlated subqueries, unbounded queries. Requires schema awareness. Expected score: **0.4-0.6**.
+### Task 3: Security & Optimization Audit (Hard)
+SQL injection vectors, privilege escalation, data leakage, complex optimization. Requires multi-step reasoning. Expected score: **0.2-0.4**.
+## Reward Design
+- Per-step partial credit (not binary end-of-episode)
+- Correct issue identification: +0.1 to +0.4 (scaled by severity)
+- Valid fix suggestion: +0.1 bonus
+- False positive: -0.1 penalty
+- Approving a query with unfound issues: -0.15 per missed issue
+- Correct approval of clean query: +0.2
+## Setup
+```bash
+# Install
+pip install openenv-core
+pip install git+https://huggingface.co/spaces/ravi/sql-query-reviewer
+# Use
+from sql_query_reviewer import SQLReviewEnv, SQLReviewAction
+with SQLReviewEnv(base_url="https://ravi-sql-query-reviewer.hf.space").sync() as env:
+    result = env.reset()
+    result = env.step(SQLReviewAction(
+        action_type="identify_issue",
+        issue_category="syntax",
+        issue_description="SELCT should be SELECT"
+    ))
+    print(result.observation.feedback)
+```
+## Docker
+```bash
+docker build -t sql-query-reviewer ./server
+docker run -p 8000:8000 sql-query-reviewer
+```
+## Baseline Scores
+| Task | Difficulty | Baseline Score |
+|---|---|---|
+| Syntax Error Detection | Easy | ~0.82 |
+| Performance Anti-Pattern Review | Medium | ~0.51 |
+| Security & Optimization Audit | Hard | ~0.29 |
+## Author
+**Ravi** — Solo participant, Meta PyTorch OpenEnv Hackathon 2026

inference.py ADDED Viewed

	@@ -0,0 +1,131 @@

+from __future__ import annotations
+import json
+import os
+from typing import Any
+from openai import OpenAI
+from sql_query_reviewer.client import SyncSQLReviewEnv
+from sql_query_reviewer.models import SQLReviewAction, SQLReviewObservation
+DEFAULT_TASK_IDS = ("easy_001", "medium_001", "hard_001")
+SYSTEM_PROMPT = """You are reviewing a SQL query for correctness, performance, and security.
+Return exactly one JSON object with these keys:
+- action_type: identify_issue, suggest_fix, approve, or request_more_context
+- issue_category: syntax, performance, security, logic, or style when relevant
+- issue_description: concise issue statement when relevant
+- suggested_fix: corrected SQL or corrected fragment when relevant
+- confidence: float between 0.0 and 1.0
+Guidelines:
+- Prefer identify_issue until you have high confidence all important issues are covered.
+- Use approve only when the query looks acceptable or all issues have already been identified.
+- Keep the JSON valid and do not wrap it in prose.
+"""
+def print_event(prefix: str, payload: dict[str, Any]) -> None:
+    print(f"[{prefix}] {json.dumps(payload, sort_keys=True)}")
+def build_user_prompt(observation: SQLReviewObservation) -> str:
+    payload = {
+        "query": observation.query,
+        "schema_info": observation.schema_info,
+        "context": observation.context,
+        "issues_found_so_far": [issue.model_dump() for issue in observation.issues_found_so_far],
+        "remaining_actions": observation.remaining_actions,
+        "difficulty": observation.difficulty,
+        "feedback": observation.feedback,
+    }
+    return json.dumps(payload, indent=2)
+def extract_json(content: str) -> dict[str, Any]:
+    stripped = content.strip()
+    if stripped.startswith("```"):
+        lines = [line for line in stripped.splitlines() if not line.startswith("```")]
+        stripped = "\n".join(lines).strip()
+    start = stripped.find("{")
+    end = stripped.rfind("}")
+    if start == -1 or end == -1 or end <= start:
+        raise ValueError(f"Could not find JSON object in model response: {content!r}")
+    return json.loads(stripped[start : end + 1])
+def choose_action(llm_client: Any, model_name: str, observation: SQLReviewObservation) -> SQLReviewAction:
+    response = llm_client.chat.completions.create(
+        model=model_name,
+        temperature=0,
+        messages=[
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": build_user_prompt(observation)},
+        ],
+    )
+    content = response.choices[0].message.content or ""
+    return SQLReviewAction.model_validate(extract_json(content))
+def run_episode(env: Any, llm_client: Any, model_name: str, task_id: str) -> dict[str, Any]:
+    result = env.reset(task_id=task_id)
+    print_event(
+        "START",
+        {
+            "difficulty": result.observation.difficulty,
+            "remaining_actions": result.observation.remaining_actions,
+            "task_id": task_id,
+        },
+    )
+    while True:
+        action = choose_action(llm_client=llm_client, model_name=model_name, observation=result.observation)
+        result = env.step(action)
+        print_event(
+            "STEP",
+            {
+                "action": action.model_dump(exclude_none=True),
+                "done": result.done,
+                "feedback": result.observation.feedback,
+                "reward": result.reward,
+                "task_id": task_id,
+            },
+        )
+        if result.done:
+            state = env.state()
+            summary = {
+                "final_score": state.final_score,
+                "steps": state.step_count,
+                "task_id": task_id,
+                "total_reward": state.total_reward,
+            }
+            print_event("END", summary)
+            return summary
+def main() -> int:
+    env_base_url = os.getenv("ENV_BASE_URL", "http://localhost:8000")
+    api_base_url = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
+    model_name = os.getenv("MODEL_NAME", "gpt-4o-mini")
+    api_key = os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY")
+    if not api_key:
+        raise SystemExit("Set HF_TOKEN or OPENAI_API_KEY before running inference.py")
+    task_ids = tuple(
+        task_id.strip()
+        for task_id in os.getenv("TASK_IDS", ",".join(DEFAULT_TASK_IDS)).split(",")
+        if task_id.strip()
+    )
+    llm_client = OpenAI(api_key=api_key, base_url=api_base_url)
+    with SyncSQLReviewEnv(base_url=env_base_url) as env:
+        for task_id in task_ids:
+            run_episode(env=env, llm_client=llm_client, model_name=model_name, task_id=task_id)
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

models.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from sql_query_reviewer.models import (
+    GroundTruthIssue,
+    IdentifiedIssue,
+    ResetRequest,
+    SQLReviewAction,
+    SQLReviewObservation,
+    SQLReviewState,
+    StepResult,
+    TaskRecord,
+)
+__all__ = [
+    "GroundTruthIssue",
+    "IdentifiedIssue",
+    "ResetRequest",
+    "SQLReviewAction",
+    "SQLReviewObservation",
+    "SQLReviewState",
+    "StepResult",
+    "TaskRecord",
+]

openenv.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+name: sql-query-reviewer
+description: "AI agent reviews SQL queries for correctness, performance, and security."
+author: Hellinferno
+version: "0.1.0"
+tags:
+  - openenv
+  - sql
+  - code-review
+  - security
+tasks:
+  - id: easy_syntax
+    name: Syntax Error Detection
+    difficulty: easy
+    description: Find obvious SQL syntax and logic defects.
+  - id: medium_performance
+    name: Performance Anti-Pattern Review
+    difficulty: medium
+    description: Identify schema-aware performance problems.
+  - id: hard_security
+    name: Security and Optimization Audit
+    difficulty: hard
+    description: Detect injection, data exposure, and advanced optimization issues.

pyproject.toml ADDED Viewed

	@@ -0,0 +1,39 @@

+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "sql-query-reviewer"
+version = "0.1.0"
+description = "An OpenEnv-style SQL review environment for correctness, performance, and security auditing."
+readme = "README.md"
+requires-python = ">=3.11"
+license = { text = "MIT" }
+authors = [
+  { name = "Hellinferno" }
+]
+dependencies = [
+  "fastapi>=0.115,<1.0",
+  "httpx>=0.27,<1.0",
+  "openenv-core>=0.2.0",
+  "openai>=2.7.2,<3.0",
+  "pydantic>=2.8,<3.0",
+  "uvicorn>=0.30,<1.0",
+]
+[project.scripts]
+server = "server.app:main"
+[project.optional-dependencies]
+dev = [
+  "pytest>=8.3,<9.0",
+]
+[tool.setuptools.packages.find]
+include = ["sql_query_reviewer*"]
+[tool.fastapi]
+entrypoint = "server.app:app"
+[tool.pytest.ini_options]
+testpaths = ["tests"]

server/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from server.app import app, create_app
+from server.environment import SQLReviewEnvironment
+__all__ = ["SQLReviewEnvironment", "app", "create_app"]

server/app.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from __future__ import annotations
+import os
+from typing import Annotated
+from fastapi import Body, FastAPI, HTTPException
+import uvicorn
+from sql_query_reviewer.models import ResetRequest, SQLReviewAction, SQLReviewState, StepResult
+from server.environment import SQLReviewEnvironment
+def create_app(environment: SQLReviewEnvironment | None = None) -> FastAPI:
+    app = FastAPI(
+        title="SQL Query Reviewer",
+        description="OpenEnv-style SQL review environment served over FastAPI.",
+        version="0.1.0",
+    )
+    env = environment or SQLReviewEnvironment()
+    @app.get("/health")
+    async def health() -> dict[str, str]:
+        return {"status": "ok"}
+    @app.post("/reset", response_model=StepResult)
+    async def reset_environment(request: Annotated[ResetRequest | None, Body()] = None) -> StepResult:
+        try:
+            return env.reset(task_id=request.task_id if request else None)
+        except ValueError as exc:
+            raise HTTPException(status_code=404, detail=str(exc)) from exc
+    @app.post("/step", response_model=StepResult)
+    async def step_environment(action: SQLReviewAction) -> StepResult:
+        try:
+            return env.step(action)
+        except RuntimeError as exc:
+            raise HTTPException(status_code=400, detail=str(exc)) from exc
+    @app.get("/state", response_model=SQLReviewState)
+    async def get_state() -> SQLReviewState:
+        try:
+            return env.state()
+        except RuntimeError as exc:
+            raise HTTPException(status_code=400, detail=str(exc)) from exc
+    return app
+app = create_app()
+def main() -> None:
+    port = int(os.getenv("PORT", "8000"))
+    host = os.getenv("HOST", "0.0.0.0")
+    uvicorn.run("server.app:app", host=host, port=port)
+if __name__ == "__main__":
+    main()

server/environment.py ADDED Viewed

	@@ -0,0 +1,185 @@

+from __future__ import annotations
+import json
+from pathlib import Path
+from sql_query_reviewer.models import (
+    IdentifiedIssue,
+    SQLReviewAction,
+    SQLReviewObservation,
+    SQLReviewState,
+    StepResult,
+    TaskRecord,
+)
+from server.grader import grade_episode, match_issue, validate_fix
+from server.reward import compute_reward
+class SQLReviewEnvironment:
+    def __init__(self, task_directory: Path | None = None) -> None:
+        self.task_directory = task_directory or Path(__file__).resolve().parent.parent / "tasks"
+        self.tasks = self._load_tasks()
+        self.task_order = sorted(self.tasks)
+        self.current_task: TaskRecord | None = None
+        self.current_state: SQLReviewState | None = None
+        self._reset_index = 0
+    def available_task_ids(self) -> list[str]:
+        return list(self.task_order)
+    def reset(self, task_id: str | None = None) -> StepResult:
+        selected_task_id = task_id or self._next_task_id()
+        if selected_task_id not in self.tasks:
+            raise ValueError(f"Unknown task_id: {selected_task_id}")
+        self.current_task = self.tasks[selected_task_id]
+        self.current_state = SQLReviewState(task_id=self.current_task.task_id)
+        observation = self._build_observation(
+            feedback="Review this SQL query and identify correctness, performance, or security issues."
+        )
+        return StepResult(observation=observation, reward=0.0, done=False, info={})
+    def step(self, action: SQLReviewAction) -> StepResult:
+        task = self._require_task()
+        state = self._require_state()
+        if state.done:
+            raise RuntimeError("Episode already finished. Call reset() before taking more steps.")
+        found_ids = {issue.issue_id for issue in state.issues_identified}
+        reward = 0.0
+        info: dict[str, object] = {}
+        feedback = "No-op."
+        state.step_count += 1
+        if action.action_type == "identify_issue":
+            duplicate_issue, duplicate_score = match_issue(action, task.ground_truth_issues, set())
+            if duplicate_issue is not None and duplicate_issue.id in found_ids:
+                reward = compute_reward(action, duplicate_issue, duplicate_issue=True)
+                feedback = f"Issue '{duplicate_issue.id}' was already identified earlier in the episode."
+                info = {"match_score": round(duplicate_score, 3), "match_type": "duplicate", "issue_id": duplicate_issue.id}
+            else:
+                matched_issue, score = match_issue(action, task.ground_truth_issues, found_ids)
+                if matched_issue is None:
+                    state.false_positive_count += 1
+                    reward = compute_reward(action, None)
+                    feedback = "No matching issue found for that description."
+                    info = {"match_score": round(score, 3), "match_type": "none"}
+                else:
+                    fix_valid = validate_fix(action.suggested_fix, matched_issue)
+                    state.issues_identified.append(
+                        IdentifiedIssue(
+                            issue_id=matched_issue.id,
+                            category=matched_issue.category,
+                            description=matched_issue.description,
+                        )
+                    )
+                    reward = compute_reward(action, matched_issue, fix_valid=fix_valid)
+                    remaining = len(task.ground_truth_issues) - len(state.issues_identified)
+                    feedback = f"Matched {matched_issue.category} issue '{matched_issue.id}'. {remaining} issue(s) remaining."
+                    info = {
+                        "match_score": round(score, 3),
+                        "match_type": "fuzzy",
+                        "severity": matched_issue.severity,
+                        "issue_id": matched_issue.id,
+                        "all_issues_found": remaining == 0,
+                    }
+                    if fix_valid and action.suggested_fix:
+                        state.fixes_suggested.append(action.suggested_fix)
+        elif action.action_type == "suggest_fix":
+            if not state.issues_identified:
+                reward = compute_reward(action, None, has_previous_issue=False)
+                feedback = "Identify an issue before suggesting a fix."
+            else:
+                last_issue_id = state.issues_identified[-1].issue_id
+                last_issue = next(issue for issue in task.ground_truth_issues if issue.id == last_issue_id)
+                fix_valid = validate_fix(action.suggested_fix, last_issue)
+                reward = compute_reward(action, last_issue, fix_valid=fix_valid, has_previous_issue=True)
+                feedback = "Fix accepted for the last identified issue." if fix_valid else "Suggested fix did not match the expected remediation."
+                info = {"issue_id": last_issue.id, "fix_valid": fix_valid}
+                if fix_valid and action.suggested_fix:
+                    state.fixes_suggested.append(action.suggested_fix)
+        elif action.action_type == "approve":
+            remaining_unfound = len(task.ground_truth_issues) - len(found_ids)
+            reward = compute_reward(action, None, remaining_unfound=remaining_unfound)
+            state.approved = True
+            state.done = True
+            feedback = (
+                "Query approved with full issue coverage."
+                if remaining_unfound == 0
+                else f"Query approved too early. {remaining_unfound} issue(s) were missed."
+            )
+            info = {"remaining_unfound": remaining_unfound}
+        else:
+            feedback = self._schema_feedback(task)
+            info = {"context_shared": bool(task.schema)}
+        state.total_reward += reward
+        if state.step_count >= task.max_steps and not state.done:
+            state.done = True
+            feedback = f"{feedback} Maximum step count reached."
+        if state.done:
+            state.final_score = grade_episode(
+                found_issue_ids={issue.issue_id for issue in state.issues_identified},
+                ground_truth_issues=task.ground_truth_issues,
+                total_steps=state.step_count,
+                max_steps=task.max_steps,
+                false_positive_count=state.false_positive_count,
+            )
+            info["final_score"] = state.final_score
+        observation = self._build_observation(feedback=feedback)
+        return StepResult(observation=observation, reward=reward, done=state.done, info=info)
+    def state(self) -> SQLReviewState:
+        return self._require_state().model_copy(deep=True)
+    def _load_tasks(self) -> dict[str, TaskRecord]:
+        tasks: dict[str, TaskRecord] = {}
+        for file_path in sorted(self.task_directory.glob("*_tasks.json")):
+            with file_path.open("r", encoding="utf-8") as handle:
+                for raw_task in json.load(handle):
+                    task = TaskRecord.model_validate(raw_task)
+                    tasks[task.task_id] = task
+        if not tasks:
+            raise RuntimeError(f"No task files found in {self.task_directory}")
+        return tasks
+    def _next_task_id(self) -> str:
+        task_id = self.task_order[self._reset_index % len(self.task_order)]
+        self._reset_index += 1
+        return task_id
+    def _build_observation(self, feedback: str) -> SQLReviewObservation:
+        task = self._require_task()
+        state = self._require_state()
+        remaining_actions = max(task.max_steps - state.step_count, 0)
+        return SQLReviewObservation(
+            query=task.query,
+            schema_info=task.schema_info,
+            context=task.context,
+            issues_found_so_far=state.issues_identified,
+            remaining_actions=remaining_actions,
+            difficulty=task.difficulty,
+            feedback=feedback,
+        )
+    def _schema_feedback(self, task: TaskRecord) -> str:
+        if not task.schema_info:
+            return "No additional schema context is available for this task."
+        tables = ", ".join(sorted(task.schema_info))
+        return f"Schema context available for: {tables}."
+    def _require_task(self) -> TaskRecord:
+        if self.current_task is None:
+            raise RuntimeError("Environment has no active task. Call reset() first.")
+        return self.current_task
+    def _require_state(self) -> SQLReviewState:
+        if self.current_state is None:
+            raise RuntimeError("Environment has no active state. Call reset() first.")
+        return self.current_state

server/grader.py ADDED Viewed

	@@ -0,0 +1,91 @@

+from __future__ import annotations
+import re
+from sql_query_reviewer.models import GroundTruthIssue, IssueCategory, SQLReviewAction
+TOKEN_RE = re.compile(r"[a-zA-Z0-9_]+")
+def clamp(value: float, minimum: float, maximum: float) -> float:
+    return max(minimum, min(maximum, value))
+def normalize_text(value: str) -> str:
+    return " ".join(TOKEN_RE.findall(value.lower()))
+def tokenize(value: str) -> set[str]:
+    return set(TOKEN_RE.findall(value.lower()))
+def _set_overlap(candidate: set[str], target: set[str]) -> float:
+    if not candidate or not target:
+        return 0.0
+    return len(candidate & target) / max(len(target), 1)
+def score_issue_match(description: str, category: IssueCategory | None, issue: GroundTruthIssue) -> float:
+    candidate_tokens = tokenize(description)
+    keyword_tokens = set(issue.keywords)
+    description_tokens = tokenize(issue.description)
+    keyword_score = _set_overlap(candidate_tokens, keyword_tokens)
+    description_score = _set_overlap(candidate_tokens, description_tokens)
+    category_bonus = 0.2 if category == issue.category else 0.0
+    score = (keyword_score * 0.6) + (description_score * 0.25) + category_bonus
+    return clamp(score, 0.0, 1.0)
+def match_issue(
+    action: SQLReviewAction,
+    ground_truth_issues: list[GroundTruthIssue],
+    already_found_ids: set[str],
+) -> tuple[GroundTruthIssue | None, float]:
+    if not action.issue_description:
+        return None, 0.0
+    best_issue: GroundTruthIssue | None = None
+    best_score = 0.0
+    for issue in ground_truth_issues:
+        if issue.id in already_found_ids:
+            continue
+        score = score_issue_match(action.issue_description, action.issue_category, issue)
+        if score > best_score:
+            best_score = score
+            best_issue = issue
+    if best_issue is None or best_score < 0.35:
+        return None, best_score
+    return best_issue, best_score
+def validate_fix(suggested_fix: str | None, issue: GroundTruthIssue) -> bool:
+    if not suggested_fix:
+        return False
+    suggestion_tokens = tokenize(suggested_fix)
+    canonical_tokens = tokenize(issue.fix)
+    if not suggestion_tokens or not canonical_tokens:
+        return False
+    overlap = _set_overlap(suggestion_tokens, canonical_tokens)
+    description_overlap = _set_overlap(suggestion_tokens, tokenize(issue.description))
+    return overlap >= 0.5 or description_overlap >= 0.6
+def grade_episode(
+    found_issue_ids: set[str],
+    ground_truth_issues: list[GroundTruthIssue],
+    total_steps: int,
+    max_steps: int,
+    false_positive_count: int,
+) -> float:
+    if not ground_truth_issues:
+        return 1.0 if false_positive_count == 0 else clamp(1.0 - (0.1 * false_positive_count), 0.0, 1.0)
+    total_severity = sum(issue.severity for issue in ground_truth_issues)
+    found_severity = sum(issue.severity for issue in ground_truth_issues if issue.id in found_issue_ids)
+    coverage_score = found_severity / total_severity if total_severity else 0.0
+    efficiency_bonus = max(0.0, 0.1 * (1 - (total_steps / max(max_steps, 1))))
+    false_positive_penalty = 0.05 * false_positive_count
+    final_score = coverage_score + efficiency_bonus - false_positive_penalty
+    return clamp(final_score, 0.0, 1.0)

server/reward.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from __future__ import annotations
+from sql_query_reviewer.models import GroundTruthIssue, SQLReviewAction
+def compute_reward(
+    action: SQLReviewAction,
+    matched_issue: GroundTruthIssue | None,
+    *,
+    fix_valid: bool = False,
+    duplicate_issue: bool = False,
+    remaining_unfound: int = 0,
+    has_previous_issue: bool = False,
+) -> float:
+    if action.action_type == "identify_issue":
+        if duplicate_issue:
+            return -0.02
+        if matched_issue is None:
+            return -0.1
+        base_reward = min(matched_issue.severity, 0.35)
+        fix_bonus = 0.08 if fix_valid else 0.0
+        confidence_bonus = min(0.05, action.confidence * 0.05)
+        return min(base_reward + fix_bonus + confidence_bonus, 0.4)
+    if action.action_type == "suggest_fix":
+        if not has_previous_issue:
+            return -0.05
+        return 0.1 if fix_valid else 0.0
+    if action.action_type == "approve":
+        if remaining_unfound == 0:
+            return 0.2
+        return max(-1.0, -0.15 * remaining_unfound)
+    return 0.0

sql_query_reviewer/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from sql_query_reviewer.client import SQLReviewEnv, SyncSQLReviewEnv
+from sql_query_reviewer.models import (
+    GroundTruthIssue,
+    IdentifiedIssue,
+    ResetRequest,
+    SQLReviewAction,
+    SQLReviewObservation,
+    SQLReviewState,
+    StepResult,
+    TaskRecord,
+)
+__all__ = [
+    "GroundTruthIssue",
+    "IdentifiedIssue",
+    "ResetRequest",
+    "SQLReviewAction",
+    "SQLReviewEnv",
+    "SQLReviewObservation",
+    "SQLReviewState",
+    "StepResult",
+    "SyncSQLReviewEnv",
+    "TaskRecord",
+]

sql_query_reviewer/client.py ADDED Viewed

	@@ -0,0 +1,95 @@

+from __future__ import annotations
+from typing import Any
+import httpx
+from sql_query_reviewer.models import ResetRequest, SQLReviewAction, SQLReviewState, StepResult
+class SQLReviewEnv:
+    def __init__(self, base_url: str, timeout: float = 30.0) -> None:
+        self.base_url = base_url.rstrip("/")
+        self.timeout = timeout
+        self._client: httpx.AsyncClient | None = None
+    async def __aenter__(self) -> "SQLReviewEnv":
+        self._client = httpx.AsyncClient(base_url=self.base_url, timeout=self.timeout)
+        return self
+    async def __aexit__(self, *_: Any) -> None:
+        await self.close()
+    async def close(self) -> None:
+        if self._client is not None:
+            await self._client.aclose()
+            self._client = None
+    def sync(self) -> "SyncSQLReviewEnv":
+        return SyncSQLReviewEnv(base_url=self.base_url, timeout=self.timeout)
+    async def reset(self, task_id: str | None = None) -> StepResult:
+        client = self._require_client()
+        response = await client.post("/reset", json=ResetRequest(task_id=task_id).model_dump(exclude_none=True))
+        response.raise_for_status()
+        return StepResult.model_validate(response.json())
+    async def step(self, action: SQLReviewAction) -> StepResult:
+        client = self._require_client()
+        response = await client.post("/step", json=action.model_dump(exclude_none=True))
+        response.raise_for_status()
+        return StepResult.model_validate(response.json())
+    async def state(self) -> SQLReviewState:
+        client = self._require_client()
+        response = await client.get("/state")
+        response.raise_for_status()
+        return SQLReviewState.model_validate(response.json())
+    def _require_client(self) -> httpx.AsyncClient:
+        if self._client is None:
+            raise RuntimeError("Use SQLReviewEnv as an async context manager before calling it.")
+        return self._client
+class SyncSQLReviewEnv:
+    def __init__(self, base_url: str, timeout: float = 30.0) -> None:
+        self.base_url = base_url.rstrip("/")
+        self.timeout = timeout
+        self._client: httpx.Client | None = None
+    def __enter__(self) -> "SyncSQLReviewEnv":
+        self._client = httpx.Client(base_url=self.base_url, timeout=self.timeout)
+        return self
+    def __exit__(self, *_: Any) -> None:
+        self.close()
+    def close(self) -> None:
+        if self._client is not None:
+            self._client.close()
+            self._client = None
+    def reset(self, task_id: str | None = None) -> StepResult:
+        client = self._require_client()
+        response = client.post("/reset", json=ResetRequest(task_id=task_id).model_dump(exclude_none=True))
+        response.raise_for_status()
+        return StepResult.model_validate(response.json())
+    def step(self, action: SQLReviewAction) -> StepResult:
+        client = self._require_client()
+        response = client.post("/step", json=action.model_dump(exclude_none=True))
+        response.raise_for_status()
+        return StepResult.model_validate(response.json())
+    def state(self) -> SQLReviewState:
+        client = self._require_client()
+        response = client.get("/state")
+        response.raise_for_status()
+        return SQLReviewState.model_validate(response.json())
+    def _require_client(self) -> httpx.Client:
+        if self._client is None:
+            raise RuntimeError("Use SyncSQLReviewEnv as a context manager before calling it.")
+        return self._client

sql_query_reviewer/models.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from __future__ import annotations
+from typing import Any, Literal
+from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
+Difficulty = Literal["easy", "medium", "hard"]
+ActionType = Literal["identify_issue", "suggest_fix", "approve", "request_more_context"]
+IssueCategory = Literal["syntax", "performance", "security", "logic", "style"]
+class StrictModel(BaseModel):
+    model_config = ConfigDict(extra="forbid", populate_by_name=True)
+class GroundTruthIssue(StrictModel):
+    id: str = Field(min_length=1)
+    category: IssueCategory
+    description: str = Field(min_length=1)
+    severity: float = Field(gt=0.0, le=1.0)
+    fix: str = Field(min_length=1)
+    keywords: list[str] = Field(default_factory=list)
+    @field_validator("keywords")
+    @classmethod
+    def normalize_keywords(cls, value: list[str]) -> list[str]:
+        deduped: list[str] = []
+        for keyword in value:
+            normalized = keyword.strip().lower()
+            if normalized and normalized not in deduped:
+                deduped.append(normalized)
+        return deduped
+class TaskRecord(StrictModel):
+    task_id: str = Field(min_length=1)
+    difficulty: Difficulty
+    query: str = Field(min_length=1)
+    schema_info: dict[str, dict[str, str]] = Field(default_factory=dict, alias="schema")
+    context: str = Field(min_length=1)
+    ground_truth_issues: list[GroundTruthIssue] = Field(default_factory=list)
+    max_steps: int = Field(ge=1, le=12)
+class IdentifiedIssue(StrictModel):
+    issue_id: str = Field(min_length=1)
+    category: IssueCategory
+    description: str = Field(min_length=1)
+class SQLReviewAction(StrictModel):
+    action_type: ActionType
+    issue_category: IssueCategory | None = None
+    issue_description: str | None = None
+    suggested_fix: str | None = None
+    confidence: float = Field(default=0.5, ge=0.0, le=1.0)
+    @model_validator(mode="after")
+    def validate_action(self) -> "SQLReviewAction":
+        if self.action_type == "identify_issue":
+            if not self.issue_category or not self.issue_description:
+                raise ValueError("identify_issue requires issue_category and issue_description")
+        elif self.action_type == "suggest_fix":
+            if not self.suggested_fix:
+                raise ValueError("suggest_fix requires suggested_fix")
+        return self
+class SQLReviewObservation(StrictModel):
+    query: str
+    schema_info: dict[str, dict[str, str]] = Field(default_factory=dict)
+    context: str
+    issues_found_so_far: list[IdentifiedIssue] = Field(default_factory=list)
+    remaining_actions: int = Field(ge=0)
+    difficulty: Difficulty
+    feedback: str
+class SQLReviewState(StrictModel):
+    task_id: str
+    step_count: int = Field(default=0, ge=0)
+    issues_identified: list[IdentifiedIssue] = Field(default_factory=list)
+    total_reward: float = 0.0
+    done: bool = False
+    approved: bool = False
+    fixes_suggested: list[str] = Field(default_factory=list)
+    false_positive_count: int = Field(default=0, ge=0)
+    final_score: float | None = Field(default=None, ge=0.0, le=1.0)
+class StepResult(StrictModel):
+    observation: SQLReviewObservation
+    reward: float
+    done: bool
+    info: dict[str, Any] = Field(default_factory=dict)
+class ResetRequest(StrictModel):
+    task_id: str | None = None

tasks/easy_tasks.json ADDED Viewed

	@@ -0,0 +1,148 @@

+[
+  {
+    "task_id": "easy_001",
+    "difficulty": "easy",
+    "query": "SELCT * FORM users WEHRE id = 1;",
+    "schema": {
+      "users": {
+        "id": "INT PRIMARY KEY",
+        "name": "VARCHAR(255)",
+        "email": "VARCHAR(255)"
+      }
+    },
+    "context": "Fetch a user profile by id for the account page.",
+    "ground_truth_issues": [
+      {
+        "id": "easy_001_select",
+        "category": "syntax",
+        "description": "SELCT should be SELECT.",
+        "severity": 0.35,
+        "fix": "SELECT * FROM users WHERE id = 1;",
+        "keywords": ["selct", "select", "misspelled keyword", "syntax"]
+      },
+      {
+        "id": "easy_001_from",
+        "category": "syntax",
+        "description": "FORM should be FROM.",
+        "severity": 0.35,
+        "fix": "SELECT * FROM users WHERE id = 1;",
+        "keywords": ["form", "from", "misspelled keyword", "syntax"]
+      },
+      {
+        "id": "easy_001_where",
+        "category": "syntax",
+        "description": "WEHRE should be WHERE.",
+        "severity": 0.25,
+        "fix": "SELECT * FROM users WHERE id = 1;",
+        "keywords": ["wehre", "where", "misspelled keyword", "syntax"]
+      },
+      {
+        "id": "easy_001_projection",
+        "category": "performance",
+        "description": "SELECT * fetches unnecessary columns for a profile lookup.",
+        "severity": 0.15,
+        "fix": "SELECT id, name, email FROM users WHERE id = 1;",
+        "keywords": ["select *", "unnecessary columns", "projection", "performance"]
+      }
+    ],
+    "max_steps": 5
+  },
+  {
+    "task_id": "easy_002",
+    "difficulty": "easy",
+    "query": "SELECT id, email users WHERE active = 1;",
+    "schema": {
+      "users": {
+        "id": "INT PRIMARY KEY",
+        "email": "VARCHAR(255)",
+        "active": "BOOLEAN"
+      }
+    },
+    "context": "List active user emails for a notification job.",
+    "ground_truth_issues": [
+      {
+        "id": "easy_002_missing_from",
+        "category": "syntax",
+        "description": "The query is missing the FROM clause before users.",
+        "severity": 0.6,
+        "fix": "SELECT id, email FROM users WHERE active = 1;",
+        "keywords": ["missing from", "from clause", "syntax", "users"]
+      }
+    ],
+    "max_steps": 4
+  },
+  {
+    "task_id": "easy_003",
+    "difficulty": "easy",
+    "query": "SELECT order_id, total FROM orders WHERE shipped_at = NULL;",
+    "schema": {
+      "orders": {
+        "order_id": "INT PRIMARY KEY",
+        "total": "DECIMAL(10,2)",
+        "shipped_at": "TIMESTAMP NULL"
+      }
+    },
+    "context": "Find orders that still need to ship.",
+    "ground_truth_issues": [
+      {
+        "id": "easy_003_null_check",
+        "category": "logic",
+        "description": "NULL must be compared with IS NULL instead of = NULL.",
+        "severity": 0.7,
+        "fix": "SELECT order_id, total FROM orders WHERE shipped_at IS NULL;",
+        "keywords": ["is null", "= null", "null comparison", "logic"]
+      }
+    ],
+    "max_steps": 4
+  },
+  {
+    "task_id": "easy_004",
+    "difficulty": "easy",
+    "query": "SELECT name FROM customers WHERE city = 'Boston;",
+    "schema": {
+      "customers": {
+        "id": "INT PRIMARY KEY",
+        "name": "VARCHAR(255)",
+        "city": "VARCHAR(128)"
+      }
+    },
+    "context": "Filter customers who live in Boston.",
+    "ground_truth_issues": [
+      {
+        "id": "easy_004_unclosed_quote",
+        "category": "syntax",
+        "description": "The string literal is not terminated with a closing quote.",
+        "severity": 0.75,
+        "fix": "SELECT name FROM customers WHERE city = 'Boston';",
+        "keywords": ["unclosed quote", "unterminated string", "syntax", "quote"]
+      }
+    ],
+    "max_steps": 4
+  },
+  {
+    "task_id": "easy_005",
+    "difficulty": "easy",
+    "query": "SELECT id, statuz FROM orders WHERE status = 'paid';",
+    "schema": {
+      "orders": {
+        "id": "INT PRIMARY KEY",
+        "status": "VARCHAR(32)",
+        "total": "DECIMAL(10,2)",
+        "created_at": "TIMESTAMP"
+      }
+    },
+    "context": "List paid orders for revenue accounting.",
+    "ground_truth_issues": [
+      {
+        "id": "easy_005_bad_column",
+        "category": "logic",
+        "description": "Column statuz does not exist; the intended column is status.",
+        "severity": 0.65,
+        "fix": "SELECT id, status FROM orders WHERE status = 'paid';",
+        "keywords": ["unknown column", "statuz", "status", "column name"]
+      }
+    ],
+    "max_steps": 4
+  }
+]

tasks/hard_tasks.json ADDED Viewed

	@@ -0,0 +1,158 @@

+[
+  {
+    "task_id": "hard_001",
+    "difficulty": "hard",
+    "query": "SELECT * FROM users WHERE email = '${user_email}' AND password = '${password}';",
+    "schema": {
+      "users": {
+        "id": "INT PRIMARY KEY",
+        "email": "VARCHAR(255) UNIQUE",
+        "password_hash": "VARCHAR(255)",
+        "role": "VARCHAR(32)",
+        "created_at": "TIMESTAMP"
+      }
+    },
+    "context": "Authenticate a user during login.",
+    "ground_truth_issues": [
+      {
+        "id": "hard_001_sql_injection",
+        "category": "security",
+        "description": "Interpolating user_email and password directly into the SQL creates a SQL injection vulnerability.",
+        "severity": 1.0,
+        "fix": "SELECT id, email, role FROM users WHERE email = ? AND password_hash = ?;",
+        "keywords": ["sql injection", "interpolation", "user input", "parameterized", "security"]
+      },
+      {
+        "id": "hard_001_select_star_sensitive",
+        "category": "security",
+        "description": "SELECT * returns sensitive columns such as password hashes that the login flow does not need.",
+        "severity": 0.4,
+        "fix": "SELECT id, email, role FROM users WHERE email = ? AND password_hash = ?;",
+        "keywords": ["select *", "sensitive columns", "password hash", "least privilege", "security"]
+      }
+    ],
+    "max_steps": 6
+  },
+  {
+    "task_id": "hard_002",
+    "difficulty": "hard",
+    "query": "SELECT id, email FROM customers UNION SELECT id, secret_value FROM admin_secrets;",
+    "schema": {
+      "customers": {
+        "id": "INT PRIMARY KEY",
+        "email": "VARCHAR(255)"
+      },
+      "admin_secrets": {
+        "id": "INT PRIMARY KEY",
+        "secret_value": "TEXT"
+      }
+    },
+    "context": "Prepare a data export for a customer marketing campaign.",
+    "ground_truth_issues": [
+      {
+        "id": "hard_002_secret_exfiltration",
+        "category": "security",
+        "description": "The UNION includes admin_secrets and leaks privileged data into a customer-facing export.",
+        "severity": 0.95,
+        "fix": "SELECT id, email FROM customers;",
+        "keywords": ["union", "admin_secrets", "secret_value", "data leakage", "security"]
+      },
+      {
+        "id": "hard_002_mixed_data_domains",
+        "category": "logic",
+        "description": "The query mixes unrelated datasets with incompatible semantics, producing an invalid export.",
+        "severity": 0.45,
+        "fix": "SELECT id, email FROM customers;",
+        "keywords": ["union", "invalid export", "mixed dataset", "logic"]
+      }
+    ],
+    "max_steps": 6
+  },
+  {
+    "task_id": "hard_003",
+    "difficulty": "hard",
+    "query": "SELECT c.id, c.full_name, c.ssn, c.email, t.subject FROM customers c JOIN support_tickets t ON t.customer_id = c.id WHERE t.status = 'open';",
+    "schema": {
+      "customers": {
+        "id": "INT PRIMARY KEY",
+        "full_name": "VARCHAR(255)",
+        "ssn": "VARCHAR(32)",
+        "email": "VARCHAR(255)"
+      },
+      "support_tickets": {
+        "id": "INT PRIMARY KEY",
+        "customer_id": "INT INDEX",
+        "subject": "VARCHAR(255)",
+        "status": "VARCHAR(32)"
+      }
+    },
+    "context": "Show open support tickets to an agent dashboard.",
+    "ground_truth_issues": [
+      {
+        "id": "hard_003_pii_leak",
+        "category": "security",
+        "description": "The dashboard query exposes SSNs even though the ticket workflow only needs identity and ticket context.",
+        "severity": 0.9,
+        "fix": "SELECT c.id, c.full_name, c.email, t.subject FROM customers c JOIN support_tickets t ON t.customer_id = c.id WHERE t.status = 'open';",
+        "keywords": ["ssn", "pii", "sensitive data", "least privilege", "security"]
+      }
+    ],
+    "max_steps": 6
+  },
+  {
+    "task_id": "hard_004",
+    "difficulty": "hard",
+    "query": "SELECT e1.department_id, e1.id, COUNT(e2.salary) + 1 AS salary_rank FROM employees e1 LEFT JOIN employees e2 ON e1.department_id = e2.department_id AND e2.salary > e1.salary GROUP BY e1.department_id, e1.id;",
+    "schema": {
+      "employees": {
+        "id": "INT PRIMARY KEY",
+        "department_id": "INT INDEX",
+        "salary": "DECIMAL(10,2)"
+      }
+    },
+    "context": "Rank employees by salary within each department.",
+    "ground_truth_issues": [
+      {
+        "id": "hard_004_self_join_ranking",
+        "category": "performance",
+        "description": "The self-join ranking pattern is expensive and should use a window function such as DENSE_RANK().",
+        "severity": 0.8,
+        "fix": "SELECT department_id, id, DENSE_RANK() OVER (PARTITION BY department_id ORDER BY salary DESC) AS salary_rank FROM employees;",
+        "keywords": ["self join", "window function", "dense_rank", "ranking", "performance"]
+      }
+    ],
+    "max_steps": 7
+  },
+  {
+    "task_id": "hard_005",
+    "difficulty": "hard",
+    "query": "UPDATE accounts SET balance = balance - 100 WHERE user_id = 10; UPDATE accounts SET balance = balance + 100 WHERE user_id = 11;",
+    "schema": {
+      "accounts": {
+        "user_id": "INT PRIMARY KEY",
+        "balance": "DECIMAL(10,2)"
+      }
+    },
+    "context": "Transfer money between two account balances.",
+    "ground_truth_issues": [
+      {
+        "id": "hard_005_missing_transaction",
+        "category": "security",
+        "description": "The transfer uses two updates without a transaction, so a partial failure can corrupt balances.",
+        "severity": 0.9,
+        "fix": "BEGIN; UPDATE accounts SET balance = balance - 100 WHERE user_id = 10 AND balance >= 100; UPDATE accounts SET balance = balance + 100 WHERE user_id = 11; COMMIT;",
+        "keywords": ["transaction", "partial failure", "atomic", "commit", "security"]
+      },
+      {
+        "id": "hard_005_no_balance_guard",
+        "category": "logic",
+        "description": "The debit statement does not verify sufficient funds before subtracting the balance.",
+        "severity": 0.55,
+        "fix": "BEGIN; UPDATE accounts SET balance = balance - 100 WHERE user_id = 10 AND balance >= 100; UPDATE accounts SET balance = balance + 100 WHERE user_id = 11; COMMIT;",
+        "keywords": ["balance guard", "insufficient funds", "where balance >=", "logic"]
+      }
+    ],
+    "max_steps": 7
+  }
+]

tasks/medium_tasks.json ADDED Viewed

	@@ -0,0 +1,152 @@

+[
+  {
+    "task_id": "medium_001",
+    "difficulty": "medium",
+    "query": "SELECT * FROM events ORDER BY created_at DESC;",
+    "schema": {
+      "events": {
+        "id": "BIGINT PRIMARY KEY",
+        "event_name": "VARCHAR(255)",
+        "payload": "JSON",
+        "created_at": "TIMESTAMP INDEX",
+        "actor_id": "BIGINT",
+        "metadata": "JSON"
+      }
+    },
+    "context": "Show the most recent events on an admin dashboard.",
+    "ground_truth_issues": [
+      {
+        "id": "medium_001_select_star",
+        "category": "performance",
+        "description": "SELECT * pulls a wide payload when the dashboard only needs a few columns.",
+        "severity": 0.3,
+        "fix": "SELECT id, event_name, created_at FROM events ORDER BY created_at DESC LIMIT 50;",
+        "keywords": ["select *", "wide table", "projection", "performance"]
+      },
+      {
+        "id": "medium_001_missing_limit",
+        "category": "performance",
+        "description": "The dashboard query is missing a LIMIT and can scan far more rows than necessary.",
+        "severity": 0.3,
+        "fix": "SELECT id, event_name, created_at FROM events ORDER BY created_at DESC LIMIT 50;",
+        "keywords": ["limit", "unbounded query", "dashboard", "performance"]
+      }
+    ],
+    "max_steps": 5
+  },
+  {
+    "task_id": "medium_002",
+    "difficulty": "medium",
+    "query": "SELECT c.id, c.name, (SELECT COUNT(*) FROM orders o WHERE o.customer_id = c.id) AS order_count FROM customers c;",
+    "schema": {
+      "customers": {
+        "id": "INT PRIMARY KEY",
+        "name": "VARCHAR(255)"
+      },
+      "orders": {
+        "id": "INT PRIMARY KEY",
+        "customer_id": "INT INDEX",
+        "total": "DECIMAL(10,2)"
+      }
+    },
+    "context": "Show each customer with the number of orders they have placed.",
+    "ground_truth_issues": [
+      {
+        "id": "medium_002_correlated_subquery",
+        "category": "performance",
+        "description": "The correlated subquery re-counts orders per row and should be rewritten as a join with GROUP BY.",
+        "severity": 0.6,
+        "fix": "SELECT c.id, c.name, COUNT(o.id) AS order_count FROM customers c LEFT JOIN orders o ON o.customer_id = c.id GROUP BY c.id, c.name;",
+        "keywords": ["correlated subquery", "group by", "join", "count", "performance"]
+      }
+    ],
+    "max_steps": 6
+  },
+  {
+    "task_id": "medium_003",
+    "difficulty": "medium",
+    "query": "SELECT DISTINCT email FROM users WHERE email IS NOT NULL;",
+    "schema": {
+      "users": {
+        "id": "INT PRIMARY KEY",
+        "email": "VARCHAR(255) UNIQUE",
+        "last_login_at": "TIMESTAMP NULL"
+      }
+    },
+    "context": "Export non-null user emails for a CRM sync.",
+    "ground_truth_issues": [
+      {
+        "id": "medium_003_redundant_distinct",
+        "category": "performance",
+        "description": "DISTINCT is redundant because users.email is already unique.",
+        "severity": 0.45,
+        "fix": "SELECT email FROM users WHERE email IS NOT NULL;",
+        "keywords": ["distinct", "unique", "redundant", "email", "performance"]
+      }
+    ],
+    "max_steps": 5
+  },
+  {
+    "task_id": "medium_004",
+    "difficulty": "medium",
+    "query": "SELECT o.id, o.total, u.name FROM orders o JOIN users u ON u.id = o.user_id WHERE DATE(o.created_at) = '2026-04-10';",
+    "schema": {
+      "orders": {
+        "id": "INT PRIMARY KEY",
+        "user_id": "INT INDEX",
+        "created_at": "TIMESTAMP INDEX",
+        "total": "DECIMAL(10,2)"
+      },
+      "users": {
+        "id": "INT PRIMARY KEY",
+        "name": "VARCHAR(255)"
+      }
+    },
+    "context": "List orders placed on a specific date with the user name attached.",
+    "ground_truth_issues": [
+      {
+        "id": "medium_004_function_on_indexed_column",
+        "category": "performance",
+        "description": "Wrapping created_at with DATE() prevents efficient use of the created_at index.",
+        "severity": 0.6,
+        "fix": "SELECT o.id, o.total, u.name FROM orders o JOIN users u ON u.id = o.user_id WHERE o.created_at >= '2026-04-10' AND o.created_at < '2026-04-11';",
+        "keywords": ["date()", "function on column", "index", "range predicate", "performance"]
+      }
+    ],
+    "max_steps": 6
+  },
+  {
+    "task_id": "medium_005",
+    "difficulty": "medium",
+    "query": "SELECT id, name FROM products WHERE LOWER(name) LIKE '%pro%';",
+    "schema": {
+      "products": {
+        "id": "INT PRIMARY KEY",
+        "name": "VARCHAR(255) INDEX",
+        "category_id": "INT",
+        "price": "DECIMAL(10,2)"
+      }
+    },
+    "context": "Search products whose names contain the text pro.",
+    "ground_truth_issues": [
+      {
+        "id": "medium_005_lower_blocks_index",
+        "category": "performance",
+        "description": "Applying LOWER(name) on every row prevents the index on name from being used efficiently.",
+        "severity": 0.35,
+        "fix": "SELECT id, name FROM products WHERE name ILIKE 'pro%';",
+        "keywords": ["lower", "function on column", "index", "performance"]
+      },
+      {
+        "id": "medium_005_leading_wildcard",
+        "category": "performance",
+        "description": "The leading wildcard in LIKE '%pro%' forces a full scan instead of an index-friendly prefix lookup.",
+        "severity": 0.35,
+        "fix": "SELECT id, name FROM products WHERE name ILIKE 'pro%';",
+        "keywords": ["leading wildcard", "%pro%", "full scan", "prefix lookup", "performance"]
+      }
+    ],
+    "max_steps": 6
+  }
+]

tests/test_api.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from fastapi.testclient import TestClient
+from server.app import create_app
+from server.environment import SQLReviewEnvironment
+def build_client() -> TestClient:
+    return TestClient(create_app(SQLReviewEnvironment()))
+def test_reset_returns_initial_observation() -> None:
+    client = build_client()
+    response = client.post("/reset", json={"task_id": "easy_001"})
+    assert response.status_code == 200
+    payload = response.json()
+    assert payload["observation"]["difficulty"] == "easy"
+    assert payload["reward"] == 0.0
+    assert payload["done"] is False
+def test_identify_issue_returns_positive_reward_for_match() -> None:
+    client = build_client()
+    client.post("/reset", json={"task_id": "easy_002"})
+    response = client.post(
+        "/step",
+        json={
+            "action_type": "identify_issue",
+            "issue_category": "syntax",
+            "issue_description": "The query is missing the FROM clause before users.",
+            "confidence": 0.95,
+        },
+    )
+    assert response.status_code == 200
+    payload = response.json()
+    assert payload["reward"] > 0
+    assert payload["info"]["issue_id"] == "easy_002_missing_from"
+def test_suggest_fix_without_identifying_issue_is_penalized() -> None:
+    client = build_client()
+    client.post("/reset", json={"task_id": "easy_002"})
+    response = client.post(
+        "/step",
+        json={
+            "action_type": "suggest_fix",
+            "suggested_fix": "SELECT id, email FROM users WHERE active = 1;",
+            "confidence": 0.8,
+        },
+    )
+    assert response.status_code == 200
+    assert response.json()["reward"] < 0
+def test_approve_with_missed_issues_ends_episode_with_penalty() -> None:
+    client = build_client()
+    client.post("/reset", json={"task_id": "easy_001"})
+    response = client.post("/step", json={"action_type": "approve", "confidence": 0.8})
+    assert response.status_code == 200
+    payload = response.json()
+    assert payload["done"] is True
+    assert payload["reward"] < 0
+    assert payload["info"]["final_score"] is not None
+def test_identify_then_approve_can_finish_successfully() -> None:
+    client = build_client()
+    client.post("/reset", json={"task_id": "easy_002"})
+    client.post(
+        "/step",
+        json={
+            "action_type": "identify_issue",
+            "issue_category": "syntax",
+            "issue_description": "The query is missing the FROM clause before users.",
+            "confidence": 0.95,
+        },
+    )
+    response = client.post("/step", json={"action_type": "approve", "confidence": 0.9})
+    assert response.status_code == 200
+    payload = response.json()
+    assert payload["done"] is True
+    assert payload["reward"] > 0
+    assert payload["info"]["final_score"] is not None

tests/test_grader.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from sql_query_reviewer.models import SQLReviewAction, TaskRecord
+from server.grader import grade_episode, match_issue, validate_fix
+from server.environment import SQLReviewEnvironment
+def test_match_issue_finds_expected_easy_issue() -> None:
+    environment = SQLReviewEnvironment()
+    task = environment.tasks["easy_002"]
+    action = SQLReviewAction(
+        action_type="identify_issue",
+        issue_category="syntax",
+        issue_description="The query is missing the FROM clause before users.",
+        confidence=0.95,
+    )
+    match, score = match_issue(action, task.ground_truth_issues, set())
+    assert match is not None
+    assert match.id == "easy_002_missing_from"
+    assert score >= 0.35
+def test_validate_fix_accepts_expected_remediation() -> None:
+    environment = SQLReviewEnvironment()
+    task = environment.tasks["easy_003"]
+    assert validate_fix("SELECT order_id, total FROM orders WHERE shipped_at IS NULL;", task.ground_truth_issues[0])
+def test_grade_episode_is_deterministic_and_bounded() -> None:
+    environment = SQLReviewEnvironment()
+    task = environment.tasks["medium_001"]
+    first = grade_episode({"medium_001_select_star"}, task.ground_truth_issues, total_steps=2, max_steps=5, false_positive_count=1)
+    second = grade_episode({"medium_001_select_star"}, task.ground_truth_issues, total_steps=2, max_steps=5, false_positive_count=1)
+    assert first == second
+    assert 0.0 <= first <= 1.0

tests/test_inference.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from types import SimpleNamespace
+import inference
+from sql_query_reviewer.models import SQLReviewObservation, SQLReviewState, StepResult
+def test_extract_json_handles_code_fence() -> None:
+    payload = inference.extract_json(
+        """```json
+        {"action_type":"approve","confidence":0.8}
+        ```"""
+    )
+    assert payload["action_type"] == "approve"
+def test_run_episode_emits_start_step_end_logs(capsys) -> None:
+    class DummyEnv:
+        def reset(self, task_id: str) -> StepResult:
+            return StepResult(
+                observation=SQLReviewObservation(
+                    query="SELECT 1;",
+                    schema_info={},
+                    context="Health check query.",
+                    issues_found_so_far=[],
+                    remaining_actions=1,
+                    difficulty="easy",
+                    feedback="Review this query.",
+                ),
+                reward=0.0,
+                done=False,
+                info={},
+            )
+        def step(self, action):
+            assert action.action_type == "approve"
+            return StepResult(
+                observation=SQLReviewObservation(
+                    query="SELECT 1;",
+                    schema_info={},
+                    context="Health check query.",
+                    issues_found_so_far=[],
+                    remaining_actions=0,
+                    difficulty="easy",
+                    feedback="Query approved with full issue coverage.",
+                ),
+                reward=0.2,
+                done=True,
+                info={},
+            )
+        def state(self) -> SQLReviewState:
+            return SQLReviewState(
+                task_id="easy_999",
+                step_count=1,
+                total_reward=0.2,
+                done=True,
+                approved=True,
+                final_score=1.0,
+            )
+    class DummyCompletions:
+        def create(self, **_kwargs):
+            return SimpleNamespace(
+                choices=[
+                    SimpleNamespace(
+                        message=SimpleNamespace(content='{"action_type":"approve","confidence":0.9}')
+                    )
+                ]
+            )
+    class DummyClient:
+        def __init__(self) -> None:
+            self.chat = SimpleNamespace(completions=DummyCompletions())
+    summary = inference.run_episode(DummyEnv(), DummyClient(), "dummy-model", "easy_999")
+    captured = capsys.readouterr().out
+    assert "[START]" in captured
+    assert "[STEP]" in captured
+    assert "[END]" in captured
+    assert summary["final_score"] == 1.0

tests/test_models.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import pytest
+from pydantic import ValidationError
+from sql_query_reviewer.models import SQLReviewAction
+def test_identify_issue_requires_category_and_description() -> None:
+    with pytest.raises(ValidationError):
+        SQLReviewAction(action_type="identify_issue", confidence=0.8)
+def test_suggest_fix_requires_fix_text() -> None:
+    with pytest.raises(ValidationError):
+        SQLReviewAction(action_type="suggest_fix")
+def test_approve_action_is_valid_without_optional_fields() -> None:
+    action = SQLReviewAction(action_type="approve", confidence=0.9)
+    assert action.action_type == "approve"
+    assert action.issue_description is None

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff