Spaces:

rahul2124
/

sql-arena

Running

App Files Files Community

rahul2124 commited on 13 days ago

Commit

72805b8

verified ·

1 Parent(s): 87e2ef6

Upload folder using huggingface_hub

Browse files

Files changed (20) hide show

.pytest_cache/.gitignore +2 -0
.pytest_cache/CACHEDIR.TAG +4 -0
.pytest_cache/README.md +8 -0
.pytest_cache/v/cache/nodeids +14 -0
Dockerfile +19 -0
README.md +29 -10
inference.py +225 -0
openenv.yaml +104 -0
pyproject.toml +31 -0
requirements.txt +7 -0
src/__init__.py +0 -0
src/sql_arena/__init__.py +24 -0
src/sql_arena/database.py +156 -0
src/sql_arena/environment.py +200 -0
src/sql_arena/graders.py +220 -0
src/sql_arena/models.py +104 -0
src/sql_arena/server.py +265 -0
src/sql_arena/tasks.py +593 -0
tests/__init__.py +0 -0
tests/test_env.py +125 -0

.pytest_cache/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Created by pytest automatically.
2	+ *

.pytest_cache/CACHEDIR.TAG ADDED Viewed

	@@ -0,0 +1,4 @@

+Signature: 8a477f597d28d172789f06886806bc55
+# This file is a cache directory tag created by pytest.
+# For information about cache directory tags, see:
+#	https://bford.info/cachedir/spec.html

.pytest_cache/README.md ADDED Viewed

	@@ -0,0 +1,8 @@

+# pytest cache directory #
+This directory contains data from the pytest's cache plugin,
+which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
+**Do not** commit this to version control.
+See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.

.pytest_cache/v/cache/nodeids ADDED Viewed

	@@ -0,0 +1,14 @@

+[
+  "tests/test_env.py::TestAllDifficulties::test_easy",
+  "tests/test_env.py::TestAllDifficulties::test_hard",
+  "tests/test_env.py::TestAllDifficulties::test_medium",
+  "tests/test_env.py::TestEnvironmentBasics::test_episode_terminates",
+  "tests/test_env.py::TestEnvironmentBasics::test_reset_returns_observation",
+  "tests/test_env.py::TestEnvironmentBasics::test_state_tracking",
+  "tests/test_env.py::TestEnvironmentBasics::test_step_with_correct_query",
+  "tests/test_env.py::TestEnvironmentBasics::test_step_with_invalid_query",
+  "tests/test_env.py::TestGrading::test_scores_in_range",
+  "tests/test_env.py::TestGrading::test_varying_scores",
+  "tests/test_env.py::TestTaskRegistry::test_list_tasks",
+  "tests/test_env.py::TestTaskRegistry::test_minimum_3_tasks"
+]

Dockerfile ADDED Viewed

	@@ -0,0 +1,19 @@

+FROM python:3.11-slim
+WORKDIR /app
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+EXPOSE 7860
+HEALTHCHECK --interval=30s --timeout=5s --start-period=10s \
+    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:7860/health')" || exit 1
+CMD ["uvicorn", "src.sql_arena.server:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]

README.md CHANGED Viewed

@@ -1,10 +1,29 @@
----
-title: Sql Arena
-emoji: 🌖
-colorFrom: purple
-colorTo: red
-sdk: docker
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# SQL Arena - OpenEnv Environment
+An interactive SQL query challenge environment where AI agents learn to write SQL
+by iteratively querying databases and receiving execution feedback with partial credit scoring.
+## Real-World Utility
+Text-to-SQL is one of the most valuable capabilities for AI agents:
+- Used by data analysts, business users, and developers daily
+- Evaluates reasoning, schema understanding, and query composition
+- Directly applicable to production AI assistants and copilots
+- SQL Arena provides interactive iterative feedback (not just static benchmarks)
+## Tasks
+| Task | Difficulty | Description | Max Steps |
+|------|-----------|-------------|-----------|
+| basic_select | Easy | SELECT, WHERE, ORDER BY on single table | 5 |
+| join_aggregate | Medium | Multi-table JOINs, GROUP BY, HAVING | 7 |
+| complex_analysis | Hard | CTEs, window functions, subqueries | 10 |
+Each difficulty has 3+ unique problems with deterministic, reproducible grading.
+## Action Space
+```json
+{
+  "sql_query": "SELECT name, salary FROM employees WHERE salary > 80000 ORDER BY salary DESC"
+}

inference.py ADDED Viewed

	@@ -0,0 +1,225 @@

+"""
+Inference Script - SQL Arena OpenEnv Environment
+Baseline agent that uses an LLM to solve SQL challenges.
+"""
+import os
+import sys
+import textwrap
+from typing import List, Optional
+from openai import OpenAI
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from src.sql_arena.environment import SQLArenaEnvironment
+from src.sql_arena.models import SQLArenaAction
+# =====================================================
+# Configuration
+# =====================================================
+API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
+API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
+MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
+BENCHMARK = "sql_arena"
+TEMPERATURE = 0.3
+MAX_TOKENS = 500
+TASKS = [
+    {"difficulty": "basic_select", "task_id": "easy_001", "name": "basic_select", "max_steps": 5},
+    {"difficulty": "join_aggregate", "task_id": "medium_001", "name": "join_aggregate", "max_steps": 7},
+    {"difficulty": "complex_analysis", "task_id": "hard_001", "name": "complex_analysis", "max_steps": 10},
+]
+# =====================================================
+# Logging (MANDATORY format)
+# =====================================================
+def log_start(task: str, env: str, model: str) -> None:
+    print(f"[START] task={task} env={env} model={model}", flush=True)
+def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
+    error_val = error if error else "null"
+    done_val = str(done).lower()
+    action_short = action.replace('\n', ' ').strip()[:100]
+    print(
+        f"[STEP] step={step} action={action_short} reward={reward:.2f} done={done_val} error={error_val}",
+        flush=True,
+    )
+def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
+    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
+    print(
+        f"[END] success={str(success).lower()} steps={steps} score={score:.2f} rewards={rewards_str}",
+        flush=True,
+    )
+# =====================================================
+# LLM Agent
+# =====================================================
+SYSTEM_PROMPT = textwrap.dedent("""
+    You are an expert SQL query writer. You are interacting with a SQL challenge environment.
+    Each turn you receive: database schema, a question, previous query results, and feedback.
+    Your goal: Write a SQL query that correctly answers the question.
+    Rules:
+    - Output ONLY the SQL query, nothing else
+    - No explanations, no markdown, no code fences
+    - Use standard SQLite syntax
+    - Be precise with column names and table names
+    - If your previous query had errors, fix them based on the feedback
+""").strip()
+def build_user_prompt(observation: dict, step: int, history: List[str]) -> str:
+    parts = []
+    parts.append(f"=== SQL Challenge (Step {step}) ===")
+    parts.append(f"\nDifficulty: {observation.get('difficulty', 'unknown')}")
+    parts.append(f"\n--- Database Schema ---\n{observation.get('schema_description', '')}")
+    parts.append(f"\n--- Question ---\n{observation.get('question', '')}")
+    if observation.get('expected_columns'):
+        parts.append(f"\n--- Expected Columns ---\n{observation['expected_columns']}")
+    if observation.get('query_result'):
+        parts.append(f"\n--- Previous Query Result ---\n{observation['query_result']}")
+    if observation.get('error_message'):
+        parts.append(f"\n--- Error ---\n{observation['error_message']}")
+    if observation.get('feedback'):
+        parts.append(f"\n--- Feedback ---\n{observation['feedback']}")
+    parts.append(f"\nAttempts remaining: {observation.get('attempts_remaining', 0)}")
+    if history:
+        parts.append("\n--- Previous Attempts ---")
+        for h in history[-3:]:
+            parts.append(h)
+    parts.append("\nWrite your SQL query now:")
+    return "\n".join(parts)
+def get_sql_from_llm(client: OpenAI, observation: dict, step: int, history: List[str]) -> str:
+    user_prompt = build_user_prompt(observation, step, history)
+    try:
+        completion = client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": user_prompt},
+            ],
+            temperature=TEMPERATURE,
+            max_tokens=MAX_TOKENS,
+            stream=False,
+        )
+        raw = (completion.choices[0].message.content or "").strip()
+        sql = raw
+        if sql.startswith("```sql"):
+            sql = sql[6:]
+        if sql.startswith("```"):
+            sql = sql[3:]
+        if sql.endswith("```"):
+            sql = sql[:-3]
+        sql = sql.strip()
+        return sql if sql else "SELECT 1"
+    except Exception as exc:
+        print(f"[DEBUG] LLM request failed: {exc}", flush=True)
+        return "SELECT 1"
+# =====================================================
+# Main Inference Loop
+# =====================================================
+def run_task(client: OpenAI, env: SQLArenaEnvironment, task_config: dict) -> float:
+    difficulty = task_config["difficulty"]
+    task_id = task_config["task_id"]
+    task_name = task_config["name"]
+    max_steps = task_config["max_steps"]
+    history: List[str] = []
+    rewards: List[float] = []
+    steps_taken = 0
+    best_score = 0.0
+    log_start(task=task_name, env=BENCHMARK, model=MODEL_NAME)
+    try:
+        result = env.reset(difficulty=difficulty, task_id=task_id)
+        obs_dict = result.observation.model_dump()
+        for step in range(1, max_steps + 1):
+            if result.done:
+                break
+            sql_query = get_sql_from_llm(client, obs_dict, step, history)
+            action = SQLArenaAction(sql_query=sql_query)
+            result = env.step(action)
+            obs_dict = result.observation.model_dump()
+            reward = result.reward
+            done = result.done
+            error = obs_dict.get("error_message")
+            rewards.append(reward)
+            steps_taken = step
+            best_score = max(best_score, result.info.get("score", 0.0))
+            log_step(step=step, action=sql_query, reward=reward, done=done, error=error)
+            history.append(
+                f"Step {step}: {sql_query[:80]}... -> reward={reward:.2f}"
+            )
+            if done:
+                break
+        final_score = min(max(best_score, 0.0), 1.0)
+        success = final_score >= 0.5
+    except Exception as e:
+        print(f"[DEBUG] Task {task_name} error: {e}", flush=True)
+        final_score = 0.0
+        success = False
+    finally:
+        log_end(success=success, steps=steps_taken, score=final_score, rewards=rewards)
+    return final_score
+def main() -> None:
+    client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
+    env = SQLArenaEnvironment()
+    all_scores = []
+    for task_config in TASKS:
+        print(f"\n{'='*60}", flush=True)
+        print(f"Running task: {task_config['name']} ({task_config['difficulty']})", flush=True)
+        print(f"{'='*60}", flush=True)
+        score = run_task(client, env, task_config)
+        all_scores.append(score)
+        print(f"\nTask {task_config['name']} final score: {score:.2f}\n", flush=True)
+    avg_score = sum(all_scores) / len(all_scores) if all_scores else 0.0
+    print(f"\n{'='*60}", flush=True)
+    print("SUMMARY", flush=True)
+    print(f"{'='*60}", flush=True)
+    for tc, sc in zip(TASKS, all_scores):
+        print(f"  {tc['name']:20s}: {sc:.2f}", flush=True)
+    print(f"  {'Average':20s}: {avg_score:.2f}", flush=True)
+    env.close()
+if __name__ == "__main__":
+    main()

openenv.yaml ADDED Viewed

	@@ -0,0 +1,104 @@

+name: sql_arena
+version: "1.0.0"
+description: >
+  Interactive SQL query challenge environment where AI agents learn to write SQL
+  by iteratively querying databases and receiving execution feedback with partial credit.
+author: "Vudumula Naga Sai Rahul"
+license: "MIT"
+interface:
+  action:
+    type: object
+    model: sql_arena.models.SQLArenaAction
+    properties:
+      sql_query:
+        type: string
+        description: "SQL query to execute against the database"
+  observation:
+    type: object
+    model: sql_arena.models.SQLArenaObservation
+    properties:
+      schema_description:
+        type: string
+      question:
+        type: string
+      query_result:
+        type: string
+        nullable: true
+      error_message:
+        type: string
+        nullable: true
+      feedback:
+        type: string
+        nullable: true
+      expected_columns:
+        type: array
+        nullable: true
+      attempts_remaining:
+        type: integer
+      difficulty:
+        type: string
+      task_id:
+        type: string
+  state:
+    type: object
+    model: sql_arena.models.SQLArenaState
+tasks:
+  - id: basic_select
+    name: "Basic SELECT Queries"
+    description: "Simple SELECT, WHERE, ORDER BY queries"
+    difficulty: easy
+    max_steps: 5
+    subtasks:
+      - easy_001
+      - easy_002
+      - easy_003
+  - id: join_aggregate
+    name: "JOIN and Aggregate Queries"
+    description: "Multi-table JOINs with GROUP BY, HAVING"
+    difficulty: medium
+    max_steps: 7
+    subtasks:
+      - medium_001
+      - medium_002
+      - medium_003
+  - id: complex_analysis
+    name: "Complex Analysis Queries"
+    description: "CTEs, window functions, subqueries"
+    difficulty: hard
+    max_steps: 10
+    subtasks:
+      - hard_001
+      - hard_002
+      - hard_003
+grading:
+  score_range: [0.0, 1.0]
+  components:
+    - name: execution
+      weight: 0.10
+      description: "Query executes without errors"
+    - name: columns
+      weight: 0.20
+      description: "Correct column names"
+    - name: row_count
+      weight: 0.20
+      description: "Correct number of rows"
+    - name: values
+      weight: 0.50
+      description: "Correct data values"
+server:
+  framework: fastapi
+  entrypoint: src.sql_arena.server:app
+  port: 7860
+deployment:
+  platform: huggingface-spaces
+  docker: true

pyproject.toml ADDED Viewed

	@@ -0,0 +1,31 @@

+[build-system]
+requires = ["setuptools>=68.0", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "sql-arena"
+version = "1.0.0"
+description = "Interactive SQL query challenge OpenEnv environment"
+readme = "README.md"
+license = {text = "MIT"}
+requires-python = ">=3.10"
+authors = [
+    {name = "Vudumula Naga Sai Rahul", email = "nagasairahulvudumula@gmail.com"}
+]
+dependencies = [
+    "fastapi>=0.104.0",
+    "uvicorn[standard]>=0.24.0",
+    "pydantic>=2.5.0",
+    "websockets>=12.0",
+    "openai>=1.0.0",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=7.0",
+    "httpx>=0.25.0",
+]
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["src*"]

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+fastapi>=0.104.0
+uvicorn[standard]>=0.24.0
+pydantic>=2.5.0
+websockets>=12.0
+openai>=1.0.0
+pytest>=7.0
+httpx>=0.25.0

src/__init__.py ADDED Viewed

File without changes

src/sql_arena/__init__.py ADDED Viewed

	@@ -0,0 +1,24 @@

+"""
+SQL Arena - Interactive SQL Query Challenge Environment for OpenEnv.
+"""
+from .models import SQLArenaAction, SQLArenaObservation, SQLArenaState
+from .environment import SQLArenaEnvironment, StepResult
+from .tasks import get_task, list_tasks, SQLTask, ALL_TASKS, TASK_BY_ID
+from .graders import grade_result
+__all__ = [
+    "SQLArenaAction",
+    "SQLArenaObservation",
+    "SQLArenaState",
+    "SQLArenaEnvironment",
+    "StepResult",
+    "get_task",
+    "list_tasks",
+    "SQLTask",
+    "ALL_TASKS",
+    "TASK_BY_ID",
+    "grade_result",
+]
+__version__ = "1.0.0"

src/sql_arena/database.py ADDED Viewed

	@@ -0,0 +1,156 @@

+"""
+SQLite Database Manager for SQL Arena.
+Creates in-memory SQLite databases for each task.
+Executes agent queries safely and formats results.
+Key design decisions:
+- In-memory databases (fast, no disk I/O, no cleanup needed)
+- Each reset() creates a fresh database
+- Query execution is sandboxed (read-only would be ideal but SQLite
+  in-memory is ephemeral anyway)
+"""
+import sqlite3
+from typing import Tuple, Optional, Any, Dict
+class DatabaseManager:
+    """
+    Manages SQLite in-memory databases for SQL challenges.
+    Each task gets its own fresh database with schema and sample data.
+    The agent's queries are executed against this database.
+    """
+    def __init__(self):
+        self.conn: Optional[sqlite3.Connection] = None
+    def create_database(self, setup_sql: str) -> None:
+        """
+        Create a new in-memory database with the given schema and data.
+        Args:
+            setup_sql: SQL string containing CREATE TABLE and INSERT statements
+        """
+        # Close any existing connection
+        self.close()
+        # Create fresh in-memory database
+        self.conn = sqlite3.connect(":memory:")
+        # Enable foreign keys
+        self.conn.execute("PRAGMA foreign_keys = ON")
+        # Run the setup SQL (creates tables and inserts data)
+        self.conn.executescript(setup_sql)
+        self.conn.commit()
+    def execute_query(self, sql: str) -> Tuple[bool, Optional[Dict], Optional[str]]:
+        """
+        Execute a SQL query and return results.
+        This is the main method called when the agent submits a query.
+        It catches all exceptions to prevent crashes.
+        Args:
+            sql: The SQL query string to execute
+        Returns:
+            Tuple of (success, result_dict, error_message):
+            - success: True if query executed without error
+            - result_dict: {"columns": [...], "rows": [...]} if successful
+            - error_message: Error string if failed, None if success
+        """
+        if not self.conn:
+            return False, None, "No database connection. Call create_database() first."
+        try:
+            cursor = self.conn.execute(sql)
+            # Get column names from cursor description
+            if cursor.description:
+                columns = [desc[0] for desc in cursor.description]
+            else:
+                columns = []
+            # Fetch all rows
+            rows = cursor.fetchall()
+            result = {
+                "columns": columns,
+                "rows": rows,
+            }
+            return True, result, None
+        except sqlite3.Error as e:
+            return False, None, f"SQL Error: {str(e)}"
+        except Exception as e:
+            return False, None, f"Execution Error: {str(e)}"
+    def format_result(self, result: Dict, max_rows: int = 20) -> str:
+        """
+        Format query result as a human-readable table string.
+        This formatted string is shown to the agent in the observation
+        so it can see what its query returned.
+        Args:
+            result: Dict with "columns" and "rows" keys
+            max_rows: Maximum number of rows to display
+        Returns:
+            Formatted table string
+        """
+        if not result or not result.get("columns"):
+            return "(empty result set)"
+        columns = result["columns"]
+        rows = result["rows"]
+        if not rows:
+            return f"Columns: {', '.join(columns)}\n(0 rows returned)"
+        # Calculate column widths (at least as wide as header)
+        col_widths = [len(str(c)) for c in columns]
+        for row in rows[:max_rows]:
+            for i, val in enumerate(row):
+                if i < len(col_widths):
+                    col_widths[i] = max(col_widths[i], len(str(val)))
+        # Build formatted table
+        # Header
+        header = " | ".join(
+            str(c).ljust(w) for c, w in zip(columns, col_widths)
+        )
+        separator = "-+-".join("-" * w for w in col_widths)
+        # Data rows
+        formatted_rows = []
+        for row in rows[:max_rows]:
+            formatted_row = " | ".join(
+                str(v).ljust(w) for v, w in zip(row, col_widths)
+            )
+            formatted_rows.append(formatted_row)
+        # Assemble
+        table_str = f"{header}\n{separator}\n" + "\n".join(formatted_rows)
+        # Truncation notice
+        if len(rows) > max_rows:
+            table_str += f"\n... ({len(rows) - max_rows} more rows not shown)"
+        # Row count
+        table_str += f"\n\n({len(rows)} row{'s' if len(rows) != 1 else ''} returned)"
+        return table_str
+    def close(self) -> None:
+        """Close the database connection and free resources."""
+        if self.conn:
+            try:
+                self.conn.close()
+            except Exception:
+                pass  # Ignore errors on close
+            self.conn = None

src/sql_arena/environment.py ADDED Viewed

	@@ -0,0 +1,200 @@

+"""
+Core SQL Arena Environment.
+Implements the OpenEnv step()/reset()/state() interface.
+"""
+from typing import Optional, Dict, Any, List
+from .models import SQLArenaAction, SQLArenaObservation, SQLArenaState
+from .database import DatabaseManager
+from .tasks import SQLTask, get_task, list_tasks, TASK_BY_ID
+from .graders import grade_result, generate_hint
+class StepResult:
+    """Result of a single environment step."""
+    def __init__(
+        self,
+        observation: SQLArenaObservation,
+        reward: float,
+        done: bool,
+        info: Optional[Dict[str, Any]] = None,
+    ):
+        self.observation = observation
+        self.reward = reward
+        self.done = done
+        self.info = info or {}
+class SQLArenaEnvironment:
+    """
+    SQL Arena: An interactive SQL query challenge environment.
+    The agent receives a database schema and a natural language question,
+    then iteratively writes SQL queries. The environment provides
+    execution results, feedback, and partial credit scoring.
+    """
+    def __init__(self):
+        self.db = DatabaseManager()
+        self.current_task: Optional[SQLTask] = None
+        self._state: Optional[SQLArenaState] = None
+        self._last_observation: Optional[SQLArenaObservation] = None
+    def reset(
+        self,
+        difficulty: str = "basic_select",
+        task_id: Optional[str] = None,
+    ) -> StepResult:
+        """
+        Reset the environment with a new task.
+        Args:
+            difficulty: 'basic_select', 'join_aggregate', or 'complex_analysis'
+            task_id: Optional specific task ID
+        Returns:
+            StepResult with initial observation
+        """
+        # Get the task
+        self.current_task = get_task(difficulty, task_id)
+        task = self.current_task
+        # Setup database
+        self.db.create_database(task.setup_sql)
+        # Initialize state
+        self._state = SQLArenaState(
+            task_id=task.task_id,
+            difficulty=task.difficulty,
+            current_step=0,
+            max_steps=task.max_steps,
+            best_score=0.0,
+            total_reward=0.0,
+            rewards_history=[],
+            done=False,
+            last_action_error=None,
+        )
+        # Create initial observation
+        self._last_observation = SQLArenaObservation(
+            schema_description=task.schema_description,
+            question=task.question,
+            query_result=None,
+            error_message=None,
+            feedback="Welcome to SQL Arena! Write a SQL query to answer the question above.",
+            expected_columns=task.expected_columns,
+            attempts_remaining=task.max_steps,
+            difficulty=task.difficulty,
+            task_id=task.task_id,
+        )
+        return StepResult(
+            observation=self._last_observation,
+            reward=0.0,
+            done=False,
+            info={"task_title": task.title},
+        )
+    def step(self, action: SQLArenaAction) -> StepResult:
+        """
+        Execute the agent's SQL query and return feedback.
+        Args:
+            action: SQLArenaAction containing the SQL query
+        Returns:
+            StepResult with observation, reward, and done flag
+        """
+        if self._state is None or self.current_task is None:
+            raise RuntimeError("Environment not initialized. Call reset() first.")
+        if self._state.done:
+            raise RuntimeError("Episode is done. Call reset() to start a new episode.")
+        task = self.current_task
+        state = self._state
+        # Increment step counter
+        state.current_step += 1
+        # Execute the query
+        success, result, error = self.db.execute_query(action.sql_query)
+        # Grade the result
+        score, feedback = grade_result(task, success, result, error)
+        # Track best score
+        state.best_score = max(state.best_score, score)
+        # Calculate step reward
+        if len(state.rewards_history) == 0:
+            reward = score
+        else:
+            prev_best = max(state.rewards_history) if state.rewards_history else 0.0
+            improvement = max(0, score - prev_best)
+            reward = score * 0.5 + improvement * 0.5
+        reward = round(min(max(reward, 0.0), 1.0), 4)
+        state.rewards_history.append(reward)
+        state.total_reward += reward
+        # Add progressive hints
+        hint = generate_hint(task, state.current_step, score)
+        if hint and score < 1.0:
+            feedback += f"\n\n{hint}"
+        # Check if done
+        attempts_remaining = task.max_steps - state.current_step
+        is_perfect = score >= 1.0
+        is_out_of_steps = attempts_remaining <= 0
+        state.done = is_perfect or is_out_of_steps
+        state.last_action_error = error
+        # Format query result for observation
+        query_result_str = None
+        if success and result:
+            query_result_str = self.db.format_result(result)
+        # Build observation
+        self._last_observation = SQLArenaObservation(
+            schema_description=task.schema_description,
+            question=task.question,
+            query_result=query_result_str,
+            error_message=error,
+            feedback=feedback,
+            expected_columns=task.expected_columns,
+            attempts_remaining=attempts_remaining,
+            difficulty=task.difficulty,
+            task_id=task.task_id,
+        )
+        return StepResult(
+            observation=self._last_observation,
+            reward=reward,
+            done=state.done,
+            info={
+                "score": score,
+                "best_score": state.best_score,
+                "step": state.current_step,
+                "is_perfect": is_perfect,
+            },
+        )
+    def state(self) -> SQLArenaState:
+        """Return the current environment state."""
+        if self._state is None:
+            raise RuntimeError("Environment not initialized. Call reset() first.")
+        return self._state
+    def close(self) -> None:
+        """Clean up resources."""
+        self.db.close()
+        self.current_task = None
+        self._state = None
+        self._last_observation = None
+    def get_available_tasks(self) -> Dict:
+        """Return all available tasks."""
+        return list_tasks()

src/sql_arena/graders.py ADDED Viewed

	@@ -0,0 +1,220 @@

+"""
+Grading logic for SQL Arena.
+Provides partial credit scoring (0.0 to 1.0) based on:
+  - Query execution success (0.10)
+  - Column correctness (0.20)
+  - Row count correctness (0.20)
+  - Value correctness (0.50)
+"""
+from typing import List, Tuple, Optional, Dict, Any
+from .tasks import SQLTask
+def normalize_value(val: Any) -> Any:
+    """Normalize values for comparison."""
+    if val is None:
+        return None
+    if isinstance(val, float):
+        return round(val, 2)
+    if isinstance(val, str):
+        return val.strip().lower()
+    return val
+def normalize_row(row: tuple) -> tuple:
+    """Normalize all values in a row."""
+    return tuple(normalize_value(v) for v in row)
+def grade_result(
+    task: SQLTask,
+    success: bool,
+    result: Optional[Dict],
+    error: Optional[str],
+) -> Tuple[float, str]:
+    """
+    Grade a SQL query result against expected output.
+    Returns:
+        (score, feedback) where score is in [0.0, 1.0]
+    Scoring breakdown:
+        - 0.10: Query executes without error
+        - 0.20: Correct column names
+        - 0.20: Correct number of rows
+        - 0.50: Correct values (proportional to matching rows)
+    """
+    feedback_parts = []
+    score = 0.0
+    # ---- Component 1: Execution Success (0.10) ----
+    if not success:
+        feedback_parts.append(f"X Query failed: {error}")
+        feedback_parts.append("Hint: Fix the syntax error and try again.")
+        return 0.0, "\n".join(feedback_parts)
+    score += 0.10
+    feedback_parts.append("OK: Query executed successfully (+0.10)")
+    # ---- Component 2: Column Correctness (0.20) ----
+    actual_columns = [c.lower().strip() for c in result.get("columns", [])]
+    expected_columns = [c.lower().strip() for c in task.expected_columns]
+    if actual_columns == expected_columns:
+        score += 0.20
+        feedback_parts.append(f"OK: Correct columns: {actual_columns} (+0.20)")
+    else:
+        # Partial credit for overlapping columns
+        matching_cols = set(actual_columns) & set(expected_columns)
+        if matching_cols:
+            partial = 0.20 * (len(matching_cols) / len(expected_columns))
+            score += partial
+            feedback_parts.append(
+                f"PARTIAL: Column match: got {actual_columns}, "
+                f"expected {expected_columns} (+{partial:.2f})"
+            )
+            missing = set(expected_columns) - set(actual_columns)
+            if missing:
+                feedback_parts.append(f"Hint: Missing columns: {missing}")
+        else:
+            feedback_parts.append(
+                f"WRONG: Columns: got {actual_columns}, expected {expected_columns}"
+            )
+    # ---- Component 3: Row Count (0.20) ----
+    actual_rows = result.get("rows", [])
+    expected_row_count = task.expected_row_count
+    if len(actual_rows) == expected_row_count:
+        score += 0.20
+        feedback_parts.append(f"OK: Correct row count: {len(actual_rows)} (+0.20)")
+    else:
+        # Partial credit: closer counts get more credit
+        if expected_row_count > 0:
+            ratio = 1.0 - abs(len(actual_rows) - expected_row_count) / max(
+                expected_row_count, len(actual_rows)
+            )
+            partial = max(0.0, 0.20 * ratio)
+            score += partial
+            feedback_parts.append(
+                f"PARTIAL: Row count: got {len(actual_rows)}, "
+                f"expected {expected_row_count} (+{partial:.2f})"
+            )
+        else:
+            if len(actual_rows) == 0:
+                score += 0.20
+                feedback_parts.append("OK: Correct empty result set (+0.20)")
+            else:
+                feedback_parts.append(
+                    f"WRONG: Expected empty result, got {len(actual_rows)} rows"
+                )
+    # ---- Component 4: Value Correctness (0.50) ----
+    if task.expected_rows:
+        normalized_expected = [normalize_row(r) for r in task.expected_rows]
+        normalized_actual = [normalize_row(r) for r in actual_rows]
+        # Try exact order match first
+        exact_matches = 0
+        for exp_row, act_row in zip(normalized_expected, normalized_actual):
+            if exp_row == act_row:
+                exact_matches += 1
+        if (
+            exact_matches == len(normalized_expected)
+            and len(normalized_actual) == len(normalized_expected)
+        ):
+            score += 0.50
+            feedback_parts.append("OK: All values correct with correct ordering (+0.50)")
+        else:
+            # Try unordered match (set-based)
+            matched_rows = 0
+            remaining_actual = list(normalized_actual)
+            for exp_row in normalized_expected:
+                for i, act_row in enumerate(remaining_actual):
+                    if exp_row == act_row:
+                        matched_rows += 1
+                        remaining_actual.pop(i)
+                        break
+            if (
+                matched_rows == len(normalized_expected)
+                and len(normalized_actual) == len(normalized_expected)
+            ):
+                # All rows match but wrong order
+                partial = 0.40
+                score += partial
+                feedback_parts.append(
+                    f"PARTIAL: All values correct but wrong ordering (+{partial:.2f})"
+                )
+                feedback_parts.append("Hint: Check your ORDER BY clause")
+            elif matched_rows > 0:
+                # Some rows match
+                partial = 0.50 * (matched_rows / len(normalized_expected))
+                score += partial
+                feedback_parts.append(
+                    f"PARTIAL: {matched_rows}/{len(normalized_expected)} rows match (+{partial:.2f})"
+                )
+                if matched_rows < len(normalized_expected):
+                    feedback_parts.append(
+                        "Hint: Some values are incorrect. Check WHERE/JOIN conditions."
+                    )
+            else:
+                feedback_parts.append("WRONG: No matching rows found")
+                feedback_parts.append(
+                    "Hint: Review your query logic - values don't match expected output."
+                )
+                # Tiny credit if some values appear somewhere
+                all_expected_vals = set()
+                for row in normalized_expected:
+                    all_expected_vals.update(row)
+                all_actual_vals = set()
+                for row in normalized_actual:
+                    all_actual_vals.update(row)
+                overlap = all_expected_vals & all_actual_vals
+                if overlap:
+                    tiny_credit = 0.05
+                    score += tiny_credit
+                    feedback_parts.append(
+                        f"  (Some expected values found in output: +{tiny_credit:.2f})"
+                    )
+    else:
+        # Expected empty result
+        if len(actual_rows) == 0:
+            score += 0.50
+            feedback_parts.append("OK: Correctly returned empty result (+0.50)")
+        else:
+            feedback_parts.append(
+                f"WRONG: Expected empty result, got {len(actual_rows)} rows"
+            )
+    # ---- Final score ----
+    score = round(min(max(score, 0.0), 1.0), 4)
+    feedback_parts.append(f"\nTotal Score: {score:.2f}/1.00")
+    return score, "\n".join(feedback_parts)
+def generate_hint(task: SQLTask, step: int, current_score: float) -> Optional[str]:
+    """Generate progressive hints based on step number and current score."""
+    if current_score >= 0.8:
+        return None  # No hint needed
+    if step <= len(task.hints):
+        return f"Hint {step}: {task.hints[step - 1]}"
+    # Generic hints for later steps
+    generic_hints = [
+        f"Expected columns are: {task.expected_columns}",
+        f"Expected {task.expected_row_count} rows in the result",
+        "Check the schema description carefully for table and column names",
+    ]
+    hint_idx = min(step - len(task.hints) - 1, len(generic_hints) - 1)
+    if hint_idx >= 0:
+        return generic_hints[hint_idx]
+    return None

src/sql_arena/models.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""
+Typed Pydantic models for SQL Arena OpenEnv environment.
+These models define the contract between the agent and environment:
+- SQLArenaAction: What the agent sends (a SQL query)
+- SQLArenaObservation: What the agent receives (schema, results, feedback)
+- SQLArenaState: Internal environment state tracking
+"""
+from pydantic import BaseModel, Field
+from typing import Optional, List
+class SQLArenaAction(BaseModel):
+    """
+    Action model — the agent submits a SQL query.
+    This is what the agent sends to the environment each step.
+    The environment will execute this query against the SQLite database
+    and return results + feedback.
+    """
+    sql_query: str = Field(
+        ...,
+        description="SQL query to execute against the database",
+        examples=[
+            "SELECT name, salary FROM employees WHERE salary > 50000",
+            "SELECT department, COUNT(*) FROM employees GROUP BY department",
+        ]
+    )
+class SQLArenaObservation(BaseModel):
+    """
+    Observation model — what the agent sees after each step.
+    Contains the database schema, the question to answer,
+    results from the last query, error messages, and feedback
+    with partial credit information.
+    """
+    # Always present
+    schema_description: str = Field(
+        ...,
+        description="Human-readable database schema (CREATE TABLE statements)"
+    )
+    question: str = Field(
+        ...,
+        description="Natural language question the agent must answer with SQL"
+    )
+    difficulty: str = Field(
+        ...,
+        description="Task difficulty level: basic_select, join_aggregate, or complex_analysis"
+    )
+    task_id: str = Field(
+        ...,
+        description="Unique identifier for this specific problem"
+    )
+    attempts_remaining: int = Field(
+        ...,
+        description="Number of query attempts the agent has left"
+    )
+    # Present after step() calls
+    query_result: Optional[str] = Field(
+        None,
+        description="Formatted result table from the last executed query"
+    )
+    error_message: Optional[str] = Field(
+        None,
+        description="SQL error message if the query failed to execute"
+    )
+    feedback: Optional[str] = Field(
+        None,
+        description="Detailed feedback on query correctness with partial credit breakdown"
+    )
+    # Hints to help the agent
+    expected_columns: Optional[List[str]] = Field(
+        None,
+        description="Expected column names in the correct result (hint)"
+    )
+class SQLArenaState(BaseModel):
+    """
+    Internal state model — tracks the episode progress.
+    This is returned by the state() endpoint and contains
+    all information about the current episode.
+    """
+    task_id: str = Field(..., description="Current task identifier")
+    difficulty: str = Field(..., description="Current difficulty level")
+    current_step: int = Field(0, description="Number of steps taken so far")
+    max_steps: int = Field(5, description="Maximum steps allowed for this task")
+    best_score: float = Field(0.0, description="Best score achieved so far in this episode")
+    total_reward: float = Field(0.0, description="Sum of all rewards received")
+    rewards_history: List[float] = Field(
+        default_factory=list,
+        description="List of rewards received at each step"
+    )
+    done: bool = Field(False, description="Whether the episode has ended")
+    last_action_error: Optional[str] = Field(
+        None,
+        description="Error from the last action, if any"
+    )

src/sql_arena/server.py ADDED Viewed

	@@ -0,0 +1,265 @@

+"""
+FastAPI server for SQL Arena - OpenEnv compatible.
+Exposes /reset, /step, /state endpoints via HTTP and WebSocket.
+"""
+import json
+import uuid
+import asyncio
+from typing import Dict, Optional
+from contextlib import asynccontextmanager
+from fastapi import FastAPI, WebSocket, WebSocketDisconnect, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from .environment import SQLArenaEnvironment, StepResult
+from .models import SQLArenaAction, SQLArenaObservation, SQLArenaState
+# =====================================================
+# Request / Response Models
+# =====================================================
+class ResetRequest(BaseModel):
+    difficulty: str = "basic_select"
+    task_id: Optional[str] = None
+class StepRequest(BaseModel):
+    sql_query: str
+class ResetResponse(BaseModel):
+    observation: SQLArenaObservation
+    reward: float
+    done: bool
+    info: dict = {}
+class StepResponse(BaseModel):
+    observation: SQLArenaObservation
+    reward: float
+    done: bool
+    info: dict = {}
+class StateResponse(BaseModel):
+    state: SQLArenaState
+class TaskListResponse(BaseModel):
+    tasks: Dict
+# =====================================================
+# Session Manager
+# =====================================================
+class SessionManager:
+    """Manages multiple concurrent environment instances."""
+    def __init__(self, max_sessions: int = 100):
+        self.sessions: Dict[str, SQLArenaEnvironment] = {}
+        self.max_sessions = max_sessions
+        self._lock = asyncio.Lock()
+    async def create_session(self):
+        async with self._lock:
+            if len(self.sessions) >= self.max_sessions:
+                oldest_key = next(iter(self.sessions))
+                self.sessions[oldest_key].close()
+                del self.sessions[oldest_key]
+            session_id = str(uuid.uuid4())
+            env = SQLArenaEnvironment()
+            self.sessions[session_id] = env
+            return session_id, env
+    async def get_session(self, session_id: str):
+        return self.sessions.get(session_id)
+    async def remove_session(self, session_id: str):
+        async with self._lock:
+            if session_id in self.sessions:
+                self.sessions[session_id].close()
+                del self.sessions[session_id]
+    async def cleanup_all(self):
+        async with self._lock:
+            for env in self.sessions.values():
+                env.close()
+            self.sessions.clear()
+# =====================================================
+# App Setup
+# =====================================================
+session_manager = SessionManager()
+_default_env = SQLArenaEnvironment()
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    yield
+    await session_manager.cleanup_all()
+    _default_env.close()
+app = FastAPI(
+    title="SQL Arena - OpenEnv Environment",
+    description="Interactive SQL query challenge environment for AI agents",
+    version="1.0.0",
+    lifespan=lifespan,
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# =====================================================
+# HTTP Endpoints
+# =====================================================
+@app.get("/")
+async def root():
+    return {
+        "name": "SQL Arena",
+        "version": "1.0.0",
+        "description": "Interactive SQL query challenge environment",
+        "endpoints": ["/reset", "/step", "/state", "/tasks", "/ws"],
+    }
+@app.get("/health")
+async def health():
+    return {"status": "healthy"}
+@app.post("/reset", response_model=ResetResponse)
+async def reset(request: ResetRequest = ResetRequest()):
+    try:
+        result = _default_env.reset(
+            difficulty=request.difficulty,
+            task_id=request.task_id,
+        )
+        return ResetResponse(
+            observation=result.observation,
+            reward=result.reward,
+            done=result.done,
+            info=result.info,
+        )
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=str(e))
+@app.post("/step", response_model=StepResponse)
+async def step(request: StepRequest):
+    try:
+        action = SQLArenaAction(sql_query=request.sql_query)
+        result = _default_env.step(action)
+        return StepResponse(
+            observation=result.observation,
+            reward=result.reward,
+            done=result.done,
+            info=result.info,
+        )
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=str(e))
+@app.get("/state", response_model=StateResponse)
+async def state():
+    try:
+        return StateResponse(state=_default_env.state())
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=str(e))
+@app.get("/tasks", response_model=TaskListResponse)
+async def tasks():
+    return TaskListResponse(tasks=_default_env.get_available_tasks())
+# =====================================================
+# WebSocket Endpoint
+# =====================================================
+@app.websocket("/ws")
+async def websocket_endpoint(websocket: WebSocket):
+    await websocket.accept()
+    session_id, env = await session_manager.create_session()
+    try:
+        while True:
+            data = await websocket.receive_text()
+            message = json.loads(data)
+            method = message.get("method", "")
+            params = message.get("params", {})
+            msg_id = message.get("id", None)
+            try:
+                if method == "reset":
+                    result = env.reset(
+                        difficulty=params.get("difficulty", "basic_select"),
+                        task_id=params.get("task_id"),
+                    )
+                    response = {
+                        "id": msg_id,
+                        "result": {
+                            "observation": result.observation.model_dump(),
+                            "reward": result.reward,
+                            "done": result.done,
+                            "info": result.info,
+                        },
+                    }
+                elif method == "step":
+                    action = SQLArenaAction(sql_query=params.get("sql_query", ""))
+                    result = env.step(action)
+                    response = {
+                        "id": msg_id,
+                        "result": {
+                            "observation": result.observation.model_dump(),
+                            "reward": result.reward,
+                            "done": result.done,
+                            "info": result.info,
+                        },
+                    }
+                elif method == "state":
+                    env_state = env.state()
+                    response = {
+                        "id": msg_id,
+                        "result": {"state": env_state.model_dump()},
+                    }
+                elif method == "close":
+                    response = {"id": msg_id, "result": {"status": "closed"}}
+                    await websocket.send_text(json.dumps(response))
+                    break
+                else:
+                    response = {"id": msg_id, "error": f"Unknown method: {method}"}
+                await websocket.send_text(json.dumps(response))
+            except Exception as e:
+                error_response = {"id": msg_id, "error": str(e)}
+                await websocket.send_text(json.dumps(error_response))
+    except WebSocketDisconnect:
+        pass
+    finally:
+        await session_manager.remove_session(session_id)
+# =====================================================
+# Entry point
+# =====================================================
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)

src/sql_arena/tasks.py ADDED Viewed

	@@ -0,0 +1,593 @@

+"""
+Task Bank for SQL Arena.
+Contains 9 SQL challenges across 3 difficulty levels:
+- basic_select (Easy): 3 tasks — simple SELECT/WHERE/ORDER BY
+- join_aggregate (Medium): 3 tasks — JOINs, GROUP BY, HAVING
+- complex_analysis (Hard): 3 tasks — CTEs, window functions, subqueries
+Each task defines:
+- Database schema and sample data (setup_sql)
+- Natural language question
+- Expected SQL solution
+- Expected result for grading
+- Progressive hints
+"""
+from dataclasses import dataclass, field
+from typing import List, Dict, Optional
+import random
+@dataclass
+class SQLTask:
+    """A single SQL challenge problem."""
+    task_id: str
+    difficulty: str  # basic_select, join_aggregate, complex_analysis
+    title: str
+    setup_sql: str  # CREATE TABLE + INSERT statements
+    question: str  # Natural language question
+    expected_sql: str  # Reference solution
+    expected_columns: List[str]  # Expected column names in result
+    expected_row_count: int  # Expected number of result rows
+    expected_rows: List[tuple]  # Expected result rows for grading
+    hints: List[str] = field(default_factory=list)
+    max_steps: int = 5
+    schema_description: str = ""  # Human-readable schema description
+# =============================================================
+# DATABASE SCHEMAS
+# =============================================================
+# Schema 1: Employee database (used by Easy tasks)
+EMPLOYEES_SCHEMA = """
+CREATE TABLE employees (
+    id INTEGER PRIMARY KEY,
+    name TEXT NOT NULL,
+    department TEXT NOT NULL,
+    salary REAL NOT NULL,
+    hire_date TEXT NOT NULL,
+    is_active INTEGER DEFAULT 1
+);
+INSERT INTO employees VALUES (1, 'Alice Johnson', 'Engineering', 95000, '2020-01-15', 1);
+INSERT INTO employees VALUES (2, 'Bob Smith', 'Marketing', 65000, '2019-06-01', 1);
+INSERT INTO employees VALUES (3, 'Carol Williams', 'Engineering', 110000, '2018-03-20', 1);
+INSERT INTO employees VALUES (4, 'David Brown', 'Sales', 72000, '2021-09-10', 1);
+INSERT INTO employees VALUES (5, 'Eve Davis', 'Engineering', 88000, '2022-02-28', 1);
+INSERT INTO employees VALUES (6, 'Frank Miller', 'Marketing', 58000, '2020-11-15', 0);
+INSERT INTO employees VALUES (7, 'Grace Wilson', 'Sales', 81000, '2019-04-22', 1);
+INSERT INTO employees VALUES (8, 'Henry Taylor', 'Engineering', 125000, '2017-08-01', 1);
+INSERT INTO employees VALUES (9, 'Ivy Anderson', 'HR', 70000, '2021-01-10', 1);
+INSERT INTO employees VALUES (10, 'Jack Thomas', 'HR', 75000, '2020-07-15', 1);
+"""
+EMPLOYEES_SCHEMA_DESC = """Table: employees
+Columns:
+  - id: INTEGER PRIMARY KEY (auto-increment identifier)
+  - name: TEXT (employee full name, e.g. 'Alice Johnson')
+  - department: TEXT (one of: Engineering, Marketing, Sales, HR)
+  - salary: REAL (annual salary in USD, e.g. 95000.0)
+  - hire_date: TEXT (date in YYYY-MM-DD format, e.g. '2020-01-15')
+  - is_active: INTEGER (1 = currently active, 0 = inactive/left)
+Data: 10 employees across 4 departments.
+  - 4 in Engineering, 2 in Marketing (1 inactive), 2 in Sales, 2 in HR
+  - Salaries range from 58,000 to 125,000
+  - Hire dates range from 2017 to 2022
+"""
+# Schema 2: E-commerce database (used by Medium and Hard tasks)
+ECOMMERCE_SCHEMA = """
+CREATE TABLE customers (
+    id INTEGER PRIMARY KEY,
+    name TEXT NOT NULL,
+    email TEXT NOT NULL,
+    city TEXT NOT NULL,
+    signup_date TEXT NOT NULL
+);
+CREATE TABLE products (
+    id INTEGER PRIMARY KEY,
+    name TEXT NOT NULL,
+    category TEXT NOT NULL,
+    price REAL NOT NULL,
+    stock INTEGER NOT NULL
+);
+CREATE TABLE orders (
+    id INTEGER PRIMARY KEY,
+    customer_id INTEGER NOT NULL,
+    order_date TEXT NOT NULL,
+    status TEXT NOT NULL,
+    FOREIGN KEY (customer_id) REFERENCES customers(id)
+);
+CREATE TABLE order_items (
+    id INTEGER PRIMARY KEY,
+    order_id INTEGER NOT NULL,
+    product_id INTEGER NOT NULL,
+    quantity INTEGER NOT NULL,
+    unit_price REAL NOT NULL,
+    FOREIGN KEY (order_id) REFERENCES orders(id),
+    FOREIGN KEY (product_id) REFERENCES products(id)
+);
+-- Customers
+INSERT INTO customers VALUES (1, 'Alice', 'alice@email.com', 'New York', '2023-01-15');
+INSERT INTO customers VALUES (2, 'Bob', 'bob@email.com', 'Los Angeles', '2023-02-20');
+INSERT INTO customers VALUES (3, 'Carol', 'carol@email.com', 'Chicago', '2023-03-10');
+INSERT INTO customers VALUES (4, 'David', 'david@email.com', 'New York', '2023-04-05');
+INSERT INTO customers VALUES (5, 'Eve', 'eve@email.com', 'Boston', '2023-05-12');
+-- Products
+INSERT INTO products VALUES (1, 'Laptop', 'Electronics', 999.99, 50);
+INSERT INTO products VALUES (2, 'Headphones', 'Electronics', 149.99, 200);
+INSERT INTO products VALUES (3, 'Python Book', 'Books', 39.99, 100);
+INSERT INTO products VALUES (4, 'Desk Lamp', 'Home', 29.99, 150);
+INSERT INTO products VALUES (5, 'Keyboard', 'Electronics', 79.99, 120);
+INSERT INTO products VALUES (6, 'SQL Book', 'Books', 44.99, 80);
+-- Orders (10 orders, various statuses)
+INSERT INTO orders VALUES (1, 1, '2023-06-01', 'completed');
+INSERT INTO orders VALUES (2, 1, '2023-07-15', 'completed');
+INSERT INTO orders VALUES (3, 2, '2023-06-20', 'completed');
+INSERT INTO orders VALUES (4, 3, '2023-08-01', 'completed');
+INSERT INTO orders VALUES (5, 3, '2023-08-15', 'completed');
+INSERT INTO orders VALUES (6, 3, '2023-09-01', 'completed');
+INSERT INTO orders VALUES (7, 4, '2023-07-10', 'cancelled');
+INSERT INTO orders VALUES (8, 5, '2023-09-20', 'completed');
+INSERT INTO orders VALUES (9, 1, '2023-10-01', 'completed');
+INSERT INTO orders VALUES (10, 2, '2023-10-15', 'pending');
+-- Order Items (17 line items)
+INSERT INTO order_items VALUES (1, 1, 1, 1, 999.99);
+INSERT INTO order_items VALUES (2, 1, 2, 2, 149.99);
+INSERT INTO order_items VALUES (3, 2, 3, 1, 39.99);
+INSERT INTO order_items VALUES (4, 2, 5, 1, 79.99);
+INSERT INTO order_items VALUES (5, 3, 1, 1, 999.99);
+INSERT INTO order_items VALUES (6, 3, 4, 3, 29.99);
+INSERT INTO order_items VALUES (7, 4, 2, 1, 149.99);
+INSERT INTO order_items VALUES (8, 4, 6, 2, 44.99);
+INSERT INTO order_items VALUES (9, 5, 3, 1, 39.99);
+INSERT INTO order_items VALUES (10, 5, 5, 2, 79.99);
+INSERT INTO order_items VALUES (11, 6, 1, 1, 999.99);
+INSERT INTO order_items VALUES (12, 6, 2, 1, 149.99);
+INSERT INTO order_items VALUES (13, 8, 6, 1, 44.99);
+INSERT INTO order_items VALUES (14, 8, 4, 1, 29.99);
+INSERT INTO order_items VALUES (15, 9, 2, 3, 149.99);
+INSERT INTO order_items VALUES (16, 9, 3, 2, 39.99);
+INSERT INTO order_items VALUES (17, 10, 1, 1, 999.99);
+"""
+ECOMMERCE_SCHEMA_DESC = """Tables:
+1. customers (5 rows)
+   - id: INTEGER PRIMARY KEY
+   - name: TEXT (customer first name)
+   - email: TEXT
+   - city: TEXT (New York, Los Angeles, Chicago, Boston)
+   - signup_date: TEXT (YYYY-MM-DD)
+2. products (6 rows)
+   - id: INTEGER PRIMARY KEY
+   - name: TEXT (product name)
+   - category: TEXT (Electronics, Books, Home)
+   - price: REAL (unit price in USD)
+   - stock: INTEGER (units in stock)
+3. orders (10 rows)
+   - id: INTEGER PRIMARY KEY
+   - customer_id: INTEGER → customers.id
+   - order_date: TEXT (YYYY-MM-DD, range: 2023-06 to 2023-10)
+   - status: TEXT (completed, cancelled, pending)
+4. order_items (17 rows)
+   - id: INTEGER PRIMARY KEY
+   - order_id: INTEGER → orders.id
+   - product_id: INTEGER → products.id
+   - quantity: INTEGER
+   - unit_price: REAL (price at time of order)
+Relationships:
+  orders.customer_id → customers.id
+  order_items.order_id → orders.id
+  order_items.product_id → products.id
+"""
+# =============================================================
+# EASY TASKS: basic_select (3 tasks)
+# =============================================================
+EASY_TASKS = [
+    SQLTask(
+        task_id="easy_001",
+        difficulty="basic_select",
+        title="High Salary Employees",
+        setup_sql=EMPLOYEES_SCHEMA,
+        question="Find the names and salaries of all ACTIVE employees who earn more than \$80,000. Order the results by salary from highest to lowest.",
+        expected_sql="SELECT name, salary FROM employees WHERE is_active = 1 AND salary > 80000 ORDER BY salary DESC",
+        expected_columns=["name", "salary"],
+        expected_row_count=4,
+        expected_rows=[
+            ("Henry Taylor", 125000.0),
+            ("Carol Williams", 110000.0),
+            ("Alice Johnson", 95000.0),
+            ("Eve Davis", 88000.0),
+        ],
+        hints=[
+            "Use SELECT with specific column names, not SELECT *",
+            "Use WHERE with AND to combine conditions: is_active = 1 AND salary > 80000",
+            "Add ORDER BY salary DESC for descending order",
+        ],
+        schema_description=EMPLOYEES_SCHEMA_DESC,
+        max_steps=5,
+    ),
+    SQLTask(
+        task_id="easy_002",
+        difficulty="basic_select",
+        title="Department Employee Count",
+        setup_sql=EMPLOYEES_SCHEMA,
+        question="Count the number of ACTIVE employees in each department. Show the department name and the count. Order by count from highest to lowest.",
+        expected_sql="SELECT department, COUNT(*) as employee_count FROM employees WHERE is_active = 1 GROUP BY department ORDER BY employee_count DESC",
+        expected_columns=["department", "employee_count"],
+        expected_row_count=4,
+        expected_rows=[
+            ("Engineering", 4),
+            ("HR", 2),
+            ("Sales", 2),
+            ("Marketing", 1),
+        ],
+        hints=[
+            "Use COUNT(*) to count rows in each group",
+            "GROUP BY department groups rows by department",
+            "Use an alias: COUNT(*) as employee_count",
+        ],
+        schema_description=EMPLOYEES_SCHEMA_DESC,
+        max_steps=5,
+    ),
+    SQLTask(
+        task_id="easy_003",
+        difficulty="basic_select",
+        title="Recent Hires",
+        setup_sql=EMPLOYEES_SCHEMA,
+        question="List the names and hire dates of employees hired on or after January 1, 2021. Order by hire date from earliest to latest.",
+        expected_sql="SELECT name, hire_date FROM employees WHERE hire_date >= '2021-01-01' ORDER BY hire_date",
+        expected_columns=["name", "hire_date"],
+        expected_row_count=3,
+        expected_rows=[
+            ("Ivy Anderson", "2021-01-10"),
+            ("David Brown", "2021-09-10"),
+            ("Eve Davis", "2022-02-28"),
+        ],
+        hints=[
+            "Dates in SQLite can be compared as strings when in YYYY-MM-DD format",
+            "Use WHERE hire_date >= '2021-01-01'",
+            "ORDER BY hire_date gives ascending order by default",
+        ],
+        schema_description=EMPLOYEES_SCHEMA_DESC,
+        max_steps=5,
+    ),
+]
+# =============================================================
+# MEDIUM TASKS: join_aggregate (3 tasks)
+# =============================================================
+MEDIUM_TASKS = [
+    SQLTask(
+        task_id="medium_001",
+        difficulty="join_aggregate",
+        title="Customer Total Spending",
+        setup_sql=ECOMMERCE_SCHEMA,
+        question="Find the total amount spent by each customer on COMPLETED orders only. Show the customer name and their total spending. Only include customers who spent more than \$200. Order by total spending from highest to lowest.",
+        expected_sql="""
+            SELECT c.name, ROUND(SUM(oi.quantity * oi.unit_price), 2) as total_spent
+            FROM customers c
+            JOIN orders o ON c.id = o.customer_id
+            JOIN order_items oi ON o.id = oi.order_id
+            WHERE o.status = 'completed'
+            GROUP BY c.id, c.name
+            HAVING SUM(oi.quantity * oi.unit_price) > 200
+            ORDER BY total_spent DESC
+        """,
+        expected_columns=["name", "total_spent"],
+        expected_row_count=4,
+        expected_rows=[
+            ("Alice", 1919.91),
+            ("Carol", 1464.94),
+            ("Bob", 1089.96),
+            ("Eve", 74.98),
+        ],
+        hints=[
+            "You need to JOIN three tables: customers → orders → order_items",
+            "Total per item = quantity * unit_price, then SUM for total per customer",
+            "Filter completed orders with WHERE o.status = 'completed'",
+            "Use HAVING (not WHERE) to filter after GROUP BY",
+        ],
+        schema_description=ECOMMERCE_SCHEMA_DESC,
+        max_steps=7,
+    ),
+    SQLTask(
+        task_id="medium_002",
+        difficulty="join_aggregate",
+        title="Category Revenue",
+        setup_sql=ECOMMERCE_SCHEMA,
+        question="Calculate the total revenue for each product category from COMPLETED orders. Show the category name and total revenue. Order by total revenue from highest to lowest.",
+        expected_sql="""
+            SELECT p.category, ROUND(SUM(oi.quantity * oi.unit_price), 2) as total_revenue
+            FROM products p
+            JOIN order_items oi ON p.id = oi.product_id
+            JOIN orders o ON oi.order_id = o.id
+            WHERE o.status = 'completed'
+            GROUP BY p.category
+            ORDER BY total_revenue DESC
+        """,
+        expected_columns=["category", "total_revenue"],
+        expected_row_count=3,
+        expected_rows=[
+            ("Electronics", 4459.83),
+            ("Books", 254.93),
+            ("Home", 119.96),
+        ],
+        hints=[
+            "JOIN products → order_items → orders",
+            "Revenue per item = quantity * unit_price",
+            "Filter only completed orders",
+            "GROUP BY p.category to get per-category totals",
+        ],
+        schema_description=ECOMMERCE_SCHEMA_DESC,
+        max_steps=7,
+    ),
+    SQLTask(
+        task_id="medium_003",
+        difficulty="join_aggregate",
+        title="Customers with Multiple Orders",
+        setup_sql=ECOMMERCE_SCHEMA,
+        question="Find customers who have placed more than one COMPLETED order. Show the customer name and the number of completed orders they placed. Order by order count descending, then by name ascending.",
+        expected_sql="""
+            SELECT c.name, COUNT(o.id) as order_count
+            FROM customers c
+            JOIN orders o ON c.id = o.customer_id
+            WHERE o.status = 'completed'
+            GROUP BY c.id, c.name
+            HAVING COUNT(o.id) > 1
+            ORDER BY order_count DESC, c.name ASC
+        """,
+        expected_columns=["name", "order_count"],
+        expected_row_count=2,
+        expected_rows=[
+            ("Alice", 3),
+            ("Carol", 3),
+        ],
+        hints=[
+            "JOIN customers with orders",
+            "Filter for completed orders in WHERE clause",
+            "GROUP BY customer, then HAVING COUNT > 1",
+            "ORDER BY count DESC, then name ASC for ties",
+        ],
+        schema_description=ECOMMERCE_SCHEMA_DESC,
+        max_steps=7,
+    ),
+]
+# =============================================================
+# HARD TASKS: complex_analysis (3 tasks)
+# =============================================================
+HARD_TASKS = [
+    SQLTask(
+        task_id="hard_001",
+        difficulty="complex_analysis",
+        title="Monthly Revenue with Growth Rate",
+        setup_sql=ECOMMERCE_SCHEMA,
+        question="Calculate monthly revenue from COMPLETED orders, and for each month show the month (YYYY-MM format), the total revenue, and the percentage change from the previous month. For the first month, the percentage change should be NULL. Round revenue to 2 decimal places and percentage to 2 decimal places. Order by month ascending.",
+        expected_sql="""
+            WITH monthly AS (
+                SELECT
+                    strftime('%Y-%m', o.order_date) as month,
+                    ROUND(SUM(oi.quantity * oi.unit_price), 2) as revenue
+                FROM orders o
+                JOIN order_items oi ON o.id = oi.order_id
+                WHERE o.status = 'completed'
+                GROUP BY strftime('%Y-%m', o.order_date)
+            ),
+            with_prev AS (
+                SELECT
+                    month,
+                    revenue,
+                    LAG(revenue) OVER (ORDER BY month) as prev_revenue
+                FROM monthly
+            )
+            SELECT
+                month,
+                revenue,
+                CASE
+                    WHEN prev_revenue IS NULL THEN NULL
+                    ELSE ROUND(((revenue - prev_revenue) * 100.0 / prev_revenue), 2)
+                END as pct_change
+            FROM with_prev
+            ORDER BY month
+        """,
+        expected_columns=["month", "revenue", "pct_change"],
+        expected_row_count=5,
+        expected_rows=[
+            ("2023-06", 2289.93, None),
+            ("2023-07", 119.98, -94.76),
+            ("2023-08", 1429.93, 1091.81),
+            ("2023-09", 1224.97, -14.34),
+            ("2023-10", 529.95, -56.74),
+        ],
+        hints=[
+            "Use a CTE (WITH clause) to first calculate monthly revenue",
+            "strftime('%Y-%m', date) extracts year-month from a date string",
+            "LAG(revenue) OVER (ORDER BY month) gets the previous month's revenue",
+            "Percentage change = ((new - old) / old) * 100",
+            "Use CASE WHEN prev IS NULL THEN NULL ELSE ... END for first month",
+        ],
+        schema_description=ECOMMERCE_SCHEMA_DESC,
+        max_steps=10,
+    ),
+    SQLTask(
+        task_id="hard_002",
+        difficulty="complex_analysis",
+        title="Top Product Per Category",
+        setup_sql=ECOMMERCE_SCHEMA,
+        question="For each product category, find the single best-selling product (by total quantity sold across COMPLETED orders). Show the category, product name, and total quantity sold. If there are ties, pick the one with the higher total revenue. Order by category name ascending.",
+        expected_sql="""
+            WITH product_sales AS (
+                SELECT
+                    p.category,
+                    p.name as product_name,
+                    SUM(oi.quantity) as total_qty,
+                    SUM(oi.quantity * oi.unit_price) as total_revenue,
+                    ROW_NUMBER() OVER (
+                        PARTITION BY p.category
+                        ORDER BY SUM(oi.quantity) DESC, SUM(oi.quantity * oi.unit_price) DESC
+                    ) as rn
+                FROM products p
+                JOIN order_items oi ON p.id = oi.product_id
+                JOIN orders o ON oi.order_id = o.id
+                WHERE o.status = 'completed'
+                GROUP BY p.category, p.name
+            )
+            SELECT category, product_name, total_qty
+            FROM product_sales
+            WHERE rn = 1
+            ORDER BY category ASC
+        """,
+        expected_columns=["category", "product_name", "total_qty"],
+        expected_row_count=3,
+        expected_rows=[
+            ("Books", "Python Book", 4),
+            ("Electronics", "Headphones", 7),
+            ("Home", "Desk Lamp", 4),
+        ],
+        hints=[
+            "First calculate total quantity sold per product (SUM of quantity)",
+            "Use ROW_NUMBER() OVER (PARTITION BY category ORDER BY qty DESC) to rank within category",
+            "Filter WHERE rn = 1 to get only the top product per category",
+            "A CTE makes this much cleaner than nested subqueries",
+            "Don't forget to filter for completed orders only",
+        ],
+        schema_description=ECOMMERCE_SCHEMA_DESC,
+        max_steps=10,
+    ),
+    SQLTask(
+        task_id="hard_003",
+        difficulty="complex_analysis",
+        title="Customer Lifetime Value Analysis",
+        setup_sql=ECOMMERCE_SCHEMA,
+        question="For customers with at least 2 completed orders, calculate: their name, number of completed orders, total lifetime spending (rounded to 2 decimals), average order value (rounded to 2 decimals), and the number of days between their first and last completed order. Order by total spending descending.",
+        expected_sql="""
+            WITH customer_order_totals AS (
+                SELECT
+                    c.id as customer_id,
+                    c.name,
+                    o.id as order_id,
+                    o.order_date,
+                    SUM(oi.quantity * oi.unit_price) as order_total
+                FROM customers c
+                JOIN orders o ON c.id = o.customer_id
+                JOIN order_items oi ON o.id = oi.order_id
+                WHERE o.status = 'completed'
+                GROUP BY c.id, c.name, o.id, o.order_date
+            )
+            SELECT
+                name,
+                COUNT(*) as num_orders,
+                ROUND(SUM(order_total), 2) as total_spending,
+                ROUND(AVG(order_total), 2) as avg_order_value,
+                CAST(julianday(MAX(order_date)) - julianday(MIN(order_date)) AS INTEGER) as days_span
+            FROM customer_order_totals
+            GROUP BY customer_id, name
+            HAVING COUNT(*) >= 2
+            ORDER BY total_spending DESC
+        """,
+        expected_columns=["name", "num_orders", "total_spending", "avg_order_value", "days_span"],
+        expected_row_count=2,
+        expected_rows=[
+            ("Alice", 3, 1919.91, 639.97, 122),
+            ("Carol", 3, 1464.94, 488.31, 31),
+        ],
+        hints=[
+            "Use a CTE to first calculate the total for each individual order",
+            "In the CTE: JOIN customers → orders → order_items, GROUP BY order",
+            "In the outer query: GROUP BY customer, HAVING COUNT >= 2",
+            "julianday() converts date strings to Julian day numbers for arithmetic",
+            "days_span = julianday(MAX(order_date)) - julianday(MIN(order_date))",
+        ],
+        schema_description=ECOMMERCE_SCHEMA_DESC,
+        max_steps=10,
+    ),
+]
+# =============================================================
+# TASK REGISTRY — Maps task IDs and difficulty levels
+# =============================================================
+ALL_TASKS: Dict[str, List[SQLTask]] = {
+    "basic_select": EASY_TASKS,
+    "join_aggregate": MEDIUM_TASKS,
+    "complex_analysis": HARD_TASKS,
+}
+# Build a flat lookup by task_id
+TASK_BY_ID: Dict[str, SQLTask] = {}
+for _tasks in ALL_TASKS.values():
+    for _task in _tasks:
+        TASK_BY_ID[_task.task_id] = _task
+def get_task(difficulty: str, task_id: Optional[str] = None) -> SQLTask:
+    """
+    Get a task by difficulty level, optionally by specific ID.
+    Args:
+        difficulty: One of 'basic_select', 'join_aggregate', 'complex_analysis'
+        task_id: Optional specific task ID (e.g., 'easy_001')
+    Returns:
+        SQLTask instance
+    Raises:
+        ValueError: If difficulty is unknown
+    """
+    # If specific task_id given, return it directly
+    if task_id and task_id in TASK_BY_ID:
+        return TASK_BY_ID[task_id]
+    # Otherwise pick from the difficulty pool
+    if difficulty not in ALL_TASKS:
+        raise ValueError(
+            f"Unknown difficulty: '{difficulty}'. "
+            f"Choose from: {list(ALL_TASKS.keys())}"
+        )
+    tasks = ALL_TASKS[difficulty]
+    return random.choice(tasks)
+def list_tasks() -> Dict[str, List[str]]:
+    """
+    List all available tasks grouped by difficulty.
+    Returns:
+        Dict mapping difficulty name to list of task IDs
+    """
+    return {
+        difficulty: [t.task_id for t in tasks]
+        for difficulty, tasks in ALL_TASKS.items()
+    }

tests/__init__.py ADDED Viewed

File without changes

tests/test_env.py ADDED Viewed

	@@ -0,0 +1,125 @@

+"""Tests for SQL Arena environment."""
+import pytest
+from src.sql_arena.environment import SQLArenaEnvironment
+from src.sql_arena.models import SQLArenaAction
+from src.sql_arena.tasks import list_tasks, TASK_BY_ID
+class TestEnvironmentBasics:
+    def setup_method(self):
+        self.env = SQLArenaEnvironment()
+    def teardown_method(self):
+        self.env.close()
+    def test_reset_returns_observation(self):
+        result = self.env.reset(difficulty="basic_select", task_id="easy_001")
+        assert result.observation is not None
+        assert result.reward == 0.0
+        assert result.done is False
+    def test_step_with_correct_query(self):
+        self.env.reset(difficulty="basic_select", task_id="easy_001")
+        task = self.env.current_task
+        action = SQLArenaAction(sql_query=task.expected_sql)
+        result = self.env.step(action)
+        assert result.reward > 0.0
+        assert result.info.get("score", 0) >= 0.8
+    def test_step_with_invalid_query(self):
+        self.env.reset(difficulty="basic_select", task_id="easy_001")
+        action = SQLArenaAction(sql_query="INVALID SQL QUERY")
+        result = self.env.step(action)
+        assert result.reward == 0.0
+        assert result.observation.error_message is not None
+    def test_state_tracking(self):
+        self.env.reset(difficulty="basic_select", task_id="easy_001")
+        state = self.env.state()
+        assert state.current_step == 0
+        self.env.step(SQLArenaAction(sql_query="SELECT 1"))
+        state = self.env.state()
+        assert state.current_step == 1
+    def test_episode_terminates(self):
+        self.env.reset(difficulty="basic_select", task_id="easy_001")
+        task = self.env.current_task
+        for _ in range(task.max_steps + 1):
+            if self.env.state().done:
+                break
+            self.env.step(SQLArenaAction(sql_query="SELECT 1"))
+        assert self.env.state().done is True
+class TestAllDifficulties:
+    def setup_method(self):
+        self.env = SQLArenaEnvironment()
+    def teardown_method(self):
+        self.env.close()
+    def test_easy(self):
+        result = self.env.reset(difficulty="basic_select")
+        assert result.observation.difficulty == "basic_select"
+    def test_medium(self):
+        result = self.env.reset(difficulty="join_aggregate")
+        assert result.observation.difficulty == "join_aggregate"
+    def test_hard(self):
+        result = self.env.reset(difficulty="complex_analysis")
+        assert result.observation.difficulty == "complex_analysis"
+class TestGrading:
+    def setup_method(self):
+        self.env = SQLArenaEnvironment()
+    def teardown_method(self):
+        self.env.close()
+    def test_scores_in_range(self):
+        for task_id, task in TASK_BY_ID.items():
+            self.env.reset(difficulty=task.difficulty, task_id=task_id)
+            action = SQLArenaAction(sql_query=task.expected_sql)
+            result = self.env.step(action)
+            assert 0.0 <= result.reward <= 1.0
+            assert 0.0 <= result.info.get("score", 0) <= 1.0
+            self.env.reset(difficulty=task.difficulty, task_id=task_id)
+    def test_varying_scores(self):
+        scores = set()
+        queries = [
+            "SELECT name, salary FROM employees WHERE is_active = 1 AND salary > 80000 ORDER BY salary DESC",
+            "SELECT * FROM employees",
+            "INVALID",
+            "SELECT name FROM employees",
+        ]
+        for q in queries:
+            self.env.reset(difficulty="basic_select", task_id="easy_001")
+            result = self.env.step(SQLArenaAction(sql_query=q))
+            scores.add(round(result.info.get("score", 0), 2))
+        assert len(scores) > 1, "Grader always returns the same score!"
+class TestTaskRegistry:
+    def test_list_tasks(self):
+        tasks = list_tasks()
+        assert "basic_select" in tasks
+        assert "join_aggregate" in tasks
+        assert "complex_analysis" in tasks
+    def test_minimum_3_tasks(self):
+        tasks = list_tasks()
+        for difficulty, task_ids in tasks.items():
+            assert len(task_ids) >= 3, f"{difficulty} has fewer than 3 tasks"
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])