# nl2sql-bench/openenv.yaml # OpenEnv environment manifest — validated by `openenv validate` name: nl2sql-bench version: "0.1.0" description: > Natural Language to SQL query generation environment for RL training. An agent iteratively writes and refines SQLite queries against a synthetic e-commerce database to answer business questions. Multi-turn episodes with dense, shaped rewards. Three difficulty tasks: easy (single-table), medium (JOIN + GROUP BY), hard (window functions + CTEs). author: "nl2sql-bench team" license: MIT tags: - openenv - nl2sql - sql - analytics - rl-training - deterministic - multi-turn # ── Task definitions ──────────────────────────────────────────────────────── tasks: - name: simple-filter difficulty: easy description: > Single-table SELECT with WHERE, ORDER BY, and LIMIT. Tests basic SQL fluency. Expected solve rate: high. max_steps: 5 reward_range: [0.0, 1.0] - name: join-aggregation difficulty: medium description: > Multi-table JOINs with GROUP BY, HAVING, and aggregation functions (COUNT, SUM, AVG, ROUND). Tests relational reasoning. max_steps: 5 reward_range: [0.0, 1.0] - name: analytics-window difficulty: hard description: > Advanced analytics using CTEs, window functions (DENSE_RANK, ROW_NUMBER, running SUM), and nested subqueries. Tests multi-step planning and SQLite-specific syntax. max_steps: 5 reward_range: [0.0, 1.0] # ── Action / Observation space ────────────────────────────────────────────── action_space: type: object properties: query: type: string description: "A SQLite SELECT query string." observation_space: type: object properties: question: type: string description: "Natural-language question the agent must answer." schema_context: type: string description: "Compact database schema description for the agent." task_name: type: string description: "Active task identifier." last_query: type: string description: "The SQL query submitted on the previous step." last_result: type: array description: "Up to 10 rows returned by the last query (list of dicts)." last_error: type: string nullable: true description: "SQLite error string if last query failed, else null." result_columns: type: array description: "Column names of last_result." step: type: integer description: "Current step number (1-indexed; 0 after reset)." max_steps: type: integer description: "Maximum steps per episode." done: type: boolean description: "True when episode ends (exact match or step limit reached)." reward: type: number nullable: true description: "Reward for the most recent step [0.0, 1.0]." score: type: number description: "Normalised cumulative episode score [0.0, 1.0]." # ── Reward function description ───────────────────────────────────────────── reward: type: shaped range: [0.0, 1.0] components: - name: syntax_ok weight: 0.10 description: "Query executes without SQLite error." - name: columns_match weight: 0.20 description: "Returned column names match ground truth exactly." - name: row_count_match weight: 0.20 description: "Number of returned rows matches ground truth." - name: exact_match weight: 0.50 description: "Full result set matches ground truth (order-aware for ORDER BY)." - name: step_penalty weight: -0.05 description: "Deducted per step beyond the first (encourages efficiency)." # ── Deployment ────────────────────────────────────────────────────────────── server: port: 7860 dockerfile: Dockerfile healthcheck: /health # ── Baseline ──────────────────────────────────────────────────────────────── baseline: script: inference.py model: Qwen/Qwen2.5-72B-Instruct expected_scores: simple-filter: 0.70 join-aggregation: 0.45 analytics-window: 0.25