Spaces:
Sleeping
Sleeping
| # nl2sql-bench/openenv.yaml | |
| # OpenEnv environment manifest β validated by `openenv validate` | |
| name: nl2sql-bench | |
| version: "0.1.0" | |
| description: > | |
| Natural Language to SQL query generation environment for RL training. | |
| An agent iteratively writes and refines SQLite queries against a synthetic | |
| e-commerce database to answer business questions. Multi-turn episodes with | |
| dense, shaped rewards. Three difficulty tasks: easy (single-table), | |
| medium (JOIN + GROUP BY), hard (window functions + CTEs). | |
| author: "nl2sql-bench team" | |
| license: MIT | |
| tags: | |
| - openenv | |
| - nl2sql | |
| - sql | |
| - analytics | |
| - rl-training | |
| - deterministic | |
| - multi-turn | |
| # ββ Task definitions ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| tasks: | |
| - name: simple-filter | |
| difficulty: easy | |
| description: > | |
| Single-table SELECT with WHERE, ORDER BY, and LIMIT. | |
| Tests basic SQL fluency. Expected solve rate: high. | |
| max_steps: 5 | |
| reward_range: [0.0, 1.0] | |
| - name: join-aggregation | |
| difficulty: medium | |
| description: > | |
| Multi-table JOINs with GROUP BY, HAVING, and aggregation functions | |
| (COUNT, SUM, AVG, ROUND). Tests relational reasoning. | |
| max_steps: 5 | |
| reward_range: [0.0, 1.0] | |
| - name: analytics-window | |
| difficulty: hard | |
| description: > | |
| Advanced analytics using CTEs, window functions (DENSE_RANK, | |
| ROW_NUMBER, running SUM), and nested subqueries. Tests multi-step | |
| planning and SQLite-specific syntax. | |
| max_steps: 5 | |
| reward_range: [0.0, 1.0] | |
| # ββ Action / Observation space ββββββββββββββββββββββββββββββββββββββββββββββ | |
| action_space: | |
| type: object | |
| properties: | |
| query: | |
| type: string | |
| description: "A SQLite SELECT query string." | |
| observation_space: | |
| type: object | |
| properties: | |
| question: | |
| type: string | |
| description: "Natural-language question the agent must answer." | |
| schema_context: | |
| type: string | |
| description: "Compact database schema description for the agent." | |
| task_name: | |
| type: string | |
| description: "Active task identifier." | |
| last_query: | |
| type: string | |
| description: "The SQL query submitted on the previous step." | |
| last_result: | |
| type: array | |
| description: "Up to 10 rows returned by the last query (list of dicts)." | |
| last_error: | |
| type: string | |
| nullable: true | |
| description: "SQLite error string if last query failed, else null." | |
| result_columns: | |
| type: array | |
| description: "Column names of last_result." | |
| step: | |
| type: integer | |
| description: "Current step number (1-indexed; 0 after reset)." | |
| max_steps: | |
| type: integer | |
| description: "Maximum steps per episode." | |
| done: | |
| type: boolean | |
| description: "True when episode ends (exact match or step limit reached)." | |
| reward: | |
| type: number | |
| nullable: true | |
| description: "Reward for the most recent step [0.0, 1.0]." | |
| score: | |
| type: number | |
| description: "Normalised cumulative episode score [0.0, 1.0]." | |
| # ββ Reward function description βββββββββββββββββββββββββββββββββββββββββββββ | |
| reward: | |
| type: shaped | |
| range: [0.0, 1.0] | |
| components: | |
| - name: syntax_ok | |
| weight: 0.10 | |
| description: "Query executes without SQLite error." | |
| - name: columns_match | |
| weight: 0.20 | |
| description: "Returned column names match ground truth exactly." | |
| - name: row_count_match | |
| weight: 0.20 | |
| description: "Number of returned rows matches ground truth." | |
| - name: exact_match | |
| weight: 0.50 | |
| description: "Full result set matches ground truth (order-aware for ORDER BY)." | |
| - name: step_penalty | |
| weight: -0.05 | |
| description: "Deducted per step beyond the first (encourages efficiency)." | |
| # ββ Deployment ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| server: | |
| port: 7860 | |
| dockerfile: Dockerfile | |
| healthcheck: /health | |
| # ββ Baseline ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| baseline: | |
| script: inference.py | |
| model: Qwen/Qwen2.5-72B-Instruct | |
| expected_scores: | |
| simple-filter: 0.70 | |
| join-aggregation: 0.45 | |
| analytics-window: 0.25 | |