nl2sql-bench / openenv.yaml
ritvik360's picture
Upload folder using huggingface_hub
a39d8ef verified
# nl2sql-bench/openenv.yaml
# OpenEnv environment manifest β€” validated by `openenv validate`
name: nl2sql-bench
version: "0.1.0"
description: >
Natural Language to SQL query generation environment for RL training.
An agent iteratively writes and refines SQLite queries against a synthetic
e-commerce database to answer business questions. Multi-turn episodes with
dense, shaped rewards. Three difficulty tasks: easy (single-table),
medium (JOIN + GROUP BY), hard (window functions + CTEs).
author: "nl2sql-bench team"
license: MIT
tags:
- openenv
- nl2sql
- sql
- analytics
- rl-training
- deterministic
- multi-turn
# ── Task definitions ────────────────────────────────────────────────────────
tasks:
- name: simple-filter
difficulty: easy
description: >
Single-table SELECT with WHERE, ORDER BY, and LIMIT.
Tests basic SQL fluency. Expected solve rate: high.
max_steps: 5
reward_range: [0.0, 1.0]
- name: join-aggregation
difficulty: medium
description: >
Multi-table JOINs with GROUP BY, HAVING, and aggregation functions
(COUNT, SUM, AVG, ROUND). Tests relational reasoning.
max_steps: 5
reward_range: [0.0, 1.0]
- name: analytics-window
difficulty: hard
description: >
Advanced analytics using CTEs, window functions (DENSE_RANK,
ROW_NUMBER, running SUM), and nested subqueries. Tests multi-step
planning and SQLite-specific syntax.
max_steps: 5
reward_range: [0.0, 1.0]
# ── Action / Observation space ──────────────────────────────────────────────
action_space:
type: object
properties:
query:
type: string
description: "A SQLite SELECT query string."
observation_space:
type: object
properties:
question:
type: string
description: "Natural-language question the agent must answer."
schema_context:
type: string
description: "Compact database schema description for the agent."
task_name:
type: string
description: "Active task identifier."
last_query:
type: string
description: "The SQL query submitted on the previous step."
last_result:
type: array
description: "Up to 10 rows returned by the last query (list of dicts)."
last_error:
type: string
nullable: true
description: "SQLite error string if last query failed, else null."
result_columns:
type: array
description: "Column names of last_result."
step:
type: integer
description: "Current step number (1-indexed; 0 after reset)."
max_steps:
type: integer
description: "Maximum steps per episode."
done:
type: boolean
description: "True when episode ends (exact match or step limit reached)."
reward:
type: number
nullable: true
description: "Reward for the most recent step [0.0, 1.0]."
score:
type: number
description: "Normalised cumulative episode score [0.0, 1.0]."
# ── Reward function description ─────────────────────────────────────────────
reward:
type: shaped
range: [0.0, 1.0]
components:
- name: syntax_ok
weight: 0.10
description: "Query executes without SQLite error."
- name: columns_match
weight: 0.20
description: "Returned column names match ground truth exactly."
- name: row_count_match
weight: 0.20
description: "Number of returned rows matches ground truth."
- name: exact_match
weight: 0.50
description: "Full result set matches ground truth (order-aware for ORDER BY)."
- name: step_penalty
weight: -0.05
description: "Deducted per step beyond the first (encourages efficiency)."
# ── Deployment ──────────────────────────────────────────────────────────────
server:
port: 7860
dockerfile: Dockerfile
healthcheck: /health
# ── Baseline ────────────────────────────────────────────────────────────────
baseline:
script: inference.py
model: Qwen/Qwen2.5-72B-Instruct
expected_scores:
simple-filter: 0.70
join-aggregation: 0.45
analytics-window: 0.25