Spaces:

ritvik360
/

nl2sql-bench

Sleeping

App Files Files Community

nl2sql-bench / openenv.yaml

ritvik360

Upload folder using huggingface_hub

a39d8ef verified 10 days ago

raw

history blame contribute delete

4.68 kB

	# nl2sql-bench/openenv.yaml
	# OpenEnv environment manifest — validated by `openenv validate`

	name: nl2sql-bench
	version: "0.1.0"
	description: >
	Natural Language to SQL query generation environment for RL training.
	An agent iteratively writes and refines SQLite queries against a synthetic
	e-commerce database to answer business questions. Multi-turn episodes with
	dense, shaped rewards. Three difficulty tasks: easy (single-table),
	medium (JOIN + GROUP BY), hard (window functions + CTEs).

	author: "nl2sql-bench team"
	license: MIT
	tags:
	- openenv
	- nl2sql
	- sql
	- analytics
	- rl-training
	- deterministic
	- multi-turn

	# ── Task definitions ────────────────────────────────────────────────────────
	tasks:
	- name: simple-filter
	difficulty: easy
	description: >
	Single-table SELECT with WHERE, ORDER BY, and LIMIT.
	Tests basic SQL fluency. Expected solve rate: high.
	max_steps: 5
	reward_range: [0.0, 1.0]

	- name: join-aggregation
	difficulty: medium
	description: >
	Multi-table JOINs with GROUP BY, HAVING, and aggregation functions
	(COUNT, SUM, AVG, ROUND). Tests relational reasoning.
	max_steps: 5
	reward_range: [0.0, 1.0]

	- name: analytics-window
	difficulty: hard
	description: >
	Advanced analytics using CTEs, window functions (DENSE_RANK,
	ROW_NUMBER, running SUM), and nested subqueries. Tests multi-step
	planning and SQLite-specific syntax.
	max_steps: 5
	reward_range: [0.0, 1.0]

	# ── Action / Observation space ──────────────────────────────────────────────
	action_space:
	type: object
	properties:
	query:
	type: string
	description: "A SQLite SELECT query string."

	observation_space:
	type: object
	properties:
	question:
	type: string
	description: "Natural-language question the agent must answer."
	schema_context:
	type: string
	description: "Compact database schema description for the agent."
	task_name:
	type: string
	description: "Active task identifier."
	last_query:
	type: string
	description: "The SQL query submitted on the previous step."
	last_result:
	type: array
	description: "Up to 10 rows returned by the last query (list of dicts)."
	last_error:
	type: string
	nullable: true
	description: "SQLite error string if last query failed, else null."
	result_columns:
	type: array
	description: "Column names of last_result."
	step:
	type: integer
	description: "Current step number (1-indexed; 0 after reset)."
	max_steps:
	type: integer
	description: "Maximum steps per episode."
	done:
	type: boolean
	description: "True when episode ends (exact match or step limit reached)."
	reward:
	type: number
	nullable: true
	description: "Reward for the most recent step [0.0, 1.0]."
	score:
	type: number
	description: "Normalised cumulative episode score [0.0, 1.0]."

	# ── Reward function description ─────────────────────────────────────────────
	reward:
	type: shaped
	range: [0.0, 1.0]
	components:
	- name: syntax_ok
	weight: 0.10
	description: "Query executes without SQLite error."
	- name: columns_match
	weight: 0.20
	description: "Returned column names match ground truth exactly."
	- name: row_count_match
	weight: 0.20
	description: "Number of returned rows matches ground truth."
	- name: exact_match
	weight: 0.50
	description: "Full result set matches ground truth (order-aware for ORDER BY)."
	- name: step_penalty
	weight: -0.05
	description: "Deducted per step beyond the first (encourages efficiency)."

	# ── Deployment ──────────────────────────────────────────────────────────────
	server:
	port: 7860
	dockerfile: Dockerfile
	healthcheck: /health

	# ── Baseline ────────────────────────────────────────────────────────────────
	baseline:
	script: inference.py
	model: Qwen/Qwen2.5-72B-Instruct
	expected_scores:
	simple-filter: 0.70
	join-aggregation: 0.45
	analytics-window: 0.25