{ "$schema": "autocode-verification-input-v1", "feature_id": "F001", "spec_path": "specs/F001-IMPLEMENTATION_SPEC.md", "generated": "2026-03-24T12:00:00Z", "verification_mode": "mvp", "overview": { "summary": "Complete the step/reset lifecycle so the SQL environment actually executes SQL queries against real Spider SQLite databases. Replace the non-functional Ollama-based action interpretation with structured actions (DESCRIBE, SAMPLE, QUERY, ANSWER) that the agent provides directly. Implement sandboxed SQL execution (read-only, SELECT-only, 5s timeout, 20-row truncation), question loading from Spider JSON, per-episode state management via EpisodeContext, and a 15-step budget.", "goal": "Enable agents to play complete RL episodes: reset with a random question, explore a hidden schema via DESCRIBE/SAMPLE, run SQL queries, and submit answers against real databases." }, "interfaces": { "types": [ { "name": "SQLAction", "fields": [ {"name": "action_type", "type": "str", "description": "One of: DESCRIBE, SAMPLE, QUERY, ANSWER"}, {"name": "argument", "type": "str", "description": "Table name (DESCRIBE/SAMPLE), SQL string (QUERY), or answer value (ANSWER)"} ], "description": "Structured action from agent to environment. Extends openenv Action base." }, { "name": "SQLObservation", "fields": [ {"name": "done", "type": "bool", "description": "Whether the episode has ended"}, {"name": "reward", "type": "float | None", "description": "Reward signal (set on terminal step)"}, {"name": "question", "type": "str", "description": "The NL question to answer"}, {"name": "schema_info", "type": "str", "description": "Known schema info (table names initially, columns added after DESCRIBE)"}, {"name": "result", "type": "str", "description": "Result of last action (truncated to 20 rows)"}, {"name": "error", "type": "str", "description": "Error message if action failed, empty string otherwise"}, {"name": "step_count", "type": "int", "description": "Current step number (0-indexed)"}, {"name": "budget_remaining", "type": "int", "description": "Steps left before forced termination"}, {"name": "action_history", "type": "list[str]", "description": "Summary of previous actions taken"} ], "description": "Rich observation from environment to agent. Extends openenv Observation base." }, { "name": "QuestionRecord", "fields": [ {"name": "question_id", "type": "str", "description": "Unique identifier for the question"}, {"name": "question_text", "type": "str", "description": "Natural language question"}, {"name": "database_name", "type": "str", "description": "Which SQLite database to load (matches db_id)"}, {"name": "gold_sql", "type": "str", "description": "Reference SQL query (hidden from agent)"}, {"name": "gold_answer", "type": "str", "description": "Expected answer (hidden from agent)"}, {"name": "answer_type", "type": "str", "description": "One of: integer, float, string, list"}, {"name": "difficulty", "type": "str", "description": "One of: easy, medium, hard"}, {"name": "tables_involved", "type": "list[str]", "description": "Tables referenced by gold query"} ], "description": "Metadata for a single question from the Spider dataset. Server-side only." }, { "name": "EpisodeContext", "fields": [ {"name": "episode_id", "type": "str", "description": "Unique episode identifier"}, {"name": "db_connection", "type": "sqlite3.Connection", "description": "Read-only connection to episode database"}, {"name": "question_record", "type": "QuestionRecord", "description": "The selected question for this episode"}, {"name": "step_count", "type": "int", "description": "Current step number"}, {"name": "budget", "type": "int", "description": "Steps remaining (default 15)"}, {"name": "described_tables", "type": "set[str]", "description": "Tables the agent has DESCRIBEd"}, {"name": "action_log", "type": "list[str]", "description": "Human-readable action summaries"}, {"name": "done", "type": "bool", "description": "Whether the episode has ended"}, {"name": "gold_answer", "type": "str | None", "description": "Computed at reset by running gold_sql"} ], "description": "Per-episode server-side state. Never sent to agent." } ], "functions": [ { "name": "SQLEnvironment.__init__", "params": [ {"name": "questions_path", "type": "str", "description": "Path to Spider questions JSON file"}, {"name": "db_dir", "type": "str", "description": "Directory containing Spider SQLite database files"}, {"name": "tokenizer", "type": "ModelTokenizer", "description": "OpenEnv tokenizer for compatibility"}, {"name": "step_budget", "type": "int", "default": "15", "description": "Maximum steps per episode"} ], "returns": "None", "raises": ["FileNotFoundError", "ValueError"], "description": "Initialize environment with question dataset and database directory. Loads questions at init time." }, { "name": "SQLEnvironment.reset", "params": [ {"name": "seed", "type": "int | None", "default": "None", "description": "Random seed for question selection"}, {"name": "episode_id", "type": "str | None", "default": "None", "description": "Optional episode identifier"} ], "returns": "SQLObservation", "raises": ["FileNotFoundError"], "description": "Pick random question, open read-only SQLite, compute gold answer, return initial observation with question text and table names." }, { "name": "SQLEnvironment.step", "params": [ {"name": "action", "type": "SQLAction", "description": "Structured action with action_type and argument"}, {"name": "timeout_s", "type": "float", "default": "30", "description": "Overall step timeout"} ], "returns": "SQLObservation", "raises": [], "description": "Dispatch action to handler, update episode context, enforce budget, return observation. Never raises -- errors are in observation.error field." }, { "name": "SQLEnvironment._execute_sql", "params": [ {"name": "sql", "type": "str", "description": "SQL query to execute"}, {"name": "timeout_s", "type": "float", "default": "5.0", "description": "Maximum execution time"} ], "returns": "list[tuple]", "raises": ["ValueError", "sqlite3.OperationalError"], "description": "Sandboxed SQL execution with SELECT-only validation, read-only connection, timeout via progress_handler, and result truncation." }, { "name": "SQLEnvironment._handle_describe", "params": [ {"name": "table_name", "type": "str", "description": "Name of table to describe"} ], "returns": "str", "description": "Return column names, types, and row count for a table. Returns error string if table not found, listing available tables." }, { "name": "SQLEnvironment._handle_sample", "params": [ {"name": "table_name", "type": "str", "description": "Name of table to sample"}, {"name": "limit", "type": "int", "default": "5", "description": "Number of rows to return"} ], "returns": "str", "description": "Execute SELECT * FROM table LIMIT N via _execute_sql, return formatted rows." }, { "name": "SQLEnvironment._handle_query", "params": [ {"name": "sql", "type": "str", "description": "SQL SELECT query to execute"} ], "returns": "str", "description": "Validate SELECT-only, execute with 5s timeout, format results, truncate to 20 rows with indicator." }, { "name": "SQLEnvironment._handle_answer", "params": [ {"name": "value", "type": "str", "description": "Agent's answer string"} ], "returns": "tuple[bool, float]", "description": "Compare to gold answer (case-insensitive string comparison for MVP). Returns (is_correct, reward). Sets episode done=True." }, { "name": "SQLEnvironment._build_observation", "params": [], "returns": "SQLObservation", "description": "Construct rich SQLObservation from current EpisodeContext state." }, { "name": "SQLEnvironment._load_questions", "params": [ {"name": "path", "type": "str", "description": "Path to questions JSON file"} ], "returns": "list[QuestionRecord]", "raises": ["FileNotFoundError", "ValueError"], "description": "Load Spider question JSON and parse into QuestionRecord list." }, { "name": "SQLEnvironment._open_db", "params": [ {"name": "db_name", "type": "str", "description": "Database name (matches db_id in questions)"} ], "returns": "sqlite3.Connection", "raises": ["FileNotFoundError"], "description": "Open read-only SQLite connection using URI file:{path}?mode=ro." } ], "api_endpoints": [ { "method": "POST", "path": "/reset", "request_body": { "type": "object", "fields": ["seed: int | null", "episode_id: str | null"] }, "response_body": { "type": "SQLObservation" }, "errors": [ {"status": 500, "when": "Database file not found or questions file missing"} ] }, { "method": "POST", "path": "/step", "request_body": { "type": "SQLAction", "fields": ["action_type: str", "argument: str"] }, "response_body": { "type": "SQLObservation" }, "errors": [ {"status": 422, "when": "Invalid action schema (missing action_type or argument)"} ] } ] }, "data_flow": { "primary_flow": [ "Agent calls POST /reset to start a new episode", "Environment picks a random QuestionRecord from loaded questions", "Environment opens read-only SQLite connection for the question's database", "Environment executes gold_sql to compute gold_answer (stored server-side)", "Environment creates EpisodeContext with step_count=0, budget=15", "Environment returns SQLObservation with question text and table names (columns hidden)", "Agent calls POST /step with SQLAction (DESCRIBE/SAMPLE/QUERY/ANSWER)", "Environment dispatches to appropriate handler based on action_type", "Handler executes against SQLite (DESCRIBE/SAMPLE/QUERY) or compares answer (ANSWER)", "Environment updates EpisodeContext: step_count++, budget-- (except ANSWER)", "Environment checks budget exhaustion and sets done=True if budget==0", "Environment returns SQLObservation with result/error, updated budget, action_history" ], "alternative_flows": [ { "name": "ANSWER submission", "trigger": "Agent sends action_type=ANSWER", "steps": [ "Compare argument to gold_answer (case-insensitive, stripped)", "Set done=True, reward=1.0 (correct) or 0.0 (incorrect)", "Do NOT decrement budget", "Return terminal observation" ] }, { "name": "Budget exhaustion", "trigger": "Budget reaches 0 after a DESCRIBE/SAMPLE/QUERY step", "steps": [ "Set done=True, reward=0.0", "Return terminal observation with done=True" ] }, { "name": "Invalid SQL", "trigger": "Agent sends non-SELECT query or malformed SQL", "steps": [ "Reject at SELECT-only validation or catch sqlite3 error", "Set observation.error with descriptive message", "Step still counts against budget", "Return observation with error field populated" ] }, { "name": "Query timeout", "trigger": "SQL execution exceeds 5 seconds", "steps": [ "Interrupt query via sqlite3 progress_handler", "Set observation.error to timeout message", "Step counts against budget" ] }, { "name": "Table not found", "trigger": "DESCRIBE/SAMPLE with nonexistent table name", "steps": [ "Return error listing available table names", "Step counts against budget" ] } ] }, "error_handling": { "error_types": [ { "name": "InvalidActionType", "when": "action_type not in {DESCRIBE, SAMPLE, QUERY, ANSWER}", "message_template": "Unknown action type '{action_type}'. Valid types: DESCRIBE, SAMPLE, QUERY, ANSWER" }, { "name": "TableNotFound", "when": "DESCRIBE or SAMPLE with table name not in database", "message_template": "Table '{table_name}' not found. Available tables: {table_list}" }, { "name": "NonSelectQuery", "when": "QUERY action with SQL that is not a SELECT statement", "message_template": "Only SELECT queries are allowed. Got: {first_keyword}" }, { "name": "SQLSyntaxError", "when": "SELECT query with invalid syntax", "message_template": "SQL error: {sqlite3_error_message}" }, { "name": "QueryTimeout", "when": "SQL execution exceeds 5 second timeout", "message_template": "Query timed out after 5.0 seconds" }, { "name": "EmptyArgument", "when": "argument field is empty or whitespace-only", "message_template": "Argument cannot be empty for {action_type}" }, { "name": "DatabaseNotFound", "when": "SQLite file not found during reset", "message_template": "Database '{db_name}' not found in {db_dir}" } ], "retry_strategy": null }, "dependencies": { "external": [ "sqlite3 (stdlib)", "pydantic", "openenv (core.env_server)", "torch" ], "internal": [ "models.py", "server/sql_environment.py", "server/app.py", "client.py", "data/databases/models.py", "data/questions/student_assessment.json" ] } }