{ "$schema": "autocode-verification-input-v1", "feature_id": "F004", "spec_path": "specs/F004-IMPLEMENTATION_SPEC.md", "generated": "2026-03-24T12:00:00Z", "verification_mode": "mvp", "overview": { "summary": "Expand the question dataset from 53 single-database questions to 100+ curated questions across 10 Spider databases. Each question is enriched with difficulty, answer_type, gold_answer, and tables_involved metadata. The dataset is split into train (70%) and eval (30%) partitions. A standalone curation script produces the output JSON files; SQLite database files are downloaded on-demand and gitignored.", "goal": "Enable training on diverse databases and question types to prevent overfitting to one schema, with pre-computed gold answers to improve training throughput." }, "interfaces": { "types": [ { "name": "EnrichedQuestionRecord", "fields": [ {"name": "question_id", "type": "str", "description": "Unique ID in format {db_id}_{split}_{index:03d}"}, {"name": "question_text", "type": "str", "description": "Natural language question"}, {"name": "database_name", "type": "str", "description": "Spider db_id matching directory in data/databases/"}, {"name": "gold_sql", "type": "str", "description": "Reference SQL query"}, {"name": "gold_answer", "type": "Any", "description": "Pre-computed result of executing gold_sql"}, {"name": "answer_type", "type": "str", "description": "One of: integer, float, string, list, table"}, {"name": "difficulty", "type": "str", "description": "One of: easy, medium, hard"}, {"name": "tables_involved", "type": "list[str]", "description": "Table names referenced in gold_sql"}, {"name": "split", "type": "str", "description": "One of: train, eval"} ], "description": "A single enriched question record in the output JSON files. Field names match QuestionRecord conceptual design in models.py." } ], "functions": [ { "name": "download_spider_databases", "params": [ {"name": "db_ids", "type": "list[str]", "description": "List of Spider database identifiers"}, {"name": "output_dir", "type": "Path", "description": "Base directory for database files"} ], "returns": "dict[str, Path]", "raises": ["FileNotFoundError"], "description": "Download Spider SQLite database files for specified db_ids. Skips existing files." }, { "name": "load_spider_questions", "params": [ {"name": "db_ids", "type": "list[str]", "description": "List of Spider database identifiers"} ], "returns": "list[dict]", "raises": ["ConnectionError"], "description": "Load raw Spider questions from HuggingFace for specified databases, both train and validation splits." }, { "name": "compute_gold_answer", "params": [ {"name": "gold_sql", "type": "str", "description": "Reference SQL query"}, {"name": "db_path", "type": "Path", "description": "Path to SQLite database file"} ], "returns": "Any", "raises": ["sqlite3.Error"], "description": "Execute gold SQL against SQLite database and return the result." }, { "name": "classify_answer_type", "params": [ {"name": "gold_answer", "type": "Any", "description": "Pre-computed answer value"} ], "returns": "str", "description": "Classify answer as integer, float, string, list, or table based on shape and type." }, { "name": "extract_tables_involved", "params": [ {"name": "gold_sql", "type": "str", "description": "Reference SQL query"} ], "returns": "list[str]", "description": "Extract sorted unique table names from SQL query using regex parsing." }, { "name": "classify_difficulty", "params": [ {"name": "tables_involved", "type": "list[str]", "description": "Tables referenced in query"} ], "returns": "str", "description": "Assign difficulty (easy/medium/hard) based on table count: 1-2=easy, 3=medium, 4+=hard." }, { "name": "assign_splits", "params": [ {"name": "questions", "type": "list[dict]", "description": "Enriched questions with spider_split key"} ], "returns": "list[dict]", "description": "Assign train/eval splits based on Spider's own train/validation split." }, { "name": "validate_dataset", "params": [ {"name": "questions", "type": "list[dict]", "description": "Full enriched dataset"}, {"name": "db_paths", "type": "dict[str, Path]", "description": "Mapping of db_id to SQLite path"} ], "returns": "list[str]", "raises": ["sqlite3.Error"], "description": "Validate dataset: all fields present, gold_sql executes, gold_answer matches, no duplicate IDs, clean splits, difficulty distribution ~40/40/20." } ], "api_endpoints": [] }, "data_flow": { "primary_flow": [ "Read db_list.json for target database IDs", "Download Spider SQLite databases to data/databases/{db_id}/{db_id}.sqlite", "Load raw Spider questions from HuggingFace for target db_ids (train + validation splits)", "For each question: execute gold_sql against SQLite to compute gold_answer", "Classify answer_type from gold_answer shape and type", "Extract tables_involved from gold_sql via regex", "Classify difficulty from tables_involved count", "Assign train/eval split from Spider's own split", "Generate question_id in format {db_id}_{split}_{index:03d}", "Validate full dataset (fields, execution, deduplication, distribution)", "Write questions_train.json and questions_eval.json" ], "alternative_flows": [ { "name": "Gold SQL execution failure", "trigger": "gold_sql raises sqlite3.Error against its database", "steps": [ "Log warning with db_id and error", "Skip the question (exclude from dataset)", "Continue processing remaining questions" ] }, { "name": "Validate-only mode", "trigger": "Script invoked with --validate flag", "steps": [ "Load existing questions_train.json and questions_eval.json", "Locate SQLite databases in data/databases/", "Run validate_dataset() on loaded data", "Print validation results and exit with 0 (valid) or 1 (invalid)" ] } ] }, "error_handling": { "error_types": [ { "name": "FileNotFoundError", "when": "SQLite database file cannot be downloaded for a given db_id", "message_template": "Failed to download database: {db_id}" }, { "name": "sqlite3.OperationalError", "when": "Gold SQL uses an unsupported SQLite feature", "message_template": "SQL execution failed for {db_id}: {error}" }, { "name": "ConnectionError", "when": "HuggingFace dataset download fails", "message_template": "Failed to download Spider dataset: {error}" }, { "name": "ValidationError", "when": "Dataset fails one or more validation checks", "message_template": "Validation failed with {count} errors" } ], "retry_strategy": { "enabled": true, "max_attempts": 2, "backoff": "linear" } }, "dependencies": { "external": [ "datasets (HuggingFace)", "sqlite3 (stdlib)" ], "internal": [ "models.py (QuestionRecord conceptual design for field names)", "data/questions/db_list.json (database configuration)" ] } }