{
  "$schema": "autocode-verification-input-v1",
  "feature_id": "F004",
  "spec_path": "specs/F004-IMPLEMENTATION_SPEC.md",
  "generated": "2026-03-24T12:00:00Z",
  "verification_mode": "mvp",

  "overview": {
    "summary": "Expand the question dataset from 53 single-database questions to 100+ curated questions across 10 Spider databases. Each question is enriched with difficulty, answer_type, gold_answer, and tables_involved metadata. The dataset is split into train (70%) and eval (30%) partitions. A standalone curation script produces the output JSON files; SQLite database files are downloaded on-demand and gitignored.",
    "goal": "Enable training on diverse databases and question types to prevent overfitting to one schema, with pre-computed gold answers to improve training throughput."
  },

  "interfaces": {
    "types": [
      {
        "name": "EnrichedQuestionRecord",
        "fields": [
          {"name": "question_id", "type": "str", "description": "Unique ID in format {db_id}_{split}_{index:03d}"},
          {"name": "question_text", "type": "str", "description": "Natural language question"},
          {"name": "database_name", "type": "str", "description": "Spider db_id matching directory in data/databases/"},
          {"name": "gold_sql", "type": "str", "description": "Reference SQL query"},
          {"name": "gold_answer", "type": "Any", "description": "Pre-computed result of executing gold_sql"},
          {"name": "answer_type", "type": "str", "description": "One of: integer, float, string, list, table"},
          {"name": "difficulty", "type": "str", "description": "One of: easy, medium, hard"},
          {"name": "tables_involved", "type": "list[str]", "description": "Table names referenced in gold_sql"},
          {"name": "split", "type": "str", "description": "One of: train, eval"}
        ],
        "description": "A single enriched question record in the output JSON files. Field names match QuestionRecord conceptual design in models.py."
      }
    ],
    "functions": [
      {
        "name": "download_spider_databases",
        "params": [
          {"name": "db_ids", "type": "list[str]", "description": "List of Spider database identifiers"},
          {"name": "output_dir", "type": "Path", "description": "Base directory for database files"}
        ],
        "returns": "dict[str, Path]",
        "raises": ["FileNotFoundError"],
        "description": "Download Spider SQLite database files for specified db_ids. Skips existing files."
      },
      {
        "name": "load_spider_questions",
        "params": [
          {"name": "db_ids", "type": "list[str]", "description": "List of Spider database identifiers"}
        ],
        "returns": "list[dict]",
        "raises": ["ConnectionError"],
        "description": "Load raw Spider questions from HuggingFace for specified databases, both train and validation splits."
      },
      {
        "name": "compute_gold_answer",
        "params": [
          {"name": "gold_sql", "type": "str", "description": "Reference SQL query"},
          {"name": "db_path", "type": "Path", "description": "Path to SQLite database file"}
        ],
        "returns": "Any",
        "raises": ["sqlite3.Error"],
        "description": "Execute gold SQL against SQLite database and return the result."
      },
      {
        "name": "classify_answer_type",
        "params": [
          {"name": "gold_answer", "type": "Any", "description": "Pre-computed answer value"}
        ],
        "returns": "str",
        "description": "Classify answer as integer, float, string, list, or table based on shape and type."
      },
      {
        "name": "extract_tables_involved",
        "params": [
          {"name": "gold_sql", "type": "str", "description": "Reference SQL query"}
        ],
        "returns": "list[str]",
        "description": "Extract sorted unique table names from SQL query using regex parsing."
      },
      {
        "name": "classify_difficulty",
        "params": [
          {"name": "tables_involved", "type": "list[str]", "description": "Tables referenced in query"}
        ],
        "returns": "str",
        "description": "Assign difficulty (easy/medium/hard) based on table count: 1-2=easy, 3=medium, 4+=hard."
      },
      {
        "name": "assign_splits",
        "params": [
          {"name": "questions", "type": "list[dict]", "description": "Enriched questions with spider_split key"}
        ],
        "returns": "list[dict]",
        "description": "Assign train/eval splits based on Spider's own train/validation split."
      },
      {
        "name": "validate_dataset",
        "params": [
          {"name": "questions", "type": "list[dict]", "description": "Full enriched dataset"},
          {"name": "db_paths", "type": "dict[str, Path]", "description": "Mapping of db_id to SQLite path"}
        ],
        "returns": "list[str]",
        "raises": ["sqlite3.Error"],
        "description": "Validate dataset: all fields present, gold_sql executes, gold_answer matches, no duplicate IDs, clean splits, difficulty distribution ~40/40/20."
      }
    ],
    "api_endpoints": []
  },

  "data_flow": {
    "primary_flow": [
      "Read db_list.json for target database IDs",
      "Download Spider SQLite databases to data/databases/{db_id}/{db_id}.sqlite",
      "Load raw Spider questions from HuggingFace for target db_ids (train + validation splits)",
      "For each question: execute gold_sql against SQLite to compute gold_answer",
      "Classify answer_type from gold_answer shape and type",
      "Extract tables_involved from gold_sql via regex",
      "Classify difficulty from tables_involved count",
      "Assign train/eval split from Spider's own split",
      "Generate question_id in format {db_id}_{split}_{index:03d}",
      "Validate full dataset (fields, execution, deduplication, distribution)",
      "Write questions_train.json and questions_eval.json"
    ],
    "alternative_flows": [
      {
        "name": "Gold SQL execution failure",
        "trigger": "gold_sql raises sqlite3.Error against its database",
        "steps": [
          "Log warning with db_id and error",
          "Skip the question (exclude from dataset)",
          "Continue processing remaining questions"
        ]
      },
      {
        "name": "Validate-only mode",
        "trigger": "Script invoked with --validate flag",
        "steps": [
          "Load existing questions_train.json and questions_eval.json",
          "Locate SQLite databases in data/databases/",
          "Run validate_dataset() on loaded data",
          "Print validation results and exit with 0 (valid) or 1 (invalid)"
        ]
      }
    ]
  },

  "error_handling": {
    "error_types": [
      {
        "name": "FileNotFoundError",
        "when": "SQLite database file cannot be downloaded for a given db_id",
        "message_template": "Failed to download database: {db_id}"
      },
      {
        "name": "sqlite3.OperationalError",
        "when": "Gold SQL uses an unsupported SQLite feature",
        "message_template": "SQL execution failed for {db_id}: {error}"
      },
      {
        "name": "ConnectionError",
        "when": "HuggingFace dataset download fails",
        "message_template": "Failed to download Spider dataset: {error}"
      },
      {
        "name": "ValidationError",
        "when": "Dataset fails one or more validation checks",
        "message_template": "Validation failed with {count} errors"
      }
    ],
    "retry_strategy": {
      "enabled": true,
      "max_attempts": 2,
      "backoff": "linear"
    }
  },

  "dependencies": {
    "external": [
      "datasets (HuggingFace)",
      "sqlite3 (stdlib)"
    ],
    "internal": [
      "models.py (QuestionRecord conceptual design for field names)",
      "data/questions/db_list.json (database configuration)"
    ]
  }
}