{ "$schema": "autocode-verification-input-v1", "feature_id": "F002", "spec_path": "specs/F002-IMPLEMENTATION_SPEC.md", "generated": "2026-03-27T12:00:00Z", "verification_mode": "mvp", "overview": { "summary": "Type-aware answer verification for SQLEnv that replaces naive string comparison with dispatched comparers for integer (exact), float (1% tolerance), string (case-insensitive), and list (order-insensitive) answer types. Falls back to string comparison when answer_type is missing.", "goal": "Ensure correct agent answers are not rejected due to trivial formatting, type coercion, or ordering differences." }, "interfaces": { "types": [ { "name": "EpisodeContext", "fields": [ {"name": "gold_rows", "type": "list[tuple] | None", "optional": true, "description": "Raw SQL result rows for accurate list comparison by verifier"} ], "description": "Per-episode server-side state. Modified to add gold_rows field alongside existing gold_answer." } ], "functions": [ { "name": "verify_answer", "params": [ {"name": "predicted", "type": "str", "description": "Agent's submitted answer string"}, {"name": "gold", "type": "str", "description": "Gold answer as formatted string"}, {"name": "answer_type", "type": "str | None", "default": "None", "description": "One of 'integer', 'float', 'string', 'list', or None"}, {"name": "gold_rows", "type": "list[tuple] | None", "default": "None", "description": "Raw SQL result rows for list comparison"} ], "returns": "bool", "raises": [], "description": "Compare agent answer against gold answer using type-specific comparison. Dispatches by answer_type; falls back to string comparison for None/unknown types." }, { "name": "_compare_integer", "params": [ {"name": "predicted", "type": "str", "description": "Agent value"}, {"name": "gold", "type": "str", "description": "Gold value"} ], "returns": "bool", "description": "Exact integer match after coercing both sides via int(float(x)). Returns False on ValueError." }, { "name": "_compare_float", "params": [ {"name": "predicted", "type": "str", "description": "Agent value"}, {"name": "gold", "type": "str", "description": "Gold value"}, {"name": "tolerance", "type": "float", "default": "0.01", "description": "Relative tolerance (1% default)"} ], "returns": "bool", "description": "Float comparison with relative tolerance. Uses abs(pred - gold) <= tolerance * abs(gold). For gold==0, uses absolute tolerance 1e-9." }, { "name": "_compare_string", "params": [ {"name": "predicted", "type": "str", "description": "Agent value"}, {"name": "gold", "type": "str", "description": "Gold value"} ], "returns": "bool", "description": "Case-insensitive, whitespace-normalized string comparison." }, { "name": "_compare_list", "params": [ {"name": "predicted", "type": "str", "description": "Agent value"}, {"name": "gold", "type": "str", "description": "Gold value as formatted string"}, {"name": "gold_rows", "type": "list[tuple] | None", "default": "None", "description": "Raw rows for accurate comparison"} ], "returns": "bool", "description": "Order-insensitive set comparison. Parses both sides into normalized string sets and compares equality." } ], "api_endpoints": [] }, "data_flow": { "primary_flow": [ "Agent sends ANSWER action with value string", "step() dispatches to _handle_answer(value)", "_handle_answer() calls verify_answer(predicted, gold, answer_type, gold_rows)", "verify_answer() dispatches to type-specific comparer based on answer_type", "Comparer returns bool; _handle_answer returns (bool, float reward)" ], "alternative_flows": [ { "name": "Unknown or missing answer_type", "trigger": "answer_type is None or not in known set", "steps": [ "verify_answer receives answer_type=None", "Falls back to _compare_string(predicted, gold)", "Returns bool" ] }, { "name": "Type coercion failure", "trigger": "predicted cannot be parsed as int or float", "steps": [ "_compare_integer or _compare_float catches ValueError", "Returns False (answer treated as incorrect)" ] }, { "name": "Empty or None input", "trigger": "predicted is empty string after strip", "steps": [ "verify_answer returns False immediately" ] } ] }, "error_handling": { "error_types": [ { "name": "ValueError", "when": "Predicted value cannot be coerced to int/float during comparison" }, { "name": "RuntimeError", "when": "_handle_answer called with no active episode (existing behavior, unchanged)" } ], "retry_strategy": null }, "dependencies": { "external": [], "internal": [ {"name": "models.EpisodeContext", "usage": "gold_rows field added for verifier input"}, {"name": "models.QuestionRecord", "usage": "answer_type field read to determine comparison strategy"}, {"name": "server.sql_environment._handle_answer", "usage": "Modified to call verify_answer instead of inline comparison"} ] } }