| { |
| "$schema": "autocode-verification-input-v1", |
| "feature_id": "F002", |
| "spec_path": "specs/F002-IMPLEMENTATION_SPEC.md", |
| "generated": "2026-03-27T12:00:00Z", |
| "verification_mode": "mvp", |
|
|
| "overview": { |
| "summary": "Type-aware answer verification for SQLEnv that replaces naive string comparison with dispatched comparers for integer (exact), float (1% tolerance), string (case-insensitive), and list (order-insensitive) answer types. Falls back to string comparison when answer_type is missing.", |
| "goal": "Ensure correct agent answers are not rejected due to trivial formatting, type coercion, or ordering differences." |
| }, |
|
|
| "interfaces": { |
| "types": [ |
| { |
| "name": "EpisodeContext", |
| "fields": [ |
| {"name": "gold_rows", "type": "list[tuple] | None", "optional": true, "description": "Raw SQL result rows for accurate list comparison by verifier"} |
| ], |
| "description": "Per-episode server-side state. Modified to add gold_rows field alongside existing gold_answer." |
| } |
| ], |
| "functions": [ |
| { |
| "name": "verify_answer", |
| "params": [ |
| {"name": "predicted", "type": "str", "description": "Agent's submitted answer string"}, |
| {"name": "gold", "type": "str", "description": "Gold answer as formatted string"}, |
| {"name": "answer_type", "type": "str | None", "default": "None", "description": "One of 'integer', 'float', 'string', 'list', or None"}, |
| {"name": "gold_rows", "type": "list[tuple] | None", "default": "None", "description": "Raw SQL result rows for list comparison"} |
| ], |
| "returns": "bool", |
| "raises": [], |
| "description": "Compare agent answer against gold answer using type-specific comparison. Dispatches by answer_type; falls back to string comparison for None/unknown types." |
| }, |
| { |
| "name": "_compare_integer", |
| "params": [ |
| {"name": "predicted", "type": "str", "description": "Agent value"}, |
| {"name": "gold", "type": "str", "description": "Gold value"} |
| ], |
| "returns": "bool", |
| "description": "Exact integer match after coercing both sides via int(float(x)). Returns False on ValueError." |
| }, |
| { |
| "name": "_compare_float", |
| "params": [ |
| {"name": "predicted", "type": "str", "description": "Agent value"}, |
| {"name": "gold", "type": "str", "description": "Gold value"}, |
| {"name": "tolerance", "type": "float", "default": "0.01", "description": "Relative tolerance (1% default)"} |
| ], |
| "returns": "bool", |
| "description": "Float comparison with relative tolerance. Uses abs(pred - gold) <= tolerance * abs(gold). For gold==0, uses absolute tolerance 1e-9." |
| }, |
| { |
| "name": "_compare_string", |
| "params": [ |
| {"name": "predicted", "type": "str", "description": "Agent value"}, |
| {"name": "gold", "type": "str", "description": "Gold value"} |
| ], |
| "returns": "bool", |
| "description": "Case-insensitive, whitespace-normalized string comparison." |
| }, |
| { |
| "name": "_compare_list", |
| "params": [ |
| {"name": "predicted", "type": "str", "description": "Agent value"}, |
| {"name": "gold", "type": "str", "description": "Gold value as formatted string"}, |
| {"name": "gold_rows", "type": "list[tuple] | None", "default": "None", "description": "Raw rows for accurate comparison"} |
| ], |
| "returns": "bool", |
| "description": "Order-insensitive set comparison. Parses both sides into normalized string sets and compares equality." |
| } |
| ], |
| "api_endpoints": [] |
| }, |
|
|
| "data_flow": { |
| "primary_flow": [ |
| "Agent sends ANSWER action with value string", |
| "step() dispatches to _handle_answer(value)", |
| "_handle_answer() calls verify_answer(predicted, gold, answer_type, gold_rows)", |
| "verify_answer() dispatches to type-specific comparer based on answer_type", |
| "Comparer returns bool; _handle_answer returns (bool, float reward)" |
| ], |
| "alternative_flows": [ |
| { |
| "name": "Unknown or missing answer_type", |
| "trigger": "answer_type is None or not in known set", |
| "steps": [ |
| "verify_answer receives answer_type=None", |
| "Falls back to _compare_string(predicted, gold)", |
| "Returns bool" |
| ] |
| }, |
| { |
| "name": "Type coercion failure", |
| "trigger": "predicted cannot be parsed as int or float", |
| "steps": [ |
| "_compare_integer or _compare_float catches ValueError", |
| "Returns False (answer treated as incorrect)" |
| ] |
| }, |
| { |
| "name": "Empty or None input", |
| "trigger": "predicted is empty string after strip", |
| "steps": [ |
| "verify_answer returns False immediately" |
| ] |
| } |
| ] |
| }, |
|
|
| "error_handling": { |
| "error_types": [ |
| { |
| "name": "ValueError", |
| "when": "Predicted value cannot be coerced to int/float during comparison" |
| }, |
| { |
| "name": "RuntimeError", |
| "when": "_handle_answer called with no active episode (existing behavior, unchanged)" |
| } |
| ], |
| "retry_strategy": null |
| }, |
|
|
| "dependencies": { |
| "external": [], |
| "internal": [ |
| {"name": "models.EpisodeContext", "usage": "gold_rows field added for verifier input"}, |
| {"name": "models.QuestionRecord", "usage": "answer_type field read to determine comparison strategy"}, |
| {"name": "server.sql_environment._handle_answer", "usage": "Modified to call verify_answer instead of inline comparison"} |
| ] |
| } |
| } |
|
|