Spaces:

hjerpe
/

sql_env

Running

App Files Files Community

sql_env / specs /F002-VERIFICATION_INPUT.json

hjerpe

Upload folder using huggingface_hub

5dd1bb4 verified 22 days ago

raw

history blame contribute delete

5.64 kB

	{
	"$schema": "autocode-verification-input-v1",
	"feature_id": "F002",
	"spec_path": "specs/F002-IMPLEMENTATION_SPEC.md",
	"generated": "2026-03-27T12:00:00Z",
	"verification_mode": "mvp",

	"overview": {
	"summary": "Type-aware answer verification for SQLEnv that replaces naive string comparison with dispatched comparers for integer (exact), float (1% tolerance), string (case-insensitive), and list (order-insensitive) answer types. Falls back to string comparison when answer_type is missing.",
	"goal": "Ensure correct agent answers are not rejected due to trivial formatting, type coercion, or ordering differences."
	},

	"interfaces": {
	"types": [
	{
	"name": "EpisodeContext",
	"fields": [
	{"name": "gold_rows", "type": "list[tuple] \| None", "optional": true, "description": "Raw SQL result rows for accurate list comparison by verifier"}
	],
	"description": "Per-episode server-side state. Modified to add gold_rows field alongside existing gold_answer."
	}
	],
	"functions": [
	{
	"name": "verify_answer",
	"params": [
	{"name": "predicted", "type": "str", "description": "Agent's submitted answer string"},
	{"name": "gold", "type": "str", "description": "Gold answer as formatted string"},
	{"name": "answer_type", "type": "str \| None", "default": "None", "description": "One of 'integer', 'float', 'string', 'list', or None"},
	{"name": "gold_rows", "type": "list[tuple] \| None", "default": "None", "description": "Raw SQL result rows for list comparison"}
	],
	"returns": "bool",
	"raises": [],
	"description": "Compare agent answer against gold answer using type-specific comparison. Dispatches by answer_type; falls back to string comparison for None/unknown types."
	},
	{
	"name": "_compare_integer",
	"params": [
	{"name": "predicted", "type": "str", "description": "Agent value"},
	{"name": "gold", "type": "str", "description": "Gold value"}
	],
	"returns": "bool",
	"description": "Exact integer match after coercing both sides via int(float(x)). Returns False on ValueError."
	},
	{
	"name": "_compare_float",
	"params": [
	{"name": "predicted", "type": "str", "description": "Agent value"},
	{"name": "gold", "type": "str", "description": "Gold value"},
	{"name": "tolerance", "type": "float", "default": "0.01", "description": "Relative tolerance (1% default)"}
	],
	"returns": "bool",
	"description": "Float comparison with relative tolerance. Uses abs(pred - gold) <= tolerance * abs(gold). For gold==0, uses absolute tolerance 1e-9."
	},
	{
	"name": "_compare_string",
	"params": [
	{"name": "predicted", "type": "str", "description": "Agent value"},
	{"name": "gold", "type": "str", "description": "Gold value"}
	],
	"returns": "bool",
	"description": "Case-insensitive, whitespace-normalized string comparison."
	},
	{
	"name": "_compare_list",
	"params": [
	{"name": "predicted", "type": "str", "description": "Agent value"},
	{"name": "gold", "type": "str", "description": "Gold value as formatted string"},
	{"name": "gold_rows", "type": "list[tuple] \| None", "default": "None", "description": "Raw rows for accurate comparison"}
	],
	"returns": "bool",
	"description": "Order-insensitive set comparison. Parses both sides into normalized string sets and compares equality."
	}
	],
	"api_endpoints": []
	},

	"data_flow": {
	"primary_flow": [
	"Agent sends ANSWER action with value string",
	"step() dispatches to _handle_answer(value)",
	"_handle_answer() calls verify_answer(predicted, gold, answer_type, gold_rows)",
	"verify_answer() dispatches to type-specific comparer based on answer_type",
	"Comparer returns bool; _handle_answer returns (bool, float reward)"
	],
	"alternative_flows": [
	{
	"name": "Unknown or missing answer_type",
	"trigger": "answer_type is None or not in known set",
	"steps": [
	"verify_answer receives answer_type=None",
	"Falls back to _compare_string(predicted, gold)",
	"Returns bool"
	]
	},
	{
	"name": "Type coercion failure",
	"trigger": "predicted cannot be parsed as int or float",
	"steps": [
	"_compare_integer or _compare_float catches ValueError",
	"Returns False (answer treated as incorrect)"
	]
	},
	{
	"name": "Empty or None input",
	"trigger": "predicted is empty string after strip",
	"steps": [
	"verify_answer returns False immediately"
	]
	}
	]
	},

	"error_handling": {
	"error_types": [
	{
	"name": "ValueError",
	"when": "Predicted value cannot be coerced to int/float during comparison"
	},
	{
	"name": "RuntimeError",
	"when": "_handle_answer called with no active episode (existing behavior, unchanged)"
	}
	],
	"retry_strategy": null
	},

	"dependencies": {
	"external": [],
	"internal": [
	{"name": "models.EpisodeContext", "usage": "gold_rows field added for verifier input"},
	{"name": "models.QuestionRecord", "usage": "answer_type field read to determine comparison strategy"},
	{"name": "server.sql_environment._handle_answer", "usage": "Modified to call verify_answer instead of inline comparison"}
	]
	}
	}