{ "$schema": "autocode-verification-input-v1", "feature_id": "F005", "spec_path": "specs/F005-IMPLEMENTATION_SPEC.md", "generated": "2026-03-27T12:00:00Z", "verification_mode": "mvp", "overview": { "summary": "Automated evaluation wrapper that runs N episodes with a given policy against SQLEnvironment and returns structured metrics (success_rate, avg_reward, avg_steps). Includes a built-in RandomPolicy for instant baseline comparison. Results are collected incrementally so partial failures do not lose completed episode data.", "goal": "Enable single-command evaluation: 'How does policy X perform over 100 episodes?' with structured output for training comparison (random vs trained)." }, "interfaces": { "types": [ { "name": "Policy", "description": "Protocol (structural subtype) for any evaluation policy. Any object with a matching select_action method satisfies this interface.", "fields": [ {"name": "select_action", "type": "(observation: SQLObservation) -> SQLAction", "description": "Choose an action given the current observation"} ] }, { "name": "EpisodeResult", "description": "Per-episode evaluation metrics. Frozen dataclass.", "fields": [ {"name": "episode_index", "type": "int", "description": "0-based episode number"}, {"name": "correct", "type": "bool", "description": "Whether the ANSWER action matched the gold answer"}, {"name": "total_reward", "type": "float", "description": "Cumulative reward for the episode"}, {"name": "steps", "type": "int", "description": "Number of steps taken in the episode"}, {"name": "error", "type": "str | None", "optional": true, "description": "Error message if episode failed, None otherwise"} ] }, { "name": "EvaluationResult", "description": "Aggregate evaluation metrics with per-episode breakdown. Frozen dataclass.", "fields": [ {"name": "success_rate", "type": "float", "description": "Fraction of correct episodes in [0.0, 1.0]"}, {"name": "avg_reward", "type": "float", "description": "Mean total_reward across completed episodes"}, {"name": "avg_steps", "type": "float", "description": "Mean steps across completed episodes"}, {"name": "n_episodes", "type": "int", "description": "Total number of episodes attempted"}, {"name": "n_completed", "type": "int", "description": "Episodes that completed without error"}, {"name": "episodes", "type": "list[EpisodeResult]", "description": "Per-episode breakdown for analysis"} ] } ], "functions": [ { "name": "RandomPolicy.__init__", "params": [ {"name": "seed", "type": "int | None", "default": "None", "description": "Random seed for reproducibility"} ], "returns": "None", "description": "Initialize random baseline policy. Deterministic given a seed." }, { "name": "RandomPolicy.select_action", "params": [ {"name": "observation", "type": "SQLObservation", "description": "Current environment observation"} ], "returns": "SQLAction", "description": "Pick a random action. If budget_remaining > 1: randomly choose DESCRIBE, SAMPLE, or QUERY. If budget_remaining == 1: ANSWER with a random guess." }, { "name": "evaluate", "params": [ {"name": "env", "type": "SQLEnvironment", "description": "The environment to evaluate against"}, {"name": "policy", "type": "Policy", "description": "Any object satisfying the Policy protocol"}, {"name": "n_episodes", "type": "int", "default": "100", "description": "Number of episodes to run"}, {"name": "seed", "type": "int | None", "default": "None", "description": "Base seed for reproducibility; episode i uses seed+i"}, {"name": "progress_callback", "type": "Callable[[int, int], None] | None", "default": "None", "description": "Optional callback(current, total) for progress reporting"} ], "returns": "EvaluationResult", "raises": ["ValueError"], "description": "Run automated evaluation of a policy over multiple episodes. Collects results incrementally -- failed episodes are recorded and evaluation continues." } ], "api_endpoints": [] }, "data_flow": { "primary_flow": [ "evaluate() called with env, policy, n_episodes, optional seed", "For each episode: env.reset(seed=base_seed+i) returns initial SQLObservation", "Loop: policy.select_action(obs) -> SQLAction, then env.step(action) -> SQLObservation, accumulate reward", "Episode ends when obs.done is True; record EpisodeResult with correct/reward/steps", "Aggregate all EpisodeResults into EvaluationResult with success_rate, avg_reward, avg_steps" ], "alternative_flows": [ { "condition": "n_episodes is 0", "steps": ["Return EvaluationResult with all zeros and empty episodes list"] }, { "condition": "Exception during episode (reset, select_action, or step fails)", "steps": [ "Catch exception", "Record EpisodeResult with correct=False, total_reward=0.0, steps=0, error=str(exc)", "Continue to next episode" ] } ] }, "error_handling": { "error_types": [ { "name": "ValueError", "when": "n_episodes < 0", "handling": "Raise immediately before starting evaluation" }, { "name": "Exception (per-episode)", "when": "Any exception during env.reset(), policy.select_action(), or env.step()", "handling": "Catch, record as failed EpisodeResult with error field, continue to next episode" } ], "retry_strategy": null }, "dependencies": { "external": [], "internal": [ {"name": "models.SQLAction", "usage": "Action type returned by policies"}, {"name": "models.SQLObservation", "usage": "Observation type passed to policies"}, {"name": "server.sql_environment.SQLEnvironment", "usage": "Environment with reset() and step() methods"} ] } }