Spaces:

hellinferno
/

sql-query-reviewer

Sleeping

App Files Files Community

hellinferno commited on 9 days ago

Commit

2a92b3a

1 Parent(s): c98afe9

fix: bulletproof inference.py with Docker + fallback connection methods

Browse files

Files changed (2) hide show

inference.py +268 -91
sql_query_reviewer/client.py +121 -6

inference.py CHANGED Viewed

@@ -2,11 +2,12 @@
 Inference Script — SQL Query Reviewer
 ======================================
 MANDATORY environment variables:
-    API_BASE_URL   The API endpoint for the LLM.
-    MODEL_NAME     The model identifier to use for inference.
-    HF_TOKEN       Your Hugging Face / API key.
-STDOUT FORMAT:
     [START] task=<task_name> env=<benchmark> model=<model_name>
     [STEP]  step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
     [END]   success=<true|false> steps=<n> score=<score> rewards=<r1,r2,...,rn>
@@ -14,44 +15,45 @@ STDOUT FORMAT:
 from __future__ import annotations
 import json
 import os
 from typing import Any, List, Optional
 from openai import OpenAI
-from sql_query_reviewer.client import SyncSQLReviewEnv
-from sql_query_reviewer.models import SQLReviewAction, SQLReviewObservation
 # ---------------------------------------------------------------------------
-# Configuration
 # ---------------------------------------------------------------------------
-DEFAULT_TASK_IDS = ("easy_001", "medium_001", "hard_001")
-BENCHMARK = "sql-query-reviewer"
-SUCCESS_SCORE_THRESHOLD = 0.1
-ENV_BASE_URL = os.getenv("ENV_BASE_URL", "http://localhost:8000")
 API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
 MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
-API_KEY = os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY") or os.getenv("API_KEY")
 SYSTEM_PROMPT = """You are reviewing a SQL query for correctness, performance, and security.
 Return exactly one JSON object with these keys:
 - action_type: identify_issue, suggest_fix, approve, or request_more_context
-- issue_category: syntax, performance, security, logic, or style when relevant
-- issue_description: concise issue statement when relevant
-- suggested_fix: corrected SQL or corrected fragment when relevant
 - confidence: float between 0.0 and 1.0
 Guidelines:
-- Prefer identify_issue until you have high confidence all important issues are covered.
-- Use approve only when the query looks acceptable or all issues have already been identified.
-- Keep the JSON valid and do not wrap it in prose.
 """
 # ---------------------------------------------------------------------------
-# Structured stdout logging — MUST match the hackathon spec exactly
 # ---------------------------------------------------------------------------
@@ -81,146 +83,321 @@ def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> No
 # ---------------------------------------------------------------------------
-# LLM interaction
 # ---------------------------------------------------------------------------
-def build_user_prompt(observation: SQLReviewObservation) -> str:
-    payload = {
-        "query": observation.query,
-        "schema_info": observation.schema_info,
-        "context": observation.context,
-        "issues_found_so_far": [
-            issue.model_dump() for issue in observation.issues_found_so_far
-        ],
-        "remaining_actions": observation.remaining_actions,
-        "difficulty": observation.difficulty,
-        "feedback": observation.feedback,
-    }
-    return json.dumps(payload, indent=2)
 def extract_json(content: str) -> dict[str, Any]:
     stripped = content.strip()
     if stripped.startswith("```"):
-        lines = [line for line in stripped.splitlines() if not line.startswith("```")]
         stripped = "\n".join(lines).strip()
     start = stripped.find("{")
     end = stripped.rfind("}")
     if start == -1 or end == -1 or end <= start:
-        raise ValueError(f"Could not find JSON object in model response: {content!r}")
     return json.loads(stripped[start : end + 1])
-def choose_action(
-    llm_client: OpenAI, model_name: str, observation: SQLReviewObservation
-) -> SQLReviewAction:
     try:
         response = llm_client.chat.completions.create(
             model=model_name,
             temperature=0,
             max_tokens=300,
             messages=[
                 {"role": "system", "content": SYSTEM_PROMPT},
-                {"role": "user", "content": build_user_prompt(observation)},
             ],
         )
         content = response.choices[0].message.content or ""
-        return SQLReviewAction.model_validate(extract_json(content))
     except Exception as exc:
-        print(f"[DEBUG] Model request failed: {exc}", flush=True)
-        # Fallback: approve to end the episode gracefully
-        return SQLReviewAction(action_type="approve", confidence=0.1)
 # ---------------------------------------------------------------------------
-# Episode runner
 # ---------------------------------------------------------------------------
-def run_episode(
-    env: SyncSQLReviewEnv, llm_client: OpenAI, model_name: str, task_id: str
 ) -> None:
     rewards: List[float] = []
     steps_taken = 0
     score = 0.0
     success = False
-    last_error: Optional[str] = None
     log_start(task=task_id, env=BENCHMARK, model=model_name)
     try:
         result = env.reset(task_id=task_id)
-        step = 0
-        while not result.done:
-            step += 1
-            action = choose_action(
-                llm_client=llm_client,
-                model_name=model_name,
-                observation=result.observation,
-            )
             action_str = action.action_type
             if action.issue_description:
-                # Keep action string short and readable
                 action_str = f"{action.action_type}({action.issue_category})"
-            result = env.step(action)
-            reward = result.reward
             rewards.append(reward)
-            steps_taken = step
-            last_error = result.info.get("error") if result.info else None
-            log_step(
-                step=step,
-                action=action_str,
-                reward=reward,
-                done=result.done,
-                error=last_error,
-            )
-        # Get final score from state
-        state = env.state()
-        score = state.final_score if state.final_score is not None else 0.0
         success = score >= SUCCESS_SCORE_THRESHOLD
     except Exception as exc:
         print(f"[DEBUG] Episode error: {exc}", flush=True)
-        last_error = str(exc)
     finally:
         log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
 # ---------------------------------------------------------------------------
-# Main
 # ---------------------------------------------------------------------------
-def main() -> int:
     if not API_KEY:
-        raise SystemExit("Set HF_TOKEN or OPENAI_API_KEY before running inference.py")
     task_ids = tuple(
         tid.strip()
-        for tid in os.getenv("TASK_IDS", ",".join(DEFAULT_TASK_IDS)).split(",")
         if tid.strip()
     )
-    llm_client = OpenAI(api_key=API_KEY, base_url=API_BASE_URL)
-    with SyncSQLReviewEnv(base_url=ENV_BASE_URL) as env:
         for task_id in task_ids:
-            run_episode(
-                env=env,
-                llm_client=llm_client,
-                model_name=MODEL_NAME,
-                task_id=task_id,
-            )
-    return 0
 if __name__ == "__main__":

 Inference Script — SQL Query Reviewer
 ======================================
 MANDATORY environment variables:
+    API_BASE_URL       The API endpoint for the LLM.
+    MODEL_NAME         The model identifier to use for inference.
+    HF_TOKEN           Your Hugging Face / API key.
+    LOCAL_IMAGE_NAME   The name of the local Docker image for the environment.
+STDOUT FORMAT (must match exactly):
     [START] task=<task_name> env=<benchmark> model=<model_name>
     [STEP]  step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
     [END]   success=<true|false> steps=<n> score=<score> rewards=<r1,r2,...,rn>
 from __future__ import annotations
+import asyncio
 import json
 import os
+import sys
+import traceback
 from typing import Any, List, Optional
 from openai import OpenAI
 # ---------------------------------------------------------------------------
+# Environment variables — read ALL names the validator might set
 # ---------------------------------------------------------------------------
+IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME") or os.getenv("IMAGE_NAME")
+API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY") or os.getenv("OPENAI_API_KEY")
 API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
 MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
+ENV_BASE_URL = os.getenv("ENV_BASE_URL", "http://localhost:8000")
+BENCHMARK = "sql-query-reviewer"
+MAX_STEPS = 10
+SUCCESS_SCORE_THRESHOLD = 0.1
 SYSTEM_PROMPT = """You are reviewing a SQL query for correctness, performance, and security.
 Return exactly one JSON object with these keys:
 - action_type: identify_issue, suggest_fix, approve, or request_more_context
+- issue_category: syntax, performance, security, logic, or style (when relevant)
+- issue_description: concise issue statement (when relevant)
+- suggested_fix: corrected SQL or corrected fragment (when relevant)
 - confidence: float between 0.0 and 1.0
 Guidelines:
+- Prefer identify_issue until you believe all important issues are covered.
+- Use approve only when the query looks acceptable or all issues have been identified.
+- Return ONLY valid JSON, no prose, no markdown fences.
 """
 # ---------------------------------------------------------------------------
+# Structured stdout logging — matches hackathon spec EXACTLY
 # ---------------------------------------------------------------------------
 # ---------------------------------------------------------------------------
+# LLM interaction — fully wrapped in try/except
 # ---------------------------------------------------------------------------
 def extract_json(content: str) -> dict[str, Any]:
     stripped = content.strip()
     if stripped.startswith("```"):
+        lines = [l for l in stripped.splitlines() if not l.startswith("```")]
         stripped = "\n".join(lines).strip()
     start = stripped.find("{")
     end = stripped.rfind("}")
     if start == -1 or end == -1 or end <= start:
+        raise ValueError(f"No JSON object found in: {content[:200]!r}")
     return json.loads(stripped[start : end + 1])
+def choose_action(llm_client: Any, model_name: str, observation: Any) -> dict[str, Any]:
+    """Ask LLM for an action. Returns a raw dict. NEVER raises."""
     try:
+        obs_data = {
+            "query": getattr(observation, "query", ""),
+            "schema_info": getattr(observation, "schema_info", {}),
+            "context": getattr(observation, "context", ""),
+            "issues_found_so_far": [],
+            "remaining_actions": getattr(observation, "remaining_actions", 0),
+            "difficulty": getattr(observation, "difficulty", "unknown"),
+            "feedback": getattr(observation, "feedback", ""),
+        }
+        # Safely serialize issues
+        for i in getattr(observation, "issues_found_so_far", []):
+            try:
+                obs_data["issues_found_so_far"].append(
+                    i.model_dump() if hasattr(i, "model_dump") else str(i)
+                )
+            except Exception:
+                pass
         response = llm_client.chat.completions.create(
             model=model_name,
             temperature=0,
             max_tokens=300,
             messages=[
                 {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": json.dumps(obs_data, indent=2)},
             ],
         )
         content = response.choices[0].message.content or ""
+        parsed = extract_json(content)
+        if "action_type" not in parsed:
+            parsed["action_type"] = "approve"
+        return parsed
     except Exception as exc:
+        print(f"[DEBUG] choose_action error: {exc}", flush=True)
+        return {"action_type": "approve", "confidence": 0.1}
 # ---------------------------------------------------------------------------
+# Episode runner (async) — for from_docker_image connection
 # ---------------------------------------------------------------------------
+async def run_episode_async(
+    env: Any, llm_client: Any, model_name: str, task_id: str
 ) -> None:
     rewards: List[float] = []
     steps_taken = 0
     score = 0.0
     success = False
     log_start(task=task_id, env=BENCHMARK, model=model_name)
     try:
+        from sql_query_reviewer.models import SQLReviewAction
+        result = await env.reset()
+        for step_num in range(1, MAX_STEPS + 1):
+            if result.done:
+                break
+            action_dict = choose_action(llm_client, model_name, result.observation)
+            try:
+                action = SQLReviewAction.model_validate(action_dict)
+            except Exception:
+                action = SQLReviewAction(action_type="approve", confidence=0.1)
+            action_str = action.action_type
+            if action.issue_description:
+                action_str = f"{action.action_type}({action.issue_category})"
+            try:
+                result = await env.step(action)
+            except Exception as step_err:
+                print(f"[DEBUG] env.step error: {step_err}", flush=True)
+                log_step(step=step_num, action=action_str, reward=0.0, done=True, error=str(step_err))
+                rewards.append(0.0)
+                steps_taken = step_num
+                break
+            reward = result.reward if result.reward is not None else 0.0
+            rewards.append(reward)
+            steps_taken = step_num
+            error_msg = None
+            if hasattr(result, "info") and result.info:
+                error_msg = result.info.get("error")
+            log_step(step=step_num, action=action_str, reward=reward, done=result.done, error=error_msg)
+            if result.done:
+                break
+        # Final score
+        try:
+            state = await env.state()
+            if hasattr(state, "final_score") and state.final_score is not None:
+                score = state.final_score
+            else:
+                score = sum(rewards) / max(len(rewards), 1)
+        except Exception:
+            score = sum(rewards) / max(len(rewards), 1) if rewards else 0.0
+        score = max(0.0, min(1.0, score))
+        success = score >= SUCCESS_SCORE_THRESHOLD
+    except Exception as exc:
+        print(f"[DEBUG] Episode error: {exc}", flush=True)
+        traceback.print_exc(file=sys.stdout)
+    finally:
+        log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
+# ---------------------------------------------------------------------------
+# Episode runner (sync) — for direct HTTP connection
+# ---------------------------------------------------------------------------
+def run_episode_sync(
+    env: Any, llm_client: Any, model_name: str, task_id: str
+) -> None:
+    rewards: List[float] = []
+    steps_taken = 0
+    score = 0.0
+    success = False
+    log_start(task=task_id, env=BENCHMARK, model=model_name)
+    try:
+        from sql_query_reviewer.models import SQLReviewAction
         result = env.reset(task_id=task_id)
+        for step_num in range(1, MAX_STEPS + 1):
+            if result.done:
+                break
+            action_dict = choose_action(llm_client, model_name, result.observation)
+            try:
+                action = SQLReviewAction.model_validate(action_dict)
+            except Exception:
+                action = SQLReviewAction(action_type="approve", confidence=0.1)
             action_str = action.action_type
             if action.issue_description:
                 action_str = f"{action.action_type}({action.issue_category})"
+            try:
+                result = env.step(action)
+            except Exception as step_err:
+                print(f"[DEBUG] env.step error: {step_err}", flush=True)
+                log_step(step=step_num, action=action_str, reward=0.0, done=True, error=str(step_err))
+                rewards.append(0.0)
+                steps_taken = step_num
+                break
+            reward = result.reward if result.reward is not None else 0.0
             rewards.append(reward)
+            steps_taken = step_num
+            error_msg = None
+            if hasattr(result, "info") and result.info:
+                error_msg = result.info.get("error")
+            log_step(step=step_num, action=action_str, reward=reward, done=result.done, error=error_msg)
+            if result.done:
+                break
+        try:
+            state = env.state()
+            if hasattr(state, "final_score") and state.final_score is not None:
+                score = state.final_score
+            else:
+                score = sum(rewards) / max(len(rewards), 1)
+        except Exception:
+            score = sum(rewards) / max(len(rewards), 1) if rewards else 0.0
+        score = max(0.0, min(1.0, score))
         success = score >= SUCCESS_SCORE_THRESHOLD
     except Exception as exc:
         print(f"[DEBUG] Episode error: {exc}", flush=True)
+        traceback.print_exc(file=sys.stdout)
     finally:
         log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
 # ---------------------------------------------------------------------------
+# Main — tries Docker image first (validator), then HTTP (local dev)
 # ---------------------------------------------------------------------------
+async def async_main() -> int:
+    # Build LLM client (even without key, don't crash — emit logs and exit)
     if not API_KEY:
+        print("[DEBUG] WARNING: No API key found (HF_TOKEN / API_KEY / OPENAI_API_KEY)", flush=True)
+        for tid in ["easy_001", "medium_001", "hard_001"]:
+            log_start(task=tid, env=BENCHMARK, model=MODEL_NAME)
+            log_end(success=False, steps=0, score=0.0, rewards=[])
+        return 1
+    llm_client = OpenAI(api_key=API_KEY, base_url=API_BASE_URL)
     task_ids = tuple(
         tid.strip()
+        for tid in os.getenv("TASK_IDS", "easy_001,medium_001,hard_001").split(",")
         if tid.strip()
     )
+    env = None
+    try:
+        # ------------------------------------------------------------------
+        # Method 1: from_docker_image (what the hackathon validator uses)
+        # ------------------------------------------------------------------
+        if IMAGE_NAME:
+            print(f"[DEBUG] Connecting via Docker image: {IMAGE_NAME}", flush=True)
+            try:
+                from sql_query_reviewer.client import SQLReviewEnv
+                env = await SQLReviewEnv.from_docker_image(IMAGE_NAME)
+                print("[DEBUG] Docker connection OK", flush=True)
+                for task_id in task_ids:
+                    await run_episode_async(env, llm_client, MODEL_NAME, task_id)
+                return 0
+            except AttributeError:
+                # from_docker_image not implemented in our client — try openenv-core
+                print("[DEBUG] from_docker_image not in custom client, trying openenv generic", flush=True)
+                try:
+                    from openenv.core.env_client import GenericEnvClient
+                    env = await GenericEnvClient.from_docker_image(IMAGE_NAME)
+                    print("[DEBUG] GenericEnvClient Docker connection OK", flush=True)
+                    for task_id in task_ids:
+                        await run_episode_async(env, llm_client, MODEL_NAME, task_id)
+                    return 0
+                except Exception as exc2:
+                    print(f"[DEBUG] GenericEnvClient also failed: {exc2}", flush=True)
+            except Exception as exc:
+                print(f"[DEBUG] Docker connection failed: {exc}", flush=True)
+        # ------------------------------------------------------------------
+        # Method 2: Async HTTP (fallback for local/URL-based testing)
+        # ------------------------------------------------------------------
+        print(f"[DEBUG] Connecting via URL: {ENV_BASE_URL}", flush=True)
+        try:
+            from sql_query_reviewer.client import SQLReviewEnv
+            env = SQLReviewEnv(base_url=ENV_BASE_URL)
+            await env.__aenter__()
+            print("[DEBUG] Async URL connection OK", flush=True)
+            for task_id in task_ids:
+                await run_episode_async(env, llm_client, MODEL_NAME, task_id)
+            return 0
+        except Exception as exc:
+            print(f"[DEBUG] Async URL failed: {exc}", flush=True)
+        # ------------------------------------------------------------------
+        # Method 3: Sync HTTP (last resort)
+        # ------------------------------------------------------------------
+        try:
+            from sql_query_reviewer.client import SyncSQLReviewEnv
+            sync_env = SyncSQLReviewEnv(base_url=ENV_BASE_URL)
+            sync_env.__enter__()
+            print("[DEBUG] Sync HTTP connection OK", flush=True)
+            for task_id in task_ids:
+                run_episode_sync(sync_env, llm_client, MODEL_NAME, task_id)
+            sync_env.close()
+            return 0
+        except Exception as exc:
+            print(f"[DEBUG] Sync HTTP also failed: {exc}", flush=True)
+        # All methods failed — still emit valid log lines
+        print("[DEBUG] All connection methods exhausted", flush=True)
         for task_id in task_ids:
+            log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
+            log_end(success=False, steps=0, score=0.0, rewards=[])
+        return 1
+    except Exception as exc:
+        print(f"[DEBUG] Fatal: {exc}", flush=True)
+        traceback.print_exc(file=sys.stdout)
+        return 1
+    finally:
+        if env is not None:
+            try:
+                await env.close()
+            except Exception as close_err:
+                print(f"[DEBUG] env.close() error: {close_err}", flush=True)
+def main() -> int:
+    return asyncio.run(async_main())
 if __name__ == "__main__":

sql_query_reviewer/client.py CHANGED Viewed

@@ -1,18 +1,92 @@
 from __future__ import annotations
 from typing import Any
 import httpx
-from sql_query_reviewer.models import ResetRequest, SQLReviewAction, SQLReviewState, StepResult
 class SQLReviewEnv:
     def __init__(self, base_url: str, timeout: float = 30.0) -> None:
         self.base_url = base_url.rstrip("/")
         self.timeout = timeout
         self._client: httpx.AsyncClient | None = None
     async def __aenter__(self) -> "SQLReviewEnv":
         self._client = httpx.AsyncClient(base_url=self.base_url, timeout=self.timeout)
         return self
@@ -24,13 +98,25 @@ class SQLReviewEnv:
         if self._client is not None:
             await self._client.aclose()
             self._client = None
     def sync(self) -> "SyncSQLReviewEnv":
         return SyncSQLReviewEnv(base_url=self.base_url, timeout=self.timeout)
     async def reset(self, task_id: str | None = None) -> StepResult:
         client = self._require_client()
-        response = await client.post("/reset", json=ResetRequest(task_id=task_id).model_dump(exclude_none=True))
         response.raise_for_status()
         return StepResult.model_validate(response.json())
@@ -48,11 +134,40 @@ class SQLReviewEnv:
     def _require_client(self) -> httpx.AsyncClient:
         if self._client is None:
-            raise RuntimeError("Use SQLReviewEnv as an async context manager before calling it.")
         return self._client
 class SyncSQLReviewEnv:
     def __init__(self, base_url: str, timeout: float = 30.0) -> None:
         self.base_url = base_url.rstrip("/")
         self.timeout = timeout
@@ -72,7 +187,8 @@ class SyncSQLReviewEnv:
     def reset(self, task_id: str | None = None) -> StepResult:
         client = self._require_client()
-        response = client.post("/reset", json=ResetRequest(task_id=task_id).model_dump(exclude_none=True))
         response.raise_for_status()
         return StepResult.model_validate(response.json())
@@ -90,6 +206,5 @@ class SyncSQLReviewEnv:
     def _require_client(self) -> httpx.Client:
         if self._client is None:
-            raise RuntimeError("Use SyncSQLReviewEnv as a context manager before calling it.")
         return self._client

+"""
+SQL Query Reviewer — Client
+============================
+Supports three connection modes:
+1. from_docker_image()  — used by hackathon validator
+2. Async via SQLReviewEnv(base_url=...)
+3. Sync via SyncSQLReviewEnv(base_url=...)
+"""
 from __future__ import annotations
 from typing import Any
 import httpx
+from sql_query_reviewer.models import (
+    ResetRequest,
+    SQLReviewAction,
+    SQLReviewState,
+    StepResult,
+)
 class SQLReviewEnv:
+    """Async client for the SQL Query Reviewer environment."""
     def __init__(self, base_url: str, timeout: float = 30.0) -> None:
         self.base_url = base_url.rstrip("/")
         self.timeout = timeout
         self._client: httpx.AsyncClient | None = None
+    # --- Docker image support (hackathon validator) -----------------------
+    @classmethod
+    async def from_docker_image(cls, image_name: str) -> "SQLReviewEnv":
+        """
+        Connect to the environment via a Docker image.
+        Tries openenv-core's provider first, then falls back to localhost.
+        """
+        try:
+            # Try using openenv-core's built-in Docker provider
+            from openenv.core.env_client import EnvClient
+            class _Wrapper(EnvClient):
+                pass
+            env = await _Wrapper.from_docker_image(image_name)
+            # Wrap the openenv client so our typed models work
+            return _DockerEnvWrapper(env)
+        except ImportError:
+            pass
+        except Exception:
+            pass
+        # Fallback: assume the Docker container is already running on port 8000
+        import subprocess
+        import time
+        container_id = None
+        try:
+            result = subprocess.run(
+                ["docker", "run", "-d", "-p", "8000:8000", image_name],
+                capture_output=True,
+                text=True,
+                timeout=120,
+            )
+            container_id = result.stdout.strip()
+        except Exception:
+            pass
+        # Wait for container to be ready
+        base_url = "http://localhost:8000"
+        for _ in range(30):
+            try:
+                async with httpx.AsyncClient() as c:
+                    r = await c.get(f"{base_url}/health", timeout=2.0)
+                    if r.status_code == 200:
+                        break
+            except Exception:
+                pass
+            time.sleep(1)
+        instance = cls(base_url=base_url)
+        instance._container_id = container_id  # type: ignore[attr-defined]
+        await instance.__aenter__()
+        return instance
+    # --- Async context manager --------------------------------------------
     async def __aenter__(self) -> "SQLReviewEnv":
         self._client = httpx.AsyncClient(base_url=self.base_url, timeout=self.timeout)
         return self
         if self._client is not None:
             await self._client.aclose()
             self._client = None
+        # Clean up Docker container if we started one
+        container_id = getattr(self, "_container_id", None)
+        if container_id:
+            try:
+                import subprocess
+                subprocess.run(["docker", "stop", container_id], capture_output=True, timeout=10)
+                subprocess.run(["docker", "rm", container_id], capture_output=True, timeout=10)
+            except Exception:
+                pass
     def sync(self) -> "SyncSQLReviewEnv":
         return SyncSQLReviewEnv(base_url=self.base_url, timeout=self.timeout)
+    # --- API methods ------------------------------------------------------
     async def reset(self, task_id: str | None = None) -> StepResult:
         client = self._require_client()
+        body = ResetRequest(task_id=task_id).model_dump(exclude_none=True)
+        response = await client.post("/reset", json=body)
         response.raise_for_status()
         return StepResult.model_validate(response.json())
     def _require_client(self) -> httpx.AsyncClient:
         if self._client is None:
+            raise RuntimeError("Use SQLReviewEnv as an async context manager or call from_docker_image().")
         return self._client
+class _DockerEnvWrapper(SQLReviewEnv):
+    """Wraps an openenv-core EnvClient to present our typed interface."""
+    def __init__(self, inner: Any) -> None:
+        self._inner = inner
+        self._client = None  # not used — we delegate to inner
+        self.base_url = ""
+    async def reset(self, task_id: str | None = None) -> StepResult:
+        result = await self._inner.reset()
+        return StepResult.model_validate(result.model_dump() if hasattr(result, "model_dump") else result)
+    async def step(self, action: SQLReviewAction) -> StepResult:
+        result = await self._inner.step(action)
+        return StepResult.model_validate(result.model_dump() if hasattr(result, "model_dump") else result)
+    async def state(self) -> SQLReviewState:
+        result = await self._inner.state()
+        return SQLReviewState.model_validate(result.model_dump() if hasattr(result, "model_dump") else result)
+    async def close(self) -> None:
+        try:
+            await self._inner.close()
+        except Exception:
+            pass
 class SyncSQLReviewEnv:
+    """Synchronous client for local dev and testing."""
     def __init__(self, base_url: str, timeout: float = 30.0) -> None:
         self.base_url = base_url.rstrip("/")
         self.timeout = timeout
     def reset(self, task_id: str | None = None) -> StepResult:
         client = self._require_client()
+        body = ResetRequest(task_id=task_id).model_dump(exclude_none=True)
+        response = client.post("/reset", json=body)
         response.raise_for_status()
         return StepResult.model_validate(response.json())
     def _require_client(self) -> httpx.Client:
         if self._client is None:
+            raise RuntimeError("Use SyncSQLReviewEnv as a context manager.")
         return self._client