Spaces:

abhinavthedev
/

sql-debug

Sleeping

App Files Files Community

abhinavthedev commited on 11 days ago

Commit

5db060f

verified ·

1 Parent(s): aa3a171

Upload folder using huggingface_hub

Browse files

Files changed (17) hide show

README.md +4 -2
client.py +23 -35
graders/__init__.py +0 -0
graders/grader_easy.py +110 -0
graders/grader_hard.py +140 -0
graders/grader_medium.py +135 -0
inference.py +157 -92
models.py +2 -19
openenv_sql_debug.egg-info/PKG-INFO +1 -0
openenv_sql_debug.egg-info/SOURCES.txt +6 -1
openenv_sql_debug.egg-info/requires.txt +1 -0
pyproject.toml +1 -0
server/app.py +1 -1
server/requirements.txt +2 -1
server/sql_debug_environment.py +52 -52
tasks/task_hard.py +2 -2
uv.lock +2 -0

README.md CHANGED Viewed

@@ -195,7 +195,6 @@ The inference script defaults to `syntax_fix_001`, logs each step, and stops whe
 ```text
 sql_exp/
 ├── client.py              # OpenEnv client wrapper
-├── grader.py              # Reward computation
 ├── inference.py           # LLM-driven inference loop
 ├── models.py              # Action and observation models
 ├── openenv.yaml           # OpenEnv manifest
@@ -209,7 +208,10 @@ sql_exp/
 │   ├── task_easy.py       # Syntax-fix task
 │   ├── task_medium.py     # Join logic task
 │   └── task_hard.py       # Query optimization task
-├── test.py                # Manual websocket smoke test
 └── README.md              # Project overview
 ```

 ```text
 sql_exp/
 ├── client.py              # OpenEnv client wrapper
 ├── inference.py           # LLM-driven inference loop
 ├── models.py              # Action and observation models
 ├── openenv.yaml           # OpenEnv manifest
 │   ├── task_easy.py       # Syntax-fix task
 │   ├── task_medium.py     # Join logic task
 │   └── task_hard.py       # Query optimization task
+├── graders/
+│   ├── grader_easy.py       # Syntax-fix task
+│   ├── grader_medium.py     # Join logic task
+│   └── grader_hard.py       # Query optimization task
 └── README.md              # Project overview
 ```

client.py CHANGED Viewed

@@ -1,16 +1,6 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
 # client.py
-"""
-SQL Debug Environment client.
-This is what inference.py uses to talk to the running server.
-"""
-from typing import Dict
 from openenv.core import EnvClient
 from openenv.core.client_types import StepResult
@@ -20,35 +10,33 @@ from models import SQLDebugAction, SQLDebugObservation
 class SQLDebugEnv(EnvClient[SQLDebugAction, SQLDebugObservation, State]):
-    """
-    Client for the SQL Debug & Optimizer environment.
-    Maintains a persistent WebSocket connection to the server.
-    Each instance gets its own dedicated environment session.
-    Usage (direct server):
-        with SQLDebugEnv(base_url="http://localhost:8000") as env:
-            result = env.reset()
-            print(result.observation.target_description)
-            result = env.step(SQLDebugAction(query="SELECT * FROM orders"))
-            print(result.reward)
-    Usage (Docker):
-        env = SQLDebugEnv.from_docker_image("sql-debug-env:latest")
-        try:
-            result = env.reset()
-            result = env.step(SQLDebugAction(query="SELECT * FROM orders WHERE amount > 500"))
-        finally:
-            env.close()
-    """
     def _step_payload(self, action: SQLDebugAction) -> Dict:
-        """Convert SQLDebugAction to JSON payload."""
         return {"query": action.query}
     def _parse_result(self, payload: Dict) -> StepResult[SQLDebugObservation]:
-        """Parse server JSON response into a typed StepResult."""
         obs_data = payload.get("observation", {})
         observation = SQLDebugObservation(
             task_id=obs_data.get("task_id", ""),
@@ -63,6 +51,7 @@ class SQLDebugEnv(EnvClient[SQLDebugAction, SQLDebugObservation, State]):
             available_tasks=obs_data.get("available_tasks", []),
             done=payload.get("done", False),
             reward=payload.get("reward", 0.0),
         )
         return StepResult(
@@ -72,8 +61,7 @@ class SQLDebugEnv(EnvClient[SQLDebugAction, SQLDebugObservation, State]):
         )
     def _parse_state(self, payload: Dict) -> State:
-        """Parse server JSON response into a State object."""
         return State(
             episode_id=payload.get("episode_id"),
             step_count=payload.get("step_count", 0),
-        )

 # client.py
+from typing import Dict, Optional
+import httpx
 from openenv.core import EnvClient
 from openenv.core.client_types import StepResult
 class SQLDebugEnv(EnvClient[SQLDebugAction, SQLDebugObservation, State]):
+    def __init__(self, base_url: str = "http://localhost:8000", **kwargs):
+        super().__init__(base_url=base_url, **kwargs)
+        self._base_url = base_url.rstrip("/")
+    # ── Override reset to send task_id in body ────────────────────────────────
+    async def reset(self, task_id: Optional[str] = None, **kwargs) -> StepResult:
+        payload = {}
+        if task_id:
+            payload["task_id"] = task_id
+        async with httpx.AsyncClient(timeout=30) as http:
+            response = await http.post(
+                f"{self._base_url}/reset",
+                json=payload,
+            )
+            response.raise_for_status()
+            return self._parse_result(response.json())
+    # ── step payload ──────────────────────────────────────────────────────────
     def _step_payload(self, action: SQLDebugAction) -> Dict:
         return {"query": action.query}
+    # — update _parse_result only
     def _parse_result(self, payload: Dict) -> StepResult[SQLDebugObservation]:
         obs_data = payload.get("observation", {})
+        meta = obs_data.get("metadata", {})  # ← feedback lives here now
         observation = SQLDebugObservation(
             task_id=obs_data.get("task_id", ""),
             available_tasks=obs_data.get("available_tasks", []),
             done=payload.get("done", False),
             reward=payload.get("reward", 0.0),
+            metadata=meta,
         )
         return StepResult(
         )
     def _parse_state(self, payload: Dict) -> State:
         return State(
             episode_id=payload.get("episode_id"),
             step_count=payload.get("step_count", 0),
+        )

graders/__init__.py ADDED Viewed

File without changes

graders/grader_easy.py ADDED Viewed

	@@ -0,0 +1,110 @@

+# tasks/grader_easy.py
+"""
+Grader for syntax_fix_001 — fix typos in SQL keywords.
+Reward is shaped on: syntax correctness + F1 row match + step efficiency.
+"""
+def grade(
+    task: dict,
+    agent_query: str,
+    run_result: dict,
+    prev_absolute_score: float = 0.0,
+    step_count: int = 1,
+    max_steps: int = 5,
+) -> dict:
+    """
+    Easy task grader. Pure row-match scoring — no plan check needed.
+    Reward components:
+        syntax_score   : 0.0 or 1.0 — did the query run at all?
+        result_score   : 0.0–1.0    — F1 of returned vs expected rows
+        efficiency_bonus: 0.0–0.05  — small bonus for solving early
+        delta          : absolute_score - prev_absolute_score
+    """
+    syntax_ok = run_result["error"] is None
+    # ── Syntax ────────────────────────────────────────────────────────────────
+    if not syntax_ok:
+        absolute_score = 0.05   # tiny gradient so agent knows to fix syntax first
+        delta = absolute_score - prev_absolute_score
+        delta = max(-0.3, min(0.5, delta))
+        return {
+            "value": delta,
+            "absolute_score": absolute_score,
+            "syntax_ok": False,
+            "result_score": 0.0,
+            "plan_score": 0.0,
+            "delta": delta,
+            "status": "syntax_error",
+            "feedback": f"syntax_error: {run_result['error'][:100]}",
+            "message": f"syntax_error | abs=0.050 | delta={delta:+.3f}",
+        }
+    # ── Row matching (F1) ─────────────────────────────────────────────────────
+    expected = task["expected_rows"]
+    got = run_result["rows"]
+    if not got:
+        result_score = 0.0
+    else:
+        correct_returned = sum(1 for row in got if row in expected)
+        correct_expected = sum(1 for row in expected if row in got)
+        precision = correct_returned / max(len(got), 1)
+        recall    = correct_expected / max(len(expected), 1)
+        if precision + recall > 0:
+            result_score = 2 * precision * recall / (precision + recall)
+        else:
+            result_score = 0.0
+    # ── Efficiency bonus ──────────────────────────────────────────────────────
+    steps_remaining = max_steps - step_count
+    efficiency_bonus = 0.0
+    if result_score >= 0.99:
+        efficiency_bonus = round(0.05 * (steps_remaining / max_steps), 4)
+    # ── Absolute score — easy: syntax 15% + correctness 80% + bonus 5% ───────
+    absolute_score = round(
+        min(0.99, 0.15 * 1.0 + 0.80 * result_score + efficiency_bonus), 4
+    )
+    # ── Delta reward — the RL signal ──────────────────────────────────────────
+    delta = absolute_score - prev_absolute_score
+    if abs(delta) < 0.001 and step_count > 1:
+        delta -= 0.02   # stall penalty — discourages repeating same query
+    delta = round(max(-0.3, min(0.5, delta)), 4)
+    # ── Feedback for agent ────────────────────────────────────────────────────
+    issues = []
+    if result_score < 0.5:
+        issues.append("result_rows: returned rows do not match expected — check your WHERE clause")
+    elif result_score < 0.99:
+        issues.append(f"result_rows: partial match ({result_score:.0%}) — some rows still wrong")
+    if len(got) > len(expected):
+        issues.append(f"extra_rows: returned {len(got)} rows but expected {len(expected)}")
+    feedback = "; ".join(issues) if issues else "rows match — looking good"
+    status = (
+        "solved"     if absolute_score >= 0.99
+        else "improving" if delta > 0.01
+        else "regression" if delta < -0.01
+        else "stalled"
+    )
+    return {
+        "value": delta,
+        "absolute_score": absolute_score,
+        "syntax_ok": True,
+        "result_score": result_score,
+        "plan_score": 0.0,
+        "delta": delta,
+        "status": status,
+        "feedback": feedback,
+        "message": (
+            f"{status} | abs={absolute_score:.3f} | delta={delta:+.3f} | "
+            f"result={result_score:.0%}"
+        ),
+    }

graders/grader_hard.py ADDED Viewed

	@@ -0,0 +1,140 @@

+# tasks/grader_hard.py
+"""
+Grader for optimize_001 — replace correlated subquery with CTE.
+Unlike easy/medium, there are no fixed expected_rows.
+Score is entirely driven by query plan quality:
+  - uses WITH (CTE)
+  - uses GROUP BY
+  - uses AVG(
+  - does NOT use correlated subquery pattern
+  - executes without error
+"""
+def grade(
+    task: dict,
+    agent_query: str,
+    run_result: dict,
+    prev_absolute_score: float = 0.0,
+    step_count: int = 1,
+    max_steps: int = 10,
+) -> dict:
+    syntax_ok = run_result["error"] is None
+    # ── Syntax ────────────────────────────────────────────────────────────────
+    if not syntax_ok:
+        absolute_score = 0.05
+        delta = round(
+            max(-0.3, min(0.5, absolute_score - prev_absolute_score)), 4
+        )
+        return {
+            "value": delta,
+            "absolute_score": absolute_score,
+            "syntax_ok": False,
+            "result_score": 0.0,
+            "plan_score": 0.0,
+            "delta": delta,
+            "status": "syntax_error",
+            "feedback": f"syntax_error: {run_result['error'][:100]}",
+            "message": f"syntax_error | abs=0.050 | delta={delta:+.3f}",
+        }
+    query_upper = agent_query.upper()
+    # good_patterns = task.get("good_patterns", ["WITH", "GROUP BY", "AVG("])
+    # ── Plan component scores ─────────────────────────────────────────────────
+    # 1. Uses CTE (WITH keyword) — most important signal
+    has_cte = "WITH" in query_upper
+    cte_score = 1.0 if has_cte else 0.0
+    # 2. Uses GROUP BY — required for computing per-user average
+    has_group_by = "GROUP BY" in query_upper
+    group_score = 1.0 if has_group_by else 0.0
+    # 3. Uses AVG — must be aggregating correctly
+    has_avg = "AVG(" in query_upper
+    avg_score = 1.0 if has_avg else 0.0
+    # 4. Correlated subquery penalty — still using the slow pattern
+    still_correlated = (
+        "SELECT AVG" in query_upper
+        and "WHERE" in query_upper
+        and not has_cte          # WITH overrides this penalty
+    )
+    correlation_penalty = 0.4 if still_correlated else 0.0
+    # 5. Execution quality — did the query actually return rows?
+    rows_returned = len(run_result["rows"])
+    execution_score = 1.0 if rows_returned > 0 else 0.3
+    # 0.3 credit for running without error even if empty result
+    # ── Plan score weighted combination ───────────────────────────────────────
+    # CTE 40% + GROUP BY 25% + AVG 20% + execution 15%
+    plan_score = round(
+        max(
+            0.0,
+            0.40 * cte_score
+            + 0.25 * group_score
+            + 0.20 * avg_score
+            + 0.15 * execution_score
+            - correlation_penalty,
+        ),
+        4,
+    )
+    # ── Efficiency bonus ──────────────────────────────────────────────────────
+    steps_remaining = max_steps - step_count
+    efficiency_bonus = 0.0
+    if plan_score >= 0.85:
+        efficiency_bonus = round(0.05 * (steps_remaining / max_steps), 4)
+    # ── Absolute score — hard: syntax 10% + plan 85% + bonus 5% ─────────────
+    absolute_score = round(
+        min(0.99, 0.10 * 1.0 + 0.85 * plan_score + efficiency_bonus), 4
+    )
+    absolute_score = max(0.05, absolute_score)
+    # ── Delta ─────────────────────────────────────────────────────────────────
+    delta = absolute_score - prev_absolute_score
+    if abs(delta) < 0.001 and step_count > 1:
+        delta -= 0.02
+    delta = round(max(-0.3, min(0.5, delta)), 4)
+    # ── Feedback ─────────────────────────────────────────────────────────────
+    issues = []
+    if not has_cte:
+        issues.append("missing_cte: query needs WITH clause to precompute averages")
+    if not has_group_by:
+        issues.append("missing_group_by: need GROUP BY user_id to compute per-user avg")
+    if not has_avg:
+        issues.append("missing_avg: need AVG(amount) in the CTE")
+    if still_correlated:
+        issues.append("still_correlated: subquery in WHERE runs per-row — move to CTE")
+    if rows_returned == 0 and syntax_ok:
+        issues.append("empty_result: query runs but returns no rows — check JOIN and WHERE")
+    feedback = "; ".join(issues) if issues else "plan looks optimized"
+    status = (
+        "solved"     if absolute_score >= 0.99
+        else "improving" if delta > 0.01
+        else "regression" if delta < -0.01
+        else "stalled"
+    )
+    return {
+        "value": delta,
+        "absolute_score": absolute_score,
+        "syntax_ok": True,
+        "result_score": execution_score,
+        "plan_score": plan_score,
+        "delta": delta,
+        "status": status,
+        "feedback": feedback,
+        "message": (
+            f"{status} | abs={absolute_score:.3f} | delta={delta:+.3f} | "
+            f"plan={plan_score:.0%} | cte={has_cte} | group={has_group_by}"
+        ),
+    }

graders/grader_medium.py ADDED Viewed

	@@ -0,0 +1,135 @@

+# tasks/grader_medium.py
+"""
+Grader for logic_fix_001 — fix wrong JOIN type / WHERE logic.
+Harder than easy: agent must get BOTH precision and recall right.
+Extra penalty for wrong row count (catches SELECT * with no WHERE).
+"""
+def grade(
+    task: dict,
+    agent_query: str,
+    run_result: dict,
+    prev_absolute_score: float = 0.0,
+    step_count: int = 1,
+    max_steps: int = 8,
+) -> dict:
+    syntax_ok = run_result["error"] is None
+    # ── Syntax ────────────────────────────────────────────────────────────────
+    if not syntax_ok:
+        absolute_score = 0.05
+        delta = round(
+            max(-0.3, min(0.5, absolute_score - prev_absolute_score)), 4
+        )
+        return {
+            "value": delta,
+            "absolute_score": absolute_score,
+            "syntax_ok": False,
+            "result_score": 0.0,
+            "plan_score": 0.0,
+            "delta": delta,
+            "status": "syntax_error",
+            "feedback": f"syntax_error: {run_result['error'][:100]}",
+            "message": f"syntax_error | abs=0.050 | delta={delta:+.3f}",
+        }
+    expected = task["expected_rows"]
+    got = run_result["rows"]
+    # ── F1 row score ──────────────────────────────────────────────────────────
+    if not got:
+        result_score = 0.0
+    else:
+        correct_returned = sum(1 for row in got if row in expected)
+        correct_expected = sum(1 for row in expected if row in got)
+        precision = correct_returned / max(len(got), 1)
+        recall    = correct_expected / max(len(expected), 1)
+        if precision + recall > 0:
+            result_score = 2 * precision * recall / (precision + recall)
+        else:
+            result_score = 0.0
+    # ── Extra penalty for wrong row count ─────────────────────────────────────
+    # Logic bugs typically show up as too many rows (LEFT JOIN returns NULLs)
+    # Penalize harder than easy task to encourage precise reasoning
+    row_count_penalty = 0.0
+    if len(got) > len(expected):
+        extra = len(got) - len(expected)
+        row_count_penalty = min(0.25, extra * 0.08)
+    # ── JOIN type hint score ──────────────────────────────────────────────────
+    # Gives partial credit for using the right JOIN type even if rows are off
+    # Avoids zero-reward cliff for agents that fix JOIN but have minor issues
+    query_upper = agent_query.upper()
+    join_score = 0.0
+    if "INNER JOIN" in query_upper:
+        join_score = 0.15   # using INNER JOIN is the right direction
+    elif "LEFT JOIN" in query_upper:
+        join_score = 0.0    # LEFT JOIN is the bug — no credit
+    elif "JOIN" in query_upper:
+        join_score = 0.05   # some join exists — small credit
+    # ── Efficiency bonus ──────────────────────────────────────────────────────
+    steps_remaining = max_steps - step_count
+    efficiency_bonus = 0.0
+    if result_score >= 0.99:
+        efficiency_bonus = round(0.05 * (steps_remaining / max_steps), 4)
+    # ── Absolute score — medium: syntax 10% + correctness 70% + join 15% + bonus 5% ──
+    absolute_score = round(
+        min(
+            0.99,
+            0.10 * 1.0
+            + 0.70 * result_score
+            + 0.15 * join_score
+            + efficiency_bonus
+            - row_count_penalty,
+        ),
+        4,
+    )
+    absolute_score = max(0.05, absolute_score)  # floor at 0.05
+    # ── Delta ─────────────────────────────────────────────────────────────────
+    delta = absolute_score - prev_absolute_score
+    if abs(delta) < 0.001 and step_count > 1:
+        delta -= 0.02
+    delta = round(max(-0.3, min(0.5, delta)), 4)
+    # ── Feedback ─────────────────────────────────────────────────────────────
+    issues = []
+    if "LEFT JOIN" in query_upper:
+        issues.append("join_type: using LEFT JOIN includes rows with no matching department")
+    if len(got) > len(expected):
+        issues.append(f"extra_rows: got {len(got)} rows, expected {len(expected)} — filter too loose")
+    if len(got) < len(expected) and len(got) > 0:
+        issues.append(f"missing_rows: got {len(got)} rows, expected {len(expected)} — filter too strict")
+    if result_score < 0.5:
+        issues.append("result_rows: output does not match expected — check JOIN and WHERE")
+    feedback = "; ".join(issues) if issues else "rows and join look correct"
+    status = (
+        "solved"     if absolute_score >= 0.99
+        else "improving" if delta > 0.01
+        else "regression" if delta < -0.01
+        else "stalled"
+    )
+    return {
+        "value": delta,
+        "absolute_score": absolute_score,
+        "syntax_ok": True,
+        "result_score": result_score,
+        "plan_score": join_score,
+        "delta": delta,
+        "status": status,
+        "feedback": feedback,
+        "message": (
+            f"{status} | abs={absolute_score:.3f} | delta={delta:+.3f} | "
+            f"result={result_score:.0%} | join={join_score:.2f}"
+        ),
+    }

inference.py CHANGED Viewed

@@ -1,50 +1,38 @@
 # inference.py
-"""
-SQL Debug & Optimizer — OpenEnv Inference Script
-Mandatory stdout format:
-    [START] task=<task_name> env=<benchmark> model=<model_name>
-    [STEP]  step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
-    [END]   success=<true|false> steps=<n> score=<0.00> rewards=<r1,r2,...>
-"""
 import asyncio
 import os
 import textwrap
-from typing import List, Optional
 from openai import OpenAI
 from client import SQLDebugEnv, SQLDebugAction
-# ── Mandatory env vars (injected by evaluator on submission) ──────────────────
 IMAGE_NAME   = os.getenv("LOCAL_IMAGE_NAME")
 API_KEY      = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
 API_BASE_URL = os.getenv("API_BASE_URL", "https://api.groq.com/openai/v1")
-MODEL_NAME   = os.getenv("MODEL_NAME",   "llama-3.3-70b-versatile")
-# ── Task + run config ─────────────────────────────────────────────────────────
-TASK_NAME  = os.getenv("SQL_ENV_TASK", "syntax_fix_001")
-BENCHMARK  = "sql-debug-optimizer"
-MAX_STEPS  = 8       # well under 20 min limit; each step is ~2s
-TEMPERATURE = 0.0    # deterministic = reproducible scores
-MAX_TOKENS  = 400
-SUCCESS_THRESHOLD = 0.5   # reward >= 0.5 = success
-# ── Mandatory stdout loggers — DO NOT change field names or order ─────────────
 def log_start(task: str, env: str, model: str) -> None:
     print(f"[START] task={task} env={env} model={model}", flush=True)
 def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
-    # action must be single-line — newlines break log parsing
     action_clean = action.replace("\n", " ").replace("\r", "").strip()
-    error_val = error if error else "null"
-    done_val = str(done).lower()
     print(
         f"[STEP] step={step} action={action_clean} reward={reward:.2f} "
-        f"done={done_val} error={error_val}",
         flush=True,
     )
@@ -58,124 +46,201 @@ def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> No
     )
-# ── Prompt design ─────────────────────────────────────────────────────────────
-SYSTEM_PROMPT = textwrap.dedent("""
-    You are an expert SQL engineer helping debug and optimize SQL queries.
-    Rules (follow exactly):
-    - Respond with ONLY the corrected SQL query.
-    - No markdown, no code fences (no ```sql), no explanation.
-    - No comments inside the SQL.
-    - If the query has a syntax error, fix it first.
-    - If the query has a logic bug (wrong JOIN, wrong WHERE), fix the logic.
-    - If asked to optimize, replace correlated subqueries with CTEs using WITH.
-    - Output raw SQL only — it will be executed directly.
-""").strip()
-def build_prompt(obs) -> str:
-    """Build the user prompt from the current observation."""
-    result_preview = str(obs.query_result[:3]) if obs.query_result else "empty / error"
     return textwrap.dedent(f"""
-        TASK: {obs.target_description}
-        DATABASE SCHEMA:
-        {obs.schema_sql.strip()[:800]}
-        CURRENT QUERY (this is broken or slow — fix it):
         {obs.current_query.strip()}
         ERROR: {obs.error_message or "none"}
-        CURRENT RESULT (first 3 rows): {result_preview}
-        STEP: {obs.step_count + 1} of {MAX_STEPS}
-        Write the corrected SQL query:
     """).strip()
-def call_llm(client: OpenAI, obs) -> str:
-    """Ask the LLM for a better SQL query. Returns clean SQL string."""
     try:
         completion = client.chat.completions.create(
             model=MODEL_NAME,
-            messages=[
-                {"role": "system", "content": SYSTEM_PROMPT},
-                {"role": "user",   "content": build_prompt(obs)},
-            ],
             temperature=TEMPERATURE,
             max_tokens=MAX_TOKENS,
             stream=False,
         )
         raw = (completion.choices[0].message.content or "").strip()
-        # Strip markdown code fences if model adds them despite instructions
         if "```" in raw:
-            lines = raw.split("\n")
             raw = "\n".join(
-                line for line in lines if not line.strip().startswith("```")
             ).strip()
-        return raw if raw else "SELECT 1"
     except Exception as exc:
         print(f"[DEBUG] LLM call failed: {exc}", flush=True)
         return "SELECT 1"
-# ── Main loop ─────────────────────────────────────────────────────────────���───
 async def main() -> None:
     client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
-    # Connect to the environment (Docker or local server)
-    SERVER_URL = os.getenv("SERVER_URL", "http://localhost:8000")
-    env = SQLDebugEnv(base_url=SERVER_URL)
-    rewards: List[float] = []
-    steps_taken = 0
-    score = 0.0
-    success = False
     log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
     try:
-        # Reset — get the broken query and task info
-        result = await env.reset(task_id=TASK_NAME)
         obs = result.observation
         for step in range(1, MAX_STEPS + 1):
             if result.done:
                 break
-            # Ask LLM for a better query
-            sql_query = call_llm(client, obs)
-            # Submit to environment
             result = await env.step(SQLDebugAction(query=sql_query))
-            obs = result.observation
-            reward = result.reward or 0.0
-            done   = result.done
-            error  = obs.error_message if obs.error_message else None
-            rewards.append(reward)
             steps_taken = step
-            log_step(
-                step=step,
-                action=sql_query,
-                reward=reward,
-                done=done,
-                error=error,
-            )
             if done:
                 break
-        # Score = best reward achieved (already 0.0–1.0 from grader)
-        score = max(rewards) if rewards else 0.0
-        score = min(max(score, 0.0), 1.0)
         success = score >= SUCCESS_THRESHOLD
     except Exception as exc:
@@ -187,7 +252,7 @@ async def main() -> None:
         except Exception as e:
             print(f"[DEBUG] env.close() error: {e}", flush=True)
-        log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
 if __name__ == "__main__":

 # inference.py
 import asyncio
 import os
 import textwrap
+from typing import List, Optional, Dict
 from openai import OpenAI
 from client import SQLDebugEnv, SQLDebugAction
+# ── Env vars ──────────────────────────────────────────────────────────────────
 IMAGE_NAME   = os.getenv("LOCAL_IMAGE_NAME")
 API_KEY      = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
 API_BASE_URL = os.getenv("API_BASE_URL", "https://api.groq.com/openai/v1")
+MODEL_NAME   = os.getenv("MODEL_NAME", "llama-3.3-70b-versatile")
+SERVER_URL   = os.getenv("SERVER_URL", "http://localhost:8000")
+TASK_NAME         = os.getenv("SQL_ENV_TASK", "syntax_fix_001")
+BENCHMARK         = "sql-debug-optimizer"
+MAX_STEPS         = 8
+TEMPERATURE       = 0.3
+MAX_TOKENS        = 400
+SUCCESS_THRESHOLD = 0.5
+# ── Stdout loggers ────────────────────────────────────────────────────────────
 def log_start(task: str, env: str, model: str) -> None:
     print(f"[START] task={task} env={env} model={model}", flush=True)
 def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
     action_clean = action.replace("\n", " ").replace("\r", "").strip()
     print(
         f"[STEP] step={step} action={action_clean} reward={reward:.2f} "
+        f"done={str(done).lower()} error={error or 'null'}",
         flush=True,
     )
     )
+# ── Prompts ───────────────────────────────────────────────────────────────────
+SYSTEM_PROMPT = """You are an expert SQL engineer fixing and optimizing SQL queries.
+STRICT OUTPUT RULES:
+- Output ONLY raw SQL. No markdown. No backticks. No explanation. No comments.
+- Your output is executed directly against a SQLite database.
+- If your previous attempt got negative reward, you made things worse — try differently.
+- If reward is stalled (same score 2+ steps), change strategy significantly."""
+TASK_CONTEXT = {
+    "syntax_fix_001": "The query has typographical errors in SQL keywords.",
+    "logic_fix_001":  "The query runs but returns incorrect rows due to a logic error.",
+    "optimize_001":   "The query is correct but slow. Rewrite it to be faster.",
+}
+GRADUATED_HINTS = {
+    "syntax_fix_001": [
+        "",
+        "Check the spelling of SQL keywords like SELECT, FROM, WHERE.",
+        "Compare each word: SELECT FROM WHERE ORDER BY GROUP BY — fix any typos.",
+        "The typos are: SELEC → SELECT, FORM → FROM, WERE → WHERE.",
+    ],
+    "logic_fix_001": [
+        "",
+        "The query returns more rows than expected. Check your JOIN type.",
+        "LEFT JOIN includes rows even when no match exists. Consider INNER JOIN.",
+        "Change LEFT JOIN to INNER JOIN to exclude employees with no matching department.",
+    ],
+    "optimize_001": [
+        "",
+        "The query uses a subquery that runs once per row — this is slow.",
+        "Compute the per-user average once using GROUP BY, then JOIN the result.",
+        "Use: WITH user_avg AS (SELECT user_id, AVG(amount) AS avg FROM transactions GROUP BY user_id) SELECT t.* FROM transactions t JOIN user_avg u ON t.user_id = u.user_id WHERE t.amount > u.avg AND t.status = 'completed'",
+    ],
+}
+def get_hint_level(step: int, stall_count: int) -> int:
+    if step <= 2 and stall_count < 2:
+        return 0
+    if step <= 4 and stall_count < 4:
+        return 1
+    if step <= 6:
+        return 2
+    return 3
+def build_prompt(obs, step: int, stall_count: int, prev_delta: float) -> str:
+    context  = TASK_CONTEXT.get(obs.task_id, "Fix the SQL query.")
+    hint_level = get_hint_level(step, stall_count)
+    hint     = GRADUATED_HINTS.get(obs.task_id, [""] * 4)[hint_level]
+    result_preview = str(obs.query_result[:3]) if obs.query_result else "none"
+    # ← read feedback from metadata dict, not obs.feedback
+    meta     = obs.metadata or {}
+    feedback = meta.get("feedback", "analyse the result yourself")
+    reward_context = ""
+    if step > 1:
+        if prev_delta > 0.01:
+            reward_context = f"Last change IMPROVED score (+{prev_delta:.2f}). Keep going."
+        elif prev_delta < -0.01:
+            reward_context = f"Last change WORSENED score ({prev_delta:.2f}). Revert and try differently."
+        else:
+            reward_context = f"Last change had NO EFFECT (delta={prev_delta:.2f}). Try a completely different approach."
+    hint_block = f"\nHINT: {hint}" if hint else ""
     return textwrap.dedent(f"""
+        TASK: {context}
+        {reward_context}{hint_block}
+        SCHEMA:
+        {obs.schema_sql.strip()[:600]}
+        CURRENT QUERY:
         {obs.current_query.strip()}
         ERROR: {obs.error_message or "none"}
+        RESULT (first 3 rows): {result_preview}
+        FEEDBACK: {feedback}
+        BEST SCORE SO FAR: {obs.reward_so_far:.3f}
+        STEP: {step} of {MAX_STEPS}
+        Write the corrected SQL:
     """).strip()
+def call_llm(
+    client: OpenAI,
+    obs,
+    history: List[Dict],
+    step: int,
+    stall_count: int,
+    prev_delta: float,
+) -> str:
+    user_content = build_prompt(obs, step, stall_count, prev_delta)
+    messages = [{"role": "system", "content": SYSTEM_PROMPT}]
+    messages.extend(history[-6:])
+    messages.append({"role": "user", "content": user_content})
     try:
         completion = client.chat.completions.create(
             model=MODEL_NAME,
+            messages=messages,
             temperature=TEMPERATURE,
             max_tokens=MAX_TOKENS,
             stream=False,
         )
         raw = (completion.choices[0].message.content or "").strip()
         if "```" in raw:
             raw = "\n".join(
+                l for l in raw.split("\n")
+                if not l.strip().startswith("```")
             ).strip()
+        result = raw if raw else "SELECT 1"
+        history.append({"role": "user",      "content": user_content})
+        history.append({"role": "assistant", "content": result})
+        return result
     except Exception as exc:
         print(f"[DEBUG] LLM call failed: {exc}", flush=True)
         return "SELECT 1"
+# ── Main ──────────────────────────────────────────────────────────────────────
 async def main() -> None:
     client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
+    env    = SQLDebugEnv(base_url=SERVER_URL)
+    delta_rewards: List[float] = []   # per-step delta — logged in [STEP]
+    abs_scores:    List[float] = []   # per-step absolute — used for final score
+    history:       List[Dict]  = []
+    stall_count  = 0
+    prev_delta   = 0.0
+    steps_taken  = 0
+    score        = 0.0
+    success      = False
     log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
     try:
+        # ── Reset ─────────────────────────────────────────────────────────────
+        try:
+            result = await env.reset(task_id=TASK_NAME)
+        except Exception as e:
+            print(f"[DEBUG] reset() failed: {e}", flush=True)
+            raise
         obs = result.observation
+        # ── Episode loop ──────────────────────────────────────────────────────
         for step in range(1, MAX_STEPS + 1):
             if result.done:
                 break
+            sql_query = call_llm(
+                client, obs, history,
+                step=step,
+                stall_count=stall_count,
+                prev_delta=prev_delta,
+            )
             result = await env.step(SQLDebugAction(query=sql_query))
+            obs    = result.observation
+            # delta reward from grader (can be negative)
+            delta   = result.reward or 0.0
+            # absolute score tracked via reward_so_far on observation
+            abs_s   = obs.reward_so_far
+            done    = result.done
+            error   = obs.error_message if obs.error_message else None
+            # Stall detection — reset on any meaningful change
+            if abs(delta) < 0.01:
+                stall_count += 1
+            else:
+                stall_count = 0
+            prev_delta = delta
+            delta_rewards.append(delta)
+            abs_scores.append(abs_s)
             steps_taken = step
+            log_step(step=step, action=sql_query, reward=delta, done=done, error=error)
             if done:
                 break
+        # Final score = best absolute score reached this episode
+        score   = max(abs_scores) if abs_scores else 0.0
+        score   = min(max(score, 0.0), 1.0)
         success = score >= SUCCESS_THRESHOLD
     except Exception as exc:
         except Exception as e:
             print(f"[DEBUG] env.close() error: {e}", flush=True)
+        log_end(success=success, steps=steps_taken, score=score, rewards=delta_rewards)
 if __name__ == "__main__":

models.py CHANGED Viewed

@@ -1,31 +1,14 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Data models for the SQL Debug & Optimizer Environment.
-"""
-from typing import Any, Dict, List
 from pydantic import Field
 from openenv.core.env_server.types import Action, Observation
 class SQLDebugAction(Action):
-    """
-    What the agent submits each step — just a SQL query string.
-    The environment will run it, grade it, and return a new observation.
-    """
     query: str = Field(..., description="The SQL query the agent wants to try")
 class SQLDebugObservation(Observation):
-    """
-    What the agent sees after each step.
-    Contains everything it needs to improve its next query.
-    """
     task_id: str = Field(default="", description="Which task is active")
     schema_sql: str = Field(default="", description="CREATE TABLE statements for this task")
     current_query: str = Field(default="", description="Last query that was run")

+# models.py
+from typing import Any, Dict, List, Optional
 from pydantic import Field
 from openenv.core.env_server.types import Action, Observation
 class SQLDebugAction(Action):
     query: str = Field(..., description="The SQL query the agent wants to try")
 class SQLDebugObservation(Observation):
     task_id: str = Field(default="", description="Which task is active")
     schema_sql: str = Field(default="", description="CREATE TABLE statements for this task")
     current_query: str = Field(default="", description="Last query that was run")

openenv_sql_debug.egg-info/PKG-INFO CHANGED Viewed

@@ -6,6 +6,7 @@ Requires-Python: >=3.10
 Requires-Dist: openenv-core[core]>=0.2.2
 Requires-Dist: openai>=2.30.0
 Requires-Dist: uvicorn>=0.43.0
 Provides-Extra: dev
 Requires-Dist: pytest>=8.0.0; extra == "dev"
 Requires-Dist: pytest-cov>=4.0.0; extra == "dev"

 Requires-Dist: openenv-core[core]>=0.2.2
 Requires-Dist: openai>=2.30.0
 Requires-Dist: uvicorn>=0.43.0
+Requires-Dist: httpx>=0.28.1
 Provides-Extra: dev
 Requires-Dist: pytest>=8.0.0; extra == "dev"
 Requires-Dist: pytest-cov>=4.0.0; extra == "dev"

openenv_sql_debug.egg-info/SOURCES.txt CHANGED Viewed

@@ -1,8 +1,13 @@
 README.md
 pyproject.toml
 ./__init__.py
 ./client.py
-./grader.py
 ./inference.py
 ./models.py
 ./runner.py

 README.md
+__init__.py
+client.py
+inference.py
+models.py
 pyproject.toml
+runner.py
+test.py
 ./__init__.py
 ./client.py
 ./inference.py
 ./models.py
 ./runner.py

openenv_sql_debug.egg-info/requires.txt CHANGED Viewed

@@ -1,6 +1,7 @@
 openenv-core[core]>=0.2.2
 openai>=2.30.0
 uvicorn>=0.43.0
 [dev]
 pytest>=8.0.0

 openenv-core[core]>=0.2.2
 openai>=2.30.0
 uvicorn>=0.43.0
+httpx>=0.28.1
 [dev]
 pytest>=8.0.0

pyproject.toml CHANGED Viewed

@@ -20,6 +20,7 @@ dependencies = [
     "openenv-core[core]>=0.2.2",
     "openai>=2.30.0",
     "uvicorn>=0.43.0",
 ]
 [project.optional-dependencies]

     "openenv-core[core]>=0.2.2",
     "openai>=2.30.0",
     "uvicorn>=0.43.0",
+    "httpx>=0.28.1",
 ]
 [project.optional-dependencies]

server/app.py CHANGED Viewed

@@ -37,7 +37,7 @@ try:
     from .sql_debug_environment import SQLDebugEnvironment
 except ModuleNotFoundError:
     from models import SQLDebugAction, SQLDebugObservation
-    from sql_exp.server.sql_debug_environment import SQLDebugEnvironment
 app = create_app(

     from .sql_debug_environment import SQLDebugEnvironment
 except ModuleNotFoundError:
     from models import SQLDebugAction, SQLDebugObservation
+    from server.sql_debug_environment import SQLDebugEnvironment
 app = create_app(

server/requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 openenv[core]>=0.2.0
 fastapi>=0.115.0
-uvicorn>=0.24.0

 openenv[core]>=0.2.0
 fastapi>=0.115.0
+uvicorn>=0.24.0
+httpx>=0.28.1

server/sql_debug_environment.py CHANGED Viewed

@@ -10,7 +10,6 @@ SQL Debug & Optimizer Environment — server-side implementation.
 The server runs this. The agent never touches this file directly.
 It loads tasks, runs queries in SQLite, grades them, and returns observations.
 """
 from uuid import uuid4
 from openenv.core.env_server.interfaces import Environment
 from openenv.core.env_server.types import State
@@ -21,35 +20,34 @@ except ImportError:
     from models import SQLDebugAction, SQLDebugObservation
 from runner import run_query
-from grader import compute_reward
 def _load_all_tasks() -> dict:
-    """Load every task from the tasks/ folder into a dict keyed by task_id."""
     from tasks.task_easy import TASK as EASY
     from tasks.task_medium import TASK as MEDIUM
     from tasks.task_hard import TASK as HARD
     return {
-        EASY["task_id"]:   EASY,
         MEDIUM["task_id"]: MEDIUM,
-        HARD["task_id"]:   HARD,
     }
-class SQLDebugEnvironment(Environment):
-    """
-    SQL Debug & Optimizer environment.
-    The agent receives a broken or slow SQL query and must fix/optimize it.
-    Each step the agent submits a new query — the environment runs it in
-    SQLite, grades it (0.0–1.0), and returns the result as an observation.
-    Three tasks:
-        syntax_fix_001  (easy)   — fix typos in SQL keywords
-        logic_fix_001   (medium) — fix wrong JOIN type causing bad results
-        # optimize_001    (hard)   — rewrite correlated subquery as a CTE
-    """
     SUPPORTS_CONCURRENT_SESSIONS: bool = True
     def __init__(self):
@@ -57,32 +55,29 @@ class SQLDebugEnvironment(Environment):
         self._current_task = None
         self._state = State(episode_id=str(uuid4()), step_count=0)
         self._best_reward = 0.0
         self._current_query = ""
-    # ── reset ────────────────────────────────────────────────────────────────
-    def reset(self, task_id: str = None) -> SQLDebugObservation:
-        """
-        Start a new episode.
-        Pass task_id to pick a specific task, or leave None for the default (easy).
-        """
         if task_id is None:
-            task_id = list(self._all_tasks.keys())[0]   # default: easy
         if task_id not in self._all_tasks:
-            # Unknown task — return error observation instead of crashing
             return SQLDebugObservation(
                 task_id=task_id,
-                error_message=f"Unknown task_id '{task_id}'. Available: {list(self._all_tasks.keys())}",
                 available_tasks=list(self._all_tasks.keys()),
             )
         self._current_task = self._all_tasks[task_id]
         self._state = State(episode_id=str(uuid4()), step_count=0)
         self._best_reward = 0.0
         self._current_query = self._current_task["broken_query"]
-        # Run the broken query so the agent sees the starting error
         run_result = run_query(
             self._current_task["schema_sql"],
             self._current_query,
@@ -101,45 +96,44 @@ class SQLDebugEnvironment(Environment):
             available_tasks=list(self._all_tasks.keys()),
             done=False,
             reward=0.0,
         )
-    # ── step ─────────────────────────────────────────────────────────────────
     def step(self, action: SQLDebugAction) -> SQLDebugObservation:
-        """
-        Agent submits a query.
-        We run it, grade it, and return the new observation + reward.
-        """
         if self._current_task is None:
-            return SQLDebugObservation(
-                error_message="Call reset() before step()",
-                available_tasks=list(self._all_tasks.keys()),
-                done=True,
-                reward=0.0,
-            )
         self._state.step_count += 1
         self._current_query = action.query
-        # Run the query in SQLite
         run_result = run_query(
             self._current_task["schema_sql"],
             action.query,
         )
-        # Grade it (returns dict with value, syntax_ok, result_match_pct, etc.)
-        reward_dict = compute_reward(self._current_task, action.query, run_result)
-        reward_value = reward_dict["value"]
-        # Track the best reward this episode
-        self._best_reward = max(self._best_reward, reward_value)
-        # Episode ends on perfect score or max steps
         max_steps = self._current_task.get("max_steps", 8)
-        done = (reward_value >= 0.99) or (self._state.step_count >= max_steps)
         return SQLDebugObservation(
-            task_id=self._current_task["task_id"],
             schema_sql=self._current_task["schema_sql"],
             current_query=action.query,
             error_message=run_result["error"] or "",
@@ -150,11 +144,17 @@ class SQLDebugEnvironment(Environment):
             reward_so_far=self._best_reward,
             available_tasks=list(self._all_tasks.keys()),
             done=done,
-            reward=reward_value,
         )
-    # ── state ─────────────────────────────────────────────────────────────────
     @property
     def state(self) -> State:
-        return self._state

 The server runs this. The agent never touches this file directly.
 It loads tasks, runs queries in SQLite, grades them, and returns observations.
 """
 from uuid import uuid4
 from openenv.core.env_server.interfaces import Environment
 from openenv.core.env_server.types import State
     from models import SQLDebugAction, SQLDebugObservation
 from runner import run_query
+# Import each task's dedicated grader
+from graders.grader_easy import grade as grade_easy
+from graders.grader_medium import grade as grade_medium
+from graders.grader_hard import grade as grade_hard
 def _load_all_tasks() -> dict:
     from tasks.task_easy import TASK as EASY
     from tasks.task_medium import TASK as MEDIUM
     from tasks.task_hard import TASK as HARD
     return {
+        EASY["task_id"]: EASY,
         MEDIUM["task_id"]: MEDIUM,
+        HARD["task_id"]: HARD,
     }
+# Maps each task_id to its dedicated grader function
+TASK_GRADERS = {
+    "syntax_fix_001": grade_easy,
+    "logic_fix_001": grade_medium,
+    "optimize_001": grade_hard,
+}
+class SQLDebugEnvironment(Environment):
     SUPPORTS_CONCURRENT_SESSIONS: bool = True
     def __init__(self):
         self._current_task = None
         self._state = State(episode_id=str(uuid4()), step_count=0)
         self._best_reward = 0.0
+        self._prev_absolute_score = 0.0  # used for delta computation
         self._current_query = ""
+    # sql_debug_environment.py — replace reset() return and step() return only
+    def reset(self, task_id: str = None, **kwargs) -> SQLDebugObservation:
         if task_id is None:
+            task_id = list(self._all_tasks.keys())[0]
         if task_id not in self._all_tasks:
             return SQLDebugObservation(
                 task_id=task_id,
+                error_message=f"Unknown task '{task_id}'. Available: {list(self._all_tasks.keys())}",
                 available_tasks=list(self._all_tasks.keys()),
+                metadata={},
             )
         self._current_task = self._all_tasks[task_id]
         self._state = State(episode_id=str(uuid4()), step_count=0)
         self._best_reward = 0.0
+        self._prev_absolute_score = 0.0
         self._current_query = self._current_task["broken_query"]
         run_result = run_query(
             self._current_task["schema_sql"],
             self._current_query,
             available_tasks=list(self._all_tasks.keys()),
             done=False,
             reward=0.0,
+            metadata={"feedback": "", "status": "ready"},  # ← feedback in metadata
         )
     def step(self, action: SQLDebugAction) -> SQLDebugObservation:
+        # Auto-reset if not already initialized (handles session management issues)
         if self._current_task is None:
+            self.reset()
         self._state.step_count += 1
         self._current_query = action.query
         run_result = run_query(
             self._current_task["schema_sql"],
             action.query,
         )
+        task_id = self._current_task["task_id"]
+        grader_fn = TASK_GRADERS.get(task_id, grade_easy)
+        reward_dict = grader_fn(
+            task=self._current_task,
+            agent_query=action.query,
+            run_result=run_result,
+            prev_absolute_score=self._prev_absolute_score,
+            step_count=self._state.step_count,
+            max_steps=self._current_task.get("max_steps", 8),
+        )
+        self._prev_absolute_score = reward_dict["absolute_score"]
+        self._best_reward = max(self._best_reward, reward_dict["absolute_score"])
         max_steps = self._current_task.get("max_steps", 8)
+        done = (
+            reward_dict["absolute_score"] >= 0.99 or self._state.step_count >= max_steps
+        )
         return SQLDebugObservation(
+            task_id=task_id,
             schema_sql=self._current_task["schema_sql"],
             current_query=action.query,
             error_message=run_result["error"] or "",
             reward_so_far=self._best_reward,
             available_tasks=list(self._all_tasks.keys()),
             done=done,
+            reward=reward_dict["value"],
+            metadata={  # ← all extra data here
+                "feedback": reward_dict["feedback"],
+                "status": reward_dict["status"],
+                "absolute_score": reward_dict["absolute_score"],
+                "delta": reward_dict["delta"],
+                "result_score": reward_dict["result_score"],
+                "plan_score": reward_dict["plan_score"],
+            },
         )
     @property
     def state(self) -> State:
+        return self._state

tasks/task_hard.py CHANGED Viewed

@@ -4,12 +4,12 @@ import random
 def generate_schema(n_rows=5000, seed=42):
     """Generates schema + INSERT statements for n_rows transactions."""
     rng = random.Random(seed)
-    statuses = ['completed', 'pending', 'failed']
     inserts = []
     for i in range(1, n_rows + 1):
         user_id = rng.randint(1, 100)
         amount = round(rng.uniform(10, 1000), 2)
-        status = rng.choice(statuses)
         inserts.append(f"INSERT INTO transactions VALUES ({i}, {user_id}, {amount}, 'completed');")
     return (
         "CREATE TABLE transactions (id INTEGER, user_id INTEGER, amount REAL, ts TEXT, status TEXT);\n"

 def generate_schema(n_rows=5000, seed=42):
     """Generates schema + INSERT statements for n_rows transactions."""
     rng = random.Random(seed)
+    # statuses = ['completed', 'pending', 'failed']
     inserts = []
     for i in range(1, n_rows + 1):
         user_id = rng.randint(1, 100)
         amount = round(rng.uniform(10, 1000), 2)
+        # status = rng.choice(statuses)
         inserts.append(f"INSERT INTO transactions VALUES ({i}, {user_id}, {amount}, 'completed');")
     return (
         "CREATE TABLE transactions (id INTEGER, user_id INTEGER, amount REAL, ts TEXT, status TEXT);\n"

uv.lock CHANGED Viewed

@@ -1603,6 +1603,7 @@ name = "openenv-sql-debug"
 version = "0.1.0"
 source = { editable = "." }
 dependencies = [
     { name = "openai" },
     { name = "openenv-core", extra = ["core"] },
     { name = "uvicorn" },
@@ -1616,6 +1617,7 @@ dev = [
 [package.metadata]
 requires-dist = [
     { name = "openai", specifier = ">=2.30.0" },
     { name = "openenv-core", extras = ["core"], specifier = ">=0.2.2" },
     { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0.0" },

 version = "0.1.0"
 source = { editable = "." }
 dependencies = [
+    { name = "httpx" },
     { name = "openai" },
     { name = "openenv-core", extra = ["core"] },
     { name = "uvicorn" },
 [package.metadata]
 requires-dist = [
+    { name = "httpx", specifier = ">=0.28.1" },
     { name = "openai", specifier = ">=2.30.0" },
     { name = "openenv-core", extras = ["core"], specifier = ">=0.2.2" },
     { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0.0" },