Spaces:

Prithvigg
/

queryforge

Running

App Files Files Community

Prithvigg commited on 22 days ago

Commit

3867c62

verified ·

1 Parent(s): a8a3c90

Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

client.py +4 -1
inference.py +241 -0
judge.py +27 -15
playbook.py +98 -99
server/queryforge_environment.py +24 -0
tasks.py +290 -6

client.py CHANGED Viewed

@@ -7,7 +7,10 @@ from openenv.core import EnvClient
 from openenv.core.client_types import StepResult
 from openenv.core.env_server.types import State
-from .models import SQLAction, SQLObservation, TaskSpec
 class QueryforgeEnv(EnvClient[SQLAction, SQLObservation, State]):

 from openenv.core.client_types import StepResult
 from openenv.core.env_server.types import State
+try:
+    from .models import SQLAction, SQLObservation, TaskSpec
+except ImportError:
+    from models import SQLAction, SQLObservation, TaskSpec
 class QueryforgeEnv(EnvClient[SQLAction, SQLObservation, State]):

inference.py ADDED Viewed

	@@ -0,0 +1,241 @@

+"""
+QueryForge Inference Script
+===================================
+MANDATORY env vars:
+  API_BASE_URL   The API endpoint for the LLM (e.g. https://router.huggingface.co/v1)
+  MODEL_NAME     The model identifier to use for inference
+  HF_TOKEN       Your Hugging Face / API key
+Optional env vars:
+  ENV_URL          QueryForge environment server URL (default: http://localhost:8000)
+  ANTHROPIC_API_KEY  Enables AI judge for scores up to 1.0 (default: deterministic mode)
+"""
+import os
+import re
+import sys
+import textwrap
+from typing import List, Optional
+from openai import OpenAI
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from client import QueryforgeEnv
+from models import SQLAction
+# ── Configuration ─────────────────────────────────────────────────────────────
+API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
+API_KEY      = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
+MODEL_NAME   = os.getenv("MODEL_NAME")
+ENV_URL      = os.getenv("ENV_URL", "http://127.0.0.1:8000")
+MAX_STEPS   = 5      # max attempts per task (overridden by task's own max_steps)
+TEMPERATURE = 0.2
+MAX_TOKENS  = 512
+TASK_IDS = [
+    "task_easy_syntax",
+    "task_medium_join",
+    "task_hard_cte",
+    "task_expert_rank",
+    "task_expert_recursive",
+    "task_expert_window",
+]
+# ── Prompts ───────────────────────────────────────────────────────────────────
+SYSTEM_PROMPT = textwrap.dedent("""
+    You are an expert SQL engineer tasked with debugging and optimising SQL queries.
+    You will be given a SQL challenge that includes a schema, a broken or slow query,
+    and a description of what the correct output should be.
+    Rules:
+    - Respond with ONLY a single SQL query inside a ```sql ... ``` code block.
+    - Do not explain your reasoning outside the code block.
+    - Do not include multiple statements separated by semicolons.
+    - If you receive grading feedback on a previous attempt, use it to improve.
+""").strip()
+# ── SQL extraction ─────────────────────────────────────────────────────────────
+_SQL_BLOCK = re.compile(r"```(?:sql)?\s*(.*?)```", re.DOTALL | re.IGNORECASE)
+def extract_sql(text: str) -> str:
+    """Pull the first SQL code block from the model response."""
+    match = _SQL_BLOCK.search(text)
+    if match:
+        return match.group(1).strip()
+    return text.strip()
+# ── Formatting ────────────────────────────────────────────────────────────────
+def score_bar(score: float, width: int = 25) -> str:
+    filled = int(score * width)
+    return "[" + "█" * filled + "░" * (width - filled) + f"] {score:.3f}"
+def hr(char="═", width=70):
+    print(char * width)
+# ── Per-task agent loop ────────────────────────────────────────────────────────
+def run_task(task_id: str, llm: OpenAI, env_client) -> dict:
+    """
+    Run one episode for a single task.
+    Returns dict with task_id, task_title, task_level, best_score, attempts, done.
+    """
+    result = env_client.reset(task_id=task_id)
+    obs = result.observation
+    if result.done:
+        print(f"  ERROR loading task: {obs.feedback}")
+        return {"task_id": task_id, "best_score": 0.0, "attempts": 0, "done": False}
+    print(f"\n  Task  : {obs.task_title}  [{obs.task_level}]")
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {
+            "role": "user",
+            "content": (
+                f"Here is your SQL challenge:\n\n{obs.task_description}\n\n"
+                "Provide your fixed SQL query."
+            ),
+        },
+    ]
+    step = 0
+    while not result.done:
+        step += 1
+        try:
+            completion = llm.chat.completions.create(
+                model=MODEL_NAME,
+                messages=messages,
+                temperature=TEMPERATURE,
+                max_tokens=MAX_TOKENS,
+                stream=False,
+            )
+            response_text = completion.choices[0].message.content or ""
+        except Exception as exc:
+            print(f"  LLM call failed at step {step}: {exc}")
+            break
+        sql = extract_sql(response_text)
+        # ── Print generated SQL ───────────────────────────────────────────────
+        print(f"\n  ┌─ Step {step} · SQL submitted {'─' * (50 - len(str(step)))}")
+        for line in sql.splitlines():
+            print(f"  │  {line}")
+        print(f"  └{'─' * 56}")
+        result = env_client.step(SQLAction(sql=sql))
+        obs = result.observation
+        score = result.reward or 0.0
+        done_marker = "  ✓ DONE" if result.done else ""
+        print(f"  Score : {score_bar(score)}{done_marker}")
+        if not obs.syntax_valid:
+            print(f"  ✗ Syntax error — query could not be parsed")
+        elif not obs.execution_success:
+            print(f"  ✗ Execution failed — {(obs.execution_error or '')[:80]}")
+        else:
+            print(f"  ✓ Executed · rows returned: {obs.rows_returned}")
+        if result.done:
+            break
+        # ── Why are we going to the next step? ───────────────────────────────
+        print(f"\n  ↻ Retrying — score {score:.3f} below threshold")
+        if obs.feedback:
+            # Split the feedback into its tagged sections for readable multi-line output
+            for part in obs.feedback.split("  "):
+                part = part.strip()
+                if part:
+                    print(f"  {part}")
+        if obs.hint:
+            print(f"  Hint     : {obs.hint[:120]}")
+        # Feed grading result back to the model for the next attempt
+        messages.append({"role": "assistant", "content": response_text})
+        messages.append({
+            "role": "user",
+            "content": (
+                f"Your query scored {result.reward:.3f}.\n\n"
+                f"Feedback: {obs.feedback}\n\n"
+                f"Hint: {obs.hint}\n\n"
+                "Please submit an improved SQL query."
+            ),
+        })
+    return {
+        "task_id": task_id,
+        "task_title": obs.task_title,
+        "task_level": obs.task_level,
+        "best_score": obs.best_score,
+        "attempts": obs.attempt,
+        "done": result.done,
+    }
+# ── Main ───────────────────────────────────────────────────────────────────────
+def main() -> None:
+    # ── Validate required config ──────────────────────────────────────────────
+    missing = [v for v in ("API_BASE_URL", "MODEL_NAME") if not os.getenv(v)]
+    if missing:
+        print(f"ERROR: missing required env vars: {', '.join(missing)}")
+        sys.exit(1)
+    if not API_KEY:
+        print("ERROR: HF_TOKEN (or API_KEY) is not set.")
+        sys.exit(1)
+    llm = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
+    hr()
+    print("  QueryForge — Inference")
+    print(f"  Model  : {MODEL_NAME}")
+    print(f"  Env    : {ENV_URL}")
+    print(f"  Tasks  : {', '.join(TASK_IDS)}")
+    hr()
+    results = []
+    with QueryforgeEnv(base_url=ENV_URL).sync() as env_client:
+        for task_id in TASK_IDS:
+            print(f"\n{'─' * 70}")
+            result = run_task(task_id, llm, env_client)
+            results.append(result)
+    # ── Results table ─────────────────────────────────────────────────────────
+    print(f"\n{'═' * 70}")
+    print("  RESULTS")
+    print(f"  Model: {MODEL_NAME}")
+    print(f"{'═' * 70}")
+    print(f"  {'Task':<28} {'Level':<8} {'Steps':>5}  {'Best Score'}")
+    print(f"  {'─' * 28} {'─' * 8} {'─' * 5}  {'─' * 30}")
+    total = 0.0
+    for r in results:
+        title   = r.get("task_title", r["task_id"])[:27]
+        level   = r.get("task_level", "?")
+        steps   = r.get("attempts", "?")
+        score   = r["best_score"]
+        total  += score
+        print(f"  {title:<28} {level:<8} {steps:>5}  {score_bar(score)}")
+    avg = total / len(results) if results else 0.0
+    print(f"{'─' * 70}")
+    print(f"  {'AVERAGE':<28} {'':8} {'':5}  {score_bar(avg)}")
+    print(f"{'═' * 70}\n")
+if __name__ == "__main__":
+    main()

judge.py CHANGED Viewed

@@ -282,18 +282,22 @@ Respond with ONLY valid JSON (no markdown fences):
     try:
         message = client.messages.create(
-            model="claude-sonnet-4-6",
             max_tokens=512,
-            messages=[{"role": "user", "content": prompt}],
         )
-        raw = message.content[0].text.strip()
-        # Strip accidental markdown fences
-        if raw.startswith("```"):
-            raw = raw.split("```")[1]
-            if raw.startswith("json"):
-                raw = raw[4:]
-            raw = raw.rsplit("```", 1)[0].strip()
         data = json.loads(raw)
         score = float(data["score"])
@@ -305,14 +309,13 @@ Respond with ONLY valid JSON (no markdown fences):
     except Exception as exc:
         # Graceful fallback — no API key, network error, or parse failure
         msg = str(exc).lower()
-        reason = (
-            "no ANTHROPIC_API_KEY set"
-            if "api_key" in msg or "auth" in msg or "authentication" in msg
-            else type(exc).__name__
-        )
         return (
             deterministic_score,
-            f"AI judge offline ({reason}). Using deterministic score.",
             task.hint,
         )
@@ -378,6 +381,15 @@ def grade(
     elif task.level == "medium" and "JOIN " not in query_upper:
         structural_penalty = 0.20  # medium task demands explicit JOINs
         row_feedback += " (Penalty: no explicit JOIN — task requires JOIN … ON syntax.)"
     details["structural_penalty"] = structural_penalty

     try:
         message = client.messages.create(
+            model=JUDGE_MODEL,
             max_tokens=512,
+            messages=[
+                {"role": "user",      "content": prompt},
+                {"role": "assistant", "content": "{"},   # prefill forces JSON-only reply
+            ],
         )
+        print("Anthropic judge response:", message.content)
+        # Prepend the prefilled "{" back before parsing
+        raw = "{" + message.content[0].text.strip()
+        # Belt-and-suspenders: extract the first {...} block in case of any preamble
+        brace_start = raw.find("{")
+        brace_end   = raw.rfind("}") + 1
+        if brace_start != -1 and brace_end > brace_start:
+            raw = raw[brace_start:brace_end]
         data = json.loads(raw)
         score = float(data["score"])
     except Exception as exc:
         # Graceful fallback — no API key, network error, or parse failure
         msg = str(exc).lower()
+        if "api_key" in msg or "auth" in msg or "authentication" in msg:
+            reason = "ANTHROPIC_API_KEY not set — deterministic scoring only (max 0.80)"
+        else:
+            reason = f"AI judge call failed ({type(exc).__name__}) — fell back to deterministic score"
         return (
             deterministic_score,
+            f"[AI Judge unavailable] {reason}.",
             task.hint,
         )
     elif task.level == "medium" and "JOIN " not in query_upper:
         structural_penalty = 0.20  # medium task demands explicit JOINs
         row_feedback += " (Penalty: no explicit JOIN — task requires JOIN … ON syntax.)"
+    elif task.id == "task_expert_recursive" and "RECURSIVE" not in query_upper:
+        structural_penalty = 0.30  # must use recursive CTE, not repeated JOINs
+        row_feedback += " (Penalty: WITH RECURSIVE required — plain JOIN only fetches one level.)"
+    elif task.id == "task_expert_rank" and "ROW_NUMBER" in query_upper:
+        structural_penalty = 0.20  # ROW_NUMBER breaks ties — must use RANK/DENSE_RANK
+        row_feedback += " (Penalty: ROW_NUMBER() drops tied rows — use RANK() or DENSE_RANK().)"
+    elif task.id == "task_expert_window" and "PARTITION BY" not in query_upper:
+        structural_penalty = 0.20  # both window functions need PARTITION BY region
+        row_feedback += " (Penalty: missing PARTITION BY — both SUM and RANK must be partitioned per region.)"
     details["structural_penalty"] = structural_penalty

playbook.py CHANGED Viewed

@@ -1,10 +1,13 @@
 """
-QueryForge Local Playbook
-─────────────────────────
-Tests the environment directly (no HTTP server needed).
-Run from the queryforge directory:
-    .venv/bin/python playbook.py
 If ANTHROPIC_API_KEY is set, Stage 4 AI scoring is live.
 If not set, the judge falls back to deterministic scoring (capped at 0.80).
@@ -14,13 +17,14 @@ import os
 import sys
 import textwrap
-# Make imports work whether run directly or as a module
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
-from server.queryforge_environment import QueryforgeEnvironment
-from models import SQLAction
 from tasks import REGISTRY, task_from_dict
 # ── Formatting helpers ────────────────────────────────────────────────────────
 def _hr(char="═", width=70):
@@ -37,8 +41,9 @@ def _score_bar(score: float, width: int = 30) -> str:
     bar = "█" * filled + "░" * (width - filled)
     return f"[{bar}] {score:.2f}"
-def _print_obs(obs, show_description=False):
-    if show_description:
         print()
         print(textwrap.indent(obs.task_description, "  "))
         print()
@@ -48,91 +53,87 @@ def _print_obs(obs, show_description=False):
         if obs.execution_error:
             print(f"  Execution error : {obs.execution_error[:100]}")
         print(f"  Rows returned   : {obs.rows_returned}")
-        print(f"  Score           : {_score_bar(obs.reward or 0.0)}")
         print(f"  Best this ep.   : {_score_bar(obs.best_score)}")
-        # Print just the first 200 chars of feedback to keep output clean
         fb = obs.feedback[:250] + ("…" if len(obs.feedback) > 250 else "")
         print(f"  Feedback        : {fb}")
         if obs.hint:
             print(f"  Hint            : {obs.hint[:120]}")
-def _attempt(env, label: str, sql: str):
     print(f"\n  ── Attempt: {label}")
     print(f"     SQL: {sql[:100]}{'…' if len(sql) > 100 else ''}")
-    obs = env.step(SQLAction(sql=sql))
-    _print_obs(obs)
-    return obs
 # ── Task runners ──────────────────────────────────────────────────────────────
-def run_easy(env):
     _section("TASK 1 · EASY — Fix Syntax Errors")
-    env._task_index = 0  # pin to easy
-    obs = env.reset()
     print(f"\n  Task : {obs.task_title}  [{obs.task_level}]")
-    print(f"  Steps: up to {5}")
-    _print_obs(obs, show_description=True)
-    _attempt(env, "still broken",
              "SELEC name, age FORM users WEHRE age > 30")
-    _attempt(env, "one keyword fixed",
              "SELECT name, age FORM users WEHRE age > 30")
-    _attempt(env, "all keywords fixed, no filter",
              "SELECT name, age FROM users WHERE age > 30")
-    obs = _attempt(env, "correct solution",
-                   "SELECT name, age FROM users "
-                   "WHERE age > 30 AND city = 'New York' "
-                   "ORDER BY name ASC")
-    print(f"\n  Episode done: {obs.done}  |  Best score: {obs.best_score:.2f}")
-def run_medium(env):
     _section("TASK 2 · MEDIUM — Fix the Cartesian JOIN")
-    env._task_index = 1  # pin to medium
-    obs = env.reset()
     print(f"\n  Task : {obs.task_title}  [{obs.task_level}]")
-    print(f"  Steps: up to {5}")
-    _print_obs(obs, show_description=True)
-    _attempt(env, "broken verbatim (cartesian product)",
              "SELECT u.name, p.title, SUM(o.amount) AS total_spent "
              "FROM orders o, users u, products p "
              "WHERE o.user_id = u.id "
              "GROUP BY u.name, p.title "
              "ORDER BY total_spent DESC")
-    _attempt(env, "comma-join but missing product condition",
              "SELECT u.name, p.title, SUM(o.amount) AS total_spent "
              "FROM orders o, users u, products p "
              "WHERE o.user_id = u.id AND o.product_id = p.id "
              "GROUP BY u.name, p.title "
              "ORDER BY total_spent DESC")
-    obs = _attempt(env, "correct INNER JOINs",
-                   "SELECT u.name, p.title, SUM(o.amount) AS total_spent\n"
-                   "FROM orders o\n"
-                   "INNER JOIN users    u ON o.user_id    = u.id\n"
-                   "INNER JOIN products p ON o.product_id = p.id\n"
-                   "GROUP BY u.name, p.title\n"
-                   "ORDER BY total_spent DESC")
-    print(f"\n  Episode done: {obs.done}  |  Best score: {obs.best_score:.2f}")
-def run_hard(env):
     _section("TASK 3 · HARD — Rewrite Correlated Subquery as CTE")
-    env._task_index = 2  # pin to hard
-    obs = env.reset()
     print(f"\n  Task : {obs.task_title}  [{obs.task_level}]")
-    print(f"  Steps: up to {6}")
-    _print_obs(obs, show_description=True)
-    _attempt(env, "broken verbatim (no CTE — penalised even though rows match)",
              "SELECT e.name, e.department_id, e.salary\n"
              "FROM employees e\n"
              "WHERE e.salary > (\n"
@@ -141,7 +142,7 @@ def run_hard(env):
              ")\n"
              "ORDER BY e.department_id, e.salary DESC")
-    _attempt(env, "halfway — CTE defined but wrong join",
              "WITH dept_avg AS (\n"
              "    SELECT department_id, AVG(salary) AS avg_salary\n"
              "    FROM employees GROUP BY department_id\n"
@@ -151,32 +152,30 @@ def run_hard(env):
              "WHERE e.salary > d.avg_salary\n"
              "ORDER BY e.department_id, e.salary DESC")
-    obs = _attempt(env, "correct CTE with proper JOIN",
-                   "WITH dept_avg AS (\n"
-                   "    SELECT department_id, AVG(salary) AS avg_salary\n"
-                   "    FROM employees\n"
-                   "    GROUP BY department_id\n"
-                   ")\n"
-                   "SELECT e.name, e.department_id, e.salary\n"
-                   "FROM employees e\n"
-                   "JOIN dept_avg d ON e.department_id = d.department_id\n"
-                   "WHERE e.salary > d.avg_salary\n"
-                   "ORDER BY e.department_id, e.salary DESC")
-    print(f"\n  Episode done: {obs.done}  |  Best score: {obs.best_score:.2f}")
-# ── Custom task demo ──────────────────────────────────────────────────────────
-def run_custom(env):
     _section("TASK 4 · CUSTOM — NULL Handling in Aggregation")
-    # Register a brand-new task at runtime
-    custom_task = task_from_dict({
-        "id": "custom_null_avg",
-        "level": "custom",
-        "title": "Handle NULLs in Aggregation",
-        "description": """\
 TASK: The query below skips NULL scores, making the class average look higher.
 Fix it so NULL scores are treated as 0.
@@ -190,8 +189,8 @@ ERROR:
   NULL values are silently excluded by AVG(), inflating the result.
 GOAL: Return a single row with avg_score that treats NULL as 0.
-      Expected result: avg_score = 72.5""",
-        "schema_ddl": """\
 CREATE TABLE students (id INTEGER, name VARCHAR, score INTEGER);
 INSERT INTO students VALUES
     (1, 'Alice', 90),
@@ -201,31 +200,30 @@ INSERT INTO students VALUES
     (5, 'Eve',   70),
     (6, 'Frank', 50);
 """,
-        "broken_query": "SELECT AVG(score) AS avg_score FROM students",
-        "error_message": "NULL scores are silently skipped by AVG().",
-        "hint": "Wrap score with COALESCE(score, 0) before averaging.",
-        "expected_rows": [{"avg_score": 65.0}],
-        "solution_query": "SELECT AVG(COALESCE(score, 0)) AS avg_score FROM students",
-        "test_description": "AVG treats NULL as 0 → 65.0",
-        "max_steps": 4,
-    })
-    REGISTRY.register(custom_task)
-    obs = env.reset(task_id="custom_null_avg")
     print(f"\n  Task : {obs.task_title}  [{obs.task_level}]")
-    print(f"  Steps: up to {custom_task.max_steps}")
-    _print_obs(obs, show_description=True)
-    _attempt(env, "broken (NULL excluded)",
              "SELECT AVG(score) AS avg_score FROM students")
-    obs = _attempt(env, "correct (COALESCE)",
-                   "SELECT AVG(COALESCE(score, 0)) AS avg_score FROM students")
-    print(f"\n  Episode done: {obs.done}  |  Best score: {obs.best_score:.2f}")
-    # Clean up: remove custom task from registry
-    REGISTRY.unregister("custom_null_avg")
     print("  Custom task unregistered from registry.")
@@ -235,15 +233,16 @@ if __name__ == "__main__":
     ai_key = os.environ.get("ANTHROPIC_API_KEY")
     _hr("═")
-    print("  QueryForge — Local Playbook")
     print(f"  AI judge : {'LIVE (ANTHROPIC_API_KEY set)' if ai_key else 'OFFLINE (fallback to deterministic, max 0.80)'}")
     _hr("═")
-    # Create a fresh env for each task so cycling order never matters
-    run_easy(QueryforgeEnvironment())
-    run_medium(QueryforgeEnvironment())
-    run_hard(QueryforgeEnvironment())
-    run_custom(QueryforgeEnvironment())
     _section("DONE")
     print("  All 4 tasks completed.\n")

 """
+QueryForge Client Playbook
+──────────────────────────
+Tests the environment through the HTTP server using the QueryforgeEnv client.
+Requires the server to be running first:
+    uvicorn server.app:app --host 0.0.0.0 --port 8000
+Then run:
+    python playbook.py
 If ANTHROPIC_API_KEY is set, Stage 4 AI scoring is live.
 If not set, the judge falls back to deterministic scoring (capped at 0.80).
 import sys
 import textwrap
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from client import QueryforgeEnv
+from models import SQLAction, TaskSpec
 from tasks import REGISTRY, task_from_dict
+BASE_URL = "https://prithvigg-queryforge.hf.space"
 # ── Formatting helpers ────────────────────────────────────────────────────────
 def _hr(char="═", width=70):
     bar = "█" * filled + "░" * (width - filled)
     return f"[{bar}] {score:.2f}"
+def _print_result(result, show_description=False):
+    obs = result.observation
+    if show_description and obs.task_description:
         print()
         print(textwrap.indent(obs.task_description, "  "))
         print()
         if obs.execution_error:
             print(f"  Execution error : {obs.execution_error[:100]}")
         print(f"  Rows returned   : {obs.rows_returned}")
+        print(f"  Score           : {_score_bar(result.reward or 0.0)}")
         print(f"  Best this ep.   : {_score_bar(obs.best_score)}")
         fb = obs.feedback[:250] + ("…" if len(obs.feedback) > 250 else "")
         print(f"  Feedback        : {fb}")
         if obs.hint:
             print(f"  Hint            : {obs.hint[:120]}")
+def _attempt(client, label: str, sql: str):
     print(f"\n  ── Attempt: {label}")
     print(f"     SQL: {sql[:100]}{'…' if len(sql) > 100 else ''}")
+    result = client.step(SQLAction(sql=sql))
+    _print_result(result)
+    return result
 # ── Task runners ──────────────────────────────────────────────────────────────
+def run_easy(client):
     _section("TASK 1 · EASY — Fix Syntax Errors")
+    result = client.reset(task_id="task_easy_syntax")
+    obs = result.observation
     print(f"\n  Task : {obs.task_title}  [{obs.task_level}]")
+    _print_result(result, show_description=True)
+    _attempt(client, "still broken",
              "SELEC name, age FORM users WEHRE age > 30")
+    _attempt(client, "one keyword fixed",
              "SELECT name, age FORM users WEHRE age > 30")
+    _attempt(client, "all keywords fixed, no filter",
              "SELECT name, age FROM users WHERE age > 30")
+    result = _attempt(client, "correct solution",
+                      "SELECT name, age FROM users "
+                      "WHERE age > 30 AND city = 'New York' "
+                      "ORDER BY name ASC")
+    print(f"\n  Episode done: {result.done}  |  Best score: {result.observation.best_score:.2f}")
+def run_medium(client):
     _section("TASK 2 · MEDIUM — Fix the Cartesian JOIN")
+    result = client.reset(task_id="task_medium_join")
+    obs = result.observation
     print(f"\n  Task : {obs.task_title}  [{obs.task_level}]")
+    _print_result(result, show_description=True)
+    _attempt(client, "broken verbatim (cartesian product)",
              "SELECT u.name, p.title, SUM(o.amount) AS total_spent "
              "FROM orders o, users u, products p "
              "WHERE o.user_id = u.id "
              "GROUP BY u.name, p.title "
              "ORDER BY total_spent DESC")
+    _attempt(client, "comma-join with product condition (no explicit JOIN)",
              "SELECT u.name, p.title, SUM(o.amount) AS total_spent "
              "FROM orders o, users u, products p "
              "WHERE o.user_id = u.id AND o.product_id = p.id "
              "GROUP BY u.name, p.title "
              "ORDER BY total_spent DESC")
+    result = _attempt(client, "correct INNER JOINs",
+                      "SELECT u.name, p.title, SUM(o.amount) AS total_spent\n"
+                      "FROM orders o\n"
+                      "INNER JOIN users    u ON o.user_id    = u.id\n"
+                      "INNER JOIN products p ON o.product_id = p.id\n"
+                      "GROUP BY u.name, p.title\n"
+                      "ORDER BY total_spent DESC")
+    print(f"\n  Episode done: {result.done}  |  Best score: {result.observation.best_score:.2f}")
+def run_hard(client):
     _section("TASK 3 · HARD — Rewrite Correlated Subquery as CTE")
+    result = client.reset(task_id="task_hard_cte")
+    obs = result.observation
     print(f"\n  Task : {obs.task_title}  [{obs.task_level}]")
+    _print_result(result, show_description=True)
+    _attempt(client, "broken verbatim (no CTE)",
              "SELECT e.name, e.department_id, e.salary\n"
              "FROM employees e\n"
              "WHERE e.salary > (\n"
              ")\n"
              "ORDER BY e.department_id, e.salary DESC")
+    _attempt(client, "halfway — CTE defined but wrong join",
              "WITH dept_avg AS (\n"
              "    SELECT department_id, AVG(salary) AS avg_salary\n"
              "    FROM employees GROUP BY department_id\n"
              "WHERE e.salary > d.avg_salary\n"
              "ORDER BY e.department_id, e.salary DESC")
+    result = _attempt(client, "correct CTE with proper JOIN",
+                      "WITH dept_avg AS (\n"
+                      "    SELECT department_id, AVG(salary) AS avg_salary\n"
+                      "    FROM employees\n"
+                      "    GROUP BY department_id\n"
+                      ")\n"
+                      "SELECT e.name, e.department_id, e.salary\n"
+                      "FROM employees e\n"
+                      "JOIN dept_avg d ON e.department_id = d.department_id\n"
+                      "WHERE e.salary > d.avg_salary\n"
+                      "ORDER BY e.department_id, e.salary DESC")
+    print(f"\n  Episode done: {result.done}  |  Best score: {result.observation.best_score:.2f}")
+def run_custom(client):
     _section("TASK 4 · CUSTOM — NULL Handling in Aggregation")
+    # Register a brand-new task at runtime via the REST API
+    client.register_task(TaskSpec(
+        id="custom_null_avg",
+        level="custom",
+        title="Handle NULLs in Aggregation",
+        description="""\
 TASK: The query below skips NULL scores, making the class average look higher.
 Fix it so NULL scores are treated as 0.
   NULL values are silently excluded by AVG(), inflating the result.
 GOAL: Return a single row with avg_score that treats NULL as 0.
+      Expected result: avg_score = 65.0""",
+        schema_ddl="""\
 CREATE TABLE students (id INTEGER, name VARCHAR, score INTEGER);
 INSERT INTO students VALUES
     (1, 'Alice', 90),
     (5, 'Eve',   70),
     (6, 'Frank', 50);
 """,
+        broken_query="SELECT AVG(score) AS avg_score FROM students",
+        error_message="NULL scores are silently skipped by AVG().",
+        hint="Wrap score with COALESCE(score, 0) before averaging.",
+        expected_rows=[{"avg_score": 65.0}],
+        solution_query="SELECT AVG(COALESCE(score, 0)) AS avg_score FROM students",
+        test_description="AVG treats NULL as 0 → 65.0",
+        max_steps=4,
+    ))
+    result = client.reset(task_id="custom_null_avg")
+    obs = result.observation
     print(f"\n  Task : {obs.task_title}  [{obs.task_level}]")
+    _print_result(result, show_description=True)
+    _attempt(client, "broken (NULL excluded)",
              "SELECT AVG(score) AS avg_score FROM students")
+    result = _attempt(client, "correct (COALESCE)",
+                      "SELECT AVG(COALESCE(score, 0)) AS avg_score FROM students")
+    print(f"\n  Episode done: {result.done}  |  Best score: {result.observation.best_score:.2f}")
+    # Clean up
+    client.delete_task("custom_null_avg")
     print("  Custom task unregistered from registry.")
     ai_key = os.environ.get("ANTHROPIC_API_KEY")
     _hr("═")
+    print("  QueryForge — Client Playbook")
+    print(f"  Server : {BASE_URL}")
     print(f"  AI judge : {'LIVE (ANTHROPIC_API_KEY set)' if ai_key else 'OFFLINE (fallback to deterministic, max 0.80)'}")
     _hr("═")
+    with QueryforgeEnv(base_url=BASE_URL).sync() as client:
+        # run_easy(client)
+        run_medium(client)
+        run_hard(client)
+        # run_custom(client)
     _section("DONE")
     print("  All 4 tasks completed.\n")

server/queryforge_environment.py CHANGED Viewed

@@ -20,6 +20,8 @@ Episode ends when:
   - max_steps for the task is exhausted
 """
 from typing import Optional
 from uuid import uuid4
@@ -35,6 +37,16 @@ except ImportError:
     from tasks import REGISTRY, SQLTask
     from judge import grade
 class QueryforgeEnvironment(Environment):
     """
@@ -97,6 +109,12 @@ class QueryforgeEnvironment(Environment):
         self._attempt = 0
         self._stale_steps = 0
         if task_id is not None:
             try:
                 self._current_task = REGISTRY.get(task_id)
@@ -142,6 +160,12 @@ class QueryforgeEnvironment(Environment):
                 reward=0.0,
             )
         score, feedback, details = grade(self._current_task, action.sql)
         # Fix 1 — early stopping: track consecutive steps with no improvement

   - max_steps for the task is exhausted
 """
+import logging
+import os
 from typing import Optional
 from uuid import uuid4
     from tasks import REGISTRY, SQLTask
     from judge import grade
+logger = logging.getLogger(__name__)
+_AI_JUDGE_ACTIVE = bool(os.environ.get("ANTHROPIC_API_KEY"))
+print("here", os.environ.get("ANTHROPIC_API_KEY"))
+logger.info(
+    "QueryForge environment loaded | AI judge: %s | done_threshold: %s",
+    "ACTIVE (scores up to 1.0)" if _AI_JUDGE_ACTIVE else "OFFLINE — deterministic only (max score 0.80)",
+    "0.90" if _AI_JUDGE_ACTIVE else "0.80",
+)
 class QueryforgeEnvironment(Environment):
     """
         self._attempt = 0
         self._stale_steps = 0
+        logger.info(
+            "reset() | task_id=%s | AI judge: %s",
+            task_id or "round-robin",
+            "ACTIVE" if _AI_JUDGE_ACTIVE else "OFFLINE",
+        )
         if task_id is not None:
             try:
                 self._current_task = REGISTRY.get(task_id)
                 reward=0.0,
             )
+        logger.info(
+            "step() | task=%s | attempt=%d | AI judge: %s",
+            self._current_task.id,
+            self._attempt,
+            "ACTIVE" if _AI_JUDGE_ACTIVE else "OFFLINE",
+        )
         score, feedback, details = grade(self._current_task, action.sql)
         # Fix 1 — early stopping: track consecutive steps with no improvement

tasks.py CHANGED Viewed

@@ -263,6 +263,283 @@ ORDER BY e.department_id, e.salary DESC""",
 )
 # ── Task Registry ─────────────────────────────────────────────────────────────
 class TaskRegistry:
@@ -273,9 +550,10 @@ class TaskRegistry:
     Custom tasks can be added via register(), load_from_json(), or POST /tasks.
     """
-    _BUILTIN_IDS: frozenset = frozenset(
-        ["task_easy_syntax", "task_medium_join", "task_hard_cte"]
-    )
     def __init__(self, initial_tasks: List[SQLTask]) -> None:
         self._lock = Lock()
@@ -408,8 +686,14 @@ def task_from_dict(d: Dict[str, Any]) -> SQLTask:
 # ── Global singleton ──────────────────────────────────────────────────────────
-REGISTRY = TaskRegistry([_TASK_EASY, _TASK_MEDIUM, _TASK_HARD])
-# Backwards-compat: snapshot of the three built-in tasks at import time
-TASKS: List[SQLTask] = [_TASK_EASY, _TASK_MEDIUM, _TASK_HARD]
 TASK_BY_ID: Dict[str, SQLTask] = {t.id: t for t in TASKS}

 )
+# ── Expert tasks ──────────────────────────────────────────────────────────────
+_TASK_EXPERT_RANK = SQLTask(
+    id="task_expert_rank",
+    level="expert",
+    title="Fix the Tie-Breaking Window Function",
+    description="""\
+TASK: The query below finds the top-earning sales rep per region, but it
+silently drops reps who are tied for first place. Fix it so ALL reps
+tied at rank 1 are returned.
+SCHEMA:
+  sales_reps(id INTEGER, name VARCHAR, region VARCHAR, revenue DECIMAL)
+BROKEN QUERY:
+  SELECT name, region, revenue
+  FROM (
+      SELECT name, region, revenue,
+             ROW_NUMBER() OVER (PARTITION BY region ORDER BY revenue DESC) AS rn
+      FROM sales_reps
+  ) ranked
+  WHERE rn = 1
+  ORDER BY region, name
+PROBLEM:
+  ROW_NUMBER() assigns unique sequential numbers even for tied revenue values.
+  When two reps share the top revenue in a region, ROW_NUMBER arbitrarily
+  picks one and discards the other.
+GOAL: Return ALL reps whose revenue is the highest in their region.
+     Use RANK() or DENSE_RANK() instead of ROW_NUMBER().
+     Order by region ASC, name ASC.""",
+    schema_ddl="""\
+CREATE TABLE sales_reps (id INTEGER, name VARCHAR, region VARCHAR, revenue DECIMAL);
+INSERT INTO sales_reps VALUES
+    (1, 'Alice', 'North', 95000),
+    (2, 'Bob',   'North', 87000),
+    (3, 'Carol', 'North', 95000),
+    (4, 'Dave',  'South', 88000),
+    (5, 'Eve',   'South', 88000),
+    (6, 'Frank', 'South', 75000);
+""",
+    broken_query="""\
+SELECT name, region, revenue
+FROM (
+    SELECT name, region, revenue,
+           ROW_NUMBER() OVER (PARTITION BY region ORDER BY revenue DESC) AS rn
+    FROM sales_reps
+) ranked
+WHERE rn = 1
+ORDER BY region, name""",
+    error_message=(
+        "Query runs but returns only 2 rows — one per region. "
+        "Tied reps at the top are silently dropped by ROW_NUMBER()."
+    ),
+    hint="Replace ROW_NUMBER() with RANK() or DENSE_RANK(). Both include all tied rows.",
+    test_cases=[
+        TestCase(
+            description="All reps tied at rank 1 per region",
+            expected_rows=[
+                {"name": "Alice", "region": "North", "revenue": 95000.0},
+                {"name": "Carol", "region": "North", "revenue": 95000.0},
+                {"name": "Dave",  "region": "South", "revenue": 88000.0},
+                {"name": "Eve",   "region": "South", "revenue": 88000.0},
+            ],
+            order_by="region,name",
+        )
+    ],
+    solution_query="""\
+SELECT name, region, revenue
+FROM (
+    SELECT name, region, revenue,
+           RANK() OVER (PARTITION BY region ORDER BY revenue DESC) AS rk
+    FROM sales_reps
+) ranked
+WHERE rk = 1
+ORDER BY region, name""",
+    max_steps=6,
+)
+_TASK_EXPERT_RECURSIVE = SQLTask(
+    id="task_expert_recursive",
+    level="expert",
+    title="Traverse Org Chart with Recursive CTE",
+    description="""\
+TASK: The query below attempts to find all subordinates of the VP of Engineering
+(id=3) using a two-level CTE expansion. It misses employees more than two levels
+deep. Rewrite it using a recursive CTE that traverses all levels.
+SCHEMA:
+  employees(id INTEGER, name VARCHAR, manager_id INTEGER)
+DATA (partial):
+  VP Eng (id=3) → Lead A (id=5), Lead B (id=6)
+  Lead A (id=5) → Dev 1 (id=8), Dev 2 (id=9)
+  Lead B (id=6) → Dev 3 (id=10), Dev 4 (id=11)
+  Dev 1 (id=8)  → Junior 1 (id=13), Junior 2 (id=14)
+BROKEN QUERY:
+  WITH direct AS (
+      SELECT id, name, manager_id FROM employees WHERE manager_id = 3
+  ),
+  level2 AS (
+      SELECT e.id, e.name, e.manager_id
+      FROM employees e
+      INNER JOIN direct d ON e.manager_id = d.id
+  )
+  SELECT id, name, manager_id FROM direct
+  UNION ALL
+  SELECT id, name, manager_id FROM level2
+  ORDER BY id
+PROBLEM:
+  This hardcoded two-level expansion returns 6 rows but misses Junior 1 (id=13)
+  and Junior 2 (id=14), who report to Dev 1 — three levels below VP Eng.
+  Adding a level3 CTE would help for now but still break if the tree grows deeper.
+GOAL: Use WITH RECURSIVE to return ALL 8 subordinates of VP Eng (id=3)
+     at any depth. Return id, name, manager_id columns, ordered by id ASC.""",
+    schema_ddl="""\
+CREATE TABLE employees (id INTEGER, name VARCHAR, manager_id INTEGER);
+INSERT INTO employees VALUES
+    (1,  'CEO',      NULL),
+    (2,  'CFO',      1),
+    (3,  'VP Eng',   1),
+    (4,  'VP Sales', 1),
+    (5,  'Lead A',   3),
+    (6,  'Lead B',   3),
+    (7,  'Sales Mgr',4),
+    (8,  'Dev 1',    5),
+    (9,  'Dev 2',    5),
+    (10, 'Dev 3',    6),
+    (11, 'Dev 4',    6),
+    (12, 'Sales Rep',7),
+    (13, 'Junior 1', 8),
+    (14, 'Junior 2', 8);
+""",
+    broken_query="""\
+WITH direct AS (
+    SELECT id, name, manager_id FROM employees WHERE manager_id = 3
+),
+level2 AS (
+    SELECT e.id, e.name, e.manager_id
+    FROM employees e
+    INNER JOIN direct d ON e.manager_id = d.id
+)
+SELECT id, name, manager_id FROM direct
+UNION ALL
+SELECT id, name, manager_id FROM level2
+ORDER BY id""",
+    error_message=(
+        "Query returns only 6 rows — two levels under VP Eng. "
+        "Junior 1 (id=13) and Junior 2 (id=14) who report to Dev 1 are missing. "
+        "A hardcoded level3 CTE would fix this instance but not scale to deeper trees."
+    ),
+    hint="Use WITH RECURSIVE. Start from manager_id = 3, then JOIN employees to the CTE itself on manager_id = cte.id.",
+    test_cases=[
+        TestCase(
+            description="All 8 subordinates of VP Eng at any depth",
+            expected_rows=[
+                {"id": 5,  "name": "Lead A",   "manager_id": 3},
+                {"id": 6,  "name": "Lead B",   "manager_id": 3},
+                {"id": 8,  "name": "Dev 1",    "manager_id": 5},
+                {"id": 9,  "name": "Dev 2",    "manager_id": 5},
+                {"id": 10, "name": "Dev 3",    "manager_id": 6},
+                {"id": 11, "name": "Dev 4",    "manager_id": 6},
+                {"id": 13, "name": "Junior 1", "manager_id": 8},
+                {"id": 14, "name": "Junior 2", "manager_id": 8},
+            ],
+            order_by="id",
+        )
+    ],
+    solution_query="""\
+WITH RECURSIVE subordinates AS (
+    SELECT id, name, manager_id
+    FROM employees
+    WHERE manager_id = 3
+    UNION ALL
+    SELECT e.id, e.name, e.manager_id
+    FROM employees e
+    INNER JOIN subordinates s ON e.manager_id = s.id
+)
+SELECT id, name, manager_id
+FROM subordinates
+ORDER BY id""",
+    max_steps=7,
+)
+_TASK_EXPERT_WINDOW = SQLTask(
+    id="task_expert_window",
+    level="expert",
+    title="Fix Two Broken Window Functions: Running Total and Revenue Rank",
+    description="""\
+TASK: The query below computes a cumulative running total and a
+within-region revenue rank for each quarter, but BOTH window functions
+are broken — neither has a PARTITION BY, so they treat all rows as one
+giant partition instead of computing independently per region.
+SCHEMA:
+  quarterly_sales(region VARCHAR, quarter INTEGER, revenue DECIMAL)
+BROKEN QUERY:
+  SELECT region, quarter, revenue,
+         SUM(revenue) OVER (ORDER BY region, quarter)        AS running_total,
+         RANK()       OVER (ORDER BY revenue DESC)            AS revenue_rank
+  FROM quarterly_sales
+  ORDER BY region, quarter
+PROBLEM:
+  - running_total accumulates across both regions: West's Q1 shows 65000
+    (continuing from East's Q4) instead of resetting to 11000.
+  - revenue_rank ranks revenue across ALL regions globally, so East Q4 (20000)
+    and West Q3 (16000) compete directly instead of being ranked within their
+    own region.
+GOAL: Fix BOTH window functions so they operate independently per region.
+     - running_total must reset to 0 at the start of each region (ORDER BY quarter).
+     - revenue_rank must rank revenue within each region (ORDER BY revenue DESC).
+     Both OVER clauses need PARTITION BY region, but with different ORDER BY columns.
+     Final output: ORDER BY region ASC, quarter ASC.""",
+    schema_ddl="""\
+CREATE TABLE quarterly_sales (region VARCHAR, quarter INTEGER, revenue DECIMAL);
+INSERT INTO quarterly_sales VALUES
+    ('East', 1, 15000),
+    ('East', 2, 18000),
+    ('East', 3, 12000),
+    ('East', 4, 20000),
+    ('West', 1, 11000),
+    ('West', 2, 14000),
+    ('West', 3, 16000),
+    ('West', 4, 13000);
+""",
+    broken_query="""\
+SELECT region, quarter, revenue,
+       SUM(revenue) OVER (ORDER BY region, quarter) AS running_total,
+       RANK()       OVER (ORDER BY revenue DESC)    AS revenue_rank
+FROM quarterly_sales
+ORDER BY region, quarter""",
+    error_message=(
+        "Query runs but both window functions are wrong. "
+        "West Q1 running_total shows 76000 (continuing from East) instead of 11000. "
+        "revenue_rank is a global ranking across all 8 rows instead of per-region. "
+        "Both SUM and RANK are missing PARTITION BY region."
+    ),
+    hint=(
+        "Add PARTITION BY region to BOTH window functions, but with different ORDER BY: "
+        "SUM(revenue) OVER (PARTITION BY region ORDER BY quarter) for running total, "
+        "RANK() OVER (PARTITION BY region ORDER BY revenue DESC) for within-region rank."
+    ),
+    test_cases=[
+        TestCase(
+            description="Per-region running total and within-region revenue rank",
+            expected_rows=[
+                {"region": "East", "quarter": 1, "revenue": 15000.0, "running_total": 15000.0, "revenue_rank": 3},
+                {"region": "East", "quarter": 2, "revenue": 18000.0, "running_total": 33000.0, "revenue_rank": 2},
+                {"region": "East", "quarter": 3, "revenue": 12000.0, "running_total": 45000.0, "revenue_rank": 4},
+                {"region": "East", "quarter": 4, "revenue": 20000.0, "running_total": 65000.0, "revenue_rank": 1},
+                {"region": "West", "quarter": 1, "revenue": 11000.0, "running_total": 11000.0, "revenue_rank": 4},
+                {"region": "West", "quarter": 2, "revenue": 14000.0, "running_total": 25000.0, "revenue_rank": 3},
+                {"region": "West", "quarter": 3, "revenue": 16000.0, "running_total": 41000.0, "revenue_rank": 1},
+                {"region": "West", "quarter": 4, "revenue": 13000.0, "running_total": 54000.0, "revenue_rank": 2},
+            ],
+            order_by="region,quarter",
+        )
+    ],
+    solution_query="""\
+SELECT region, quarter, revenue,
+       SUM(revenue) OVER (PARTITION BY region ORDER BY quarter)        AS running_total,
+       RANK()       OVER (PARTITION BY region ORDER BY revenue DESC)   AS revenue_rank
+FROM quarterly_sales
+ORDER BY region, quarter""",
+    max_steps=6,
+)
 # ── Task Registry ─────────────────────────────────────────────────────────────
 class TaskRegistry:
     Custom tasks can be added via register(), load_from_json(), or POST /tasks.
     """
+    _BUILTIN_IDS: frozenset = frozenset([
+        "task_easy_syntax", "task_medium_join", "task_hard_cte",
+        "task_expert_rank", "task_expert_recursive", "task_expert_window",
+    ])
     def __init__(self, initial_tasks: List[SQLTask]) -> None:
         self._lock = Lock()
 # ── Global singleton ──────────────────────────────────────────────────────────
+REGISTRY = TaskRegistry([
+    _TASK_EASY, _TASK_MEDIUM, _TASK_HARD,
+    _TASK_EXPERT_RANK, _TASK_EXPERT_RECURSIVE, _TASK_EXPERT_WINDOW,
+])
+# Backwards-compat: snapshot of all built-in tasks at import time
+TASKS: List[SQLTask] = [
+    _TASK_EASY, _TASK_MEDIUM, _TASK_HARD,
+    _TASK_EXPERT_RANK, _TASK_EXPERT_RECURSIVE, _TASK_EXPERT_WINDOW,
+]
 TASK_BY_ID: Dict[str, SQLTask] = {t.id: t for t in TASKS}