""" QueryForge Client Playbook ────────────────────────── Tests the environment through the HTTP server using the QueryforgeEnv client. Usage: # Against the live HF Space: python playbook.py # Against a local server: ENV_URL=http://localhost:8000 python playbook.py If ANTHROPIC_API_KEY is set on the server, Stage 4 AI scoring is live. If not set, the judge falls back to deterministic scoring (capped at 0.80). """ import os import sys import textwrap sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from client import QueryforgeEnv from models import SQLAction, TaskSpec from tasks import REGISTRY, task_from_dict BASE_URL = os.environ.get("ENV_URL", "https://prithvigg-queryforge.hf.space") # ── Formatting helpers ──────────────────────────────────────────────────────── def _hr(char="═", width=70): print(char * width) def _section(title): print() _hr() print(f" {title}") _hr() def _score_bar(score: float, width: int = 30) -> str: filled = int(score * width) bar = "█" * filled + "░" * (width - filled) return f"[{bar}] {score:.2f}" def _print_result(result, show_description=False): obs = result.observation if show_description and obs.task_description: print() print(textwrap.indent(obs.task_description, " ")) print() if obs.feedback and obs.feedback != "New task loaded. Submit your fixed/optimised SQL query.": print(f" Syntax valid : {obs.syntax_valid}") print(f" Execution OK : {obs.execution_success}") if obs.execution_error: print(f" Execution error : {obs.execution_error[:100]}") print(f" Rows returned : {obs.rows_returned}") print(f" Score : {_score_bar(result.reward or 0.0)}") print(f" Best this ep. : {_score_bar(obs.best_score)}") fb = obs.feedback[:250] + ("…" if len(obs.feedback) > 250 else "") print(f" Feedback : {fb}") if obs.hint: print(f" Hint : {obs.hint[:120]}") def _attempt(client, label: str, sql: str): print(f"\n ── Attempt: {label}") print(f" SQL: {sql[:100]}{'…' if len(sql) > 100 else ''}") result = client.step(SQLAction(sql=sql)) _print_result(result) return result # ── Task runners ────────────────────────────────────────────────────────────── def run_easy(client): _section("TASK 1 · EASY — Fix Syntax Errors") result = client.reset(task_id="task_easy_syntax") obs = result.observation print(f"\n Task : {obs.task_title} [{obs.task_level}]") _print_result(result, show_description=True) _attempt(client, "still broken", "SELEC name, age FORM users WEHRE age > 30") _attempt(client, "one keyword fixed", "SELECT name, age FORM users WEHRE age > 30") _attempt(client, "all keywords fixed, no filter", "SELECT name, age FROM users WHERE age > 30") result = _attempt(client, "correct solution", "SELECT name, age FROM users " "WHERE age > 30 AND city = 'New York' " "ORDER BY name ASC") print(f"\n Episode done: {result.done} | Best score: {result.observation.best_score:.2f}") def run_medium(client): _section("TASK 2 · MEDIUM — Fix the Cartesian JOIN") result = client.reset(task_id="task_medium_join") obs = result.observation print(f"\n Task : {obs.task_title} [{obs.task_level}]") _print_result(result, show_description=True) _attempt(client, "broken verbatim (cartesian product)", "SELECT u.name, p.title, SUM(o.amount) AS total_spent " "FROM orders o, users u, products p " "WHERE o.user_id = u.id " "GROUP BY u.name, p.title " "ORDER BY total_spent DESC") _attempt(client, "comma-join with product condition (no explicit JOIN)", "SELECT u.name, p.title, SUM(o.amount) AS total_spent " "FROM orders o, users u, products p " "WHERE o.user_id = u.id AND o.product_id = p.id " "GROUP BY u.name, p.title " "ORDER BY total_spent DESC") result = _attempt(client, "correct INNER JOINs", "SELECT u.name, p.title, SUM(o.amount) AS total_spent\n" "FROM orders o\n" "INNER JOIN users u ON o.user_id = u.id\n" "INNER JOIN products p ON o.product_id = p.id\n" "GROUP BY u.name, p.title\n" "ORDER BY total_spent DESC") print(f"\n Episode done: {result.done} | Best score: {result.observation.best_score:.2f}") def run_hard(client): _section("TASK 3 · HARD — Rewrite Correlated Subquery as CTE") result = client.reset(task_id="task_hard_cte") obs = result.observation print(f"\n Task : {obs.task_title} [{obs.task_level}]") _print_result(result, show_description=True) _attempt(client, "broken verbatim (no CTE)", "SELECT e.name, e.department_id, e.salary\n" "FROM employees e\n" "WHERE e.salary > (\n" " SELECT AVG(e2.salary) FROM employees e2\n" " WHERE e2.department_id = e.department_id\n" ")\n" "ORDER BY e.department_id, e.salary DESC") _attempt(client, "halfway — CTE defined but wrong join", "WITH dept_avg AS (\n" " SELECT department_id, AVG(salary) AS avg_salary\n" " FROM employees GROUP BY department_id\n" ")\n" "SELECT e.name, e.department_id, e.salary\n" "FROM employees e, dept_avg d\n" "WHERE e.salary > d.avg_salary\n" "ORDER BY e.department_id, e.salary DESC") result = _attempt(client, "correct CTE with proper JOIN", "WITH dept_avg AS (\n" " SELECT department_id, AVG(salary) AS avg_salary\n" " FROM employees\n" " GROUP BY department_id\n" ")\n" "SELECT e.name, e.department_id, e.salary\n" "FROM employees e\n" "JOIN dept_avg d ON e.department_id = d.department_id\n" "WHERE e.salary > d.avg_salary\n" "ORDER BY e.department_id, e.salary DESC") print(f"\n Episode done: {result.done} | Best score: {result.observation.best_score:.2f}") def run_custom(client): _section("TASK 4 · CUSTOM — NULL Handling in Aggregation") # Register a brand-new task at runtime via the REST API client.register_task(TaskSpec( id="custom_null_avg", level="custom", title="Handle NULLs in Aggregation", description="""\ TASK: The query below skips NULL scores, making the class average look higher. Fix it so NULL scores are treated as 0. SCHEMA: students(id INTEGER, name VARCHAR, score INTEGER) BROKEN QUERY: SELECT AVG(score) AS avg_score FROM students ERROR: NULL values are silently excluded by AVG(), inflating the result. GOAL: Return a single row with avg_score that treats NULL as 0. Expected result: avg_score = 65.0""", schema_ddl="""\ CREATE TABLE students (id INTEGER, name VARCHAR, score INTEGER); INSERT INTO students VALUES (1, 'Alice', 90), (2, 'Bob', NULL), (3, 'Carol', 80), (4, 'Dave', NULL), (5, 'Eve', 70), (6, 'Frank', 50); """, broken_query="SELECT AVG(score) AS avg_score FROM students", error_message="NULL scores are silently skipped by AVG().", hint="Wrap score with COALESCE(score, 0) before averaging.", expected_rows=[{"avg_score": 65.0}], solution_query="SELECT AVG(COALESCE(score, 0)) AS avg_score FROM students", test_description="AVG treats NULL as 0 → 65.0", max_steps=4, )) result = client.reset(task_id="custom_null_avg") obs = result.observation print(f"\n Task : {obs.task_title} [{obs.task_level}]") _print_result(result, show_description=True) _attempt(client, "broken (NULL excluded)", "SELECT AVG(score) AS avg_score FROM students") result = _attempt(client, "correct (COALESCE)", "SELECT AVG(COALESCE(score, 0)) AS avg_score FROM students") print(f"\n Episode done: {result.done} | Best score: {result.observation.best_score:.2f}") # Clean up client.delete_task("custom_null_avg") print(" Custom task unregistered from registry.") # ── Main ────────────────────────────────────────────────────────────────────── if __name__ == "__main__": ai_key = os.environ.get("ANTHROPIC_API_KEY") _hr("═") print(" QueryForge — Client Playbook") print(f" Server : {BASE_URL}") print(f" AI judge : {'LIVE (ANTHROPIC_API_KEY set)' if ai_key else 'OFFLINE (fallback to deterministic, max 0.80)'}") _hr("═") with QueryforgeEnv(base_url=BASE_URL).sync() as client: run_easy(client) run_medium(client) run_hard(client) run_custom(client) _section("DONE") print(" All tasks completed.\n")