Spaces:

ritvik360
/

nl2sql-bench

Sleeping

App Files Files Community

ritvik360 commited on 11 days ago

Commit

daab6a1

verified ·

1 Parent(s): 46e0615

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

inference.py +48 -52

inference.py CHANGED Viewed

@@ -6,7 +6,7 @@ MANDATORY COMPLIANCE
 --------------------
 - Named `inference.py`, placed in project root.
 - Uses OpenAI client for all LLM calls.
-- Reads: API_BASE_URL, MODEL_NAME, API_KEY (+ HF_TOKEN fallback) from environment.
 - Emits [START] / [STEP] / [END] lines to stdout in the exact format below.
 - Runs all 3 tasks; total runtime < 20 min on 2 vCPU / 8 GB.
@@ -27,38 +27,39 @@ from typing import List, Optional
 from openai import OpenAI
-# ── Configuration ──────────────────────────────────────────────────────────
-# CRITICAL: API_BASE_URL and API_KEY are injected by the competition evaluator.
-# Do NOT hardcode values. The evaluator injects their LiteLLM proxy URL + key.
-API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
-# CRITICAL FIX: Default MODEL_NAME must be a model available on the HF router /
-# the competition's LiteLLM proxy. "ritvik360/qwen-7b-nl2sql-merged_1" is NOT
-# on their proxy — it would silently fail and produce SELECT 1 for all steps.
-# The competition injects MODEL_NAME if they want to override it.
-MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-7B-Instruct")
-# CRITICAL FIX: Read API_KEY first (competition injects this), then fall back
-# to HF_TOKEN. Both variable names must be checked.
-API_KEY = os.getenv("API_KEY") or os.getenv("HF_TOKEN", "") or os.getenv("OPENAI_API_KEY", "")
-SPACE_URL         = os.getenv("SPACE_URL", "https://ritvik360-nl2sql-bench.hf.space")
 IMAGE_NAME        = os.getenv("LOCAL_IMAGE_NAME", "nl2sql-bench:latest")
 BENCHMARK         = "nl2sql-bench"
 MAX_STEPS         = 5
-TEMPERATURE       = 0.2
 MAX_TOKENS        = 512
-SUCCESS_THRESHOLD = 0.7
 TASKS = ["simple-filter", "join-aggregation", "analytics-window"]
-# ── Startup diagnostics (stderr — not scored) ─────────────────────────────
-print(f"[DEBUG] API_BASE_URL = {API_BASE_URL}", file=sys.stderr, flush=True)
-print(f"[DEBUG] MODEL_NAME   = {MODEL_NAME}", file=sys.stderr, flush=True)
-print(f"[DEBUG] API_KEY set  = {bool(API_KEY)} (len={len(API_KEY)})", file=sys.stderr, flush=True)
-print(f"[DEBUG] SPACE_URL    = {SPACE_URL}", file=sys.stderr, flush=True)
 # ── System prompt ──────────────────────────────────────────────────────────
 SYSTEM_PROMPT = textwrap.dedent("""
 You are an expert SQL analyst working with a SQLite e-commerce database.
@@ -94,6 +95,7 @@ def log_start(task: str, model: str) -> None:
 def log_step(
     step: int, action: str, reward: float, done: bool, error: Optional[str]
 ) -> None:
     action_single = " ".join(action.split())
     error_val = error.replace("\n", " ") if error else "null"
     print(
@@ -146,14 +148,7 @@ def build_user_prompt(
 def call_llm(client: OpenAI, user_prompt: str) -> str:
-    # CRITICAL: Do NOT silently swallow exceptions with a bare `except Exception`.
-    # Silent failure means inference.py "succeeds" but makes zero LLM API calls,
-    # which causes the competition's LLM Criteria Check to fail.
     try:
-        print(
-            f"[DEBUG] Calling LLM: model={MODEL_NAME} base_url={API_BASE_URL}",
-            file=sys.stderr, flush=True
-        )
         resp = client.chat.completions.create(
             model=MODEL_NAME,
             messages=[
@@ -165,7 +160,6 @@ def call_llm(client: OpenAI, user_prompt: str) -> str:
             stream=False,
         )
         text = (resp.choices[0].message.content or "").strip()
-        print(f"[DEBUG] LLM raw response (first 120 chars): {text[:120]}", file=sys.stderr, flush=True)
         # Strip markdown code fences if model wraps in ```sql ... ```
         if text.startswith("```"):
             lines = text.split("\n")
@@ -175,11 +169,8 @@ def call_llm(client: OpenAI, user_prompt: str) -> str:
             ).strip()
         return text if text else "SELECT 1"
     except Exception as exc:
-        # Log the full error — this is the signal that tells you what went wrong
-        print(f"[DEBUG] LLM call FAILED: {type(exc).__name__}: {exc}", file=sys.stderr, flush=True)
-        # Re-raise so the episode is marked failed, not silently scored as 0.
-        # A visible failure is better than a silent one that breaks the LLM check.
-        raise
 # ── Single-task episode ────────────────────────────────────────────────────
@@ -194,7 +185,11 @@ async def run_task(client: OpenAI, env, task_name: str) -> dict:
     log_start(task_name, MODEL_NAME)
     try:
-        result = await env.reset()
         obs = result.observation
         for step in range(1, MAX_STEPS + 1):
@@ -213,7 +208,7 @@ async def run_task(client: OpenAI, env, task_name: str) -> dict:
             sql = call_llm(client, user_prompt)
-            from models import NL2SQLAction
             action = NL2SQLAction(query=sql)
             result = await env.step(action)
             obs = result.observation
@@ -230,12 +225,15 @@ async def run_task(client: OpenAI, env, task_name: str) -> dict:
             if done:
                 break
-        score   = sum(rewards) / max(len(rewards), 1)
-        score   = round(min(max(score, 0.0), 1.0), 4)
-        success = score >= SUCCESS_THRESHOLD
     except Exception as exc:
-        print(f"[DEBUG] Episode error for {task_name}: {type(exc).__name__}: {exc}", file=sys.stderr, flush=True)
     finally:
         log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
@@ -245,21 +243,17 @@ async def run_task(client: OpenAI, env, task_name: str) -> dict:
 # ── Main ───────────────────────────────────────────────────────────────────
 async def main() -> None:
-    # Validate that API_KEY is present — fail fast with a clear message
-    if not API_KEY:
-        print(
-            "[ERROR] No API key found. Set API_KEY or HF_TOKEN environment variable.",
-            file=sys.stderr, flush=True
-        )
-        sys.exit(1)
     client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
     from client import NL2SQLEnv
     all_results = []
     for task_name in TASKS:
         os.environ["NL2SQL_DEFAULT_TASK"] = task_name
         try:
@@ -268,18 +262,20 @@ async def main() -> None:
                 all_results.append(result)
         except Exception as exc:
             print(
-                f"[DEBUG] Failed to connect for task {task_name}: {type(exc).__name__}: {exc}",
                 file=sys.stderr,
                 flush=True,
             )
             log_end(success=False, steps=0, score=0.0, rewards=[])
             all_results.append({"task": task_name, "success": False, "score": 0.0})
-    # Summary to stderr
     print("\n=== Baseline Summary ===", file=sys.stderr)
     for r in all_results:
         print(
-            f"  {r['task']:20s}  score={r['score']:.3f}  success={r['success']}",
             file=sys.stderr,
         )
     avg = sum(r["score"] for r in all_results) / max(len(all_results), 1)

 --------------------
 - Named `inference.py`, placed in project root.
 - Uses OpenAI client for all LLM calls.
+- Reads: API_BASE_URL, MODEL_NAME, HF_TOKEN from environment.
 - Emits [START] / [STEP] / [END] lines to stdout in the exact format below.
 - Runs all 3 tasks; total runtime < 20 min on 2 vCPU / 8 GB.
 from openai import OpenAI
+# # ── Configuration ──────────────────────────────────────────────────────────
+# API_BASE_URL   = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
+# MODEL_NAME     = os.getenv("MODEL_NAME",   "Qwen/Qwen2.5-7B-Instruct")
+# API_KEY        = os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY", "")
+# IMAGE_NAME     = os.getenv("LOCAL_IMAGE_NAME", "nl2sql-bench:latest")
+# SPACE_URL      = os.getenv("SPACE_URL", "http://localhost:8000")
+# BENCHMARK      = "nl2sql-bench"
+# MAX_STEPS      = 5
+# TEMPERATURE    = 0.2      # Low temp for SQL generation
+# MAX_TOKENS     = 512
+# SUCCESS_THRESHOLD = 0.7   # score >= 0.7 → success
+# TASKS = ["simple-filter", "join-aggregation", "analytics-window"]
+# ── Configuration ──────────────────────────────────────────────────────────
+API_BASE_URL      = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
+# Points to your newly uploaded fine-tuned weights!
+MODEL_NAME        = os.getenv("MODEL_NAME", "ritvik360/qwen-7b-nl2sql-merged_1")
+# CRITICAL FIX: Looks for 'API_KEY' first to satisfy the evaluator's LiteLLM proxy
+API_KEY           = os.getenv("API_KEY")  or os.getenv("HF_TOKEN", "") or os.getenv("OPENAI_API_KEY")
 IMAGE_NAME        = os.getenv("LOCAL_IMAGE_NAME", "nl2sql-bench:latest")
+# CRITICAL FIX: Point the default directly to your live HF Space!
+SPACE_URL         = os.getenv("SPACE_URL", "https://ritvik360-nl2sql-bench.hf.space")
 BENCHMARK         = "nl2sql-bench"
 MAX_STEPS         = 5
+TEMPERATURE       = 0.2      # Low temp for SQL generation
 MAX_TOKENS        = 512
+SUCCESS_THRESHOLD = 0.7      # score >= 0.7 → success
 TASKS = ["simple-filter", "join-aggregation", "analytics-window"]
 # ── System prompt ──────────────────────────────────────────────────────────
 SYSTEM_PROMPT = textwrap.dedent("""
 You are an expert SQL analyst working with a SQLite e-commerce database.
 def log_step(
     step: int, action: str, reward: float, done: bool, error: Optional[str]
 ) -> None:
+    # Collapse multi-line SQL to single line for log compliance
     action_single = " ".join(action.split())
     error_val = error.replace("\n", " ") if error else "null"
     print(
 def call_llm(client: OpenAI, user_prompt: str) -> str:
     try:
         resp = client.chat.completions.create(
             model=MODEL_NAME,
             messages=[
             stream=False,
         )
         text = (resp.choices[0].message.content or "").strip()
         # Strip markdown code fences if model wraps in ```sql ... ```
         if text.startswith("```"):
             lines = text.split("\n")
             ).strip()
         return text if text else "SELECT 1"
     except Exception as exc:
+        print(f"[DEBUG] LLM call failed: {exc}", file=sys.stderr, flush=True)
+        return "SELECT 1"
 # ── Single-task episode ────────────────────────────────────────────────────
     log_start(task_name, MODEL_NAME)
     try:
+        # Reset — pass task_name via action payload or query param
+        # OpenEnv reset() may not accept task args via HTTP; we rely on
+        # NL2SQL_DEFAULT_TASK env-var being set before calling, OR we
+        # pass it as a reset parameter if the server supports it.
+        result = await env.reset() # changed
         obs = result.observation
         for step in range(1, MAX_STEPS + 1):
             sql = call_llm(client, user_prompt)
+            from models import NL2SQLAction  # local to avoid circular at module level
             action = NL2SQLAction(query=sql)
             result = await env.step(action)
             obs = result.observation
             if done:
                 break
+        # Compute final score
+        # CRITICAL: Evaluator requires score strictly in (0, 1) — not 0.0, not 1.0.
+        # A perfect solve gives 1.0 → clamp to 0.999. All-fail gives 0.0 → clamp to 0.001.
+        raw_score = sum(rewards) / max(len(rewards), 1)
+        score     = round(min(max(raw_score, 0.001), 0.999), 4)
+        success   = raw_score >= SUCCESS_THRESHOLD
     except Exception as exc:
+        print(f"[DEBUG] Episode error for {task_name}: {exc}", file=sys.stderr, flush=True)
     finally:
         log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
 # ── Main ───────────────────────────────────────────────────────────────────
 async def main() -> None:
     client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
+    # Import here to avoid import errors if openenv not installed during lint
     from client import NL2SQLEnv
     all_results = []
     for task_name in TASKS:
+        # Set the default task for the server session via env-var approach.
+        # For the hosted Space, we rely on the task cycling implemented in
+        # the task registry's round-robin iterator.
         os.environ["NL2SQL_DEFAULT_TASK"] = task_name
         try:
                 all_results.append(result)
         except Exception as exc:
             print(
+                f"[DEBUG] Failed to connect for task {task_name}: {exc}",
                 file=sys.stderr,
                 flush=True,
             )
+            # Emit a zero-score END to keep log format valid
             log_end(success=False, steps=0, score=0.0, rewards=[])
             all_results.append({"task": task_name, "success": False, "score": 0.0})
+    # Summary to stderr (not scored, for human readability)
     print("\n=== Baseline Summary ===", file=sys.stderr)
     for r in all_results:
         print(
+            f"  {r['task']:20s}  score={r['score']:.3f}  "
+            f"success={r['success']}",
             file=sys.stderr,
         )
     avg = sum(r["score"] for r in all_results) / max(len(all_results), 1)