Spaces:

ar9av
/

sql-agent-openenv

Sleeping

ar9avg commited on 10 days ago

Commit

8ae8e0b

1 Parent(s): f0b682f

Fix chat query failures and benchmark ID mismatches

- Free-form chat: use error-based success check instead of task grader
(grader compared against wrong expected results, causing all queries to fail)
- Add /api/benchmark-questions endpoint to expose real task question IDs
- Benchmark SSE: rename query_id→id, success→pass, overall_score→overallScore
to match frontend field names
- Add queryIds filtering support to BenchmarkRequest
- Frontend: load benchmark questions from API instead of hardcoded IDs
(E1-E5 didn't match backend sq-01–sq-05)
- Remove duplicate difficulty tabs from BenchmarkPanel

Files changed (5) hide show

backend/api/demo.py +33 -10
frontend/src/App.tsx +26 -2
frontend/src/components/BenchmarkPanel.tsx +0 -17
frontend/src/lib/api.ts +8 -0
frontend/src/store/useStore.ts +17 -29

backend/api/demo.py CHANGED Viewed

@@ -179,11 +179,9 @@ async def execute_query_stream(req: ExecuteQueryRequest):
             rows, error = execute_query(generated_sql)
-            from env.tasks import grade_response
-            task_score = grade_response(
-                task_id, question_obj.id, generated_sql, rows, error, attempt
-            )
-            attempt_success = task_score >= 0.8
             current_error_class = None
             error_class_name = None
@@ -320,10 +318,30 @@ async def execute_query_stream(req: ExecuteQueryRequest):
     return EventSourceResponse(event_generator())
 # ─── /api/benchmark ───────────────────────────────────────────────
 class BenchmarkRequest(BaseModel):
     task_id: str = "simple_queries"
 @router.post("/benchmark")
@@ -333,10 +351,14 @@ async def run_benchmark(req: BenchmarkRequest):
         task = get_task(task_id)
         scores: list[float] = []
-        for question_obj in task.questions:
             yield {"data": json.dumps({
                 "type": "query_start",
-                "query_id": question_obj.id,
                 "question": question_obj.question,
             })}
@@ -444,17 +466,18 @@ async def run_benchmark(req: BenchmarkRequest):
             yield {"data": json.dumps({
                 "type": "query_result",
-                "query_id": question_obj.id,
-                "success": success,
                 "score": task_score,
                 "sql": sql,
                 "attempts": attempt,
             })}
         overall_score = sum(scores) / len(scores) if scores else 0.0
         yield {"data": json.dumps({
             "type": "done",
-            "overall_score": overall_score,
             "task_id": task_id,
         })}

             rows, error = execute_query(generated_sql)
+            # For free-form chat, success = no SQL error (not task grader)
+            attempt_success = (error is None)
+            task_score = 1.0 if attempt_success else 0.0
             current_error_class = None
             error_class_name = None
     return EventSourceResponse(event_generator())
+# ─── /api/benchmark-questions ────────────────────────────────────
+@router.get("/benchmark-questions")
+async def get_benchmark_questions(task_id: str = "easy"):
+    mapped_id = _DIFFICULTY_MAP.get(task_id, task_id)
+    task = get_task(mapped_id)
+    difficulty_label = task.difficulty  # "easy" | "medium" | "hard"
+    return {
+        "questions": [
+            {
+                "id": q.id,
+                "question": q.question,
+                "difficulty": difficulty_label,
+            }
+            for q in task.questions
+        ]
+    }
 # ─── /api/benchmark ───────────────────────────────────────────────
 class BenchmarkRequest(BaseModel):
     task_id: str = "simple_queries"
+    queryIds: Optional[list[str]] = None
 @router.post("/benchmark")
         task = get_task(task_id)
         scores: list[float] = []
+        questions = task.questions
+        if req.queryIds:
+            questions = [q for q in questions if q.id in req.queryIds]
+        for question_obj in questions:
             yield {"data": json.dumps({
                 "type": "query_start",
+                "id": question_obj.id,
                 "question": question_obj.question,
             })}
             yield {"data": json.dumps({
                 "type": "query_result",
+                "id": question_obj.id,
+                "pass": success,
                 "score": task_score,
                 "sql": sql,
                 "attempts": attempt,
+                "reason": "Correct" if success else "Agent exhausted all repair attempts",
             })}
         overall_score = sum(scores) / len(scores) if scores else 0.0
         yield {"data": json.dumps({
             "type": "done",
+            "overallScore": overall_score,
             "task_id": task_id,
         })}

frontend/src/App.tsx CHANGED Viewed

@@ -11,7 +11,7 @@ import { RightSidebar } from './components/RightSidebar'
 import { DemoMode } from './components/DemoMode'
 import { ConnectDB } from './components/ConnectDB'
 import { useStore } from './store/useStore'
-import { fetchInit } from './lib/api'
 type Tab = 'chat' | 'benchmark' | 'er'
@@ -28,7 +28,7 @@ export default function App() {
   const [demoOpen, setDemoOpen] = useState(false)
   const [connectDbOpen, setConnectDbOpen] = useState(false)
-  const { theme, setDbSeeded, setTables, setSchemaGraph, setDbLabel } = useStore()
   // Apply theme on mount / change
   useEffect(() => {
@@ -62,6 +62,30 @@ export default function App() {
       .catch(() => { /* noop */ })
   }, [setDbSeeded, setTables, setSchemaGraph])
   // Close mobile sidebars on tab change
   useEffect(() => {
     setLeftOpen(false)

 import { DemoMode } from './components/DemoMode'
 import { ConnectDB } from './components/ConnectDB'
 import { useStore } from './store/useStore'
+import { fetchInit, fetchBenchmarkQuestions } from './lib/api'
 type Tab = 'chat' | 'benchmark' | 'er'
   const [demoOpen, setDemoOpen] = useState(false)
   const [connectDbOpen, setConnectDbOpen] = useState(false)
+  const { theme, setDbSeeded, setTables, setSchemaGraph, setDbLabel, taskDifficulty } = useStore()
   // Apply theme on mount / change
   useEffect(() => {
       .catch(() => { /* noop */ })
   }, [setDbSeeded, setTables, setSchemaGraph])
+  // Load benchmark questions from API on mount
+  useEffect(() => {
+    const { setBenchmarkResults } = useStore.getState()
+    fetchBenchmarkQuestions(taskDifficulty)
+      .then(({ questions }) => {
+        setBenchmarkResults(
+          questions.map((q) => ({
+            id: q.id,
+            question: q.question,
+            difficulty: q.difficulty as 'easy' | 'medium' | 'hard',
+            status: 'pending' as const,
+            score: null,
+            sql: null,
+            reason: null,
+            attempts: null,
+            refRowCount: null,
+            agentRowCount: null,
+          }))
+        )
+      })
+      .catch(() => { /* noop */ })
+  // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [])
   // Close mobile sidebars on tab change
   useEffect(() => {
     setLeftOpen(false)

frontend/src/components/BenchmarkPanel.tsx CHANGED Viewed

@@ -356,23 +356,6 @@ export function BenchmarkPanel() {
         )}
       </div>
-      {/* Difficulty tabs */}
-      <div className="flex items-center gap-1 px-4 py-2 border-b border-white/[0.06] shrink-0">
-        {DIFFICULTY_TABS.map((tab) => (
-          <button
-            key={tab.id}
-            onClick={() => setTaskDifficulty(tab.id)}
-            className={`px-3 py-1 rounded-lg text-xs font-medium transition-all ${
-              taskDifficulty === tab.id
-                ? 'bg-violet-600/20 text-violet-300 border border-violet-500/30'
-                : 'text-gray-500 hover:text-gray-300 hover:bg-white/5 border border-transparent'
-            }`}
-          >
-            {tab.label}
-          </button>
-        ))}
-      </div>
       {/* Query list */}
       <div className="flex-1 overflow-y-auto">
         <div className="p-2 flex flex-col gap-1">

         )}
       </div>
       {/* Query list */}
       <div className="flex-1 overflow-y-auto">
         <div className="p-2 flex flex-col gap-1">

frontend/src/lib/api.ts CHANGED Viewed

@@ -96,6 +96,14 @@ export async function fetchPromptHistory() {
   return res.json()
 }
 export async function connectExternalDb(path: string): Promise<{ success: boolean; message: string; tables: { name: string; rows: number }[]; dbLabel: string }> {
   const res = await fetch(`${BASE_URL}/api/connect-db`, {
     method: 'POST',

   return res.json()
 }
+export async function fetchBenchmarkQuestions(
+  taskId: string
+): Promise<{ questions: { id: string; question: string; difficulty: string }[] }> {
+  const res = await fetch(`${BASE_URL}/api/benchmark-questions?task_id=${encodeURIComponent(taskId)}`)
+  if (!res.ok) throw new Error(`HTTP ${res.status}`)
+  return res.json()
+}
 export async function connectExternalDb(path: string): Promise<{ success: boolean; message: string; tables: { name: string; rows: number }[]; dbLabel: string }> {
   const res = await fetch(`${BASE_URL}/api/connect-db`, {
     method: 'POST',

frontend/src/store/useStore.ts CHANGED Viewed

@@ -8,6 +8,7 @@ import type {
   PromptSnapshot,
   Difficulty,
 } from '../lib/types'
 interface Store {
   // Theme
@@ -64,28 +65,12 @@ interface Store {
   setPromptData: (data: { prompt: string; generation: number; history: PromptSnapshot[] }) => void
 }
-const EASY_QUERIES: BenchmarkResult[] = [
-  { id: 'E1', question: 'Show all products', difficulty: 'easy', status: 'pending', score: null, sql: null, reason: null, attempts: null, refRowCount: null, agentRowCount: null },
-  { id: 'E2', question: 'List all users from the USA', difficulty: 'easy', status: 'pending', score: null, sql: null, reason: null, attempts: null, refRowCount: null, agentRowCount: null },
-  { id: 'E3', question: 'What product categories exist?', difficulty: 'easy', status: 'pending', score: null, sql: null, reason: null, attempts: null, refRowCount: null, agentRowCount: null },
-  { id: 'E4', question: 'How many orders are in the database?', difficulty: 'easy', status: 'pending', score: null, sql: null, reason: null, attempts: null, refRowCount: null, agentRowCount: null },
-  { id: 'E5', question: 'Show all sellers with their names', difficulty: 'easy', status: 'pending', score: null, sql: null, reason: null, attempts: null, refRowCount: null, agentRowCount: null },
-]
-const MEDIUM_QUERIES: BenchmarkResult[] = [
-  { id: 'M1', question: 'Top 5 sellers by total revenue', difficulty: 'medium', status: 'pending', score: null, sql: null, reason: null, attempts: null, refRowCount: null, agentRowCount: null },
-  { id: 'M2', question: 'Average order value by country', difficulty: 'medium', status: 'pending', score: null, sql: null, reason: null, attempts: null, refRowCount: null, agentRowCount: null },
-  { id: 'M3', question: 'Products with stock below 10 units', difficulty: 'medium', status: 'pending', score: null, sql: null, reason: null, attempts: null, refRowCount: null, agentRowCount: null },
-  { id: 'M4', question: 'Monthly order count for the last 12 months', difficulty: 'medium', status: 'pending', score: null, sql: null, reason: null, attempts: null, refRowCount: null, agentRowCount: null },
-  { id: 'M5', question: 'Categories ranked by number of products', difficulty: 'medium', status: 'pending', score: null, sql: null, reason: null, attempts: null, refRowCount: null, agentRowCount: null },
-]
-const HARD_QUERIES: BenchmarkResult[] = [
-  { id: 'H1', question: 'Rolling 7-day revenue for the past 30 days', difficulty: 'hard', status: 'pending', score: null, sql: null, reason: null, attempts: null, refRowCount: null, agentRowCount: null },
-  { id: 'H2', question: 'Seller ranking with rank change from previous month', difficulty: 'hard', status: 'pending', score: null, sql: null, reason: null, attempts: null, refRowCount: null, agentRowCount: null },
-  { id: 'H3', question: 'Cohort retention analysis by signup month', difficulty: 'hard', status: 'pending', score: null, sql: null, reason: null, attempts: null, refRowCount: null, agentRowCount: null },
-  { id: 'H4', question: 'Identify top products contributing to 80% of revenue (Pareto)', difficulty: 'hard', status: 'pending', score: null, sql: null, reason: null, attempts: null, refRowCount: null, agentRowCount: null },
-  { id: 'H5', question: 'Customer lifetime value segmented by acquisition channel', difficulty: 'hard', status: 'pending', score: null, sql: null, reason: null, attempts: null, refRowCount: null, agentRowCount: null },
 ]
 export const useStore = create<Store>((set) => ({
@@ -105,13 +90,16 @@ export const useStore = create<Store>((set) => ({
   setTaskId: (id) => set({ taskId: id }),
   setTaskDifficulty: (d) => {
     const taskId = d === 'easy' ? 'simple_queries' : d === 'medium' ? 'join_queries' : 'complex_queries'
-    set({
-      taskDifficulty: d,
-      taskId,
-      benchmarkResults:
-        d === 'easy' ? EASY_QUERIES : d === 'medium' ? MEDIUM_QUERIES : HARD_QUERIES,
-      overallScore: null,
-    })
   },
   // DB
   dbLabel: 'benchmark (built-in)',
@@ -139,7 +127,7 @@ export const useStore = create<Store>((set) => ({
   setOptimizingBanner: (v) => set({ optimizingBanner: v }),
   // Benchmark
-  benchmarkResults: EASY_QUERIES,
   setBenchmarkResults: (r) => set({ benchmarkResults: r }),
   updateBenchmarkResult: (r) =>
     set((s) => ({

   PromptSnapshot,
   Difficulty,
 } from '../lib/types'
+import { fetchBenchmarkQuestions } from '../lib/api'
 interface Store {
   // Theme
   setPromptData: (data: { prompt: string; generation: number; history: PromptSnapshot[] }) => void
 }
+function makePending(id: string, question: string, difficulty: Difficulty): BenchmarkResult {
+  return { id, question, difficulty, status: 'pending', score: null, sql: null, reason: null, attempts: null, refRowCount: null, agentRowCount: null }
+}
+const PLACEHOLDER_QUERIES: BenchmarkResult[] = [
+  makePending('loading', 'Loading questions…', 'easy'),
 ]
 export const useStore = create<Store>((set) => ({
   setTaskId: (id) => set({ taskId: id }),
   setTaskDifficulty: (d) => {
     const taskId = d === 'easy' ? 'simple_queries' : d === 'medium' ? 'join_queries' : 'complex_queries'
+    set({ taskDifficulty: d, taskId, overallScore: null })
+    fetchBenchmarkQuestions(d)
+      .then(({ questions }) => {
+        set({
+          benchmarkResults: questions.map((q) =>
+            makePending(q.id, q.question, q.difficulty as Difficulty)
+          ),
+        })
+      })
+      .catch(() => { /* keep current list on error */ })
   },
   // DB
   dbLabel: 'benchmark (built-in)',
   setOptimizingBanner: (v) => set({ optimizingBanner: v }),
   // Benchmark
+  benchmarkResults: PLACEHOLDER_QUERIES,
   setBenchmarkResults: (r) => set({ benchmarkResults: r }),
   updateBenchmarkResult: (r) =>
     set((s) => ({