weijiang99 commited on
Commit
cb5acaf
·
verified ·
1 Parent(s): 52ea128

Update SpatialBench pipeline

Browse files
Files changed (2) hide show
  1. app.py +183 -186
  2. requirements.txt +0 -7
app.py CHANGED
@@ -3,26 +3,27 @@ app.py — SpatialBench Gradio application
3
  -----------------------------------------
4
  Entrypoint for the HuggingFace Space "SpatialBench".
5
 
6
- Two tabs:
7
- 1. Leaderboard — visualize pre-computed results from all three tasks
8
- 2. Run launch experiments directly via API keys (no SLURM needed)
9
- (on HF Space, set API keys as Space Secrets)
 
10
 
11
  To run locally:
12
  cd pipeline/
13
  python app.py
14
 
15
  To deploy on HuggingFace Spaces:
16
- - Set Space Secrets: GEMINI_API_KEY, OPENAI_API_KEY, ANTHROPIC_API_KEY, DEEPSEEK_API_KEY
17
- - The Space entrypoint is this file (app.py)
18
  """
19
 
20
  from __future__ import annotations
21
 
22
  import os
23
  import sys
24
- import threading
25
- import time
26
  from pathlib import Path
27
 
28
  import gradio as gr
@@ -43,8 +44,7 @@ if _env.exists():
43
  # Add repo root to path so pipeline imports work
44
  sys.path.insert(0, str(Path(__file__).parent))
45
 
46
- from pipeline.task_builder import load_config, build_all_jobs
47
- from pipeline.job_monitor import JobMonitor, submit_direct
48
  from pipeline.results_loader import (
49
  load_all_results,
50
  maze_navigation_leaderboard,
@@ -53,17 +53,11 @@ from pipeline.results_loader import (
53
  )
54
 
55
  # ---------------------------------------------------------------------------
56
- # Paths
57
  # ---------------------------------------------------------------------------
58
  CONFIG_PATH = Path(__file__).parent / "configs" / "experiments.yaml"
59
  CFG = load_config(CONFIG_PATH)
60
  MODEL_CHOICES = list(CFG["models"].keys())
61
- MODEL_DISPLAY = {k: v["display_name"] for k, v in CFG["models"].items()}
62
-
63
- # Global job monitor (direct mode only — HF Space has no SLURM)
64
- _monitor = JobMonitor(mode="direct")
65
- _monitor_lock = threading.Lock()
66
-
67
 
68
  # ---------------------------------------------------------------------------
69
  # Leaderboard helpers
@@ -72,8 +66,12 @@ _monitor_lock = threading.Lock()
72
  def _load_results():
73
  try:
74
  return load_all_results(CONFIG_PATH)
75
- except Exception as e:
76
- return {"maze_navigation": pd.DataFrame(), "point_reuse": pd.DataFrame(), "compositional_distance": pd.DataFrame()}
 
 
 
 
77
 
78
 
79
  def _make_empty_fig(msg: str) -> go.Figure:
@@ -86,7 +84,7 @@ def _make_empty_fig(msg: str) -> go.Figure:
86
  return fig
87
 
88
 
89
- # ── Task 1 plots ────────────────────────────────────────────────────────────
90
 
91
  def plot_task1_accuracy(k_shot: int, input_format: str) -> tuple[go.Figure, pd.DataFrame]:
92
  results = _load_results()
@@ -122,7 +120,6 @@ def plot_task1_format_comparison() -> go.Figure:
122
  if df.empty:
123
  return _make_empty_fig("No Task 1 results found.")
124
 
125
- # Average over grid sizes, compare raw vs visual at k=0 with CoT
126
  sub = df[(df["k_shot"] == 0) & (df["prompt_strategy"] == "cot")]
127
  if sub.empty:
128
  sub = df[df["k_shot"] == 0]
@@ -140,7 +137,7 @@ def plot_task1_format_comparison() -> go.Figure:
140
  return fig
141
 
142
 
143
- # ── Task 2 plots ────────────────────────────────────────────────────────────
144
 
145
  def plot_task2_q0_q3(grid_size: int) -> tuple[go.Figure, pd.DataFrame]:
146
  results = _load_results()
@@ -191,7 +188,7 @@ def plot_task2_by_grid() -> go.Figure:
191
  return fig
192
 
193
 
194
- # ── Task 3 plots ────────────────────────────────────────────────────────────
195
 
196
  def plot_task3_compositional() -> tuple[go.Figure, pd.DataFrame]:
197
  results = _load_results()
@@ -242,61 +239,54 @@ def plot_task3_by_grid() -> go.Figure:
242
 
243
 
244
  # ---------------------------------------------------------------------------
245
- # Run-experiments tab
246
  # ---------------------------------------------------------------------------
247
 
248
- # Map from env-var name → user-provided key (populated at runtime from form)
249
- _USER_KEYS: dict[str, str] = {}
250
- _USER_KEYS_LOCK = threading.Lock()
 
 
 
251
 
 
 
 
 
 
 
 
 
 
 
 
 
252
 
253
- def launch_experiments(
 
254
  tasks: list[str],
255
  models: list[str],
256
  grid_sizes_str: str,
257
  formats: list[str],
258
  strategies: list[str],
259
- gemini_key: str,
260
- openai_key: str,
261
- anthropic_key: str,
262
- deepseek_key: str,
263
- ) -> tuple[str, list[list[str]]]:
264
- """Called when the user clicks 'Run' in the Gradio UI."""
265
- # Build a key map from only what the user explicitly typed — never os.environ
266
- user_keys: dict[str, str] = {}
267
- if gemini_key.strip():
268
- user_keys["GEMINI_API_KEY"] = gemini_key.strip()
269
- if openai_key.strip():
270
- user_keys["OPENAI_API_KEY"] = openai_key.strip()
271
- if anthropic_key.strip():
272
- user_keys["ANTHROPIC_API_KEY"] = anthropic_key.strip()
273
- if deepseek_key.strip():
274
- user_keys["DEEPSEEK_API_KEY"] = deepseek_key.strip()
275
-
276
- if not user_keys:
277
- return (
278
- "No API keys provided. Please enter at least one API key to run experiments.",
279
- [],
280
- )
281
 
282
- # Parse grid sizes
283
  try:
284
  grid_sizes = [int(g.strip()) for g in grid_sizes_str.split(",") if g.strip()]
285
  except ValueError:
286
- return "Invalid grid sizes — enter comma-separated integers, e.g. 5,6,7", []
287
 
288
- if not tasks:
289
- return "Select at least one task.", []
290
- if not models:
291
- return "Select at least one model.", []
292
-
293
- # Map display choices back to internal IDs
294
- task_map = {
295
- "Maze Navigation": "maze_navigation",
296
- "Sequential Point Reuse": "point_reuse",
297
- "Compositional Distance Comparison": "compositional_distance",
298
- }
299
- selected_tasks = [task_map[t] for t in tasks if t in task_map]
300
 
301
  jobs = build_all_jobs(
302
  cfg=CFG,
@@ -309,52 +299,75 @@ def launch_experiments(
309
  )
310
 
311
  if not jobs:
312
- return "No jobs matched the selected filters.", []
313
-
314
- launched = 0
315
- skipped = 0
316
- skipped_models: list[str] = []
317
- with _monitor_lock:
318
- for job in jobs:
319
- # Only use the key the user provided — never fall back to server env
320
- api_key = user_keys.get(job.api_key_env, "")
321
- if not api_key:
322
- skipped += 1
323
- skipped_models.append(job.model)
324
- continue
325
- job.output_dir.mkdir(parents=True, exist_ok=True)
326
- proc = submit_direct(
327
- cmd=job.python_cmd,
328
- working_dir=str(job.working_dir),
329
- env={job.api_key_env: api_key},
330
- )
331
- _monitor.add_direct(
332
- proc=proc,
333
- label=job.label,
334
- task_id=job.task_id,
335
- model=job.model,
336
- output_dir=str(job.output_dir),
337
- )
338
- launched += 1
339
- time.sleep(1) # avoid API rate limits on burst start
340
-
341
- status_msg = f"Launched {launched} job(s)."
342
- if skipped:
343
- missing = sorted(set(skipped_models))
344
- status_msg += (
345
- f" Skipped {skipped} job(s) for {', '.join(missing)} "
346
- f"— no API key provided for those models."
347
- )
348
-
349
- return status_msg, _monitor.as_table()
350
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
 
352
- def refresh_status() -> tuple[list[list[str]], str]:
353
- _monitor.refresh()
354
- summary = _monitor.summary()
355
- counts = summary["counts"]
356
- msg = " ".join(f"{s}: {n}" for s, n in counts.items()) or "No jobs submitted yet."
357
- return _monitor.as_table(), msg
358
 
359
 
360
  # ---------------------------------------------------------------------------
@@ -380,12 +393,10 @@ CSS = """
380
  *, body, .gradio-container { font-family: 'Inter', ui-sans-serif, system-ui, sans-serif !important; }
381
  code, pre, .monospace { font-family: 'IBM Plex Mono', ui-monospace, monospace !important; }
382
  .leaderboard-table { font-size: 0.9em; }
383
- .status-badge-running { color: #2196F3; font-weight: bold; }
384
- .status-badge-completed { color: #4CAF50; font-weight: bold; }
385
- .status-badge-failed { color: #F44336; font-weight: bold; }
386
  footer { display: none !important; }
387
  """
388
 
 
389
  def build_ui() -> gr.Blocks:
390
  with gr.Blocks(
391
  title="SpatialBench — Do LLMs Build Spatial World Models?",
@@ -449,14 +460,10 @@ def build_ui() -> gr.Blocks:
449
  "Do models reuse their earlier computation, or start from scratch?"
450
  )
451
 
452
- t2_grid = gr.Slider(minimum=5, maximum=9, step=1, value=5,
453
- label="Grid Size")
454
  t2_plot = gr.Plot(label="Q0 vs Q3 Accuracy")
455
  t2_grid_plot = gr.Plot(label="Q3 Accuracy Across Grid Sizes")
456
- t2_lb = gr.Dataframe(
457
- label="Leaderboard",
458
- elem_classes=["leaderboard-table"],
459
- )
460
 
461
  def refresh_task2(gs):
462
  fig, lb = plot_task2_q0_q3(int(gs))
@@ -508,7 +515,6 @@ def build_ui() -> gr.Blocks:
508
  ],
509
  )
510
 
511
- # Initial load
512
  demo.load(
513
  refresh_all_leaderboard,
514
  outputs=[
@@ -519,99 +525,93 @@ def build_ui() -> gr.Blocks:
519
  )
520
 
521
  # ================================================================
522
- # Tab 2: Run Experiments
523
  # ================================================================
524
- with gr.Tab(" Run Experiments"):
525
  gr.Markdown(
526
- "## Launch Experiments\n"
527
- "Experiments call LLM APIs directly no compute cluster needed.\n\n"
528
- "> **Your API keys are used only for your session and are never stored or logged.** \n"
529
- "> Enter keys only for the model(s) you want to evaluate. "
530
- "Jobs for models without a key will be skipped."
 
 
 
 
 
 
 
 
 
531
  )
532
 
533
  with gr.Row():
534
  with gr.Column(scale=2):
535
- # Task / model / grid selection
536
- run_tasks = gr.CheckboxGroup(
537
- choices=[
538
- "Maze Navigation",
539
- "Sequential Point Reuse",
540
- "Compositional Distance Comparison",
541
- ],
542
  value=["Maze Navigation"],
543
  label="Tasks",
544
  )
545
- run_models = gr.CheckboxGroup(
546
  choices=MODEL_CHOICES,
547
  value=["gemini-2.5-flash"],
548
  label="Models",
549
  )
550
- run_grids = gr.Textbox(
551
- value="5,6,7",
552
  label="Grid Sizes",
553
- info="Comma-separated integers. Maze dataset supports 5–9 (and beyond if regenerated).",
554
  )
555
  with gr.Row():
556
- run_formats = gr.CheckboxGroup(
557
  choices=["raw", "visual"],
558
- value=["raw"],
559
  label="Input Formats (Task 1 only)",
560
  )
561
- run_strategies = gr.CheckboxGroup(
562
  choices=["base", "cot", "reasoning"],
563
- value=["cot"],
564
  label="Prompt Strategies",
565
  )
566
 
567
  with gr.Column(scale=1):
568
- gr.Markdown("### API Keys")
569
- gr.Markdown(
570
- "Enter the key(s) for the model(s) you selected. "
571
- "Keys are used only for this session."
572
- )
573
- gemini_key = gr.Textbox(
574
- label="GEMINI_API_KEY", type="password", placeholder="AIza...",
575
- )
576
- openai_key = gr.Textbox(
577
- label="OPENAI_API_KEY", type="password", placeholder="sk-...",
578
- )
579
- anthropic_key = gr.Textbox(
580
- label="ANTHROPIC_API_KEY", type="password", placeholder="sk-ant-...",
581
  )
582
- deepseek_key = gr.Textbox(
583
- label="DEEPSEEK_API_KEY", type="password",
 
 
584
  )
585
 
586
  with gr.Row():
587
- run_btn = gr.Button("🚀 Launch Experiments", variant="primary", scale=2)
588
- refresh_btn = gr.Button("🔄 Refresh Status", scale=1)
589
 
590
- launch_msg = gr.Textbox(label="Launch Status", interactive=False)
591
-
592
- job_table = gr.Dataframe(
593
- headers=["Task", "Model", "Label", "Status", "Elapsed", "Started"],
594
- label="Job Status",
595
  interactive=False,
596
- wrap=True,
 
597
  )
598
- status_summary = gr.Textbox(
599
- label="Summary", interactive=False,
 
600
  )
601
 
602
- run_btn.click(
603
- launch_experiments,
604
  inputs=[
605
- run_tasks, run_models, run_grids,
606
- run_formats, run_strategies,
607
- gemini_key, openai_key, anthropic_key, deepseek_key,
608
  ],
609
- outputs=[launch_msg, job_table],
610
- )
611
-
612
- refresh_btn.click(
613
- refresh_status,
614
- outputs=[job_table, status_summary],
615
  )
616
 
617
  # ================================================================
@@ -651,18 +651,15 @@ SpatialBench is the evaluation platform accompanying the paper:
651
  ### Grid Sizes
652
 
653
  Experiments run on n×n grids for n ∈ {5, 6, 7, 8, 9} by default.
654
- The underlying `maze-dataset` library supports larger grids — adjust in the **Run** tab.
655
 
656
- ### Adding a New Model
 
 
657
 
658
- Edit `pipeline/configs/experiments.yaml`:
659
- ```yaml
660
- models:
661
- your-model-id:
662
- api_key_env: YOUR_API_KEY_ENV_VAR
663
- display_name: "Your Model Name"
664
  ```
665
- Then add inference support in `utils/llm_inference.py`.
666
 
667
  ### Citation
668
  ```bibtex
 
3
  -----------------------------------------
4
  Entrypoint for the HuggingFace Space "SpatialBench".
5
 
6
+ Three tabs:
7
+ 1. Leaderboard — visualize pre-computed results from all three tasks
8
+ 2. Get Scripts generate ready-to-run SLURM scripts (or plain shell
9
+ scripts) as a downloadable zip; no compute needed here
10
+ 3. About — paper info and citation
11
 
12
  To run locally:
13
  cd pipeline/
14
  python app.py
15
 
16
  To deploy on HuggingFace Spaces:
17
+ - No secrets required for the Leaderboard or Get Scripts tabs.
18
+ - The Space entrypoint is this file (app.py).
19
  """
20
 
21
  from __future__ import annotations
22
 
23
  import os
24
  import sys
25
+ import zipfile
26
+ import tempfile
27
  from pathlib import Path
28
 
29
  import gradio as gr
 
44
  # Add repo root to path so pipeline imports work
45
  sys.path.insert(0, str(Path(__file__).parent))
46
 
47
+ from pipeline.task_builder import load_config, build_all_jobs, make_sbatch_script
 
48
  from pipeline.results_loader import (
49
  load_all_results,
50
  maze_navigation_leaderboard,
 
53
  )
54
 
55
  # ---------------------------------------------------------------------------
56
+ # Paths / config
57
  # ---------------------------------------------------------------------------
58
  CONFIG_PATH = Path(__file__).parent / "configs" / "experiments.yaml"
59
  CFG = load_config(CONFIG_PATH)
60
  MODEL_CHOICES = list(CFG["models"].keys())
 
 
 
 
 
 
61
 
62
  # ---------------------------------------------------------------------------
63
  # Leaderboard helpers
 
66
  def _load_results():
67
  try:
68
  return load_all_results(CONFIG_PATH)
69
+ except Exception:
70
+ return {
71
+ "maze_navigation": pd.DataFrame(),
72
+ "point_reuse": pd.DataFrame(),
73
+ "compositional_distance": pd.DataFrame(),
74
+ }
75
 
76
 
77
  def _make_empty_fig(msg: str) -> go.Figure:
 
84
  return fig
85
 
86
 
87
+ # ── Task 1 plots ────────────────────────────────────────────────────────────
88
 
89
  def plot_task1_accuracy(k_shot: int, input_format: str) -> tuple[go.Figure, pd.DataFrame]:
90
  results = _load_results()
 
120
  if df.empty:
121
  return _make_empty_fig("No Task 1 results found.")
122
 
 
123
  sub = df[(df["k_shot"] == 0) & (df["prompt_strategy"] == "cot")]
124
  if sub.empty:
125
  sub = df[df["k_shot"] == 0]
 
137
  return fig
138
 
139
 
140
+ # ── Task 2 plots ────────────────────────────────────────────────────────────
141
 
142
  def plot_task2_q0_q3(grid_size: int) -> tuple[go.Figure, pd.DataFrame]:
143
  results = _load_results()
 
188
  return fig
189
 
190
 
191
+ # ── Task 3 plots ────────────────────────────────────────────────────────────
192
 
193
  def plot_task3_compositional() -> tuple[go.Figure, pd.DataFrame]:
194
  results = _load_results()
 
239
 
240
 
241
  # ---------------------------------------------------------------------------
242
+ # Script generation tab
243
  # ---------------------------------------------------------------------------
244
 
245
+ TASK_DISPLAY_MAP = {
246
+ "Maze Navigation": "maze_navigation",
247
+ "Sequential Point Reuse": "point_reuse",
248
+ "Compositional Distance Comparison": "compositional_distance",
249
+ }
250
+
251
 
252
+ def _make_plain_script(job, api_key_placeholder: str) -> str:
253
+ """Return a plain bash script (no SLURM headers) for running a job directly."""
254
+ lines = [
255
+ "#!/usr/bin/env bash",
256
+ f"# {job.label}",
257
+ f"export {job.api_key_env}={api_key_placeholder}",
258
+ "",
259
+ f"cd {job.working_dir}",
260
+ " \\\n ".join(job.python_cmd),
261
+ "",
262
+ ]
263
+ return "\n".join(lines)
264
 
265
+
266
+ def generate_scripts(
267
  tasks: list[str],
268
  models: list[str],
269
  grid_sizes_str: str,
270
  formats: list[str],
271
  strategies: list[str],
272
+ script_type: str,
273
+ repo_path: str,
274
+ ) -> tuple[str, str | None]:
275
+ """
276
+ Build experiment scripts and return (preview_text, zip_path).
277
+ zip_path is a temp file the user can download.
278
+ """
279
+ if not tasks:
280
+ return "Select at least one task.", None
281
+ if not models:
282
+ return "Select at least one model.", None
 
 
 
 
 
 
 
 
 
 
 
283
 
 
284
  try:
285
  grid_sizes = [int(g.strip()) for g in grid_sizes_str.split(",") if g.strip()]
286
  except ValueError:
287
+ return "Invalid grid sizes — enter comma-separated integers, e.g. 5,6,7", None
288
 
289
+ selected_tasks = [TASK_DISPLAY_MAP[t] for t in tasks if t in TASK_DISPLAY_MAP]
 
 
 
 
 
 
 
 
 
 
 
290
 
291
  jobs = build_all_jobs(
292
  cfg=CFG,
 
299
  )
300
 
301
  if not jobs:
302
+ return "No jobs matched the selected filters.", None
303
+
304
+ # Optionally override repo path in working_dir
305
+ repo_override = repo_path.strip() if repo_path.strip() else None
306
+
307
+ use_slurm = (script_type == "SLURM (.sh with #SBATCH headers)")
308
+ log_dir = Path(repo_override or ".") / "maze-solver" / "eval_llm_logs"
309
+
310
+ script_contents: dict[str, str] = {}
311
+ for job in jobs:
312
+ safe = job.label.replace(" ", "_").replace("|", "").replace("/", "_").strip("_")
313
+ filename = f"{safe}.sh"
314
+
315
+ # If a repo path override was provided, patch working_dir in the job
316
+ if repo_override:
317
+ # Rebase working_dir: replace the config-derived root with the user's path
318
+ try:
319
+ rel = job.working_dir.relative_to(CONFIG_PATH.parent.parent.parent)
320
+ job.working_dir = Path(repo_override) / rel
321
+ except ValueError:
322
+ pass
323
+ # Rebase output_dir similarly
324
+ try:
325
+ rel_out = job.output_dir.relative_to(CONFIG_PATH.parent.parent.parent)
326
+ job.output_dir = Path(repo_override) / rel_out
327
+ except ValueError:
328
+ pass
329
+ # Rebase python_cmd paths (first two tokens are "python" and script path)
330
+ if len(job.python_cmd) >= 2:
331
+ script_abs = Path(job.python_cmd[1])
332
+ try:
333
+ rel_script = script_abs.relative_to(CONFIG_PATH.parent.parent.parent)
334
+ job.python_cmd[1] = str(Path(repo_override) / rel_script)
335
+ except ValueError:
336
+ pass
337
+
338
+ if use_slurm:
339
+ content = make_sbatch_script(job, log_dir)
340
+ else:
341
+ content = _make_plain_script(job, f'"${{{job.api_key_env}}}"')
342
+
343
+ script_contents[filename] = content
344
+
345
+ # Write zip to a named temp file (Gradio File component needs a real path)
346
+ tmp = tempfile.NamedTemporaryFile(
347
+ delete=False, suffix=".zip", prefix="spatialbench_scripts_"
348
+ )
349
+ with zipfile.ZipFile(tmp, "w", zipfile.ZIP_DEFLATED) as zf:
350
+ for fname, content in script_contents.items():
351
+ zf.writestr(fname, content)
352
+ # Also include a README and a master run_all.sh
353
+ run_all_lines = ["#!/usr/bin/env bash", "# Run all generated scripts sequentially", ""]
354
+ for fname in sorted(script_contents):
355
+ run_all_lines.append(f"bash {fname}")
356
+ zf.writestr("run_all.sh", "\n".join(run_all_lines) + "\n")
357
+
358
+ tmp.close()
359
+
360
+ # Preview: show first script + summary
361
+ n = len(script_contents)
362
+ first_name, first_content = next(iter(script_contents.items()))
363
+ preview = (
364
+ f"Generated {n} script(s) for {len(models)} model(s) across {len(selected_tasks)} task(s).\n"
365
+ f"Download the zip below, unzip in your cluster, then run: bash run_all.sh\n\n"
366
+ f"── {first_name} ──\n{first_content}"
367
+ + (f"\n\n... and {n - 1} more script(s) in the zip." if n > 1 else "")
368
+ )
369
 
370
+ return preview, tmp.name
 
 
 
 
 
371
 
372
 
373
  # ---------------------------------------------------------------------------
 
393
  *, body, .gradio-container { font-family: 'Inter', ui-sans-serif, system-ui, sans-serif !important; }
394
  code, pre, .monospace { font-family: 'IBM Plex Mono', ui-monospace, monospace !important; }
395
  .leaderboard-table { font-size: 0.9em; }
 
 
 
396
  footer { display: none !important; }
397
  """
398
 
399
+
400
  def build_ui() -> gr.Blocks:
401
  with gr.Blocks(
402
  title="SpatialBench — Do LLMs Build Spatial World Models?",
 
460
  "Do models reuse their earlier computation, or start from scratch?"
461
  )
462
 
463
+ t2_grid = gr.Slider(minimum=5, maximum=9, step=1, value=5, label="Grid Size")
 
464
  t2_plot = gr.Plot(label="Q0 vs Q3 Accuracy")
465
  t2_grid_plot = gr.Plot(label="Q3 Accuracy Across Grid Sizes")
466
+ t2_lb = gr.Dataframe(label="Leaderboard", elem_classes=["leaderboard-table"])
 
 
 
467
 
468
  def refresh_task2(gs):
469
  fig, lb = plot_task2_q0_q3(int(gs))
 
515
  ],
516
  )
517
 
 
518
  demo.load(
519
  refresh_all_leaderboard,
520
  outputs=[
 
525
  )
526
 
527
  # ================================================================
528
+ # Tab 2: Get Scripts
529
  # ================================================================
530
+ with gr.Tab("⬇️ Get Scripts"):
531
  gr.Markdown(
532
+ "## Generate Experiment Scripts\n"
533
+ "Configure the experiments you want to run, then download a zip of ready-to-run "
534
+ "shell scripts.\n\n"
535
+ "**How to use:**\n"
536
+ "1. Select tasks, models, and settings below\n"
537
+ "2. Enter the path to your local clone of the repo (so paths in the scripts are correct)\n"
538
+ "3. Click **Generate** — a preview appears and a zip is ready to download\n"
539
+ "4. Unzip on your cluster, set your API key(s) as environment variables, then:\n"
540
+ " ```bash\n"
541
+ " export GEMINI_API_KEY=your_key_here\n"
542
+ " bash run_all.sh # run sequentially\n"
543
+ " # — or submit individually:\n"
544
+ " sbatch Task_1__Maze_Navigation__gemini-2.5-flash__raw__cot.sh\n"
545
+ " ```"
546
  )
547
 
548
  with gr.Row():
549
  with gr.Column(scale=2):
550
+ gen_tasks = gr.CheckboxGroup(
551
+ choices=list(TASK_DISPLAY_MAP.keys()),
 
 
 
 
 
552
  value=["Maze Navigation"],
553
  label="Tasks",
554
  )
555
+ gen_models = gr.CheckboxGroup(
556
  choices=MODEL_CHOICES,
557
  value=["gemini-2.5-flash"],
558
  label="Models",
559
  )
560
+ gen_grids = gr.Textbox(
561
+ value="5,6,7,8,9",
562
  label="Grid Sizes",
563
+ info="Comma-separated. Paper used 5–9.",
564
  )
565
  with gr.Row():
566
+ gen_formats = gr.CheckboxGroup(
567
  choices=["raw", "visual"],
568
+ value=["raw", "visual"],
569
  label="Input Formats (Task 1 only)",
570
  )
571
+ gen_strategies = gr.CheckboxGroup(
572
  choices=["base", "cot", "reasoning"],
573
+ value=["base", "cot", "reasoning"],
574
  label="Prompt Strategies",
575
  )
576
 
577
  with gr.Column(scale=1):
578
+ gen_script_type = gr.Radio(
579
+ choices=[
580
+ "SLURM (.sh with #SBATCH headers)",
581
+ "Plain bash (.sh, no SLURM)",
582
+ ],
583
+ value="SLURM (.sh with #SBATCH headers)",
584
+ label="Script Type",
585
+ info="Use SLURM if you have a cluster. Plain bash runs directly.",
 
 
 
 
 
586
  )
587
+ gen_repo_path = gr.Textbox(
588
+ label="Repo path on your cluster",
589
+ placeholder="/path/to/llm-maze-solver",
590
+ info="Absolute path to the llm-maze-solver repo root on the machine where you'll run the scripts. Leave blank to use relative paths.",
591
  )
592
 
593
  with gr.Row():
594
+ gen_btn = gr.Button("⚙️ Generate Scripts", variant="primary", scale=2)
 
595
 
596
+ gen_preview = gr.Textbox(
597
+ label="Preview (first script)",
 
 
 
598
  interactive=False,
599
+ lines=20,
600
+ max_lines=30,
601
  )
602
+ gen_download = gr.File(
603
+ label="Download Scripts (.zip)",
604
+ interactive=False,
605
  )
606
 
607
+ gen_btn.click(
608
+ generate_scripts,
609
  inputs=[
610
+ gen_tasks, gen_models, gen_grids,
611
+ gen_formats, gen_strategies,
612
+ gen_script_type, gen_repo_path,
613
  ],
614
+ outputs=[gen_preview, gen_download],
 
 
 
 
 
615
  )
616
 
617
  # ================================================================
 
651
  ### Grid Sizes
652
 
653
  Experiments run on n×n grids for n ∈ {5, 6, 7, 8, 9} by default.
 
654
 
655
+ ### Reproducing Experiments
656
+
657
+ Clone the repo and use the **Get Scripts** tab above to generate SLURM scripts, or use the CLI directly:
658
 
659
+ ```bash
660
+ cd pipeline/
661
+ python run_experiments.py --tasks maze_navigation --models gemini-2.5-flash --mode slurm --dry-run
 
 
 
662
  ```
 
663
 
664
  ### Citation
665
  ```bibtex
requirements.txt CHANGED
@@ -15,12 +15,5 @@ numpy>=1.24.0
15
  # Config parsing
16
  PyYAML>=6.0
17
 
18
- # LLM API clients
19
- openai>=1.14.0
20
- anthropic>=0.25.0
21
- google-generativeai>=0.5.0
22
-
23
- # (DeepSeek uses the OpenAI-compatible client — no extra package needed)
24
-
25
  # Environment variable loading
26
  python-dotenv>=1.0.0
 
15
  # Config parsing
16
  PyYAML>=6.0
17
 
 
 
 
 
 
 
 
18
  # Environment variable loading
19
  python-dotenv>=1.0.0