Spaces:

weijiang99
/

SpatialBench

Sleeping

App Files Files Community

weijiang99 commited on 14 days ago

Commit

cb5acaf

verified ·

1 Parent(s): 52ea128

Update SpatialBench pipeline

Browse files

Files changed (2) hide show

app.py +183 -186
requirements.txt +0 -7

app.py CHANGED Viewed

@@ -3,26 +3,27 @@ app.py — SpatialBench Gradio application
 -----------------------------------------
 Entrypoint for the HuggingFace Space "SpatialBench".
-Two tabs:
-  1. Leaderboard  — visualize pre-computed results from all three tasks
-  2. Run          — launch experiments directly via API keys (no SLURM needed)
-                    (on HF Space, set API keys as Space Secrets)
 To run locally:
     cd pipeline/
     python app.py
 To deploy on HuggingFace Spaces:
-    - Set Space Secrets: GEMINI_API_KEY, OPENAI_API_KEY, ANTHROPIC_API_KEY, DEEPSEEK_API_KEY
-    - The Space entrypoint is this file (app.py)
 """
 from __future__ import annotations
 import os
 import sys
-import threading
-import time
 from pathlib import Path
 import gradio as gr
@@ -43,8 +44,7 @@ if _env.exists():
 # Add repo root to path so pipeline imports work
 sys.path.insert(0, str(Path(__file__).parent))
-from pipeline.task_builder import load_config, build_all_jobs
-from pipeline.job_monitor import JobMonitor, submit_direct
 from pipeline.results_loader import (
     load_all_results,
     maze_navigation_leaderboard,
@@ -53,17 +53,11 @@ from pipeline.results_loader import (
 )
 # ---------------------------------------------------------------------------
-# Paths
 # ---------------------------------------------------------------------------
 CONFIG_PATH = Path(__file__).parent / "configs" / "experiments.yaml"
 CFG = load_config(CONFIG_PATH)
 MODEL_CHOICES = list(CFG["models"].keys())
-MODEL_DISPLAY = {k: v["display_name"] for k, v in CFG["models"].items()}
-# Global job monitor (direct mode only — HF Space has no SLURM)
-_monitor = JobMonitor(mode="direct")
-_monitor_lock = threading.Lock()
 # ---------------------------------------------------------------------------
 # Leaderboard helpers
@@ -72,8 +66,12 @@ _monitor_lock = threading.Lock()
 def _load_results():
     try:
         return load_all_results(CONFIG_PATH)
-    except Exception as e:
-        return {"maze_navigation": pd.DataFrame(), "point_reuse": pd.DataFrame(), "compositional_distance": pd.DataFrame()}
 def _make_empty_fig(msg: str) -> go.Figure:
@@ -86,7 +84,7 @@ def _make_empty_fig(msg: str) -> go.Figure:
     return fig
-# ── Task 1 plots ────────────────────────────────────────────────────────────
 def plot_task1_accuracy(k_shot: int, input_format: str) -> tuple[go.Figure, pd.DataFrame]:
     results = _load_results()
@@ -122,7 +120,6 @@ def plot_task1_format_comparison() -> go.Figure:
     if df.empty:
         return _make_empty_fig("No Task 1 results found.")
-    # Average over grid sizes, compare raw vs visual at k=0 with CoT
     sub = df[(df["k_shot"] == 0) & (df["prompt_strategy"] == "cot")]
     if sub.empty:
         sub = df[df["k_shot"] == 0]
@@ -140,7 +137,7 @@ def plot_task1_format_comparison() -> go.Figure:
     return fig
-# ── Task 2 plots ────────────────────────────────────────────────────────────
 def plot_task2_q0_q3(grid_size: int) -> tuple[go.Figure, pd.DataFrame]:
     results = _load_results()
@@ -191,7 +188,7 @@ def plot_task2_by_grid() -> go.Figure:
     return fig
-# ── Task 3 plots ────────────────────────────────────────────────────────────
 def plot_task3_compositional() -> tuple[go.Figure, pd.DataFrame]:
     results = _load_results()
@@ -242,61 +239,54 @@ def plot_task3_by_grid() -> go.Figure:
 # ---------------------------------------------------------------------------
-# Run-experiments tab
 # ---------------------------------------------------------------------------
-# Map from env-var name → user-provided key (populated at runtime from form)
-_USER_KEYS: dict[str, str] = {}
-_USER_KEYS_LOCK = threading.Lock()
-def launch_experiments(
     tasks: list[str],
     models: list[str],
     grid_sizes_str: str,
     formats: list[str],
     strategies: list[str],
-    gemini_key: str,
-    openai_key: str,
-    anthropic_key: str,
-    deepseek_key: str,
-) -> tuple[str, list[list[str]]]:
-    """Called when the user clicks 'Run' in the Gradio UI."""
-    # Build a key map from only what the user explicitly typed — never os.environ
-    user_keys: dict[str, str] = {}
-    if gemini_key.strip():
-        user_keys["GEMINI_API_KEY"] = gemini_key.strip()
-    if openai_key.strip():
-        user_keys["OPENAI_API_KEY"] = openai_key.strip()
-    if anthropic_key.strip():
-        user_keys["ANTHROPIC_API_KEY"] = anthropic_key.strip()
-    if deepseek_key.strip():
-        user_keys["DEEPSEEK_API_KEY"] = deepseek_key.strip()
-    if not user_keys:
-        return (
-            "No API keys provided. Please enter at least one API key to run experiments.",
-            [],
-        )
-    # Parse grid sizes
     try:
         grid_sizes = [int(g.strip()) for g in grid_sizes_str.split(",") if g.strip()]
     except ValueError:
-        return "Invalid grid sizes — enter comma-separated integers, e.g. 5,6,7", []
-    if not tasks:
-        return "Select at least one task.", []
-    if not models:
-        return "Select at least one model.", []
-    # Map display choices back to internal IDs
-    task_map = {
-        "Maze Navigation":                   "maze_navigation",
-        "Sequential Point Reuse":            "point_reuse",
-        "Compositional Distance Comparison": "compositional_distance",
-    }
-    selected_tasks = [task_map[t] for t in tasks if t in task_map]
     jobs = build_all_jobs(
         cfg=CFG,
@@ -309,52 +299,75 @@ def launch_experiments(
     )
     if not jobs:
-        return "No jobs matched the selected filters.", []
-    launched = 0
-    skipped = 0
-    skipped_models: list[str] = []
-    with _monitor_lock:
-        for job in jobs:
-            # Only use the key the user provided — never fall back to server env
-            api_key = user_keys.get(job.api_key_env, "")
-            if not api_key:
-                skipped += 1
-                skipped_models.append(job.model)
-                continue
-            job.output_dir.mkdir(parents=True, exist_ok=True)
-            proc = submit_direct(
-                cmd=job.python_cmd,
-                working_dir=str(job.working_dir),
-                env={job.api_key_env: api_key},
-            )
-            _monitor.add_direct(
-                proc=proc,
-                label=job.label,
-                task_id=job.task_id,
-                model=job.model,
-                output_dir=str(job.output_dir),
-            )
-            launched += 1
-            time.sleep(1)  # avoid API rate limits on burst start
-    status_msg = f"Launched {launched} job(s)."
-    if skipped:
-        missing = sorted(set(skipped_models))
-        status_msg += (
-            f" Skipped {skipped} job(s) for {', '.join(missing)} "
-            f"— no API key provided for those models."
-        )
-    return status_msg, _monitor.as_table()
-def refresh_status() -> tuple[list[list[str]], str]:
-    _monitor.refresh()
-    summary = _monitor.summary()
-    counts = summary["counts"]
-    msg = "  ".join(f"{s}: {n}" for s, n in counts.items()) or "No jobs submitted yet."
-    return _monitor.as_table(), msg
 # ---------------------------------------------------------------------------
@@ -380,12 +393,10 @@ CSS = """
 *, body, .gradio-container { font-family: 'Inter', ui-sans-serif, system-ui, sans-serif !important; }
 code, pre, .monospace { font-family: 'IBM Plex Mono', ui-monospace, monospace !important; }
 .leaderboard-table { font-size: 0.9em; }
-.status-badge-running { color: #2196F3; font-weight: bold; }
-.status-badge-completed { color: #4CAF50; font-weight: bold; }
-.status-badge-failed { color: #F44336; font-weight: bold; }
 footer { display: none !important; }
 """
 def build_ui() -> gr.Blocks:
     with gr.Blocks(
         title="SpatialBench — Do LLMs Build Spatial World Models?",
@@ -449,14 +460,10 @@ def build_ui() -> gr.Blocks:
                     "Do models reuse their earlier computation, or start from scratch?"
                 )
-                t2_grid = gr.Slider(minimum=5, maximum=9, step=1, value=5,
-                                    label="Grid Size")
                 t2_plot = gr.Plot(label="Q0 vs Q3 Accuracy")
                 t2_grid_plot = gr.Plot(label="Q3 Accuracy Across Grid Sizes")
-                t2_lb = gr.Dataframe(
-                    label="Leaderboard",
-                    elem_classes=["leaderboard-table"],
-                )
                 def refresh_task2(gs):
                     fig, lb = plot_task2_q0_q3(int(gs))
@@ -508,7 +515,6 @@ def build_ui() -> gr.Blocks:
                     ],
                 )
-                # Initial load
                 demo.load(
                     refresh_all_leaderboard,
                     outputs=[
@@ -519,99 +525,93 @@ def build_ui() -> gr.Blocks:
                 )
             # ================================================================
-            # Tab 2: Run Experiments
             # ================================================================
-            with gr.Tab("⚡ Run Experiments"):
                 gr.Markdown(
-                    "## Launch Experiments\n"
-                    "Experiments call LLM APIs directly — no compute cluster needed.\n\n"
-                    "> **Your API keys are used only for your session and are never stored or logged.**  \n"
-                    "> Enter keys only for the model(s) you want to evaluate. "
-                    "Jobs for models without a key will be skipped."
                 )
                 with gr.Row():
                     with gr.Column(scale=2):
-                        # Task / model / grid selection
-                        run_tasks = gr.CheckboxGroup(
-                            choices=[
-                                "Maze Navigation",
-                                "Sequential Point Reuse",
-                                "Compositional Distance Comparison",
-                            ],
                             value=["Maze Navigation"],
                             label="Tasks",
                         )
-                        run_models = gr.CheckboxGroup(
                             choices=MODEL_CHOICES,
                             value=["gemini-2.5-flash"],
                             label="Models",
                         )
-                        run_grids = gr.Textbox(
-                            value="5,6,7",
                             label="Grid Sizes",
-                            info="Comma-separated integers. Maze dataset supports 5–9 (and beyond if regenerated).",
                         )
                         with gr.Row():
-                            run_formats = gr.CheckboxGroup(
                                 choices=["raw", "visual"],
-                                value=["raw"],
                                 label="Input Formats (Task 1 only)",
                             )
-                            run_strategies = gr.CheckboxGroup(
                                 choices=["base", "cot", "reasoning"],
-                                value=["cot"],
                                 label="Prompt Strategies",
                             )
                     with gr.Column(scale=1):
-                        gr.Markdown("### API Keys")
-                        gr.Markdown(
-                            "Enter the key(s) for the model(s) you selected. "
-                            "Keys are used only for this session."
-                        )
-                        gemini_key = gr.Textbox(
-                            label="GEMINI_API_KEY", type="password", placeholder="AIza...",
-                        )
-                        openai_key = gr.Textbox(
-                            label="OPENAI_API_KEY", type="password", placeholder="sk-...",
-                        )
-                        anthropic_key = gr.Textbox(
-                            label="ANTHROPIC_API_KEY", type="password", placeholder="sk-ant-...",
                         )
-                        deepseek_key = gr.Textbox(
-                            label="DEEPSEEK_API_KEY", type="password",
                         )
                 with gr.Row():
-                    run_btn = gr.Button("🚀 Launch Experiments", variant="primary", scale=2)
-                    refresh_btn = gr.Button("🔄 Refresh Status", scale=1)
-                launch_msg = gr.Textbox(label="Launch Status", interactive=False)
-                job_table = gr.Dataframe(
-                    headers=["Task", "Model", "Label", "Status", "Elapsed", "Started"],
-                    label="Job Status",
                     interactive=False,
-                    wrap=True,
                 )
-                status_summary = gr.Textbox(
-                    label="Summary", interactive=False,
                 )
-                run_btn.click(
-                    launch_experiments,
                     inputs=[
-                        run_tasks, run_models, run_grids,
-                        run_formats, run_strategies,
-                        gemini_key, openai_key, anthropic_key, deepseek_key,
                     ],
-                    outputs=[launch_msg, job_table],
-                )
-                refresh_btn.click(
-                    refresh_status,
-                    outputs=[job_table, status_summary],
                 )
             # ================================================================
@@ -651,18 +651,15 @@ SpatialBench is the evaluation platform accompanying the paper:
 ### Grid Sizes
 Experiments run on n×n grids for n ∈ {5, 6, 7, 8, 9} by default.
-The underlying `maze-dataset` library supports larger grids — adjust in the **Run** tab.
-### Adding a New Model
-Edit `pipeline/configs/experiments.yaml`:
-```yaml
-models:
-  your-model-id:
-    api_key_env: YOUR_API_KEY_ENV_VAR
-    display_name: "Your Model Name"
 ```
-Then add inference support in `utils/llm_inference.py`.
 ### Citation
 ```bibtex

 -----------------------------------------
 Entrypoint for the HuggingFace Space "SpatialBench".
+Three tabs:
+  1. Leaderboard     — visualize pre-computed results from all three tasks
+  2. Get Scripts     — generate ready-to-run SLURM scripts (or plain shell
+                       scripts) as a downloadable zip; no compute needed here
+  3. About           — paper info and citation
 To run locally:
     cd pipeline/
     python app.py
 To deploy on HuggingFace Spaces:
+    - No secrets required for the Leaderboard or Get Scripts tabs.
+    - The Space entrypoint is this file (app.py).
 """
 from __future__ import annotations
 import os
 import sys
+import zipfile
+import tempfile
 from pathlib import Path
 import gradio as gr
 # Add repo root to path so pipeline imports work
 sys.path.insert(0, str(Path(__file__).parent))
+from pipeline.task_builder import load_config, build_all_jobs, make_sbatch_script
 from pipeline.results_loader import (
     load_all_results,
     maze_navigation_leaderboard,
 )
 # ---------------------------------------------------------------------------
+# Paths / config
 # ---------------------------------------------------------------------------
 CONFIG_PATH = Path(__file__).parent / "configs" / "experiments.yaml"
 CFG = load_config(CONFIG_PATH)
 MODEL_CHOICES = list(CFG["models"].keys())
 # ---------------------------------------------------------------------------
 # Leaderboard helpers
 def _load_results():
     try:
         return load_all_results(CONFIG_PATH)
+    except Exception:
+        return {
+            "maze_navigation": pd.DataFrame(),
+            "point_reuse": pd.DataFrame(),
+            "compositional_distance": pd.DataFrame(),
+        }
 def _make_empty_fig(msg: str) -> go.Figure:
     return fig
+# ── Task 1 plots ─────────────────────────────────────────────────────────────
 def plot_task1_accuracy(k_shot: int, input_format: str) -> tuple[go.Figure, pd.DataFrame]:
     results = _load_results()
     if df.empty:
         return _make_empty_fig("No Task 1 results found.")
     sub = df[(df["k_shot"] == 0) & (df["prompt_strategy"] == "cot")]
     if sub.empty:
         sub = df[df["k_shot"] == 0]
     return fig
+# ── Task 2 plots ─────────────────────────────────────────────────────────────
 def plot_task2_q0_q3(grid_size: int) -> tuple[go.Figure, pd.DataFrame]:
     results = _load_results()
     return fig
+# ── Task 3 plots ─────────────────────────────────────────────────────────────
 def plot_task3_compositional() -> tuple[go.Figure, pd.DataFrame]:
     results = _load_results()
 # ---------------------------------------------------------------------------
+# Script generation tab
 # ---------------------------------------------------------------------------
+TASK_DISPLAY_MAP = {
+    "Maze Navigation":                   "maze_navigation",
+    "Sequential Point Reuse":            "point_reuse",
+    "Compositional Distance Comparison": "compositional_distance",
+}
+def _make_plain_script(job, api_key_placeholder: str) -> str:
+    """Return a plain bash script (no SLURM headers) for running a job directly."""
+    lines = [
+        "#!/usr/bin/env bash",
+        f"# {job.label}",
+        f"export {job.api_key_env}={api_key_placeholder}",
+        "",
+        f"cd {job.working_dir}",
+        " \\\n    ".join(job.python_cmd),
+        "",
+    ]
+    return "\n".join(lines)
+def generate_scripts(
     tasks: list[str],
     models: list[str],
     grid_sizes_str: str,
     formats: list[str],
     strategies: list[str],
+    script_type: str,
+    repo_path: str,
+) -> tuple[str, str | None]:
+    """
+    Build experiment scripts and return (preview_text, zip_path).
+    zip_path is a temp file the user can download.
+    """
+    if not tasks:
+        return "Select at least one task.", None
+    if not models:
+        return "Select at least one model.", None
     try:
         grid_sizes = [int(g.strip()) for g in grid_sizes_str.split(",") if g.strip()]
     except ValueError:
+        return "Invalid grid sizes — enter comma-separated integers, e.g. 5,6,7", None
+    selected_tasks = [TASK_DISPLAY_MAP[t] for t in tasks if t in TASK_DISPLAY_MAP]
     jobs = build_all_jobs(
         cfg=CFG,
     )
     if not jobs:
+        return "No jobs matched the selected filters.", None
+    # Optionally override repo path in working_dir
+    repo_override = repo_path.strip() if repo_path.strip() else None
+    use_slurm = (script_type == "SLURM (.sh with #SBATCH headers)")
+    log_dir = Path(repo_override or ".") / "maze-solver" / "eval_llm_logs"
+    script_contents: dict[str, str] = {}
+    for job in jobs:
+        safe = job.label.replace(" ", "_").replace("|", "").replace("/", "_").strip("_")
+        filename = f"{safe}.sh"
+        # If a repo path override was provided, patch working_dir in the job
+        if repo_override:
+            # Rebase working_dir: replace the config-derived root with the user's path
+            try:
+                rel = job.working_dir.relative_to(CONFIG_PATH.parent.parent.parent)
+                job.working_dir = Path(repo_override) / rel
+            except ValueError:
+                pass
+            # Rebase output_dir similarly
+            try:
+                rel_out = job.output_dir.relative_to(CONFIG_PATH.parent.parent.parent)
+                job.output_dir = Path(repo_override) / rel_out
+            except ValueError:
+                pass
+            # Rebase python_cmd paths (first two tokens are "python" and script path)
+            if len(job.python_cmd) >= 2:
+                script_abs = Path(job.python_cmd[1])
+                try:
+                    rel_script = script_abs.relative_to(CONFIG_PATH.parent.parent.parent)
+                    job.python_cmd[1] = str(Path(repo_override) / rel_script)
+                except ValueError:
+                    pass
+        if use_slurm:
+            content = make_sbatch_script(job, log_dir)
+        else:
+            content = _make_plain_script(job, f'"${{{job.api_key_env}}}"')
+        script_contents[filename] = content
+    # Write zip to a named temp file (Gradio File component needs a real path)
+    tmp = tempfile.NamedTemporaryFile(
+        delete=False, suffix=".zip", prefix="spatialbench_scripts_"
+    )
+    with zipfile.ZipFile(tmp, "w", zipfile.ZIP_DEFLATED) as zf:
+        for fname, content in script_contents.items():
+            zf.writestr(fname, content)
+        # Also include a README and a master run_all.sh
+        run_all_lines = ["#!/usr/bin/env bash", "# Run all generated scripts sequentially", ""]
+        for fname in sorted(script_contents):
+            run_all_lines.append(f"bash {fname}")
+        zf.writestr("run_all.sh", "\n".join(run_all_lines) + "\n")
+    tmp.close()
+    # Preview: show first script + summary
+    n = len(script_contents)
+    first_name, first_content = next(iter(script_contents.items()))
+    preview = (
+        f"Generated {n} script(s) for {len(models)} model(s) across {len(selected_tasks)} task(s).\n"
+        f"Download the zip below, unzip in your cluster, then run: bash run_all.sh\n\n"
+        f"── {first_name} ──\n{first_content}"
+        + (f"\n\n... and {n - 1} more script(s) in the zip." if n > 1 else "")
+    )
+    return preview, tmp.name
 # ---------------------------------------------------------------------------
 *, body, .gradio-container { font-family: 'Inter', ui-sans-serif, system-ui, sans-serif !important; }
 code, pre, .monospace { font-family: 'IBM Plex Mono', ui-monospace, monospace !important; }
 .leaderboard-table { font-size: 0.9em; }
 footer { display: none !important; }
 """
 def build_ui() -> gr.Blocks:
     with gr.Blocks(
         title="SpatialBench — Do LLMs Build Spatial World Models?",
                     "Do models reuse their earlier computation, or start from scratch?"
                 )
+                t2_grid = gr.Slider(minimum=5, maximum=9, step=1, value=5, label="Grid Size")
                 t2_plot = gr.Plot(label="Q0 vs Q3 Accuracy")
                 t2_grid_plot = gr.Plot(label="Q3 Accuracy Across Grid Sizes")
+                t2_lb = gr.Dataframe(label="Leaderboard", elem_classes=["leaderboard-table"])
                 def refresh_task2(gs):
                     fig, lb = plot_task2_q0_q3(int(gs))
                     ],
                 )
                 demo.load(
                     refresh_all_leaderboard,
                     outputs=[
                 )
             # ================================================================
+            # Tab 2: Get Scripts
             # ================================================================
+            with gr.Tab("⬇️ Get Scripts"):
                 gr.Markdown(
+                    "## Generate Experiment Scripts\n"
+                    "Configure the experiments you want to run, then download a zip of ready-to-run "
+                    "shell scripts.\n\n"
+                    "**How to use:**\n"
+                    "1. Select tasks, models, and settings below\n"
+                    "2. Enter the path to your local clone of the repo (so paths in the scripts are correct)\n"
+                    "3. Click **Generate** — a preview appears and a zip is ready to download\n"
+                    "4. Unzip on your cluster, set your API key(s) as environment variables, then:\n"
+                    "   ```bash\n"
+                    "   export GEMINI_API_KEY=your_key_here\n"
+                    "   bash run_all.sh        # run sequentially\n"
+                    "   # — or submit individually:\n"
+                    "   sbatch Task_1__Maze_Navigation__gemini-2.5-flash__raw__cot.sh\n"
+                    "   ```"
                 )
                 with gr.Row():
                     with gr.Column(scale=2):
+                        gen_tasks = gr.CheckboxGroup(
+                            choices=list(TASK_DISPLAY_MAP.keys()),
                             value=["Maze Navigation"],
                             label="Tasks",
                         )
+                        gen_models = gr.CheckboxGroup(
                             choices=MODEL_CHOICES,
                             value=["gemini-2.5-flash"],
                             label="Models",
                         )
+                        gen_grids = gr.Textbox(
+                            value="5,6,7,8,9",
                             label="Grid Sizes",
+                            info="Comma-separated. Paper used 5–9.",
                         )
                         with gr.Row():
+                            gen_formats = gr.CheckboxGroup(
                                 choices=["raw", "visual"],
+                                value=["raw", "visual"],
                                 label="Input Formats (Task 1 only)",
                             )
+                            gen_strategies = gr.CheckboxGroup(
                                 choices=["base", "cot", "reasoning"],
+                                value=["base", "cot", "reasoning"],
                                 label="Prompt Strategies",
                             )
                     with gr.Column(scale=1):
+                        gen_script_type = gr.Radio(
+                            choices=[
+                                "SLURM (.sh with #SBATCH headers)",
+                                "Plain bash (.sh, no SLURM)",
+                            ],
+                            value="SLURM (.sh with #SBATCH headers)",
+                            label="Script Type",
+                            info="Use SLURM if you have a cluster. Plain bash runs directly.",
                         )
+                        gen_repo_path = gr.Textbox(
+                            label="Repo path on your cluster",
+                            placeholder="/path/to/llm-maze-solver",
+                            info="Absolute path to the llm-maze-solver repo root on the machine where you'll run the scripts. Leave blank to use relative paths.",
                         )
                 with gr.Row():
+                    gen_btn = gr.Button("⚙️ Generate Scripts", variant="primary", scale=2)
+                gen_preview = gr.Textbox(
+                    label="Preview (first script)",
                     interactive=False,
+                    lines=20,
+                    max_lines=30,
                 )
+                gen_download = gr.File(
+                    label="Download Scripts (.zip)",
+                    interactive=False,
                 )
+                gen_btn.click(
+                    generate_scripts,
                     inputs=[
+                        gen_tasks, gen_models, gen_grids,
+                        gen_formats, gen_strategies,
+                        gen_script_type, gen_repo_path,
                     ],
+                    outputs=[gen_preview, gen_download],
                 )
             # ================================================================
 ### Grid Sizes
 Experiments run on n×n grids for n ∈ {5, 6, 7, 8, 9} by default.
+### Reproducing Experiments
+Clone the repo and use the **Get Scripts** tab above to generate SLURM scripts, or use the CLI directly:
+```bash
+cd pipeline/
+python run_experiments.py --tasks maze_navigation --models gemini-2.5-flash --mode slurm --dry-run
 ```
 ### Citation
 ```bibtex

requirements.txt CHANGED Viewed

@@ -15,12 +15,5 @@ numpy>=1.24.0
 # Config parsing
 PyYAML>=6.0
-# LLM API clients
-openai>=1.14.0
-anthropic>=0.25.0
-google-generativeai>=0.5.0
-# (DeepSeek uses the OpenAI-compatible client — no extra package needed)
 # Environment variable loading
 python-dotenv>=1.0.0

 # Config parsing
 PyYAML>=6.0
 # Environment variable loading
 python-dotenv>=1.0.0