Spaces:

weijiang99
/

SpatialBench

Sleeping

App Files Files Community

weijiang99 commited on 15 days ago

Commit

cffeecf

verified ·

1 Parent(s): 022c2d7

Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

README.md +19 -6
app.py +688 -0
configs/experiments.yaml +171 -0
pipeline/__init__.py +1 -0
pipeline/job_monitor.py +314 -0
pipeline/results_loader.py +402 -0
pipeline/task_builder.py +328 -0
requirements.txt +31 -0
run_experiments.py +307 -0

README.md CHANGED Viewed

@@ -1,12 +1,25 @@
 ---
 title: SpatialBench
-emoji: 🌖
-colorFrom: pink
-colorTo: purple
 sdk: gradio
-sdk_version: 6.11.0
 app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: SpatialBench
+emoji: 🧩
+colorFrom: blue
+colorTo: indigo
 sdk: gradio
+sdk_version: "4.44.0"
 app_file: app.py
+pinned: true
+short_description: Do LLMs Build Spatial World Models? Evidence from Maze Tasks
 ---
+# SpatialBench
+Evaluation platform for **"Do LLMs Build Spatial World Models? Evidence from Grid-World Maze Tasks"** (ICLR 2026 Workshop).
+Three tasks probe whether LLMs construct internal spatial representations:
+| Task | Type | Description |
+|------|------|-------------|
+| **Maze Navigation** | Planning | Find shortest path from start to goal |
+| **Sequential Point Reuse** | Reasoning | Q3 = Q0 — do models reuse earlier computation? |
+| **Compositional Distance** | Reasoning | Compose corner→center distances for Q2 |
+Models evaluated: Gemini 2.5 Flash, GPT-5 Mini, Claude Haiku 4.5, DeepSeek Chat.

app.py ADDED Viewed

	@@ -0,0 +1,688 @@

+"""
+app.py — SpatialBench Gradio application
+-----------------------------------------
+Entrypoint for the HuggingFace Space "SpatialBench".
+Two tabs:
+  1. Leaderboard  — visualize pre-computed results from all three tasks
+  2. Run          — launch experiments directly via API keys (no SLURM needed)
+                    (on HF Space, set API keys as Space Secrets)
+To run locally:
+    cd pipeline/
+    python app.py
+To deploy on HuggingFace Spaces:
+    - Set Space Secrets: GEMINI_API_KEY, OPENAI_API_KEY, ANTHROPIC_API_KEY, DEEPSEEK_API_KEY
+    - The Space entrypoint is this file (app.py)
+"""
+from __future__ import annotations
+import os
+import sys
+import threading
+import time
+from pathlib import Path
+import gradio as gr
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+# Load .env if running locally
+_env = Path(__file__).parent / ".env"
+if _env.exists():
+    with open(_env) as _f:
+        for _line in _f:
+            _line = _line.strip()
+            if _line and not _line.startswith("#") and "=" in _line:
+                _k, _v = _line.split("=", 1)
+                os.environ.setdefault(_k.strip(), _v.strip())
+# Add repo root to path so pipeline imports work
+sys.path.insert(0, str(Path(__file__).parent))
+from pipeline.task_builder import load_config, build_all_jobs
+from pipeline.job_monitor import JobMonitor, submit_direct
+from pipeline.results_loader import (
+    load_all_results,
+    maze_navigation_leaderboard,
+    point_reuse_leaderboard,
+    compositional_distance_leaderboard,
+)
+# ---------------------------------------------------------------------------
+# Paths
+# ---------------------------------------------------------------------------
+CONFIG_PATH = Path(__file__).parent / "configs" / "experiments.yaml"
+CFG = load_config(CONFIG_PATH)
+MODEL_CHOICES = list(CFG["models"].keys())
+MODEL_DISPLAY = {k: v["display_name"] for k, v in CFG["models"].items()}
+# Global job monitor (direct mode only — HF Space has no SLURM)
+_monitor = JobMonitor(mode="direct")
+_monitor_lock = threading.Lock()
+# ---------------------------------------------------------------------------
+# Leaderboard helpers
+# ---------------------------------------------------------------------------
+def _load_results():
+    try:
+        return load_all_results(CONFIG_PATH)
+    except Exception as e:
+        return {"maze_navigation": pd.DataFrame(), "point_reuse": pd.DataFrame(), "compositional_distance": pd.DataFrame()}
+def _make_empty_fig(msg: str) -> go.Figure:
+    fig = go.Figure()
+    fig.add_annotation(text=msg, x=0.5, y=0.5, showarrow=False,
+                       font=dict(size=16), xref="paper", yref="paper")
+    fig.update_layout(xaxis_visible=False, yaxis_visible=False,
+                      height=300, paper_bgcolor="rgba(0,0,0,0)",
+                      plot_bgcolor="rgba(0,0,0,0)")
+    return fig
+# ── Task 1 plots ────────────────────────────────────────────────────────────
+def plot_task1_accuracy(k_shot: int, input_format: str) -> tuple[go.Figure, pd.DataFrame]:
+    results = _load_results()
+    df = results["maze_navigation"]
+    if df.empty:
+        return _make_empty_fig("No Task 1 results found.\nRun experiments first."), pd.DataFrame()
+    sub = df[(df["k_shot"] == k_shot) & (df["input_format"] == input_format)]
+    if sub.empty:
+        return _make_empty_fig(f"No results for k={k_shot}, format={input_format}"), pd.DataFrame()
+    fig = px.line(
+        sub, x="grid_size", y="accuracy",
+        color="display_name", line_dash="prompt_strategy",
+        markers=True,
+        labels={"grid_size": "Grid Size (n×n)", "accuracy": "Accuracy",
+                "display_name": "Model", "prompt_strategy": "Strategy"},
+        title=f"Task 1 — Maze Navigation ({input_format} format, {k_shot}-shot)",
+        color_discrete_sequence=px.colors.qualitative.Set2,
+    )
+    fig.update_layout(
+        yaxis_range=[0, 1],
+        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
+        height=420,
+    )
+    lb = maze_navigation_leaderboard(df, k_shot=k_shot)
+    return fig, lb
+def plot_task1_format_comparison() -> go.Figure:
+    results = _load_results()
+    df = results["maze_navigation"]
+    if df.empty:
+        return _make_empty_fig("No Task 1 results found.")
+    # Average over grid sizes, compare raw vs visual at k=0 with CoT
+    sub = df[(df["k_shot"] == 0) & (df["prompt_strategy"] == "cot")]
+    if sub.empty:
+        sub = df[df["k_shot"] == 0]
+    agg = sub.groupby(["display_name", "input_format"])["accuracy"].mean().reset_index()
+    fig = px.bar(
+        agg, x="display_name", y="accuracy", color="input_format",
+        barmode="group",
+        labels={"display_name": "Model", "accuracy": "Mean Accuracy",
+                "input_format": "Input Format"},
+        title="Task 1 — Raw vs Visual Format (0-shot, CoT, averaged over grid sizes)",
+        color_discrete_map={"raw": "#2196F3", "visual": "#FF9800"},
+    )
+    fig.update_layout(yaxis_range=[0, 1], height=380)
+    return fig
+# ── Task 2 plots ────────────────────────────────────────────────────────────
+def plot_task2_q0_q3(grid_size: int) -> tuple[go.Figure, pd.DataFrame]:
+    results = _load_results()
+    df = results["point_reuse"]
+    if df.empty:
+        return _make_empty_fig("No Task 2 results found.\nRun experiments first."), pd.DataFrame()
+    sub = df[df["grid_size"] == grid_size]
+    if sub.empty:
+        return _make_empty_fig(f"No Task 2 results for {grid_size}×{grid_size}"), pd.DataFrame()
+    q0 = sub[sub["question_idx"] == 0].groupby("display_name")["accuracy"].mean().rename("Q0")
+    q3 = sub[sub["question_idx"] == 3].groupby("display_name")["accuracy"].mean().rename("Q3")
+    plot_df = pd.concat([q0, q3], axis=1).reset_index()
+    plot_df_melt = plot_df.melt(id_vars="display_name", var_name="Question", value_name="Accuracy")
+    fig = px.bar(
+        plot_df_melt, x="display_name", y="Accuracy", color="Question",
+        barmode="group",
+        labels={"display_name": "Model"},
+        title=f"Task 2 — Q0 vs Q3 Accuracy ({grid_size}×{grid_size} maze)\n"
+              "Q3 = Q0 (same question repeated — tests information reuse)",
+        color_discrete_map={"Q0": "#4CAF50", "Q3": "#F44336"},
+    )
+    fig.update_layout(yaxis_range=[0, 1], height=400)
+    lb = point_reuse_leaderboard(df)
+    return fig, lb
+def plot_task2_by_grid() -> go.Figure:
+    results = _load_results()
+    df = results["point_reuse"]
+    if df.empty:
+        return _make_empty_fig("No Task 2 results found.")
+    q3 = df[df["question_idx"] == 3].groupby(
+        ["display_name", "grid_size"])["accuracy"].mean().reset_index()
+    fig = px.line(
+        q3, x="grid_size", y="accuracy", color="display_name",
+        markers=True,
+        labels={"grid_size": "Grid Size", "accuracy": "Q3 Accuracy",
+                "display_name": "Model"},
+        title="Task 2 — Q3 Accuracy by Grid Size (Q3 = Q0 repeated)",
+        color_discrete_sequence=px.colors.qualitative.Set2,
+    )
+    fig.update_layout(yaxis_range=[0, 1], height=380)
+    return fig
+# ── Task 3 plots ────────────────────────────────────────────────────────────
+def plot_task3_compositional() -> tuple[go.Figure, pd.DataFrame]:
+    results = _load_results()
+    df = results["compositional_distance"]
+    if df.empty:
+        return _make_empty_fig("No Task 3 results found.\nRun experiments first."), pd.DataFrame()
+    agg = df.groupby(["display_name", "question_idx"])["accuracy"].mean().reset_index()
+    q_labels = {0: "Q0: A→M", 1: "Q1: D→M", 2: "Q2: B→C (compositional)"}
+    agg["Question"] = agg["question_idx"].map(q_labels)
+    fig = px.bar(
+        agg, x="display_name", y="accuracy", color="Question",
+        barmode="group",
+        labels={"display_name": "Model", "accuracy": "Accuracy"},
+        title="Task 3 — Compositional Distance Comparison\n"
+              "Q2 can be composed from Q0+Q1 (corner→center distances)",
+        color_discrete_map={
+            "Q0: A→M": "#2196F3",
+            "Q1: D→M": "#9C27B0",
+            "Q2: B→C (compositional)": "#FF5722",
+        },
+    )
+    fig.update_layout(yaxis_range=[0, 1], height=420)
+    lb = compositional_distance_leaderboard(df)
+    return fig, lb
+def plot_task3_by_grid() -> go.Figure:
+    results = _load_results()
+    df = results["compositional_distance"]
+    if df.empty:
+        return _make_empty_fig("No Task 3 results found.")
+    q2 = df[df["question_idx"] == 2].groupby(
+        ["display_name", "grid_size"])["accuracy"].mean().reset_index()
+    fig = px.line(
+        q2, x="grid_size", y="accuracy", color="display_name",
+        markers=True,
+        labels={"grid_size": "Grid Size", "accuracy": "Q2 Accuracy",
+                "display_name": "Model"},
+        title="Task 3 — Q2 (Compositional) Accuracy by Grid Size",
+        color_discrete_sequence=px.colors.qualitative.Set2,
+    )
+    fig.update_layout(yaxis_range=[0, 1], height=380)
+    return fig
+# ---------------------------------------------------------------------------
+# Run-experiments tab
+# ---------------------------------------------------------------------------
+# Map from env-var name → user-provided key (populated at runtime from form)
+_USER_KEYS: dict[str, str] = {}
+_USER_KEYS_LOCK = threading.Lock()
+def launch_experiments(
+    tasks: list[str],
+    models: list[str],
+    grid_sizes_str: str,
+    formats: list[str],
+    strategies: list[str],
+    gemini_key: str,
+    openai_key: str,
+    anthropic_key: str,
+    deepseek_key: str,
+) -> tuple[str, list[list[str]]]:
+    """Called when the user clicks 'Run' in the Gradio UI."""
+    # Build a key map from only what the user explicitly typed — never os.environ
+    user_keys: dict[str, str] = {}
+    if gemini_key.strip():
+        user_keys["GEMINI_API_KEY"] = gemini_key.strip()
+    if openai_key.strip():
+        user_keys["OPENAI_API_KEY"] = openai_key.strip()
+    if anthropic_key.strip():
+        user_keys["ANTHROPIC_API_KEY"] = anthropic_key.strip()
+    if deepseek_key.strip():
+        user_keys["DEEPSEEK_API_KEY"] = deepseek_key.strip()
+    if not user_keys:
+        return (
+            "No API keys provided. Please enter at least one API key to run experiments.",
+            [],
+        )
+    # Parse grid sizes
+    try:
+        grid_sizes = [int(g.strip()) for g in grid_sizes_str.split(",") if g.strip()]
+    except ValueError:
+        return "Invalid grid sizes — enter comma-separated integers, e.g. 5,6,7", []
+    if not tasks:
+        return "Select at least one task.", []
+    if not models:
+        return "Select at least one model.", []
+    # Map display choices back to internal IDs
+    task_map = {
+        "Maze Navigation":                   "maze_navigation",
+        "Sequential Point Reuse":            "point_reuse",
+        "Compositional Distance Comparison": "compositional_distance",
+    }
+    selected_tasks = [task_map[t] for t in tasks if t in task_map]
+    jobs = build_all_jobs(
+        cfg=CFG,
+        tasks=selected_tasks,
+        models=models,
+        grid_sizes=grid_sizes or None,
+        input_formats=formats or None,
+        prompt_strategies=strategies or None,
+        config_path=CONFIG_PATH,
+    )
+    if not jobs:
+        return "No jobs matched the selected filters.", []
+    launched = 0
+    skipped = 0
+    skipped_models: list[str] = []
+    with _monitor_lock:
+        for job in jobs:
+            # Only use the key the user provided — never fall back to server env
+            api_key = user_keys.get(job.api_key_env, "")
+            if not api_key:
+                skipped += 1
+                skipped_models.append(job.model)
+                continue
+            job.output_dir.mkdir(parents=True, exist_ok=True)
+            proc = submit_direct(
+                cmd=job.python_cmd,
+                working_dir=str(job.working_dir),
+                env={job.api_key_env: api_key},
+            )
+            _monitor.add_direct(
+                proc=proc,
+                label=job.label,
+                task_id=job.task_id,
+                model=job.model,
+                output_dir=str(job.output_dir),
+            )
+            launched += 1
+            time.sleep(1)  # avoid API rate limits on burst start
+    status_msg = f"Launched {launched} job(s)."
+    if skipped:
+        missing = sorted(set(skipped_models))
+        status_msg += (
+            f" Skipped {skipped} job(s) for {', '.join(missing)} "
+            f"— no API key provided for those models."
+        )
+    return status_msg, _monitor.as_table()
+def refresh_status() -> tuple[list[list[str]], str]:
+    _monitor.refresh()
+    summary = _monitor.summary()
+    counts = summary["counts"]
+    msg = "  ".join(f"{s}: {n}" for s, n in counts.items()) or "No jobs submitted yet."
+    return _monitor.as_table(), msg
+# ---------------------------------------------------------------------------
+# Gradio UI
+# ---------------------------------------------------------------------------
+PAPER_ABSTRACT = """
+**Do LLMs Build Spatial World Models? Evidence from Grid-World Maze Tasks**
+We systematically evaluate the spatial understanding of large language models through maze tasks—a
+controlled testing context requiring multi-step planning and spatial abstraction. Across experiments
+with Gemini-2.5-Flash, GPT-5-mini, Claude-Haiku-4.5, and DeepSeek-Chat, we uncover significant
+discrepancies in spatial reasoning that challenge assumptions about LLM planning capabilities.
+Key findings:
+- **Representation sensitivity**: Gemini drops from 86% (raw tokenized) to 34% (visual grid) on 5×5 mazes with CoT
+- **Prompting dependency**: Claude-Haiku fails completely without CoT, recovers to 78% with it
+- **No spatial memory**: Models treat sequential questions independently, failing to reuse computed spatial knowledge
+"""
+CSS = """
+.leaderboard-table { font-size: 0.9em; }
+.status-badge-running { color: #2196F3; font-weight: bold; }
+.status-badge-completed { color: #4CAF50; font-weight: bold; }
+.status-badge-failed { color: #F44336; font-weight: bold; }
+footer { display: none !important; }
+"""
+def build_ui() -> gr.Blocks:
+    with gr.Blocks(
+        title="SpatialBench — Do LLMs Build Spatial World Models?",
+        css=CSS,
+        theme=gr.themes.Soft(primary_hue="blue"),
+    ) as demo:
+        gr.Markdown("# 🧩 SpatialBench")
+        gr.Markdown(
+            "**Evaluating Spatial World Models in Large Language Models** · "
+            "[Paper (ICLR 2026 Workshop)](https://arxiv.org/abs/...) · "
+            "[Code](https://github.com/...)"
+        )
+        with gr.Tabs():
+            # ================================================================
+            # Tab 1: Leaderboard
+            # ================================================================
+            with gr.Tab("📊 Leaderboard"):
+                gr.Markdown(PAPER_ABSTRACT)
+                gr.Markdown("---")
+                gr.Markdown("## Task 1 — Maze Navigation (Planning)")
+                gr.Markdown(
+                    "Models find shortest paths through mazes. "
+                    "Two input formats: **raw** tokenized adjacency lists vs **visual** character grids."
+                )
+                with gr.Row():
+                    t1_k = gr.Radio(
+                        choices=[0, 3, 5], value=0, label="K-shot",
+                        info="Number of in-context examples",
+                    )
+                    t1_fmt = gr.Radio(
+                        choices=["raw", "visual"], value="raw", label="Input Format",
+                    )
+                t1_plot = gr.Plot(label="Accuracy by Grid Size")
+                t1_lb = gr.Dataframe(
+                    label="Leaderboard (mean accuracy across grid sizes)",
+                    elem_classes=["leaderboard-table"],
+                )
+                t1_fmt_plot = gr.Plot(label="Raw vs Visual Format Comparison")
+                def refresh_task1(k, fmt):
+                    fig, lb = plot_task1_accuracy(int(k), fmt)
+                    fmt_fig = plot_task1_format_comparison()
+                    return fig, lb, fmt_fig
+                for inp in [t1_k, t1_fmt]:
+                    inp.change(
+                        refresh_task1, inputs=[t1_k, t1_fmt],
+                        outputs=[t1_plot, t1_lb, t1_fmt_plot],
+                    )
+                gr.Markdown("---")
+                gr.Markdown("## Task 2 — Sequential Reasoning with Point Reuse")
+                gr.Markdown(
+                    "Models answer 4 proximity questions. **Q3 = Q0** (same question repeated). "
+                    "Do models reuse their earlier computation, or start from scratch?"
+                )
+                t2_grid = gr.Slider(minimum=5, maximum=9, step=1, value=5,
+                                    label="Grid Size")
+                t2_plot = gr.Plot(label="Q0 vs Q3 Accuracy")
+                t2_grid_plot = gr.Plot(label="Q3 Accuracy Across Grid Sizes")
+                t2_lb = gr.Dataframe(
+                    label="Leaderboard",
+                    elem_classes=["leaderboard-table"],
+                )
+                def refresh_task2(gs):
+                    fig, lb = plot_task2_q0_q3(int(gs))
+                    grid_fig = plot_task2_by_grid()
+                    return fig, grid_fig, lb
+                t2_grid.change(
+                    refresh_task2, inputs=[t2_grid],
+                    outputs=[t2_plot, t2_grid_plot, t2_lb],
+                )
+                gr.Markdown("---")
+                gr.Markdown("## Task 3 — Compositional Distance Comparison")
+                gr.Markdown(
+                    "Models answer 3 questions about maze corners (A, B, C, D) and center M. "
+                    "**Q2** (B→C) can potentially be composed from Q0 (A→M) and Q1 (D→M). "
+                    "Δ = Q2 accuracy − avg(Q0, Q1)."
+                )
+                t3_plot = gr.Plot(label="Q0 / Q1 / Q2 Accuracy by Model")
+                t3_grid_plot = gr.Plot(label="Q2 Accuracy Across Grid Sizes")
+                t3_lb = gr.Dataframe(
+                    label="Leaderboard (Δ shows compositional benefit)",
+                    elem_classes=["leaderboard-table"],
+                )
+                with gr.Row():
+                    refresh_lb_btn = gr.Button("🔄 Refresh Results", variant="secondary")
+                def refresh_all_leaderboard(_=None):
+                    t1_fig, t1_table = plot_task1_accuracy(0, "raw")
+                    t1_ff = plot_task1_format_comparison()
+                    t2_fig, t2_lb_table = plot_task2_q0_q3(5)
+                    t2_gfig = plot_task2_by_grid()
+                    t3_fig, t3_lb_table = plot_task3_compositional()
+                    t3_gfig = plot_task3_by_grid()
+                    return (
+                        t1_fig, t1_table, t1_ff,
+                        t2_fig, t2_gfig, t2_lb_table,
+                        t3_fig, t3_gfig, t3_lb_table,
+                    )
+                refresh_lb_btn.click(
+                    refresh_all_leaderboard,
+                    outputs=[
+                        t1_plot, t1_lb, t1_fmt_plot,
+                        t2_plot, t2_grid_plot, t2_lb,
+                        t3_plot, t3_grid_plot, t3_lb,
+                    ],
+                )
+                # Initial load
+                demo.load(
+                    refresh_all_leaderboard,
+                    outputs=[
+                        t1_plot, t1_lb, t1_fmt_plot,
+                        t2_plot, t2_grid_plot, t2_lb,
+                        t3_plot, t3_grid_plot, t3_lb,
+                    ],
+                )
+            # ================================================================
+            # Tab 2: Run Experiments
+            # ================================================================
+            with gr.Tab("⚡ Run Experiments"):
+                gr.Markdown(
+                    "## Launch Experiments\n"
+                    "Experiments call LLM APIs directly — no compute cluster needed.\n\n"
+                    "> **Your API keys are used only for your session and are never stored or logged.**  \n"
+                    "> Enter keys only for the model(s) you want to evaluate. "
+                    "Jobs for models without a key will be skipped."
+                )
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        # Task / model / grid selection
+                        run_tasks = gr.CheckboxGroup(
+                            choices=[
+                                "Maze Navigation",
+                                "Sequential Point Reuse",
+                                "Compositional Distance Comparison",
+                            ],
+                            value=["Maze Navigation"],
+                            label="Tasks",
+                        )
+                        run_models = gr.CheckboxGroup(
+                            choices=MODEL_CHOICES,
+                            value=["gemini-2.5-flash"],
+                            label="Models",
+                        )
+                        run_grids = gr.Textbox(
+                            value="5,6,7",
+                            label="Grid Sizes",
+                            info="Comma-separated integers. Maze dataset supports 5–9 (and beyond if regenerated).",
+                        )
+                        with gr.Row():
+                            run_formats = gr.CheckboxGroup(
+                                choices=["raw", "visual"],
+                                value=["raw"],
+                                label="Input Formats (Task 1 only)",
+                            )
+                            run_strategies = gr.CheckboxGroup(
+                                choices=["base", "cot", "reasoning"],
+                                value=["cot"],
+                                label="Prompt Strategies",
+                            )
+                    with gr.Column(scale=1):
+                        gr.Markdown("### API Keys")
+                        gr.Markdown(
+                            "Enter the key(s) for the model(s) you selected. "
+                            "Keys are used only for this session."
+                        )
+                        gemini_key = gr.Textbox(
+                            label="GEMINI_API_KEY", type="password", placeholder="AIza...",
+                        )
+                        openai_key = gr.Textbox(
+                            label="OPENAI_API_KEY", type="password", placeholder="sk-...",
+                        )
+                        anthropic_key = gr.Textbox(
+                            label="ANTHROPIC_API_KEY", type="password", placeholder="sk-ant-...",
+                        )
+                        deepseek_key = gr.Textbox(
+                            label="DEEPSEEK_API_KEY", type="password",
+                        )
+                with gr.Row():
+                    run_btn = gr.Button("🚀 Launch Experiments", variant="primary", scale=2)
+                    refresh_btn = gr.Button("🔄 Refresh Status", scale=1)
+                launch_msg = gr.Textbox(label="Launch Status", interactive=False)
+                job_table = gr.Dataframe(
+                    headers=["Task", "Model", "Label", "Status", "Elapsed", "Started"],
+                    label="Job Status",
+                    interactive=False,
+                    wrap=True,
+                )
+                status_summary = gr.Textbox(
+                    label="Summary", interactive=False,
+                )
+                run_btn.click(
+                    launch_experiments,
+                    inputs=[
+                        run_tasks, run_models, run_grids,
+                        run_formats, run_strategies,
+                        gemini_key, openai_key, anthropic_key, deepseek_key,
+                    ],
+                    outputs=[launch_msg, job_table],
+                )
+                refresh_btn.click(
+                    refresh_status,
+                    outputs=[job_table, status_summary],
+                )
+            # ================================================================
+            # Tab 3: About
+            # ================================================================
+            with gr.Tab("ℹ️ About"):
+                gr.Markdown("""
+## About SpatialBench
+SpatialBench is the evaluation platform accompanying the paper:
+> **Do LLMs Build Spatial World Models? Evidence from Grid-World Maze Tasks**
+> *Under review at ICLR 2026 Workshop*
+### Three Tasks
+| Task | Type | What it tests |
+|------|------|---------------|
+| **Task 1: Maze Navigation** | Planning | Find shortest path from start to goal |
+| **Task 2: Sequential Point Reuse** | Reasoning | Reuse Q0 computation when Q3=Q0 |
+| **Task 3: Compositional Distance** | Reasoning | Compose corner→center distances for Q2 |
+### Input Representations
+- **Raw (tokenized)**: `<ADJLIST_START> (0,0) <--> (0,1) ... <ADJLIST_END>`
+- **Visual (grid)**: `Row 0: ['.', 'S', '.', '#']  Row 1: ['#', '.', '.', 'E']`
+### Models Evaluated
+| Model | Provider |
+|-------|----------|
+| Gemini 2.5 Flash | Google |
+| GPT-5 Mini | OpenAI |
+| Claude Haiku 4.5 | Anthropic |
+| DeepSeek Chat | DeepSeek |
+### Grid Sizes
+Experiments run on n×n grids for n ∈ {5, 6, 7, 8, 9} by default.
+The underlying `maze-dataset` library supports larger grids — adjust in the **Run** tab.
+### Adding a New Model
+Edit `pipeline/configs/experiments.yaml`:
+```yaml
+models:
+  your-model-id:
+    api_key_env: YOUR_API_KEY_ENV_VAR
+    display_name: "Your Model Name"
+```
+Then add inference support in `utils/llm_inference.py`.
+### Citation
+```bibtex
+@inproceedings{spatialbench2026,
+  title     = {Do {LLMs} Build Spatial World Models? Evidence from Grid-World Maze Tasks},
+  author    = {Anonymous},
+  booktitle = {ICLR 2026 Workshop},
+  year      = {2026},
+}
+```
+                """)
+    return demo
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+    demo = build_ui()
+    demo.launch(
+        server_name="0.0.0.0",
+        share=False,
+        show_error=True,
+    )

configs/experiments.yaml ADDED Viewed

	@@ -0,0 +1,171 @@

+# =============================================================================
+# SpatialBench Experiment Configuration
+# =============================================================================
+# This file is the single source of truth for all experiments.
+# Add a new model by adding an entry under `models`.
+# Add a new grid size by extending `grid_sizes` in any task.
+# All paths are relative to llm-maze-solver/ (the repo root).
+# =============================================================================
+# ---------------------------------------------------------------------------
+# Global defaults — overridden per-task or per-experiment as needed
+# ---------------------------------------------------------------------------
+defaults:
+  n_test_mazes: 50
+  seed: 42
+  temperature: 0.1
+  max_tokens: 8192
+  sbatch:
+    cpus: 2
+    mem: "8G"
+    time: "10:00:00"
+    partition: "short"
+    log_dir: "maze-solver/eval_llm_logs"
+# ---------------------------------------------------------------------------
+# Models
+# Each entry defines the model identifier used in API calls and the
+# environment variable that must hold the API key.
+# ---------------------------------------------------------------------------
+models:
+  gemini-2.5-flash:
+    api_key_env: GEMINI_API_KEY
+    display_name: "Gemini 2.5 Flash"
+  gpt-5-mini:
+    api_key_env: OPENAI_API_KEY
+    display_name: "GPT-5 Mini"
+  claude-haiku-4-5:
+    api_key_env: ANTHROPIC_API_KEY
+    display_name: "Claude Haiku 4.5"
+  deepseek-chat:
+    api_key_env: DEEPSEEK_API_KEY
+    display_name: "DeepSeek Chat"
+# ---------------------------------------------------------------------------
+# Maze Navigation (Planning)
+# Paper: Table 1, Table 5 (3-shot), Table 6 (5-shot)
+# Script: maze-solver/eval_llm_maze_solver.py
+# ---------------------------------------------------------------------------
+maze_navigation:
+  description: >
+    Models find shortest paths through mazes represented in two formats
+    (raw tokenized adjacency lists vs visual character grids), tested
+    across k-shot settings and three prompting strategies.
+  script: "maze-solver/eval_llm_maze_solver.py"
+  working_dir: "maze-solver"
+  output_base: "maze-solver/llm-maze-evaluation-results"
+  # Grid sizes: paper used 5-9; extend freely up to maze-dataset limits
+  grid_sizes: [5, 6, 7, 8, 9]
+  # Input representations
+  input_formats: ["raw", "visual"]
+  # Prompting strategies  (maps to script flags)
+  prompt_strategies:
+    base:
+      flags: []
+      display_name: "Base"
+    cot:
+      flags: ["--chain_of_thought"]
+      display_name: "Chain-of-Thought"
+    reasoning:
+      flags: ["--reasoning"]
+      display_name: "Post-hoc Reasoning"
+  # K-shot values tested simultaneously in one script run
+  k_shots: "0,3,5"
+  # Fixed params
+  maze_type: "cycles"
+  percolation_p: 0.2
+  visualize: true
+  sbatch:
+    time: "10:00:00"
+# ---------------------------------------------------------------------------
+# Sequential Reasoning with Point Reuse (Q3 = Q0)
+# Paper: Table 2, Table 7
+# Script: maze-solver/spatial_reasoning/eval_proximity_comparison.py
+# ---------------------------------------------------------------------------
+point_reuse:
+  description: >
+    Models answer four sequential proximity questions about the same maze.
+    Q3 is identical to Q0, probing whether models reuse previously
+    computed spatial information or treat each question independently.
+  script: "maze-solver/spatial_reasoning/eval_proximity_comparison.py"
+  working_dir: "spatial_reasoning/spatial_reasoning_experiments"
+  # Paper used 5-9; extend freely
+  grid_sizes: [5, 6, 7, 8, 9]
+  input_format: "raw"
+  strategy: "point_reuse"
+  reuse_pattern: "last_first_same"
+  n_questions_per_maze: 4
+  sequential_questions: true
+  # Prompting strategies
+  prompt_strategies:
+    base:
+      prompt_type: "baseline"
+      display_name: "Base"
+    cot:
+      prompt_type: "cot"
+      display_name: "Chain-of-Thought"
+    reasoning:
+      prompt_type: "reasoning"
+      display_name: "Post-hoc Reasoning"
+  output_base: "spatial_reasoning/spatial-reasoning-results-point-reuse-q3-q0"
+  visualize: true
+  save_details: true
+  sbatch:
+    time: "10:30:00"
+# ---------------------------------------------------------------------------
+# Compositional Distance Comparison
+# Paper: Table 3, Table 8, Table 9
+# Script: maze-solver/spatial_reasoning/eval_extended_experiments.py
+# Corner pattern: corners_to_center  (Q0: top-left→center,
+#                                     Q1: bottom-right→center,
+#                                     Q2: corner→corner compositional)
+# ---------------------------------------------------------------------------
+compositional_distance:
+  description: >
+    Models answer three questions about maze corners (A=top-left,
+    B=top-right, C=bottom-left, D=bottom-right) and center M.
+    Q2 can be composed from information established in Q0 and Q1,
+    probing whether models build cumulative spatial knowledge.
+  script: "maze-solver/spatial_reasoning/eval_extended_experiments.py"
+  working_dir: "spatial_reasoning/spatial_reasoning_experiments"
+  # Paper reported 5-9 in Tables 8/9; scripts originally only ran 5-7
+  # Extended to match paper
+  grid_sizes: [5, 6, 7, 8, 9]
+  input_format: "raw"
+  strategy: "orthogonal"
+  corner_pattern: "corners_to_center"   # matches paper Q0/Q1/Q2 design
+  n_questions_per_maze: 3
+  # Prompting strategies
+  prompt_strategies:
+    base:
+      prompt_type: "baseline"
+      display_name: "Base"
+    cot:
+      prompt_type: "cot"
+      display_name: "Chain-of-Thought"
+    reasoning:
+      prompt_type: "reasoning"
+      display_name: "Post-hoc Reasoning"
+  output_base: "spatial_reasoning/spatial-reasoning-results-orthogonal"
+  visualize: true
+  save_details: true
+  sbatch:
+    time: "06:30:00"

pipeline/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ "SpatialBench pipeline modules."

pipeline/job_monitor.py ADDED Viewed

	@@ -0,0 +1,314 @@

+"""
+job_monitor.py
+--------------
+Tracks and displays the status of experiment jobs.
+Supports two backends:
+  - SLURM  : polls `squeue` for cluster jobs (used when running locally)
+  - Direct : tracks subprocess-launched jobs (used when running via API keys
+             on HF Space or without a cluster)
+"""
+from __future__ import annotations
+import subprocess
+import time
+import threading
+from dataclasses import dataclass, field
+from datetime import datetime
+from enum import Enum
+from typing import Callable
+# ---------------------------------------------------------------------------
+# Status model
+# ---------------------------------------------------------------------------
+class JobStatus(str, Enum):
+    PENDING   = "pending"
+    RUNNING   = "running"
+    COMPLETED = "completed"
+    FAILED    = "failed"
+    CANCELLED = "cancelled"
+    UNKNOWN   = "unknown"
+@dataclass
+class JobRecord:
+    job_id: str              # SLURM job-id or process PID (as str)
+    label: str               # human-readable experiment label
+    task_id: str
+    model: str
+    status: JobStatus = JobStatus.PENDING
+    submitted_at: datetime = field(default_factory=datetime.now)
+    finished_at: datetime | None = None
+    output_dir: str = ""
+    log_out: str = ""
+    log_err: str = ""
+    def elapsed(self) -> str:
+        end = self.finished_at or datetime.now()
+        secs = int((end - self.submitted_at).total_seconds())
+        h, rem = divmod(secs, 3600)
+        m, s = divmod(rem, 60)
+        return f"{h:02d}:{m:02d}:{s:02d}"
+    def as_dict(self) -> dict:
+        return {
+            "job_id":       self.job_id,
+            "label":        self.label,
+            "task_id":      self.task_id,
+            "model":        self.model,
+            "status":       self.status.value,
+            "submitted_at": self.submitted_at.strftime("%Y-%m-%d %H:%M:%S"),
+            "elapsed":      self.elapsed(),
+            "output_dir":   self.output_dir,
+        }
+# ---------------------------------------------------------------------------
+# SLURM monitor
+# ---------------------------------------------------------------------------
+# Map squeue state codes → JobStatus
+_SLURM_STATE_MAP = {
+    "PD": JobStatus.PENDING,
+    "R":  JobStatus.RUNNING,
+    "CG": JobStatus.RUNNING,
+    "CD": JobStatus.COMPLETED,
+    "F":  JobStatus.FAILED,
+    "CA": JobStatus.CANCELLED,
+    "TO": JobStatus.FAILED,
+    "OOM": JobStatus.FAILED,
+}
+def _query_slurm(job_ids: list[str]) -> dict[str, JobStatus]:
+    """Return {job_id: JobStatus} for a batch of SLURM job ids."""
+    if not job_ids:
+        return {}
+    try:
+        result = subprocess.run(
+            ["squeue", "--jobs", ",".join(job_ids), "--format=%i %t", "--noheader"],
+            capture_output=True, text=True, timeout=15,
+        )
+        statuses: dict[str, JobStatus] = {}
+        for line in result.stdout.strip().splitlines():
+            parts = line.split()
+            if len(parts) >= 2:
+                jid, state = parts[0], parts[1]
+                statuses[jid] = _SLURM_STATE_MAP.get(state, JobStatus.UNKNOWN)
+        return statuses
+    except Exception:
+        return {}
+def submit_sbatch(script_text: str, script_path: str) -> str | None:
+    """Write script_text to script_path, submit via sbatch, return job_id."""
+    with open(script_path, "w") as f:
+        f.write(script_text)
+    try:
+        result = subprocess.run(
+            ["sbatch", script_path],
+            capture_output=True, text=True, timeout=30,
+        )
+        # sbatch output: "Submitted batch job 12345"
+        for token in result.stdout.split():
+            if token.isdigit():
+                return token
+    except Exception:
+        pass
+    return None
+# ---------------------------------------------------------------------------
+# Direct (subprocess) monitor
+# ---------------------------------------------------------------------------
+def submit_direct(
+    cmd: list[str],
+    working_dir: str,
+    env: dict | None = None,
+    on_finish: Callable[[int], None] | None = None,
+) -> subprocess.Popen:
+    """Launch a job as a subprocess and optionally call on_finish(returncode)."""
+    import os
+    proc_env = os.environ.copy()
+    if env:
+        proc_env.update(env)
+    proc = subprocess.Popen(
+        cmd,
+        cwd=working_dir,
+        env=proc_env,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True,
+    )
+    if on_finish:
+        def _wait():
+            proc.wait()
+            on_finish(proc.returncode)
+        threading.Thread(target=_wait, daemon=True).start()
+    return proc
+# ---------------------------------------------------------------------------
+# JobMonitor — unified tracker
+# ---------------------------------------------------------------------------
+class JobMonitor:
+    """
+    Tracks a collection of JobRecords.
+    Usage (SLURM):
+        monitor = JobMonitor(mode="slurm")
+        record = monitor.add(job_id="12345", label="Task1|gemini|raw|cot", ...)
+        monitor.refresh()          # updates statuses from squeue
+        monitor.wait_all()         # blocks until all done
+    Usage (direct):
+        monitor = JobMonitor(mode="direct")
+        proc = submit_direct(cmd, wdir, env)
+        record = monitor.add_direct(proc, label=..., ...)
+        monitor.wait_all()
+    """
+    def __init__(self, mode: str = "slurm"):
+        assert mode in ("slurm", "direct")
+        self.mode = mode
+        self._records: dict[str, JobRecord] = {}   # job_id → JobRecord
+        self._procs:   dict[str, subprocess.Popen] = {}
+        self._lock = threading.Lock()
+    # -- adding jobs --------------------------------------------------------
+    def add(
+        self,
+        job_id: str,
+        label: str,
+        task_id: str,
+        model: str,
+        output_dir: str = "",
+        log_out: str = "",
+        log_err: str = "",
+    ) -> JobRecord:
+        record = JobRecord(
+            job_id=job_id, label=label,
+            task_id=task_id, model=model,
+            output_dir=output_dir, log_out=log_out, log_err=log_err,
+        )
+        with self._lock:
+            self._records[job_id] = record
+        return record
+    def add_direct(
+        self,
+        proc: subprocess.Popen,
+        label: str,
+        task_id: str,
+        model: str,
+        output_dir: str = "",
+    ) -> JobRecord:
+        job_id = str(proc.pid)
+        record = self.add(
+            job_id=job_id, label=label,
+            task_id=task_id, model=model, output_dir=output_dir,
+        )
+        record.status = JobStatus.RUNNING
+        with self._lock:
+            self._procs[job_id] = proc
+        def _monitor():
+            proc.wait()
+            with self._lock:
+                record.status = (
+                    JobStatus.COMPLETED if proc.returncode == 0
+                    else JobStatus.FAILED
+                )
+                record.finished_at = datetime.now()
+        threading.Thread(target=_monitor, daemon=True).start()
+        return record
+    # -- status refreshing --------------------------------------------------
+    def refresh(self) -> None:
+        """Update statuses from SLURM (no-op for direct mode)."""
+        if self.mode != "slurm":
+            return
+        with self._lock:
+            active_ids = [
+                jid for jid, r in self._records.items()
+                if r.status in (JobStatus.PENDING, JobStatus.RUNNING)
+            ]
+        if not active_ids:
+            return
+        statuses = _query_slurm(active_ids)
+        with self._lock:
+            for jid, status in statuses.items():
+                if jid in self._records:
+                    old = self._records[jid].status
+                    self._records[jid].status = status
+                    if old != status and status in (
+                        JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED
+                    ):
+                        self._records[jid].finished_at = datetime.now()
+            # Any job no longer appearing in squeue is done
+            for jid in active_ids:
+                if jid not in statuses and jid in self._records:
+                    r = self._records[jid]
+                    if r.status == JobStatus.RUNNING:
+                        r.status = JobStatus.COMPLETED
+                        r.finished_at = datetime.now()
+    def wait_all(self, poll_interval: int = 30, callback: Callable | None = None) -> None:
+        """Block until all jobs are in a terminal state."""
+        while True:
+            self.refresh()
+            active = self.active_jobs()
+            if callback:
+                callback(self.summary())
+            if not active:
+                break
+            time.sleep(poll_interval)
+    # -- queries ------------------------------------------------------------
+    def active_jobs(self) -> list[JobRecord]:
+        with self._lock:
+            return [
+                r for r in self._records.values()
+                if r.status in (JobStatus.PENDING, JobStatus.RUNNING)
+            ]
+    def all_records(self) -> list[JobRecord]:
+        with self._lock:
+            return list(self._records.values())
+    def summary(self) -> dict:
+        records = self.all_records()
+        counts: dict[str, int] = {}
+        for r in records:
+            counts[r.status.value] = counts.get(r.status.value, 0) + 1
+        return {
+            "total":   len(records),
+            "counts":  counts,
+            "records": [r.as_dict() for r in records],
+        }
+    def as_table(self) -> list[list[str]]:
+        """Return rows suitable for a Gradio Dataframe component."""
+        records = self.all_records()
+        rows = []
+        for r in records:
+            rows.append([
+                r.task_id, r.model, r.label,
+                r.status.value, r.elapsed(),
+                r.submitted_at.strftime("%H:%M:%S"),
+            ])
+        return rows
+    TABLE_HEADERS = ["Task", "Model", "Label", "Status", "Elapsed", "Started"]

pipeline/results_loader.py ADDED Viewed

	@@ -0,0 +1,402 @@

+"""
+results_loader.py
+-----------------
+Scans experiment output directories and assembles results into pandas
+DataFrames ready for display in the Gradio leaderboard.
+Output directory conventions (from experiments.yaml):
+  Task 1: <output_base>/<model>/<fmt>_input_<strat>/
+             → results_{grid_size}x{grid_size}_k{k}.csv  OR  summary.json
+  Task 2: <output_base>/<model>/point_reuse_q3q0_<strat>/
+             → proximity_comparison_results.csv
+  Task 3: <output_base>/<model>/orthogonal_corners_to_center_<strat>/
+             → results.csv  OR  summary_stats.json
+"""
+from __future__ import annotations
+import json
+import os
+from pathlib import Path
+import pandas as pd
+import yaml
+# ---------------------------------------------------------------------------
+# Config helpers
+# ---------------------------------------------------------------------------
+def load_config(config_path: str | Path) -> dict:
+    with open(config_path) as f:
+        return yaml.safe_load(f)
+def _model_display(cfg: dict, model_id: str) -> str:
+    return cfg["models"].get(model_id, {}).get("display_name", model_id)
+# ---------------------------------------------------------------------------
+# Task 1 results
+# ---------------------------------------------------------------------------
+def load_maze_navigation_results(cfg: dict, repo_root: Path) -> pd.DataFrame:
+    """
+    Scan Task 1 output dirs and return a DataFrame with columns:
+    model, display_name, input_format, prompt_strategy, grid_size, k_shot, accuracy
+    """
+    task = cfg["maze_navigation"]
+    base = repo_root / task["output_base"]
+    rows = []
+    for model_id, model_meta in cfg["models"].items():
+        display = model_meta["display_name"]
+        for fmt in task["input_formats"]:
+            for strat in task["prompt_strategies"]:
+                subdir = base / model_id.replace(".", "_").replace("-", "_") / f"{fmt}_input_{strat}"
+                if not subdir.exists():
+                    continue
+                # Look for summary JSON first, then CSVs
+                summary_file = subdir / "summary.json"
+                if summary_file.exists():
+                    _parse_task1_summary(summary_file, rows, model_id, display, fmt, strat)
+                else:
+                    # Fall back to per-grid CSVs
+                    for csv_file in sorted(subdir.glob("*.csv")):
+                        _parse_task1_csv(csv_file, rows, model_id, display, fmt, strat)
+    if not rows:
+        return pd.DataFrame(columns=[
+            "model", "display_name", "input_format", "prompt_strategy",
+            "grid_size", "k_shot", "accuracy"
+        ])
+    return pd.DataFrame(rows)
+def _parse_task1_summary(path: Path, rows: list, model_id, display, fmt, strat):
+    try:
+        with open(path) as f:
+            data = json.load(f)
+        # Expected: {grid_size: {k: accuracy, ...}, ...}
+        for grid_key, k_dict in data.items():
+            try:
+                grid_size = int(str(grid_key).replace("x", "").split("_")[0])
+            except ValueError:
+                continue
+            if isinstance(k_dict, dict):
+                for k, acc in k_dict.items():
+                    rows.append({
+                        "model": model_id, "display_name": display,
+                        "input_format": fmt, "prompt_strategy": strat,
+                        "grid_size": grid_size, "k_shot": int(k),
+                        "accuracy": float(acc),
+                    })
+            elif isinstance(k_dict, (int, float)):
+                rows.append({
+                    "model": model_id, "display_name": display,
+                    "input_format": fmt, "prompt_strategy": strat,
+                    "grid_size": grid_size, "k_shot": 0,
+                    "accuracy": float(k_dict),
+                })
+    except Exception:
+        pass
+def _parse_task1_csv(path: Path, rows: list, model_id, display, fmt, strat):
+    try:
+        df = pd.read_csv(path)
+        # Detect grid_size and k_shot from filename or columns
+        grid_size = None
+        k_shot = 0
+        name = path.stem
+        for part in name.split("_"):
+            if part.startswith("k") and part[1:].isdigit():
+                k_shot = int(part[1:])
+            if "x" in part:
+                try:
+                    g = int(part.split("x")[0])
+                    grid_size = g
+                except ValueError:
+                    pass
+        if "grid_size" in df.columns:
+            for gs, gdf in df.groupby("grid_size"):
+                acc = _df_accuracy(gdf)
+                rows.append({
+                    "model": model_id, "display_name": display,
+                    "input_format": fmt, "prompt_strategy": strat,
+                    "grid_size": int(gs), "k_shot": k_shot,
+                    "accuracy": acc,
+                })
+        elif grid_size is not None:
+            rows.append({
+                "model": model_id, "display_name": display,
+                "input_format": fmt, "prompt_strategy": strat,
+                "grid_size": grid_size, "k_shot": k_shot,
+                "accuracy": _df_accuracy(df),
+            })
+    except Exception:
+        pass
+def _df_accuracy(df: pd.DataFrame) -> float:
+    for col in ("is_correct", "exact_match", "correct", "accuracy"):
+        if col in df.columns:
+            return float(df[col].mean())
+    return float("nan")
+# ---------------------------------------------------------------------------
+# Task 2 results
+# ---------------------------------------------------------------------------
+def load_point_reuse_results(cfg: dict, repo_root: Path) -> pd.DataFrame:
+    """
+    Return DataFrame with columns:
+    model, display_name, prompt_strategy, grid_size, question_idx, accuracy
+    """
+    task = cfg["point_reuse"]
+    base = repo_root / task["output_base"]
+    rows = []
+    for model_id, model_meta in cfg["models"].items():
+        display = model_meta["display_name"]
+        for strat, strat_cfg in task["prompt_strategies"].items():
+            subdir = (
+                base
+                / model_id.replace(".", "_").replace("-", "_")
+                / f"point_reuse_q3q0_{strat}"
+            )
+            if not subdir.exists():
+                # Also try the pattern used by existing scripts
+                subdir = base / model_id / f"proximity_comparison_point_reuse_last_first_same_{strat_cfg['prompt_type']}"
+            if not subdir.exists():
+                continue
+            csv_files = list(subdir.glob("*.csv"))
+            for csv_file in csv_files:
+                try:
+                    df = pd.read_csv(csv_file)
+                    if "grid_size" not in df.columns:
+                        continue
+                    q_col = next(
+                        (c for c in ("question_idx", "question_index", "q_idx") if c in df.columns),
+                        None,
+                    )
+                    for gs, gdf in df.groupby("grid_size"):
+                        if q_col:
+                            for qi, qdf in gdf.groupby(q_col):
+                                rows.append({
+                                    "model": model_id, "display_name": display,
+                                    "prompt_strategy": strat,
+                                    "grid_size": int(gs),
+                                    "question_idx": int(qi),
+                                    "accuracy": _df_accuracy(qdf),
+                                })
+                        else:
+                            rows.append({
+                                "model": model_id, "display_name": display,
+                                "prompt_strategy": strat,
+                                "grid_size": int(gs),
+                                "question_idx": -1,
+                                "accuracy": _df_accuracy(gdf),
+                            })
+                except Exception:
+                    pass
+    if not rows:
+        return pd.DataFrame(columns=[
+            "model", "display_name", "prompt_strategy",
+            "grid_size", "question_idx", "accuracy"
+        ])
+    return pd.DataFrame(rows)
+# ---------------------------------------------------------------------------
+# Task 3 results
+# ---------------------------------------------------------------------------
+def load_compositional_distance_results(cfg: dict, repo_root: Path) -> pd.DataFrame:
+    """
+    Return DataFrame with columns:
+    model, display_name, prompt_strategy, grid_size, question_idx, accuracy, delta
+    """
+    task = cfg["compositional_distance"]
+    base = repo_root / task["output_base"]
+    rows = []
+    for model_id, model_meta in cfg["models"].items():
+        display = model_meta["display_name"]
+        for strat, strat_cfg in task["prompt_strategies"].items():
+            tag = f"orthogonal_{task['corner_pattern']}_{strat}"
+            subdir = (
+                base
+                / model_id.replace(".", "_").replace("-", "_")
+                / tag
+            )
+            if not subdir.exists():
+                continue
+            # Prefer summary_stats.json
+            stats_file = subdir / "summary_stats.json"
+            if stats_file.exists():
+                try:
+                    with open(stats_file) as f:
+                        data = json.load(f)
+                    _parse_task3_stats(data, rows, model_id, display, strat)
+                    continue
+                except Exception:
+                    pass
+            # Fall back to results.csv
+            for csv_file in sorted(subdir.glob("*.csv")):
+                try:
+                    df = pd.read_csv(csv_file)
+                    if "grid_size" not in df.columns:
+                        continue
+                    q_col = next(
+                        (c for c in ("question_idx", "question_index") if c in df.columns),
+                        None,
+                    )
+                    for gs, gdf in df.groupby("grid_size"):
+                        if q_col:
+                            q_accs = {}
+                            for qi, qdf in gdf.groupby(q_col):
+                                acc = _df_accuracy(qdf)
+                                q_accs[int(qi)] = acc
+                                rows.append({
+                                    "model": model_id, "display_name": display,
+                                    "prompt_strategy": strat,
+                                    "grid_size": int(gs),
+                                    "question_idx": int(qi),
+                                    "accuracy": acc,
+                                    "delta": float("nan"),
+                                })
+                            # Compute delta for Q2 vs avg(Q0, Q1)
+                            if 0 in q_accs and 1 in q_accs and 2 in q_accs:
+                                delta = q_accs[2] - (q_accs[0] + q_accs[1]) / 2
+                                for r in rows:
+                                    if (r["model"] == model_id and
+                                        r["prompt_strategy"] == strat and
+                                        r["grid_size"] == int(gs) and
+                                        r["question_idx"] == 2):
+                                        r["delta"] = round(delta, 4)
+                except Exception:
+                    pass
+    if not rows:
+        return pd.DataFrame(columns=[
+            "model", "display_name", "prompt_strategy",
+            "grid_size", "question_idx", "accuracy", "delta"
+        ])
+    return pd.DataFrame(rows)
+def _parse_task3_stats(data: dict, rows: list, model_id, display, strat):
+    """Parse summary_stats.json for task3."""
+    try:
+        by_q = data.get("accuracy_by_question", data.get("per_question", {}))
+        by_gs = data.get("accuracy_by_grid_size", {})
+        for gs_key, gs_data in by_gs.items():
+            try:
+                gs = int(str(gs_key).replace("x", "").split("_")[0])
+            except ValueError:
+                continue
+            if isinstance(gs_data, dict):
+                q_accs = {}
+                for qi_key, acc in gs_data.items():
+                    try:
+                        qi = int(qi_key)
+                        q_accs[qi] = float(acc)
+                        rows.append({
+                            "model": model_id, "display_name": display,
+                            "prompt_strategy": strat,
+                            "grid_size": gs, "question_idx": qi,
+                            "accuracy": float(acc), "delta": float("nan"),
+                        })
+                    except (ValueError, TypeError):
+                        pass
+                if 0 in q_accs and 1 in q_accs and 2 in q_accs:
+                    delta = q_accs[2] - (q_accs[0] + q_accs[1]) / 2
+                    for r in rows:
+                        if (r["model"] == model_id and r["prompt_strategy"] == strat
+                                and r["grid_size"] == gs and r["question_idx"] == 2):
+                            r["delta"] = round(delta, 4)
+    except Exception:
+        pass
+# ---------------------------------------------------------------------------
+# Leaderboard aggregators
+# ---------------------------------------------------------------------------
+def maze_navigation_leaderboard(df: pd.DataFrame, k_shot: int = 0) -> pd.DataFrame:
+    """
+    Pivot Task 1 results into a leaderboard table.
+    Rows = models, columns = (format × strategy), values = accuracy at k_shot.
+    """
+    if df.empty:
+        return pd.DataFrame()
+    sub = df[df["k_shot"] == k_shot]
+    if sub.empty:
+        return pd.DataFrame()
+    pivot = sub.pivot_table(
+        index=["display_name"],
+        columns=["input_format", "prompt_strategy"],
+        values="accuracy",
+        aggfunc="mean",
+    )
+    pivot.columns = [f"{fmt}_{strat}" for fmt, strat in pivot.columns]
+    pivot = pivot.reset_index().rename(columns={"display_name": "Model"})
+    return pivot.round(3)
+def point_reuse_leaderboard(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Task 2 leaderboard: per-model accuracy at Q0 and Q3 across all grid sizes.
+    Highlights Q3 vs Q0 consistency.
+    """
+    if df.empty:
+        return pd.DataFrame()
+    q0 = df[df["question_idx"] == 0].groupby("display_name")["accuracy"].mean().rename("Q0 acc")
+    q3 = df[df["question_idx"] == 3].groupby("display_name")["accuracy"].mean().rename("Q3 acc")
+    out = pd.concat([q0, q3], axis=1).reset_index().rename(columns={"display_name": "Model"})
+    out["Q3-Q0 diff"] = (out["Q3 acc"] - out["Q0 acc"]).round(3)
+    return out.round(3)
+def compositional_distance_leaderboard(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Task 3 leaderboard: per-model Q0/Q1/Q2 accuracy + delta (Q2 vs avg Q0/Q1).
+    """
+    if df.empty:
+        return pd.DataFrame()
+    rows = []
+    for model, mdf in df.groupby("display_name"):
+        q0 = mdf[mdf["question_idx"] == 0]["accuracy"].mean()
+        q1 = mdf[mdf["question_idx"] == 1]["accuracy"].mean()
+        q2 = mdf[mdf["question_idx"] == 2]["accuracy"].mean()
+        delta = q2 - (q0 + q1) / 2 if not (pd.isna(q0) or pd.isna(q1) or pd.isna(q2)) else float("nan")
+        rows.append({
+            "Model": model,
+            "Q0 (A→M)": round(q0, 3),
+            "Q1 (D→M)": round(q1, 3),
+            "Q2 (B→C)": round(q2, 3),
+            "Δ Q2 vs avg(Q0,Q1)": round(delta, 3),
+        })
+    return pd.DataFrame(rows)
+# ---------------------------------------------------------------------------
+# Full results loader (called by app.py)
+# ---------------------------------------------------------------------------
+def load_all_results(config_path: str | Path) -> dict[str, pd.DataFrame]:
+    """Load results for all three tasks. Returns dict of DataFrames."""
+    cfg = load_config(config_path)
+    repo_root = Path(config_path).parent.parent.parent  # pipeline/configs/.. → llm-maze-solver
+    return {
+        "maze_navigation":        load_maze_navigation_results(cfg, repo_root),
+        "point_reuse":            load_point_reuse_results(cfg, repo_root),
+        "compositional_distance": load_compositional_distance_results(cfg, repo_root),
+    }

pipeline/task_builder.py ADDED Viewed

	@@ -0,0 +1,328 @@

+"""
+task_builder.py
+---------------
+Translates experiments.yaml into concrete shell commands (direct or sbatch).
+Each public function returns a list of ExperimentJob dataclasses, one per
+(model × format × prompt_strategy × grid_sizes) combination.  The caller
+decides whether to run them directly or wrap them in sbatch.
+"""
+from __future__ import annotations
+import os
+import itertools
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+import yaml
+# ---------------------------------------------------------------------------
+# Data structures
+# ---------------------------------------------------------------------------
+@dataclass
+class ExperimentJob:
+    """A single runnable experiment unit."""
+    task_id: str            # e.g. "maze_navigation"
+    model: str              # e.g. "gemini-2.5-flash"
+    label: str              # human-readable label for this job
+    working_dir: Path       # where to cd before running
+    python_cmd: list[str]   # [python, script.py, --arg, value, ...]
+    api_key_env: str        # env-var name that must be set
+    output_dir: Path        # where results land
+    sbatch_cfg: dict        # mem, time, cpus, partition, log_dir
+    grid_sizes: list[int]   # for display / filtering
+# ---------------------------------------------------------------------------
+# Config loader
+# ---------------------------------------------------------------------------
+def load_config(config_path: str | Path) -> dict:
+    with open(config_path) as f:
+        return yaml.safe_load(f)
+def _repo_root(config_path: Path) -> Path:
+    """pipeline/configs/experiments.yaml → llm-maze-solver/"""
+    return config_path.parent.parent.parent
+# ---------------------------------------------------------------------------
+# Internal helpers
+# ---------------------------------------------------------------------------
+def _merge_sbatch(defaults: dict, override: dict) -> dict:
+    merged = dict(defaults)
+    merged.update(override)
+    return merged
+def _grid_str(grid_sizes: list[int]) -> str:
+    return ",".join(str(g) for g in grid_sizes)
+def _output_subdir(base: str, model: str, tag: str) -> str:
+    """Produce a deterministic output subdirectory path."""
+    return f"{base}/{model.replace('.', '_').replace('-', '_')}/{tag}"
+# ---------------------------------------------------------------------------
+# Maze Navigation
+# ---------------------------------------------------------------------------
+def build_maze_navigation_jobs(
+    cfg: dict,
+    models: list[str] | None = None,
+    grid_sizes: list[int] | None = None,
+    input_formats: list[str] | None = None,
+    prompt_strategies: list[str] | None = None,
+    config_path: Path = None,
+) -> list[ExperimentJob]:
+    """Build jobs for Maze Navigation (planning, k-shot)."""
+    task = cfg["maze_navigation"]
+    defaults = cfg["defaults"]
+    model_cfg = cfg["models"]
+    selected_models = models or list(model_cfg.keys())
+    selected_formats = input_formats or task["input_formats"]
+    selected_strategies = prompt_strategies or list(task["prompt_strategies"].keys())
+    selected_grids = grid_sizes or task["grid_sizes"]
+    repo = _repo_root(config_path) if config_path else Path(".")
+    script = repo / task["script"]
+    wdir = repo / task["working_dir"]
+    jobs: list[ExperimentJob] = []
+    for model, fmt, strat in itertools.product(
+        selected_models, selected_formats, selected_strategies
+    ):
+        if model not in model_cfg:
+            continue
+        strat_cfg = task["prompt_strategies"][strat]
+        tag = f"{fmt}_input_{strat}"
+        out_dir = repo / _output_subdir(task["output_base"], model, tag)
+        cmd = [
+            "python", str(script),
+            "--model_name", model,
+            "--input_format", fmt,
+            "--k_shots", task["k_shots"],
+            "--n_test_mazes", str(cfg["defaults"]["n_test_mazes"]),
+            "--test_grid_sizes", _grid_str(selected_grids),
+            "--maze_type", task["maze_type"],
+            "--seed", str(defaults["seed"]),
+            "--output_dir", str(out_dir),
+        ]
+        for flag in strat_cfg["flags"]:
+            cmd.append(flag)
+        if task.get("visualize"):
+            cmd.append("--visualize")
+        jobs.append(ExperimentJob(
+            task_id="maze_navigation",
+            model=model,
+            label=f"Maze Navigation | {model} | {fmt} | {strat}",
+            working_dir=wdir,
+            python_cmd=cmd,
+            api_key_env=model_cfg[model]["api_key_env"],
+            output_dir=out_dir,
+            sbatch_cfg=_merge_sbatch(defaults["sbatch"], task.get("sbatch", {})),
+            grid_sizes=selected_grids,
+        ))
+    return jobs
+# ---------------------------------------------------------------------------
+# Sequential Reasoning with Point Reuse (Q3 = Q0)
+# ---------------------------------------------------------------------------
+def build_point_reuse_jobs(
+    cfg: dict,
+    models: list[str] | None = None,
+    grid_sizes: list[int] | None = None,
+    prompt_strategies: list[str] | None = None,
+    config_path: Path = None,
+) -> list[ExperimentJob]:
+    """Build jobs for Sequential Reasoning with Point Reuse (Q3=Q0)."""
+    task = cfg["point_reuse"]
+    defaults = cfg["defaults"]
+    model_cfg = cfg["models"]
+    selected_models = models or list(model_cfg.keys())
+    selected_strategies = prompt_strategies or list(task["prompt_strategies"].keys())
+    selected_grids = grid_sizes or task["grid_sizes"]
+    repo = _repo_root(config_path) if config_path else Path(".")
+    script = repo / task["script"]
+    wdir = repo / task["working_dir"]
+    jobs: list[ExperimentJob] = []
+    for model, strat in itertools.product(selected_models, selected_strategies):
+        if model not in model_cfg:
+            continue
+        strat_cfg = task["prompt_strategies"][strat]
+        tag = f"point_reuse_q3q0_{strat}"
+        out_dir = repo / _output_subdir(task["output_base"], model, tag)
+        cmd = [
+            "python", str(script),
+            "--model_name", model,
+            "--input_format", task["input_format"],
+            "--strategy", task["strategy"],
+            "--reuse_pattern", task["reuse_pattern"],
+            "--prompt_type", strat_cfg["prompt_type"],
+            "--n_questions_per_maze", str(task["n_questions_per_maze"]),
+            "--n_test_mazes", str(defaults["n_test_mazes"]),
+            "--test_grid_sizes", _grid_str(selected_grids),
+            "--output_dir", str(out_dir),
+        ]
+        if task.get("sequential_questions"):
+            cmd.append("--sequential_questions")
+        if task.get("visualize"):
+            cmd.append("--visualize")
+        if task.get("save_details"):
+            cmd.append("--save_details")
+        jobs.append(ExperimentJob(
+            task_id="point_reuse",
+            model=model,
+            label=f"Point Reuse | {model} | {strat}",
+            working_dir=wdir,
+            python_cmd=cmd,
+            api_key_env=model_cfg[model]["api_key_env"],
+            output_dir=out_dir,
+            sbatch_cfg=_merge_sbatch(defaults["sbatch"], task.get("sbatch", {})),
+            grid_sizes=selected_grids,
+        ))
+    return jobs
+# ---------------------------------------------------------------------------
+# Compositional Distance Comparison
+# ---------------------------------------------------------------------------
+def build_compositional_distance_jobs(
+    cfg: dict,
+    models: list[str] | None = None,
+    grid_sizes: list[int] | None = None,
+    prompt_strategies: list[str] | None = None,
+    config_path: Path = None,
+) -> list[ExperimentJob]:
+    """Build jobs for Compositional Distance Comparison (corners-to-center)."""
+    task = cfg["compositional_distance"]
+    defaults = cfg["defaults"]
+    model_cfg = cfg["models"]
+    selected_models = models or list(model_cfg.keys())
+    selected_strategies = prompt_strategies or list(task["prompt_strategies"].keys())
+    selected_grids = grid_sizes or task["grid_sizes"]
+    repo = _repo_root(config_path) if config_path else Path(".")
+    script = repo / task["script"]
+    wdir = repo / task["working_dir"]
+    jobs: list[ExperimentJob] = []
+    for model, strat in itertools.product(selected_models, selected_strategies):
+        if model not in model_cfg:
+            continue
+        strat_cfg = task["prompt_strategies"][strat]
+        tag = f"orthogonal_{task['corner_pattern']}_{strat}"
+        out_dir = repo / _output_subdir(task["output_base"], model, tag)
+        cmd = [
+            "python", str(script),
+            "--model_name", model,
+            "--input_format", task["input_format"],
+            "--strategy", task["strategy"],
+            "--corner_pattern", task["corner_pattern"],
+            "--prompt_type", strat_cfg["prompt_type"],
+            "--n_questions_per_maze", str(task["n_questions_per_maze"]),
+            "--n_test_mazes", str(defaults["n_test_mazes"]),
+            "--test_grid_sizes", _grid_str(selected_grids),
+            "--output_dir", str(out_dir),
+        ]
+        if task.get("visualize"):
+            cmd.append("--visualize")
+        if task.get("save_details"):
+            cmd.append("--save_details")
+        jobs.append(ExperimentJob(
+            task_id="compositional_distance",
+            model=model,
+            label=f"Compositional Distance | {model} | {strat}",
+            working_dir=wdir,
+            python_cmd=cmd,
+            api_key_env=model_cfg[model]["api_key_env"],
+            output_dir=out_dir,
+            sbatch_cfg=_merge_sbatch(defaults["sbatch"], task.get("sbatch", {})),
+            grid_sizes=selected_grids,
+        ))
+    return jobs
+# ---------------------------------------------------------------------------
+# Unified builder
+# ---------------------------------------------------------------------------
+def build_all_jobs(
+    cfg: dict,
+    tasks: list[str] | None = None,
+    models: list[str] | None = None,
+    grid_sizes: list[int] | None = None,
+    input_formats: list[str] | None = None,
+    prompt_strategies: list[str] | None = None,
+    config_path: Path = None,
+) -> list[ExperimentJob]:
+    """Build jobs for all requested tasks."""
+    selected_tasks = tasks or ["maze_navigation", "point_reuse", "compositional_distance"]
+    jobs: list[ExperimentJob] = []
+    kw = dict(
+        models=models,
+        grid_sizes=grid_sizes,
+        prompt_strategies=prompt_strategies,
+        config_path=config_path,
+    )
+    if "maze_navigation" in selected_tasks:
+        jobs += build_maze_navigation_jobs(cfg, input_formats=input_formats, **kw)
+    if "point_reuse" in selected_tasks:
+        jobs += build_point_reuse_jobs(cfg, **kw)
+    if "compositional_distance" in selected_tasks:
+        jobs += build_compositional_distance_jobs(cfg, **kw)
+    return jobs
+# ---------------------------------------------------------------------------
+# sbatch script generator
+# ---------------------------------------------------------------------------
+def make_sbatch_script(job: ExperimentJob, log_dir: Path) -> str:
+    """Return the text of an sbatch submission script for a job."""
+    s = job.sbatch_cfg
+    log_dir.mkdir(parents=True, exist_ok=True)
+    safe_label = job.label.replace(" ", "_").replace("|", "").replace("/", "_")
+    lines = [
+        "#!/bin/bash",
+        f"#SBATCH -c {s.get('cpus', 2)}",
+        f"#SBATCH -t {s.get('time', '10:00:00')}",
+        f"#SBATCH -p {s.get('partition', 'short')}",
+        f"#SBATCH --mem={s.get('mem', '8G')}",
+        f"#SBATCH -o {log_dir}/{safe_label}_%j.out",
+        f"#SBATCH -e {log_dir}/{safe_label}_%j.err",
+        "",
+        f"# {job.label}",
+        f"export {job.api_key_env}=${{{job.api_key_env}}}",
+        "",
+        f"cd {job.working_dir}",
+        " \\\n    ".join(job.python_cmd),
+    ]
+    return "\n".join(lines) + "\n"

requirements.txt ADDED Viewed

	@@ -0,0 +1,31 @@

+# SpatialBench — pipeline dependencies
+# Install with: pip install -r requirements.txt
+# Gradio UI (HuggingFace Space entrypoint)
+gradio>=4.20.0
+# Plotting
+plotly>=5.18.0
+# Data
+pandas>=2.0.0
+numpy>=1.24.0
+# Config parsing
+PyYAML>=6.0
+# LLM API clients
+openai>=1.14.0
+anthropic>=0.25.0
+google-generativeai>=0.5.0
+# (DeepSeek uses the OpenAI-compatible client — no extra package needed)
+# Sentence embeddings for reasoning quality analysis
+sentence-transformers>=2.6.0
+# ROUGE for reasoning quality analysis
+rouge-score>=0.1.2
+# Environment variable loading
+python-dotenv>=1.0.0

run_experiments.py ADDED Viewed

	@@ -0,0 +1,307 @@

+#!/usr/bin/env python3
+"""
+run_experiments.py
+------------------
+CLI orchestrator for SpatialBench experiments.
+Run on the cluster with SLURM:
+    python run_experiments.py --tasks maze_navigation point_reuse compositional_distance --mode slurm
+Run directly (uses API keys, no SLURM required):
+    python run_experiments.py --tasks maze_navigation --models gemini-2.5-flash --mode direct
+Dry-run (print commands without executing):
+    python run_experiments.py --tasks maze_navigation point_reuse compositional_distance --dry-run
+Filter experiments:
+    python run_experiments.py --tasks maze_navigation \\
+        --models gemini-2.5-flash claude-haiku-4-5 \\
+        --grid-sizes 5 6 7 \\
+        --formats raw \\
+        --strategies cot reasoning
+Show status of running SLURM jobs:
+    python run_experiments.py --status
+"""
+from __future__ import annotations
+import argparse
+import json
+import os
+import subprocess
+import sys
+import tempfile
+import time
+from pathlib import Path
+# Load .env if present (before importing pipeline modules)
+_env_file = Path(__file__).parent / ".env"
+if _env_file.exists():
+    with open(_env_file) as _f:
+        for _line in _f:
+            _line = _line.strip()
+            if _line and not _line.startswith("#") and "=" in _line:
+                _k, _v = _line.split("=", 1)
+                os.environ.setdefault(_k.strip(), _v.strip())
+from pipeline.task_builder import (
+    load_config, build_all_jobs, make_sbatch_script, ExperimentJob,
+)
+from pipeline.job_monitor import JobMonitor, submit_sbatch, submit_direct
+CONFIG_PATH = Path(__file__).parent / "configs" / "experiments.yaml"
+REPO_ROOT   = CONFIG_PATH.parent.parent.parent  # llm-maze-solver/
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def _check_api_key(job: ExperimentJob) -> bool:
+    val = os.environ.get(job.api_key_env, "")
+    if not val:
+        print(f"  [WARN] {job.api_key_env} not set — skipping: {job.label}")
+        return False
+    return True
+def _print_job(job: ExperimentJob) -> None:
+    print(f"\n  {job.label}")
+    print(f"    cmd : {' '.join(job.python_cmd[:4])} ...")
+    print(f"    wdir: {job.working_dir}")
+    print(f"    out : {job.output_dir}")
+# ---------------------------------------------------------------------------
+# Run modes
+# ---------------------------------------------------------------------------
+def run_slurm(jobs: list[ExperimentJob], monitor: JobMonitor, dry_run: bool) -> None:
+    log_dir = REPO_ROOT / "maze-solver" / "eval_llm_logs"
+    log_dir.mkdir(parents=True, exist_ok=True)
+    for job in jobs:
+        if not _check_api_key(job):
+            continue
+        script_text = make_sbatch_script(job, log_dir)
+        if dry_run:
+            _print_job(job)
+            print("  --- sbatch script ---")
+            print(script_text)
+            continue
+        with tempfile.NamedTemporaryFile(
+            mode="w", suffix=".sh", prefix="spatialbench_",
+            dir=log_dir, delete=False
+        ) as tmp:
+            tmp.write(script_text)
+            script_path = tmp.name
+        job_id = submit_sbatch(script_text, script_path)
+        if job_id:
+            monitor.add(
+                job_id=job_id,
+                label=job.label,
+                task_id=job.task_id,
+                model=job.model,
+                output_dir=str(job.output_dir),
+                log_out=str(log_dir / f"{job_id}.out"),
+                log_err=str(log_dir / f"{job_id}.err"),
+            )
+            print(f"  Submitted {job.label}  →  SLURM job {job_id}")
+        else:
+            print(f"  [ERROR] Failed to submit: {job.label}")
+def run_direct(jobs: list[ExperimentJob], monitor: JobMonitor, dry_run: bool) -> None:
+    for job in jobs:
+        if not _check_api_key(job):
+            continue
+        if dry_run:
+            _print_job(job)
+            print(f"  cmd: {' '.join(job.python_cmd)}\n")
+            continue
+        env_patch = {job.api_key_env: os.environ[job.api_key_env]}
+        job.output_dir.mkdir(parents=True, exist_ok=True)
+        print(f"  Starting: {job.label}")
+        proc = submit_direct(
+            cmd=job.python_cmd,
+            working_dir=str(job.working_dir),
+            env=env_patch,
+        )
+        monitor.add_direct(
+            proc=proc,
+            label=job.label,
+            task_id=job.task_id,
+            model=job.model,
+            output_dir=str(job.output_dir),
+        )
+        # Small gap to avoid hammering APIs simultaneously
+        time.sleep(2)
+# ---------------------------------------------------------------------------
+# Status display
+# ---------------------------------------------------------------------------
+def show_status(monitor: JobMonitor) -> None:
+    monitor.refresh()
+    summary = monitor.summary()
+    print(f"\nTotal jobs: {summary['total']}")
+    for status, count in summary["counts"].items():
+        print(f"  {status:12s}: {count}")
+    print()
+    for r in summary["records"]:
+        print(f"  [{r['status']:9s}] {r['label']:<60s}  elapsed: {r['elapsed']}")
+# ---------------------------------------------------------------------------
+# Argument parsing
+# ---------------------------------------------------------------------------
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="SpatialBench experiment orchestrator",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+    parser.add_argument(
+        "--tasks", nargs="+",
+        default=["maze_navigation", "point_reuse", "compositional_distance"],
+        choices=["maze_navigation", "point_reuse", "compositional_distance"],
+        help="Which tasks to run (default: all three)",
+    )
+    parser.add_argument(
+        "--models", nargs="+", default=None,
+        help="Model IDs to run (default: all models in config)",
+    )
+    parser.add_argument(
+        "--grid-sizes", nargs="+", type=int, default=None,
+        dest="grid_sizes",
+        help="Grid sizes to evaluate, e.g. --grid-sizes 5 6 7 (default: per-task config)",
+    )
+    parser.add_argument(
+        "--formats", nargs="+", default=None,
+        choices=["raw", "visual"],
+        help="Input formats for Task 1 (default: both raw and visual)",
+    )
+    parser.add_argument(
+        "--strategies", nargs="+", default=None,
+        choices=["base", "cot", "reasoning"],
+        help="Prompt strategies (default: all)",
+    )
+    parser.add_argument(
+        "--mode", default="slurm", choices=["slurm", "direct"],
+        help="Execution mode: 'slurm' submits sbatch jobs, 'direct' runs inline (default: slurm)",
+    )
+    parser.add_argument(
+        "--dry-run", action="store_true",
+        help="Print commands without executing them",
+    )
+    parser.add_argument(
+        "--no-wait", action="store_true",
+        help="Return immediately after submission (don't poll for completion)",
+    )
+    parser.add_argument(
+        "--status", action="store_true",
+        help="Query and display SLURM job status (requires --job-ids or a running monitor)",
+    )
+    parser.add_argument(
+        "--job-ids", nargs="+", default=None,
+        help="SLURM job IDs to check status for (used with --status)",
+    )
+    parser.add_argument(
+        "--config", default=str(CONFIG_PATH),
+        help=f"Path to experiments.yaml (default: {CONFIG_PATH})",
+    )
+    parser.add_argument(
+        "--poll-interval", type=int, default=60,
+        dest="poll_interval",
+        help="Seconds between SLURM status polls when waiting (default: 60)",
+    )
+    return parser.parse_args()
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main() -> None:
+    args = parse_args()
+    cfg = load_config(args.config)
+    # Status-only mode
+    if args.status:
+        monitor = JobMonitor(mode="slurm")
+        if args.job_ids:
+            for jid in args.job_ids:
+                monitor.add(job_id=jid, label=jid, task_id="?", model="?")
+        show_status(monitor)
+        return
+    # Build jobs
+    jobs = build_all_jobs(
+        cfg=cfg,
+        tasks=args.tasks,
+        models=args.models,
+        grid_sizes=args.grid_sizes,
+        input_formats=args.formats,
+        prompt_strategies=args.strategies,
+        config_path=Path(args.config),
+    )
+    if not jobs:
+        print("No jobs matched the requested filters.")
+        return
+    print(f"\nSpatialBench — {len(jobs)} job(s) to run")
+    print(f"  mode     : {args.mode}")
+    print(f"  tasks    : {args.tasks}")
+    print(f"  models   : {args.models or 'all'}")
+    print(f"  grids    : {args.grid_sizes or 'per-task default'}")
+    print(f"  formats  : {args.formats or 'per-task default'}")
+    print(f"  strategies: {args.strategies or 'all'}")
+    print(f"  dry-run  : {args.dry_run}")
+    print()
+    monitor = JobMonitor(mode=args.mode)
+    if args.mode == "slurm":
+        run_slurm(jobs, monitor, dry_run=args.dry_run)
+    else:
+        run_direct(jobs, monitor, dry_run=args.dry_run)
+    if args.dry_run or args.no_wait:
+        if not args.dry_run:
+            print(f"\nSubmitted {len(monitor.all_records())} job(s). Use --status to check progress.")
+        return
+    # Wait for completion
+    print("\nWaiting for jobs to complete...")
+    def _progress(summary: dict) -> None:
+        counts = summary["counts"]
+        parts = [f"{s}: {n}" for s, n in counts.items()]
+        print(f"  [{time.strftime('%H:%M:%S')}] {' | '.join(parts)}")
+    monitor.wait_all(poll_interval=args.poll_interval, callback=_progress)
+    # Final summary
+    summary = monitor.summary()
+    print(f"\nDone. {summary['counts'].get('completed', 0)} completed, "
+          f"{summary['counts'].get('failed', 0)} failed.")
+    failed = [r for r in summary["records"] if r["status"] == "failed"]
+    if failed:
+        print("\nFailed jobs:")
+        for r in failed:
+            print(f"  {r['label']}  (job_id={r['job_id']})")
+if __name__ == "__main__":
+    main()