SpatialBench / app.py
weijiang99's picture
Update SpatialBench pipeline
cb5acaf verified
"""
app.py β€” SpatialBench Gradio application
-----------------------------------------
Entrypoint for the HuggingFace Space "SpatialBench".
Three tabs:
1. Leaderboard β€” visualize pre-computed results from all three tasks
2. Get Scripts β€” generate ready-to-run SLURM scripts (or plain shell
scripts) as a downloadable zip; no compute needed here
3. About β€” paper info and citation
To run locally:
cd pipeline/
python app.py
To deploy on HuggingFace Spaces:
- No secrets required for the Leaderboard or Get Scripts tabs.
- The Space entrypoint is this file (app.py).
"""
from __future__ import annotations
import os
import sys
import zipfile
import tempfile
from pathlib import Path
import gradio as gr
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
# Load .env if running locally
_env = Path(__file__).parent / ".env"
if _env.exists():
with open(_env) as _f:
for _line in _f:
_line = _line.strip()
if _line and not _line.startswith("#") and "=" in _line:
_k, _v = _line.split("=", 1)
os.environ.setdefault(_k.strip(), _v.strip())
# Add repo root to path so pipeline imports work
sys.path.insert(0, str(Path(__file__).parent))
from pipeline.task_builder import load_config, build_all_jobs, make_sbatch_script
from pipeline.results_loader import (
load_all_results,
maze_navigation_leaderboard,
point_reuse_leaderboard,
compositional_distance_leaderboard,
)
# ---------------------------------------------------------------------------
# Paths / config
# ---------------------------------------------------------------------------
CONFIG_PATH = Path(__file__).parent / "configs" / "experiments.yaml"
CFG = load_config(CONFIG_PATH)
MODEL_CHOICES = list(CFG["models"].keys())
# ---------------------------------------------------------------------------
# Leaderboard helpers
# ---------------------------------------------------------------------------
def _load_results():
try:
return load_all_results(CONFIG_PATH)
except Exception:
return {
"maze_navigation": pd.DataFrame(),
"point_reuse": pd.DataFrame(),
"compositional_distance": pd.DataFrame(),
}
def _make_empty_fig(msg: str) -> go.Figure:
fig = go.Figure()
fig.add_annotation(text=msg, x=0.5, y=0.5, showarrow=False,
font=dict(size=16), xref="paper", yref="paper")
fig.update_layout(xaxis_visible=False, yaxis_visible=False,
height=300, paper_bgcolor="rgba(0,0,0,0)",
plot_bgcolor="rgba(0,0,0,0)")
return fig
# ── Task 1 plots ─────────────────────────────────────────────────────────────
def plot_task1_accuracy(k_shot: int, input_format: str) -> tuple[go.Figure, pd.DataFrame]:
results = _load_results()
df = results["maze_navigation"]
if df.empty:
return _make_empty_fig("No Task 1 results found.\nRun experiments first."), pd.DataFrame()
sub = df[(df["k_shot"] == k_shot) & (df["input_format"] == input_format)]
if sub.empty:
return _make_empty_fig(f"No results for k={k_shot}, format={input_format}"), pd.DataFrame()
fig = px.line(
sub, x="grid_size", y="accuracy",
color="display_name", line_dash="prompt_strategy",
markers=True,
labels={"grid_size": "Grid Size (nΓ—n)", "accuracy": "Accuracy",
"display_name": "Model", "prompt_strategy": "Strategy"},
title=f"Task 1 β€” Maze Navigation ({input_format} format, {k_shot}-shot)",
color_discrete_sequence=px.colors.qualitative.Set2,
)
fig.update_layout(
yaxis_range=[0, 1],
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
height=420,
)
lb = maze_navigation_leaderboard(df, k_shot=k_shot)
return fig, lb
def plot_task1_format_comparison() -> go.Figure:
results = _load_results()
df = results["maze_navigation"]
if df.empty:
return _make_empty_fig("No Task 1 results found.")
sub = df[(df["k_shot"] == 0) & (df["prompt_strategy"] == "cot")]
if sub.empty:
sub = df[df["k_shot"] == 0]
agg = sub.groupby(["display_name", "input_format"])["accuracy"].mean().reset_index()
fig = px.bar(
agg, x="display_name", y="accuracy", color="input_format",
barmode="group",
labels={"display_name": "Model", "accuracy": "Mean Accuracy",
"input_format": "Input Format"},
title="Task 1 β€” Raw vs Visual Format (0-shot, CoT, averaged over grid sizes)",
color_discrete_map={"raw": "#2196F3", "visual": "#FF9800"},
)
fig.update_layout(yaxis_range=[0, 1], height=380)
return fig
# ── Task 2 plots ─────────────────────────────────────────────────────────────
def plot_task2_q0_q3(grid_size: int) -> tuple[go.Figure, pd.DataFrame]:
results = _load_results()
df = results["point_reuse"]
if df.empty:
return _make_empty_fig("No Task 2 results found.\nRun experiments first."), pd.DataFrame()
sub = df[df["grid_size"] == grid_size]
if sub.empty:
return _make_empty_fig(f"No Task 2 results for {grid_size}Γ—{grid_size}"), pd.DataFrame()
q0 = sub[sub["question_idx"] == 0].groupby("display_name")["accuracy"].mean().rename("Q0")
q3 = sub[sub["question_idx"] == 3].groupby("display_name")["accuracy"].mean().rename("Q3")
plot_df = pd.concat([q0, q3], axis=1).reset_index()
plot_df_melt = plot_df.melt(id_vars="display_name", var_name="Question", value_name="Accuracy")
fig = px.bar(
plot_df_melt, x="display_name", y="Accuracy", color="Question",
barmode="group",
labels={"display_name": "Model"},
title=f"Task 2 β€” Q0 vs Q3 Accuracy ({grid_size}Γ—{grid_size} maze)\n"
"Q3 = Q0 (same question repeated β€” tests information reuse)",
color_discrete_map={"Q0": "#4CAF50", "Q3": "#F44336"},
)
fig.update_layout(yaxis_range=[0, 1], height=400)
lb = point_reuse_leaderboard(df)
return fig, lb
def plot_task2_by_grid() -> go.Figure:
results = _load_results()
df = results["point_reuse"]
if df.empty:
return _make_empty_fig("No Task 2 results found.")
q3 = df[df["question_idx"] == 3].groupby(
["display_name", "grid_size"])["accuracy"].mean().reset_index()
fig = px.line(
q3, x="grid_size", y="accuracy", color="display_name",
markers=True,
labels={"grid_size": "Grid Size", "accuracy": "Q3 Accuracy",
"display_name": "Model"},
title="Task 2 β€” Q3 Accuracy by Grid Size (Q3 = Q0 repeated)",
color_discrete_sequence=px.colors.qualitative.Set2,
)
fig.update_layout(yaxis_range=[0, 1], height=380)
return fig
# ── Task 3 plots ─────────────────────────────────────────────────────────────
def plot_task3_compositional() -> tuple[go.Figure, pd.DataFrame]:
results = _load_results()
df = results["compositional_distance"]
if df.empty:
return _make_empty_fig("No Task 3 results found.\nRun experiments first."), pd.DataFrame()
agg = df.groupby(["display_name", "question_idx"])["accuracy"].mean().reset_index()
q_labels = {0: "Q0: A→M", 1: "Q1: D→M", 2: "Q2: B→C (compositional)"}
agg["Question"] = agg["question_idx"].map(q_labels)
fig = px.bar(
agg, x="display_name", y="accuracy", color="Question",
barmode="group",
labels={"display_name": "Model", "accuracy": "Accuracy"},
title="Task 3 β€” Compositional Distance Comparison\n"
"Q2 can be composed from Q0+Q1 (corner→center distances)",
color_discrete_map={
"Q0: A→M": "#2196F3",
"Q1: D→M": "#9C27B0",
"Q2: B→C (compositional)": "#FF5722",
},
)
fig.update_layout(yaxis_range=[0, 1], height=420)
lb = compositional_distance_leaderboard(df)
return fig, lb
def plot_task3_by_grid() -> go.Figure:
results = _load_results()
df = results["compositional_distance"]
if df.empty:
return _make_empty_fig("No Task 3 results found.")
q2 = df[df["question_idx"] == 2].groupby(
["display_name", "grid_size"])["accuracy"].mean().reset_index()
fig = px.line(
q2, x="grid_size", y="accuracy", color="display_name",
markers=True,
labels={"grid_size": "Grid Size", "accuracy": "Q2 Accuracy",
"display_name": "Model"},
title="Task 3 β€” Q2 (Compositional) Accuracy by Grid Size",
color_discrete_sequence=px.colors.qualitative.Set2,
)
fig.update_layout(yaxis_range=[0, 1], height=380)
return fig
# ---------------------------------------------------------------------------
# Script generation tab
# ---------------------------------------------------------------------------
TASK_DISPLAY_MAP = {
"Maze Navigation": "maze_navigation",
"Sequential Point Reuse": "point_reuse",
"Compositional Distance Comparison": "compositional_distance",
}
def _make_plain_script(job, api_key_placeholder: str) -> str:
"""Return a plain bash script (no SLURM headers) for running a job directly."""
lines = [
"#!/usr/bin/env bash",
f"# {job.label}",
f"export {job.api_key_env}={api_key_placeholder}",
"",
f"cd {job.working_dir}",
" \\\n ".join(job.python_cmd),
"",
]
return "\n".join(lines)
def generate_scripts(
tasks: list[str],
models: list[str],
grid_sizes_str: str,
formats: list[str],
strategies: list[str],
script_type: str,
repo_path: str,
) -> tuple[str, str | None]:
"""
Build experiment scripts and return (preview_text, zip_path).
zip_path is a temp file the user can download.
"""
if not tasks:
return "Select at least one task.", None
if not models:
return "Select at least one model.", None
try:
grid_sizes = [int(g.strip()) for g in grid_sizes_str.split(",") if g.strip()]
except ValueError:
return "Invalid grid sizes β€” enter comma-separated integers, e.g. 5,6,7", None
selected_tasks = [TASK_DISPLAY_MAP[t] for t in tasks if t in TASK_DISPLAY_MAP]
jobs = build_all_jobs(
cfg=CFG,
tasks=selected_tasks,
models=models,
grid_sizes=grid_sizes or None,
input_formats=formats or None,
prompt_strategies=strategies or None,
config_path=CONFIG_PATH,
)
if not jobs:
return "No jobs matched the selected filters.", None
# Optionally override repo path in working_dir
repo_override = repo_path.strip() if repo_path.strip() else None
use_slurm = (script_type == "SLURM (.sh with #SBATCH headers)")
log_dir = Path(repo_override or ".") / "maze-solver" / "eval_llm_logs"
script_contents: dict[str, str] = {}
for job in jobs:
safe = job.label.replace(" ", "_").replace("|", "").replace("/", "_").strip("_")
filename = f"{safe}.sh"
# If a repo path override was provided, patch working_dir in the job
if repo_override:
# Rebase working_dir: replace the config-derived root with the user's path
try:
rel = job.working_dir.relative_to(CONFIG_PATH.parent.parent.parent)
job.working_dir = Path(repo_override) / rel
except ValueError:
pass
# Rebase output_dir similarly
try:
rel_out = job.output_dir.relative_to(CONFIG_PATH.parent.parent.parent)
job.output_dir = Path(repo_override) / rel_out
except ValueError:
pass
# Rebase python_cmd paths (first two tokens are "python" and script path)
if len(job.python_cmd) >= 2:
script_abs = Path(job.python_cmd[1])
try:
rel_script = script_abs.relative_to(CONFIG_PATH.parent.parent.parent)
job.python_cmd[1] = str(Path(repo_override) / rel_script)
except ValueError:
pass
if use_slurm:
content = make_sbatch_script(job, log_dir)
else:
content = _make_plain_script(job, f'"${{{job.api_key_env}}}"')
script_contents[filename] = content
# Write zip to a named temp file (Gradio File component needs a real path)
tmp = tempfile.NamedTemporaryFile(
delete=False, suffix=".zip", prefix="spatialbench_scripts_"
)
with zipfile.ZipFile(tmp, "w", zipfile.ZIP_DEFLATED) as zf:
for fname, content in script_contents.items():
zf.writestr(fname, content)
# Also include a README and a master run_all.sh
run_all_lines = ["#!/usr/bin/env bash", "# Run all generated scripts sequentially", ""]
for fname in sorted(script_contents):
run_all_lines.append(f"bash {fname}")
zf.writestr("run_all.sh", "\n".join(run_all_lines) + "\n")
tmp.close()
# Preview: show first script + summary
n = len(script_contents)
first_name, first_content = next(iter(script_contents.items()))
preview = (
f"Generated {n} script(s) for {len(models)} model(s) across {len(selected_tasks)} task(s).\n"
f"Download the zip below, unzip in your cluster, then run: bash run_all.sh\n\n"
f"── {first_name} ──\n{first_content}"
+ (f"\n\n... and {n - 1} more script(s) in the zip." if n > 1 else "")
)
return preview, tmp.name
# ---------------------------------------------------------------------------
# Gradio UI
# ---------------------------------------------------------------------------
PAPER_ABSTRACT = """
**Do LLMs Build Spatial World Models? Evidence from Grid-World Maze Tasks**
We systematically evaluate the spatial understanding of large language models through maze tasksβ€”a
controlled testing context requiring multi-step planning and spatial abstraction. Across experiments
with Gemini-2.5-Flash, GPT-5-mini, Claude-Haiku-4.5, and DeepSeek-Chat, we uncover significant
discrepancies in spatial reasoning that challenge assumptions about LLM planning capabilities.
Key findings:
- **Representation sensitivity**: Gemini drops from 86% (raw tokenized) to 34% (visual grid) on 5Γ—5 mazes with CoT
- **Prompting dependency**: Claude-Haiku fails completely without CoT, recovers to 78% with it
- **No spatial memory**: Models treat sequential questions independently, failing to reuse computed spatial knowledge
"""
CSS = """
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600&family=IBM+Plex+Mono:wght@400;500&display=swap');
*, body, .gradio-container { font-family: 'Inter', ui-sans-serif, system-ui, sans-serif !important; }
code, pre, .monospace { font-family: 'IBM Plex Mono', ui-monospace, monospace !important; }
.leaderboard-table { font-size: 0.9em; }
footer { display: none !important; }
"""
def build_ui() -> gr.Blocks:
with gr.Blocks(
title="SpatialBench β€” Do LLMs Build Spatial World Models?",
css=CSS,
theme=gr.themes.Soft(primary_hue="blue"),
) as demo:
gr.Markdown("# 🧩 SpatialBench")
gr.Markdown(
"**Evaluating Spatial World Models in Large Language Models** Β· "
"[Paper (ICLR 2026 Workshop)](https://arxiv.org/abs/...) Β· "
"[Code](https://github.com/...)"
)
with gr.Tabs():
# ================================================================
# Tab 1: Leaderboard
# ================================================================
with gr.Tab("πŸ“Š Leaderboard"):
gr.Markdown(PAPER_ABSTRACT)
gr.Markdown("---")
gr.Markdown("## Task 1 β€” Maze Navigation (Planning)")
gr.Markdown(
"Models find shortest paths through mazes. "
"Two input formats: **raw** tokenized adjacency lists vs **visual** character grids."
)
with gr.Row():
t1_k = gr.Radio(
choices=[0, 3, 5], value=0, label="K-shot",
info="Number of in-context examples",
)
t1_fmt = gr.Radio(
choices=["raw", "visual"], value="raw", label="Input Format",
)
t1_plot = gr.Plot(label="Accuracy by Grid Size")
t1_lb = gr.Dataframe(
label="Leaderboard (mean accuracy across grid sizes)",
elem_classes=["leaderboard-table"],
)
t1_fmt_plot = gr.Plot(label="Raw vs Visual Format Comparison")
def refresh_task1(k, fmt):
fig, lb = plot_task1_accuracy(int(k), fmt)
fmt_fig = plot_task1_format_comparison()
return fig, lb, fmt_fig
for inp in [t1_k, t1_fmt]:
inp.change(
refresh_task1, inputs=[t1_k, t1_fmt],
outputs=[t1_plot, t1_lb, t1_fmt_plot],
)
gr.Markdown("---")
gr.Markdown("## Task 2 β€” Sequential Reasoning with Point Reuse")
gr.Markdown(
"Models answer 4 proximity questions. **Q3 = Q0** (same question repeated). "
"Do models reuse their earlier computation, or start from scratch?"
)
t2_grid = gr.Slider(minimum=5, maximum=9, step=1, value=5, label="Grid Size")
t2_plot = gr.Plot(label="Q0 vs Q3 Accuracy")
t2_grid_plot = gr.Plot(label="Q3 Accuracy Across Grid Sizes")
t2_lb = gr.Dataframe(label="Leaderboard", elem_classes=["leaderboard-table"])
def refresh_task2(gs):
fig, lb = plot_task2_q0_q3(int(gs))
grid_fig = plot_task2_by_grid()
return fig, grid_fig, lb
t2_grid.change(
refresh_task2, inputs=[t2_grid],
outputs=[t2_plot, t2_grid_plot, t2_lb],
)
gr.Markdown("---")
gr.Markdown("## Task 3 β€” Compositional Distance Comparison")
gr.Markdown(
"Models answer 3 questions about maze corners (A, B, C, D) and center M. "
"**Q2** (B→C) can potentially be composed from Q0 (A→M) and Q1 (D→M). "
"Ξ” = Q2 accuracy βˆ’ avg(Q0, Q1)."
)
t3_plot = gr.Plot(label="Q0 / Q1 / Q2 Accuracy by Model")
t3_grid_plot = gr.Plot(label="Q2 Accuracy Across Grid Sizes")
t3_lb = gr.Dataframe(
label="Leaderboard (Ξ” shows compositional benefit)",
elem_classes=["leaderboard-table"],
)
with gr.Row():
refresh_lb_btn = gr.Button("πŸ”„ Refresh Results", variant="secondary")
def refresh_all_leaderboard(_=None):
t1_fig, t1_table = plot_task1_accuracy(0, "raw")
t1_ff = plot_task1_format_comparison()
t2_fig, t2_lb_table = plot_task2_q0_q3(5)
t2_gfig = plot_task2_by_grid()
t3_fig, t3_lb_table = plot_task3_compositional()
t3_gfig = plot_task3_by_grid()
return (
t1_fig, t1_table, t1_ff,
t2_fig, t2_gfig, t2_lb_table,
t3_fig, t3_gfig, t3_lb_table,
)
refresh_lb_btn.click(
refresh_all_leaderboard,
outputs=[
t1_plot, t1_lb, t1_fmt_plot,
t2_plot, t2_grid_plot, t2_lb,
t3_plot, t3_grid_plot, t3_lb,
],
)
demo.load(
refresh_all_leaderboard,
outputs=[
t1_plot, t1_lb, t1_fmt_plot,
t2_plot, t2_grid_plot, t2_lb,
t3_plot, t3_grid_plot, t3_lb,
],
)
# ================================================================
# Tab 2: Get Scripts
# ================================================================
with gr.Tab("⬇️ Get Scripts"):
gr.Markdown(
"## Generate Experiment Scripts\n"
"Configure the experiments you want to run, then download a zip of ready-to-run "
"shell scripts.\n\n"
"**How to use:**\n"
"1. Select tasks, models, and settings below\n"
"2. Enter the path to your local clone of the repo (so paths in the scripts are correct)\n"
"3. Click **Generate** β€” a preview appears and a zip is ready to download\n"
"4. Unzip on your cluster, set your API key(s) as environment variables, then:\n"
" ```bash\n"
" export GEMINI_API_KEY=your_key_here\n"
" bash run_all.sh # run sequentially\n"
" # β€” or submit individually:\n"
" sbatch Task_1__Maze_Navigation__gemini-2.5-flash__raw__cot.sh\n"
" ```"
)
with gr.Row():
with gr.Column(scale=2):
gen_tasks = gr.CheckboxGroup(
choices=list(TASK_DISPLAY_MAP.keys()),
value=["Maze Navigation"],
label="Tasks",
)
gen_models = gr.CheckboxGroup(
choices=MODEL_CHOICES,
value=["gemini-2.5-flash"],
label="Models",
)
gen_grids = gr.Textbox(
value="5,6,7,8,9",
label="Grid Sizes",
info="Comma-separated. Paper used 5–9.",
)
with gr.Row():
gen_formats = gr.CheckboxGroup(
choices=["raw", "visual"],
value=["raw", "visual"],
label="Input Formats (Task 1 only)",
)
gen_strategies = gr.CheckboxGroup(
choices=["base", "cot", "reasoning"],
value=["base", "cot", "reasoning"],
label="Prompt Strategies",
)
with gr.Column(scale=1):
gen_script_type = gr.Radio(
choices=[
"SLURM (.sh with #SBATCH headers)",
"Plain bash (.sh, no SLURM)",
],
value="SLURM (.sh with #SBATCH headers)",
label="Script Type",
info="Use SLURM if you have a cluster. Plain bash runs directly.",
)
gen_repo_path = gr.Textbox(
label="Repo path on your cluster",
placeholder="/path/to/llm-maze-solver",
info="Absolute path to the llm-maze-solver repo root on the machine where you'll run the scripts. Leave blank to use relative paths.",
)
with gr.Row():
gen_btn = gr.Button("βš™οΈ Generate Scripts", variant="primary", scale=2)
gen_preview = gr.Textbox(
label="Preview (first script)",
interactive=False,
lines=20,
max_lines=30,
)
gen_download = gr.File(
label="Download Scripts (.zip)",
interactive=False,
)
gen_btn.click(
generate_scripts,
inputs=[
gen_tasks, gen_models, gen_grids,
gen_formats, gen_strategies,
gen_script_type, gen_repo_path,
],
outputs=[gen_preview, gen_download],
)
# ================================================================
# Tab 3: About
# ================================================================
with gr.Tab("ℹ️ About"):
gr.Markdown("""
## About SpatialBench
SpatialBench is the evaluation platform accompanying the paper:
> **Do LLMs Build Spatial World Models? Evidence from Grid-World Maze Tasks**
> *Under review at ICLR 2026 Workshop*
### Three Tasks
| Task | Type | What it tests |
|------|------|---------------|
| **Task 1: Maze Navigation** | Planning | Find shortest path from start to goal |
| **Task 2: Sequential Point Reuse** | Reasoning | Reuse Q0 computation when Q3=Q0 |
| **Task 3: Compositional Distance** | Reasoning | Compose corner→center distances for Q2 |
### Input Representations
- **Raw (tokenized)**: `<ADJLIST_START> (0,0) <--> (0,1) ... <ADJLIST_END>`
- **Visual (grid)**: `Row 0: ['.', 'S', '.', '#'] Row 1: ['#', '.', '.', 'E']`
### Models Evaluated
| Model | Provider |
|-------|----------|
| Gemini 2.5 Flash | Google |
| GPT-5 Mini | OpenAI |
| Claude Haiku 4.5 | Anthropic |
| DeepSeek Chat | DeepSeek |
### Grid Sizes
Experiments run on nΓ—n grids for n ∈ {5, 6, 7, 8, 9} by default.
### Reproducing Experiments
Clone the repo and use the **Get Scripts** tab above to generate SLURM scripts, or use the CLI directly:
```bash
cd pipeline/
python run_experiments.py --tasks maze_navigation --models gemini-2.5-flash --mode slurm --dry-run
```
### Citation
```bibtex
@inproceedings{spatialbench2026,
title = {Do {LLMs} Build Spatial World Models? Evidence from Grid-World Maze Tasks},
author = {Anonymous},
booktitle = {ICLR 2026 Workshop},
year = {2026},
}
```
""")
return demo
# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------
if __name__ == "__main__":
demo = build_ui()
demo.launch(
server_name="0.0.0.0",
share=False,
show_error=True,
)