""" app.py — SpatialBench Gradio application ----------------------------------------- Entrypoint for the HuggingFace Space "SpatialBench". Three tabs: 1. Leaderboard — visualize pre-computed results from all three tasks 2. Get Scripts — generate ready-to-run SLURM scripts (or plain shell scripts) as a downloadable zip; no compute needed here 3. About — paper info and citation To run locally: cd pipeline/ python app.py To deploy on HuggingFace Spaces: - No secrets required for the Leaderboard or Get Scripts tabs. - The Space entrypoint is this file (app.py). """ from __future__ import annotations import os import sys import zipfile import tempfile from pathlib import Path import gradio as gr import pandas as pd import plotly.express as px import plotly.graph_objects as go # Load .env if running locally _env = Path(__file__).parent / ".env" if _env.exists(): with open(_env) as _f: for _line in _f: _line = _line.strip() if _line and not _line.startswith("#") and "=" in _line: _k, _v = _line.split("=", 1) os.environ.setdefault(_k.strip(), _v.strip()) # Add repo root to path so pipeline imports work sys.path.insert(0, str(Path(__file__).parent)) from pipeline.task_builder import load_config, build_all_jobs, make_sbatch_script from pipeline.results_loader import ( load_all_results, maze_navigation_leaderboard, point_reuse_leaderboard, compositional_distance_leaderboard, ) # --------------------------------------------------------------------------- # Paths / config # --------------------------------------------------------------------------- CONFIG_PATH = Path(__file__).parent / "configs" / "experiments.yaml" CFG = load_config(CONFIG_PATH) MODEL_CHOICES = list(CFG["models"].keys()) # --------------------------------------------------------------------------- # Leaderboard helpers # --------------------------------------------------------------------------- def _load_results(): try: return load_all_results(CONFIG_PATH) except Exception: return { "maze_navigation": pd.DataFrame(), "point_reuse": pd.DataFrame(), "compositional_distance": pd.DataFrame(), } def _make_empty_fig(msg: str) -> go.Figure: fig = go.Figure() fig.add_annotation(text=msg, x=0.5, y=0.5, showarrow=False, font=dict(size=16), xref="paper", yref="paper") fig.update_layout(xaxis_visible=False, yaxis_visible=False, height=300, paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0,0,0,0)") return fig # ── Task 1 plots ───────────────────────────────────────────────────────────── def plot_task1_accuracy(k_shot: int, input_format: str) -> tuple[go.Figure, pd.DataFrame]: results = _load_results() df = results["maze_navigation"] if df.empty: return _make_empty_fig("No Task 1 results found.\nRun experiments first."), pd.DataFrame() sub = df[(df["k_shot"] == k_shot) & (df["input_format"] == input_format)] if sub.empty: return _make_empty_fig(f"No results for k={k_shot}, format={input_format}"), pd.DataFrame() fig = px.line( sub, x="grid_size", y="accuracy", color="display_name", line_dash="prompt_strategy", markers=True, labels={"grid_size": "Grid Size (n×n)", "accuracy": "Accuracy", "display_name": "Model", "prompt_strategy": "Strategy"}, title=f"Task 1 — Maze Navigation ({input_format} format, {k_shot}-shot)", color_discrete_sequence=px.colors.qualitative.Set2, ) fig.update_layout( yaxis_range=[0, 1], legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), height=420, ) lb = maze_navigation_leaderboard(df, k_shot=k_shot) return fig, lb def plot_task1_format_comparison() -> go.Figure: results = _load_results() df = results["maze_navigation"] if df.empty: return _make_empty_fig("No Task 1 results found.") sub = df[(df["k_shot"] == 0) & (df["prompt_strategy"] == "cot")] if sub.empty: sub = df[df["k_shot"] == 0] agg = sub.groupby(["display_name", "input_format"])["accuracy"].mean().reset_index() fig = px.bar( agg, x="display_name", y="accuracy", color="input_format", barmode="group", labels={"display_name": "Model", "accuracy": "Mean Accuracy", "input_format": "Input Format"}, title="Task 1 — Raw vs Visual Format (0-shot, CoT, averaged over grid sizes)", color_discrete_map={"raw": "#2196F3", "visual": "#FF9800"}, ) fig.update_layout(yaxis_range=[0, 1], height=380) return fig # ── Task 2 plots ───────────────────────────────────────────────────────────── def plot_task2_q0_q3(grid_size: int) -> tuple[go.Figure, pd.DataFrame]: results = _load_results() df = results["point_reuse"] if df.empty: return _make_empty_fig("No Task 2 results found.\nRun experiments first."), pd.DataFrame() sub = df[df["grid_size"] == grid_size] if sub.empty: return _make_empty_fig(f"No Task 2 results for {grid_size}×{grid_size}"), pd.DataFrame() q0 = sub[sub["question_idx"] == 0].groupby("display_name")["accuracy"].mean().rename("Q0") q3 = sub[sub["question_idx"] == 3].groupby("display_name")["accuracy"].mean().rename("Q3") plot_df = pd.concat([q0, q3], axis=1).reset_index() plot_df_melt = plot_df.melt(id_vars="display_name", var_name="Question", value_name="Accuracy") fig = px.bar( plot_df_melt, x="display_name", y="Accuracy", color="Question", barmode="group", labels={"display_name": "Model"}, title=f"Task 2 — Q0 vs Q3 Accuracy ({grid_size}×{grid_size} maze)\n" "Q3 = Q0 (same question repeated — tests information reuse)", color_discrete_map={"Q0": "#4CAF50", "Q3": "#F44336"}, ) fig.update_layout(yaxis_range=[0, 1], height=400) lb = point_reuse_leaderboard(df) return fig, lb def plot_task2_by_grid() -> go.Figure: results = _load_results() df = results["point_reuse"] if df.empty: return _make_empty_fig("No Task 2 results found.") q3 = df[df["question_idx"] == 3].groupby( ["display_name", "grid_size"])["accuracy"].mean().reset_index() fig = px.line( q3, x="grid_size", y="accuracy", color="display_name", markers=True, labels={"grid_size": "Grid Size", "accuracy": "Q3 Accuracy", "display_name": "Model"}, title="Task 2 — Q3 Accuracy by Grid Size (Q3 = Q0 repeated)", color_discrete_sequence=px.colors.qualitative.Set2, ) fig.update_layout(yaxis_range=[0, 1], height=380) return fig # ── Task 3 plots ───────────────────────────────────────────────────────────── def plot_task3_compositional() -> tuple[go.Figure, pd.DataFrame]: results = _load_results() df = results["compositional_distance"] if df.empty: return _make_empty_fig("No Task 3 results found.\nRun experiments first."), pd.DataFrame() agg = df.groupby(["display_name", "question_idx"])["accuracy"].mean().reset_index() q_labels = {0: "Q0: A→M", 1: "Q1: D→M", 2: "Q2: B→C (compositional)"} agg["Question"] = agg["question_idx"].map(q_labels) fig = px.bar( agg, x="display_name", y="accuracy", color="Question", barmode="group", labels={"display_name": "Model", "accuracy": "Accuracy"}, title="Task 3 — Compositional Distance Comparison\n" "Q2 can be composed from Q0+Q1 (corner→center distances)", color_discrete_map={ "Q0: A→M": "#2196F3", "Q1: D→M": "#9C27B0", "Q2: B→C (compositional)": "#FF5722", }, ) fig.update_layout(yaxis_range=[0, 1], height=420) lb = compositional_distance_leaderboard(df) return fig, lb def plot_task3_by_grid() -> go.Figure: results = _load_results() df = results["compositional_distance"] if df.empty: return _make_empty_fig("No Task 3 results found.") q2 = df[df["question_idx"] == 2].groupby( ["display_name", "grid_size"])["accuracy"].mean().reset_index() fig = px.line( q2, x="grid_size", y="accuracy", color="display_name", markers=True, labels={"grid_size": "Grid Size", "accuracy": "Q2 Accuracy", "display_name": "Model"}, title="Task 3 — Q2 (Compositional) Accuracy by Grid Size", color_discrete_sequence=px.colors.qualitative.Set2, ) fig.update_layout(yaxis_range=[0, 1], height=380) return fig # --------------------------------------------------------------------------- # Script generation tab # --------------------------------------------------------------------------- TASK_DISPLAY_MAP = { "Maze Navigation": "maze_navigation", "Sequential Point Reuse": "point_reuse", "Compositional Distance Comparison": "compositional_distance", } def _make_plain_script(job, api_key_placeholder: str) -> str: """Return a plain bash script (no SLURM headers) for running a job directly.""" lines = [ "#!/usr/bin/env bash", f"# {job.label}", f"export {job.api_key_env}={api_key_placeholder}", "", f"cd {job.working_dir}", " \\\n ".join(job.python_cmd), "", ] return "\n".join(lines) def generate_scripts( tasks: list[str], models: list[str], grid_sizes_str: str, formats: list[str], strategies: list[str], script_type: str, repo_path: str, ) -> tuple[str, str | None]: """ Build experiment scripts and return (preview_text, zip_path). zip_path is a temp file the user can download. """ if not tasks: return "Select at least one task.", None if not models: return "Select at least one model.", None try: grid_sizes = [int(g.strip()) for g in grid_sizes_str.split(",") if g.strip()] except ValueError: return "Invalid grid sizes — enter comma-separated integers, e.g. 5,6,7", None selected_tasks = [TASK_DISPLAY_MAP[t] for t in tasks if t in TASK_DISPLAY_MAP] jobs = build_all_jobs( cfg=CFG, tasks=selected_tasks, models=models, grid_sizes=grid_sizes or None, input_formats=formats or None, prompt_strategies=strategies or None, config_path=CONFIG_PATH, ) if not jobs: return "No jobs matched the selected filters.", None # Optionally override repo path in working_dir repo_override = repo_path.strip() if repo_path.strip() else None use_slurm = (script_type == "SLURM (.sh with #SBATCH headers)") log_dir = Path(repo_override or ".") / "maze-solver" / "eval_llm_logs" script_contents: dict[str, str] = {} for job in jobs: safe = job.label.replace(" ", "_").replace("|", "").replace("/", "_").strip("_") filename = f"{safe}.sh" # If a repo path override was provided, patch working_dir in the job if repo_override: # Rebase working_dir: replace the config-derived root with the user's path try: rel = job.working_dir.relative_to(CONFIG_PATH.parent.parent.parent) job.working_dir = Path(repo_override) / rel except ValueError: pass # Rebase output_dir similarly try: rel_out = job.output_dir.relative_to(CONFIG_PATH.parent.parent.parent) job.output_dir = Path(repo_override) / rel_out except ValueError: pass # Rebase python_cmd paths (first two tokens are "python" and script path) if len(job.python_cmd) >= 2: script_abs = Path(job.python_cmd[1]) try: rel_script = script_abs.relative_to(CONFIG_PATH.parent.parent.parent) job.python_cmd[1] = str(Path(repo_override) / rel_script) except ValueError: pass if use_slurm: content = make_sbatch_script(job, log_dir) else: content = _make_plain_script(job, f'"${{{job.api_key_env}}}"') script_contents[filename] = content # Write zip to a named temp file (Gradio File component needs a real path) tmp = tempfile.NamedTemporaryFile( delete=False, suffix=".zip", prefix="spatialbench_scripts_" ) with zipfile.ZipFile(tmp, "w", zipfile.ZIP_DEFLATED) as zf: for fname, content in script_contents.items(): zf.writestr(fname, content) # Also include a README and a master run_all.sh run_all_lines = ["#!/usr/bin/env bash", "# Run all generated scripts sequentially", ""] for fname in sorted(script_contents): run_all_lines.append(f"bash {fname}") zf.writestr("run_all.sh", "\n".join(run_all_lines) + "\n") tmp.close() # Preview: show first script + summary n = len(script_contents) first_name, first_content = next(iter(script_contents.items())) preview = ( f"Generated {n} script(s) for {len(models)} model(s) across {len(selected_tasks)} task(s).\n" f"Download the zip below, unzip in your cluster, then run: bash run_all.sh\n\n" f"── {first_name} ──\n{first_content}" + (f"\n\n... and {n - 1} more script(s) in the zip." if n > 1 else "") ) return preview, tmp.name # --------------------------------------------------------------------------- # Gradio UI # --------------------------------------------------------------------------- PAPER_ABSTRACT = """ **Do LLMs Build Spatial World Models? Evidence from Grid-World Maze Tasks** We systematically evaluate the spatial understanding of large language models through maze tasks—a controlled testing context requiring multi-step planning and spatial abstraction. Across experiments with Gemini-2.5-Flash, GPT-5-mini, Claude-Haiku-4.5, and DeepSeek-Chat, we uncover significant discrepancies in spatial reasoning that challenge assumptions about LLM planning capabilities. Key findings: - **Representation sensitivity**: Gemini drops from 86% (raw tokenized) to 34% (visual grid) on 5×5 mazes with CoT - **Prompting dependency**: Claude-Haiku fails completely without CoT, recovers to 78% with it - **No spatial memory**: Models treat sequential questions independently, failing to reuse computed spatial knowledge """ CSS = """ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600&family=IBM+Plex+Mono:wght@400;500&display=swap'); *, body, .gradio-container { font-family: 'Inter', ui-sans-serif, system-ui, sans-serif !important; } code, pre, .monospace { font-family: 'IBM Plex Mono', ui-monospace, monospace !important; } .leaderboard-table { font-size: 0.9em; } footer { display: none !important; } """ def build_ui() -> gr.Blocks: with gr.Blocks( title="SpatialBench — Do LLMs Build Spatial World Models?", css=CSS, theme=gr.themes.Soft(primary_hue="blue"), ) as demo: gr.Markdown("# 🧩 SpatialBench") gr.Markdown( "**Evaluating Spatial World Models in Large Language Models** · " "[Paper (ICLR 2026 Workshop)](https://arxiv.org/abs/...) · " "[Code](https://github.com/...)" ) with gr.Tabs(): # ================================================================ # Tab 1: Leaderboard # ================================================================ with gr.Tab("📊 Leaderboard"): gr.Markdown(PAPER_ABSTRACT) gr.Markdown("---") gr.Markdown("## Task 1 — Maze Navigation (Planning)") gr.Markdown( "Models find shortest paths through mazes. " "Two input formats: **raw** tokenized adjacency lists vs **visual** character grids." ) with gr.Row(): t1_k = gr.Radio( choices=[0, 3, 5], value=0, label="K-shot", info="Number of in-context examples", ) t1_fmt = gr.Radio( choices=["raw", "visual"], value="raw", label="Input Format", ) t1_plot = gr.Plot(label="Accuracy by Grid Size") t1_lb = gr.Dataframe( label="Leaderboard (mean accuracy across grid sizes)", elem_classes=["leaderboard-table"], ) t1_fmt_plot = gr.Plot(label="Raw vs Visual Format Comparison") def refresh_task1(k, fmt): fig, lb = plot_task1_accuracy(int(k), fmt) fmt_fig = plot_task1_format_comparison() return fig, lb, fmt_fig for inp in [t1_k, t1_fmt]: inp.change( refresh_task1, inputs=[t1_k, t1_fmt], outputs=[t1_plot, t1_lb, t1_fmt_plot], ) gr.Markdown("---") gr.Markdown("## Task 2 — Sequential Reasoning with Point Reuse") gr.Markdown( "Models answer 4 proximity questions. **Q3 = Q0** (same question repeated). " "Do models reuse their earlier computation, or start from scratch?" ) t2_grid = gr.Slider(minimum=5, maximum=9, step=1, value=5, label="Grid Size") t2_plot = gr.Plot(label="Q0 vs Q3 Accuracy") t2_grid_plot = gr.Plot(label="Q3 Accuracy Across Grid Sizes") t2_lb = gr.Dataframe(label="Leaderboard", elem_classes=["leaderboard-table"]) def refresh_task2(gs): fig, lb = plot_task2_q0_q3(int(gs)) grid_fig = plot_task2_by_grid() return fig, grid_fig, lb t2_grid.change( refresh_task2, inputs=[t2_grid], outputs=[t2_plot, t2_grid_plot, t2_lb], ) gr.Markdown("---") gr.Markdown("## Task 3 — Compositional Distance Comparison") gr.Markdown( "Models answer 3 questions about maze corners (A, B, C, D) and center M. " "**Q2** (B→C) can potentially be composed from Q0 (A→M) and Q1 (D→M). " "Δ = Q2 accuracy − avg(Q0, Q1)." ) t3_plot = gr.Plot(label="Q0 / Q1 / Q2 Accuracy by Model") t3_grid_plot = gr.Plot(label="Q2 Accuracy Across Grid Sizes") t3_lb = gr.Dataframe( label="Leaderboard (Δ shows compositional benefit)", elem_classes=["leaderboard-table"], ) with gr.Row(): refresh_lb_btn = gr.Button("🔄 Refresh Results", variant="secondary") def refresh_all_leaderboard(_=None): t1_fig, t1_table = plot_task1_accuracy(0, "raw") t1_ff = plot_task1_format_comparison() t2_fig, t2_lb_table = plot_task2_q0_q3(5) t2_gfig = plot_task2_by_grid() t3_fig, t3_lb_table = plot_task3_compositional() t3_gfig = plot_task3_by_grid() return ( t1_fig, t1_table, t1_ff, t2_fig, t2_gfig, t2_lb_table, t3_fig, t3_gfig, t3_lb_table, ) refresh_lb_btn.click( refresh_all_leaderboard, outputs=[ t1_plot, t1_lb, t1_fmt_plot, t2_plot, t2_grid_plot, t2_lb, t3_plot, t3_grid_plot, t3_lb, ], ) demo.load( refresh_all_leaderboard, outputs=[ t1_plot, t1_lb, t1_fmt_plot, t2_plot, t2_grid_plot, t2_lb, t3_plot, t3_grid_plot, t3_lb, ], ) # ================================================================ # Tab 2: Get Scripts # ================================================================ with gr.Tab("⬇️ Get Scripts"): gr.Markdown( "## Generate Experiment Scripts\n" "Configure the experiments you want to run, then download a zip of ready-to-run " "shell scripts.\n\n" "**How to use:**\n" "1. Select tasks, models, and settings below\n" "2. Enter the path to your local clone of the repo (so paths in the scripts are correct)\n" "3. Click **Generate** — a preview appears and a zip is ready to download\n" "4. Unzip on your cluster, set your API key(s) as environment variables, then:\n" " ```bash\n" " export GEMINI_API_KEY=your_key_here\n" " bash run_all.sh # run sequentially\n" " # — or submit individually:\n" " sbatch Task_1__Maze_Navigation__gemini-2.5-flash__raw__cot.sh\n" " ```" ) with gr.Row(): with gr.Column(scale=2): gen_tasks = gr.CheckboxGroup( choices=list(TASK_DISPLAY_MAP.keys()), value=["Maze Navigation"], label="Tasks", ) gen_models = gr.CheckboxGroup( choices=MODEL_CHOICES, value=["gemini-2.5-flash"], label="Models", ) gen_grids = gr.Textbox( value="5,6,7,8,9", label="Grid Sizes", info="Comma-separated. Paper used 5–9.", ) with gr.Row(): gen_formats = gr.CheckboxGroup( choices=["raw", "visual"], value=["raw", "visual"], label="Input Formats (Task 1 only)", ) gen_strategies = gr.CheckboxGroup( choices=["base", "cot", "reasoning"], value=["base", "cot", "reasoning"], label="Prompt Strategies", ) with gr.Column(scale=1): gen_script_type = gr.Radio( choices=[ "SLURM (.sh with #SBATCH headers)", "Plain bash (.sh, no SLURM)", ], value="SLURM (.sh with #SBATCH headers)", label="Script Type", info="Use SLURM if you have a cluster. Plain bash runs directly.", ) gen_repo_path = gr.Textbox( label="Repo path on your cluster", placeholder="/path/to/llm-maze-solver", info="Absolute path to the llm-maze-solver repo root on the machine where you'll run the scripts. Leave blank to use relative paths.", ) with gr.Row(): gen_btn = gr.Button("⚙️ Generate Scripts", variant="primary", scale=2) gen_preview = gr.Textbox( label="Preview (first script)", interactive=False, lines=20, max_lines=30, ) gen_download = gr.File( label="Download Scripts (.zip)", interactive=False, ) gen_btn.click( generate_scripts, inputs=[ gen_tasks, gen_models, gen_grids, gen_formats, gen_strategies, gen_script_type, gen_repo_path, ], outputs=[gen_preview, gen_download], ) # ================================================================ # Tab 3: About # ================================================================ with gr.Tab("ℹ️ About"): gr.Markdown(""" ## About SpatialBench SpatialBench is the evaluation platform accompanying the paper: > **Do LLMs Build Spatial World Models? Evidence from Grid-World Maze Tasks** > *Under review at ICLR 2026 Workshop* ### Three Tasks | Task | Type | What it tests | |------|------|---------------| | **Task 1: Maze Navigation** | Planning | Find shortest path from start to goal | | **Task 2: Sequential Point Reuse** | Reasoning | Reuse Q0 computation when Q3=Q0 | | **Task 3: Compositional Distance** | Reasoning | Compose corner→center distances for Q2 | ### Input Representations - **Raw (tokenized)**: ` (0,0) <--> (0,1) ... ` - **Visual (grid)**: `Row 0: ['.', 'S', '.', '#'] Row 1: ['#', '.', '.', 'E']` ### Models Evaluated | Model | Provider | |-------|----------| | Gemini 2.5 Flash | Google | | GPT-5 Mini | OpenAI | | Claude Haiku 4.5 | Anthropic | | DeepSeek Chat | DeepSeek | ### Grid Sizes Experiments run on n×n grids for n ∈ {5, 6, 7, 8, 9} by default. ### Reproducing Experiments Clone the repo and use the **Get Scripts** tab above to generate SLURM scripts, or use the CLI directly: ```bash cd pipeline/ python run_experiments.py --tasks maze_navigation --models gemini-2.5-flash --mode slurm --dry-run ``` ### Citation ```bibtex @inproceedings{spatialbench2026, title = {Do {LLMs} Build Spatial World Models? Evidence from Grid-World Maze Tasks}, author = {Anonymous}, booktitle = {ICLR 2026 Workshop}, year = {2026}, } ``` """) return demo # --------------------------------------------------------------------------- # Entry point # --------------------------------------------------------------------------- if __name__ == "__main__": demo = build_ui() demo.launch( server_name="0.0.0.0", share=False, show_error=True, )