Spaces:
Running
Running
| """ | |
| app.py β SpatialBench Gradio application | |
| ----------------------------------------- | |
| Entrypoint for the HuggingFace Space "SpatialBench". | |
| Three tabs: | |
| 1. Leaderboard β visualize pre-computed results from all three tasks | |
| 2. Get Scripts β generate ready-to-run SLURM scripts (or plain shell | |
| scripts) as a downloadable zip; no compute needed here | |
| 3. About β paper info and citation | |
| To run locally: | |
| cd pipeline/ | |
| python app.py | |
| To deploy on HuggingFace Spaces: | |
| - No secrets required for the Leaderboard or Get Scripts tabs. | |
| - The Space entrypoint is this file (app.py). | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import sys | |
| import zipfile | |
| import tempfile | |
| from pathlib import Path | |
| import gradio as gr | |
| import pandas as pd | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| # Load .env if running locally | |
| _env = Path(__file__).parent / ".env" | |
| if _env.exists(): | |
| with open(_env) as _f: | |
| for _line in _f: | |
| _line = _line.strip() | |
| if _line and not _line.startswith("#") and "=" in _line: | |
| _k, _v = _line.split("=", 1) | |
| os.environ.setdefault(_k.strip(), _v.strip()) | |
| # Add repo root to path so pipeline imports work | |
| sys.path.insert(0, str(Path(__file__).parent)) | |
| from pipeline.task_builder import load_config, build_all_jobs, make_sbatch_script | |
| from pipeline.results_loader import ( | |
| load_all_results, | |
| maze_navigation_leaderboard, | |
| point_reuse_leaderboard, | |
| compositional_distance_leaderboard, | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Paths / config | |
| # --------------------------------------------------------------------------- | |
| CONFIG_PATH = Path(__file__).parent / "configs" / "experiments.yaml" | |
| CFG = load_config(CONFIG_PATH) | |
| MODEL_CHOICES = list(CFG["models"].keys()) | |
| # --------------------------------------------------------------------------- | |
| # Leaderboard helpers | |
| # --------------------------------------------------------------------------- | |
| def _load_results(): | |
| try: | |
| return load_all_results(CONFIG_PATH) | |
| except Exception: | |
| return { | |
| "maze_navigation": pd.DataFrame(), | |
| "point_reuse": pd.DataFrame(), | |
| "compositional_distance": pd.DataFrame(), | |
| } | |
| def _make_empty_fig(msg: str) -> go.Figure: | |
| fig = go.Figure() | |
| fig.add_annotation(text=msg, x=0.5, y=0.5, showarrow=False, | |
| font=dict(size=16), xref="paper", yref="paper") | |
| fig.update_layout(xaxis_visible=False, yaxis_visible=False, | |
| height=300, paper_bgcolor="rgba(0,0,0,0)", | |
| plot_bgcolor="rgba(0,0,0,0)") | |
| return fig | |
| # ββ Task 1 plots βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def plot_task1_accuracy(k_shot: int, input_format: str) -> tuple[go.Figure, pd.DataFrame]: | |
| results = _load_results() | |
| df = results["maze_navigation"] | |
| if df.empty: | |
| return _make_empty_fig("No Task 1 results found.\nRun experiments first."), pd.DataFrame() | |
| sub = df[(df["k_shot"] == k_shot) & (df["input_format"] == input_format)] | |
| if sub.empty: | |
| return _make_empty_fig(f"No results for k={k_shot}, format={input_format}"), pd.DataFrame() | |
| fig = px.line( | |
| sub, x="grid_size", y="accuracy", | |
| color="display_name", line_dash="prompt_strategy", | |
| markers=True, | |
| labels={"grid_size": "Grid Size (nΓn)", "accuracy": "Accuracy", | |
| "display_name": "Model", "prompt_strategy": "Strategy"}, | |
| title=f"Task 1 β Maze Navigation ({input_format} format, {k_shot}-shot)", | |
| color_discrete_sequence=px.colors.qualitative.Set2, | |
| ) | |
| fig.update_layout( | |
| yaxis_range=[0, 1], | |
| legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), | |
| height=420, | |
| ) | |
| lb = maze_navigation_leaderboard(df, k_shot=k_shot) | |
| return fig, lb | |
| def plot_task1_format_comparison() -> go.Figure: | |
| results = _load_results() | |
| df = results["maze_navigation"] | |
| if df.empty: | |
| return _make_empty_fig("No Task 1 results found.") | |
| sub = df[(df["k_shot"] == 0) & (df["prompt_strategy"] == "cot")] | |
| if sub.empty: | |
| sub = df[df["k_shot"] == 0] | |
| agg = sub.groupby(["display_name", "input_format"])["accuracy"].mean().reset_index() | |
| fig = px.bar( | |
| agg, x="display_name", y="accuracy", color="input_format", | |
| barmode="group", | |
| labels={"display_name": "Model", "accuracy": "Mean Accuracy", | |
| "input_format": "Input Format"}, | |
| title="Task 1 β Raw vs Visual Format (0-shot, CoT, averaged over grid sizes)", | |
| color_discrete_map={"raw": "#2196F3", "visual": "#FF9800"}, | |
| ) | |
| fig.update_layout(yaxis_range=[0, 1], height=380) | |
| return fig | |
| # ββ Task 2 plots βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def plot_task2_q0_q3(grid_size: int) -> tuple[go.Figure, pd.DataFrame]: | |
| results = _load_results() | |
| df = results["point_reuse"] | |
| if df.empty: | |
| return _make_empty_fig("No Task 2 results found.\nRun experiments first."), pd.DataFrame() | |
| sub = df[df["grid_size"] == grid_size] | |
| if sub.empty: | |
| return _make_empty_fig(f"No Task 2 results for {grid_size}Γ{grid_size}"), pd.DataFrame() | |
| q0 = sub[sub["question_idx"] == 0].groupby("display_name")["accuracy"].mean().rename("Q0") | |
| q3 = sub[sub["question_idx"] == 3].groupby("display_name")["accuracy"].mean().rename("Q3") | |
| plot_df = pd.concat([q0, q3], axis=1).reset_index() | |
| plot_df_melt = plot_df.melt(id_vars="display_name", var_name="Question", value_name="Accuracy") | |
| fig = px.bar( | |
| plot_df_melt, x="display_name", y="Accuracy", color="Question", | |
| barmode="group", | |
| labels={"display_name": "Model"}, | |
| title=f"Task 2 β Q0 vs Q3 Accuracy ({grid_size}Γ{grid_size} maze)\n" | |
| "Q3 = Q0 (same question repeated β tests information reuse)", | |
| color_discrete_map={"Q0": "#4CAF50", "Q3": "#F44336"}, | |
| ) | |
| fig.update_layout(yaxis_range=[0, 1], height=400) | |
| lb = point_reuse_leaderboard(df) | |
| return fig, lb | |
| def plot_task2_by_grid() -> go.Figure: | |
| results = _load_results() | |
| df = results["point_reuse"] | |
| if df.empty: | |
| return _make_empty_fig("No Task 2 results found.") | |
| q3 = df[df["question_idx"] == 3].groupby( | |
| ["display_name", "grid_size"])["accuracy"].mean().reset_index() | |
| fig = px.line( | |
| q3, x="grid_size", y="accuracy", color="display_name", | |
| markers=True, | |
| labels={"grid_size": "Grid Size", "accuracy": "Q3 Accuracy", | |
| "display_name": "Model"}, | |
| title="Task 2 β Q3 Accuracy by Grid Size (Q3 = Q0 repeated)", | |
| color_discrete_sequence=px.colors.qualitative.Set2, | |
| ) | |
| fig.update_layout(yaxis_range=[0, 1], height=380) | |
| return fig | |
| # ββ Task 3 plots βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def plot_task3_compositional() -> tuple[go.Figure, pd.DataFrame]: | |
| results = _load_results() | |
| df = results["compositional_distance"] | |
| if df.empty: | |
| return _make_empty_fig("No Task 3 results found.\nRun experiments first."), pd.DataFrame() | |
| agg = df.groupby(["display_name", "question_idx"])["accuracy"].mean().reset_index() | |
| q_labels = {0: "Q0: AβM", 1: "Q1: DβM", 2: "Q2: BβC (compositional)"} | |
| agg["Question"] = agg["question_idx"].map(q_labels) | |
| fig = px.bar( | |
| agg, x="display_name", y="accuracy", color="Question", | |
| barmode="group", | |
| labels={"display_name": "Model", "accuracy": "Accuracy"}, | |
| title="Task 3 β Compositional Distance Comparison\n" | |
| "Q2 can be composed from Q0+Q1 (cornerβcenter distances)", | |
| color_discrete_map={ | |
| "Q0: AβM": "#2196F3", | |
| "Q1: DβM": "#9C27B0", | |
| "Q2: BβC (compositional)": "#FF5722", | |
| }, | |
| ) | |
| fig.update_layout(yaxis_range=[0, 1], height=420) | |
| lb = compositional_distance_leaderboard(df) | |
| return fig, lb | |
| def plot_task3_by_grid() -> go.Figure: | |
| results = _load_results() | |
| df = results["compositional_distance"] | |
| if df.empty: | |
| return _make_empty_fig("No Task 3 results found.") | |
| q2 = df[df["question_idx"] == 2].groupby( | |
| ["display_name", "grid_size"])["accuracy"].mean().reset_index() | |
| fig = px.line( | |
| q2, x="grid_size", y="accuracy", color="display_name", | |
| markers=True, | |
| labels={"grid_size": "Grid Size", "accuracy": "Q2 Accuracy", | |
| "display_name": "Model"}, | |
| title="Task 3 β Q2 (Compositional) Accuracy by Grid Size", | |
| color_discrete_sequence=px.colors.qualitative.Set2, | |
| ) | |
| fig.update_layout(yaxis_range=[0, 1], height=380) | |
| return fig | |
| # --------------------------------------------------------------------------- | |
| # Script generation tab | |
| # --------------------------------------------------------------------------- | |
| TASK_DISPLAY_MAP = { | |
| "Maze Navigation": "maze_navigation", | |
| "Sequential Point Reuse": "point_reuse", | |
| "Compositional Distance Comparison": "compositional_distance", | |
| } | |
| def _make_plain_script(job, api_key_placeholder: str) -> str: | |
| """Return a plain bash script (no SLURM headers) for running a job directly.""" | |
| lines = [ | |
| "#!/usr/bin/env bash", | |
| f"# {job.label}", | |
| f"export {job.api_key_env}={api_key_placeholder}", | |
| "", | |
| f"cd {job.working_dir}", | |
| " \\\n ".join(job.python_cmd), | |
| "", | |
| ] | |
| return "\n".join(lines) | |
| def generate_scripts( | |
| tasks: list[str], | |
| models: list[str], | |
| grid_sizes_str: str, | |
| formats: list[str], | |
| strategies: list[str], | |
| script_type: str, | |
| repo_path: str, | |
| ) -> tuple[str, str | None]: | |
| """ | |
| Build experiment scripts and return (preview_text, zip_path). | |
| zip_path is a temp file the user can download. | |
| """ | |
| if not tasks: | |
| return "Select at least one task.", None | |
| if not models: | |
| return "Select at least one model.", None | |
| try: | |
| grid_sizes = [int(g.strip()) for g in grid_sizes_str.split(",") if g.strip()] | |
| except ValueError: | |
| return "Invalid grid sizes β enter comma-separated integers, e.g. 5,6,7", None | |
| selected_tasks = [TASK_DISPLAY_MAP[t] for t in tasks if t in TASK_DISPLAY_MAP] | |
| jobs = build_all_jobs( | |
| cfg=CFG, | |
| tasks=selected_tasks, | |
| models=models, | |
| grid_sizes=grid_sizes or None, | |
| input_formats=formats or None, | |
| prompt_strategies=strategies or None, | |
| config_path=CONFIG_PATH, | |
| ) | |
| if not jobs: | |
| return "No jobs matched the selected filters.", None | |
| # Optionally override repo path in working_dir | |
| repo_override = repo_path.strip() if repo_path.strip() else None | |
| use_slurm = (script_type == "SLURM (.sh with #SBATCH headers)") | |
| log_dir = Path(repo_override or ".") / "maze-solver" / "eval_llm_logs" | |
| script_contents: dict[str, str] = {} | |
| for job in jobs: | |
| safe = job.label.replace(" ", "_").replace("|", "").replace("/", "_").strip("_") | |
| filename = f"{safe}.sh" | |
| # If a repo path override was provided, patch working_dir in the job | |
| if repo_override: | |
| # Rebase working_dir: replace the config-derived root with the user's path | |
| try: | |
| rel = job.working_dir.relative_to(CONFIG_PATH.parent.parent.parent) | |
| job.working_dir = Path(repo_override) / rel | |
| except ValueError: | |
| pass | |
| # Rebase output_dir similarly | |
| try: | |
| rel_out = job.output_dir.relative_to(CONFIG_PATH.parent.parent.parent) | |
| job.output_dir = Path(repo_override) / rel_out | |
| except ValueError: | |
| pass | |
| # Rebase python_cmd paths (first two tokens are "python" and script path) | |
| if len(job.python_cmd) >= 2: | |
| script_abs = Path(job.python_cmd[1]) | |
| try: | |
| rel_script = script_abs.relative_to(CONFIG_PATH.parent.parent.parent) | |
| job.python_cmd[1] = str(Path(repo_override) / rel_script) | |
| except ValueError: | |
| pass | |
| if use_slurm: | |
| content = make_sbatch_script(job, log_dir) | |
| else: | |
| content = _make_plain_script(job, f'"${{{job.api_key_env}}}"') | |
| script_contents[filename] = content | |
| # Write zip to a named temp file (Gradio File component needs a real path) | |
| tmp = tempfile.NamedTemporaryFile( | |
| delete=False, suffix=".zip", prefix="spatialbench_scripts_" | |
| ) | |
| with zipfile.ZipFile(tmp, "w", zipfile.ZIP_DEFLATED) as zf: | |
| for fname, content in script_contents.items(): | |
| zf.writestr(fname, content) | |
| # Also include a README and a master run_all.sh | |
| run_all_lines = ["#!/usr/bin/env bash", "# Run all generated scripts sequentially", ""] | |
| for fname in sorted(script_contents): | |
| run_all_lines.append(f"bash {fname}") | |
| zf.writestr("run_all.sh", "\n".join(run_all_lines) + "\n") | |
| tmp.close() | |
| # Preview: show first script + summary | |
| n = len(script_contents) | |
| first_name, first_content = next(iter(script_contents.items())) | |
| preview = ( | |
| f"Generated {n} script(s) for {len(models)} model(s) across {len(selected_tasks)} task(s).\n" | |
| f"Download the zip below, unzip in your cluster, then run: bash run_all.sh\n\n" | |
| f"ββ {first_name} ββ\n{first_content}" | |
| + (f"\n\n... and {n - 1} more script(s) in the zip." if n > 1 else "") | |
| ) | |
| return preview, tmp.name | |
| # --------------------------------------------------------------------------- | |
| # Gradio UI | |
| # --------------------------------------------------------------------------- | |
| PAPER_ABSTRACT = """ | |
| **Do LLMs Build Spatial World Models? Evidence from Grid-World Maze Tasks** | |
| We systematically evaluate the spatial understanding of large language models through maze tasksβa | |
| controlled testing context requiring multi-step planning and spatial abstraction. Across experiments | |
| with Gemini-2.5-Flash, GPT-5-mini, Claude-Haiku-4.5, and DeepSeek-Chat, we uncover significant | |
| discrepancies in spatial reasoning that challenge assumptions about LLM planning capabilities. | |
| Key findings: | |
| - **Representation sensitivity**: Gemini drops from 86% (raw tokenized) to 34% (visual grid) on 5Γ5 mazes with CoT | |
| - **Prompting dependency**: Claude-Haiku fails completely without CoT, recovers to 78% with it | |
| - **No spatial memory**: Models treat sequential questions independently, failing to reuse computed spatial knowledge | |
| """ | |
| CSS = """ | |
| @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600&family=IBM+Plex+Mono:wght@400;500&display=swap'); | |
| *, body, .gradio-container { font-family: 'Inter', ui-sans-serif, system-ui, sans-serif !important; } | |
| code, pre, .monospace { font-family: 'IBM Plex Mono', ui-monospace, monospace !important; } | |
| .leaderboard-table { font-size: 0.9em; } | |
| footer { display: none !important; } | |
| """ | |
| def build_ui() -> gr.Blocks: | |
| with gr.Blocks( | |
| title="SpatialBench β Do LLMs Build Spatial World Models?", | |
| css=CSS, | |
| theme=gr.themes.Soft(primary_hue="blue"), | |
| ) as demo: | |
| gr.Markdown("# π§© SpatialBench") | |
| gr.Markdown( | |
| "**Evaluating Spatial World Models in Large Language Models** Β· " | |
| "[Paper (ICLR 2026 Workshop)](https://arxiv.org/abs/...) Β· " | |
| "[Code](https://github.com/...)" | |
| ) | |
| with gr.Tabs(): | |
| # ================================================================ | |
| # Tab 1: Leaderboard | |
| # ================================================================ | |
| with gr.Tab("π Leaderboard"): | |
| gr.Markdown(PAPER_ABSTRACT) | |
| gr.Markdown("---") | |
| gr.Markdown("## Task 1 β Maze Navigation (Planning)") | |
| gr.Markdown( | |
| "Models find shortest paths through mazes. " | |
| "Two input formats: **raw** tokenized adjacency lists vs **visual** character grids." | |
| ) | |
| with gr.Row(): | |
| t1_k = gr.Radio( | |
| choices=[0, 3, 5], value=0, label="K-shot", | |
| info="Number of in-context examples", | |
| ) | |
| t1_fmt = gr.Radio( | |
| choices=["raw", "visual"], value="raw", label="Input Format", | |
| ) | |
| t1_plot = gr.Plot(label="Accuracy by Grid Size") | |
| t1_lb = gr.Dataframe( | |
| label="Leaderboard (mean accuracy across grid sizes)", | |
| elem_classes=["leaderboard-table"], | |
| ) | |
| t1_fmt_plot = gr.Plot(label="Raw vs Visual Format Comparison") | |
| def refresh_task1(k, fmt): | |
| fig, lb = plot_task1_accuracy(int(k), fmt) | |
| fmt_fig = plot_task1_format_comparison() | |
| return fig, lb, fmt_fig | |
| for inp in [t1_k, t1_fmt]: | |
| inp.change( | |
| refresh_task1, inputs=[t1_k, t1_fmt], | |
| outputs=[t1_plot, t1_lb, t1_fmt_plot], | |
| ) | |
| gr.Markdown("---") | |
| gr.Markdown("## Task 2 β Sequential Reasoning with Point Reuse") | |
| gr.Markdown( | |
| "Models answer 4 proximity questions. **Q3 = Q0** (same question repeated). " | |
| "Do models reuse their earlier computation, or start from scratch?" | |
| ) | |
| t2_grid = gr.Slider(minimum=5, maximum=9, step=1, value=5, label="Grid Size") | |
| t2_plot = gr.Plot(label="Q0 vs Q3 Accuracy") | |
| t2_grid_plot = gr.Plot(label="Q3 Accuracy Across Grid Sizes") | |
| t2_lb = gr.Dataframe(label="Leaderboard", elem_classes=["leaderboard-table"]) | |
| def refresh_task2(gs): | |
| fig, lb = plot_task2_q0_q3(int(gs)) | |
| grid_fig = plot_task2_by_grid() | |
| return fig, grid_fig, lb | |
| t2_grid.change( | |
| refresh_task2, inputs=[t2_grid], | |
| outputs=[t2_plot, t2_grid_plot, t2_lb], | |
| ) | |
| gr.Markdown("---") | |
| gr.Markdown("## Task 3 β Compositional Distance Comparison") | |
| gr.Markdown( | |
| "Models answer 3 questions about maze corners (A, B, C, D) and center M. " | |
| "**Q2** (BβC) can potentially be composed from Q0 (AβM) and Q1 (DβM). " | |
| "Ξ = Q2 accuracy β avg(Q0, Q1)." | |
| ) | |
| t3_plot = gr.Plot(label="Q0 / Q1 / Q2 Accuracy by Model") | |
| t3_grid_plot = gr.Plot(label="Q2 Accuracy Across Grid Sizes") | |
| t3_lb = gr.Dataframe( | |
| label="Leaderboard (Ξ shows compositional benefit)", | |
| elem_classes=["leaderboard-table"], | |
| ) | |
| with gr.Row(): | |
| refresh_lb_btn = gr.Button("π Refresh Results", variant="secondary") | |
| def refresh_all_leaderboard(_=None): | |
| t1_fig, t1_table = plot_task1_accuracy(0, "raw") | |
| t1_ff = plot_task1_format_comparison() | |
| t2_fig, t2_lb_table = plot_task2_q0_q3(5) | |
| t2_gfig = plot_task2_by_grid() | |
| t3_fig, t3_lb_table = plot_task3_compositional() | |
| t3_gfig = plot_task3_by_grid() | |
| return ( | |
| t1_fig, t1_table, t1_ff, | |
| t2_fig, t2_gfig, t2_lb_table, | |
| t3_fig, t3_gfig, t3_lb_table, | |
| ) | |
| refresh_lb_btn.click( | |
| refresh_all_leaderboard, | |
| outputs=[ | |
| t1_plot, t1_lb, t1_fmt_plot, | |
| t2_plot, t2_grid_plot, t2_lb, | |
| t3_plot, t3_grid_plot, t3_lb, | |
| ], | |
| ) | |
| demo.load( | |
| refresh_all_leaderboard, | |
| outputs=[ | |
| t1_plot, t1_lb, t1_fmt_plot, | |
| t2_plot, t2_grid_plot, t2_lb, | |
| t3_plot, t3_grid_plot, t3_lb, | |
| ], | |
| ) | |
| # ================================================================ | |
| # Tab 2: Get Scripts | |
| # ================================================================ | |
| with gr.Tab("β¬οΈ Get Scripts"): | |
| gr.Markdown( | |
| "## Generate Experiment Scripts\n" | |
| "Configure the experiments you want to run, then download a zip of ready-to-run " | |
| "shell scripts.\n\n" | |
| "**How to use:**\n" | |
| "1. Select tasks, models, and settings below\n" | |
| "2. Enter the path to your local clone of the repo (so paths in the scripts are correct)\n" | |
| "3. Click **Generate** β a preview appears and a zip is ready to download\n" | |
| "4. Unzip on your cluster, set your API key(s) as environment variables, then:\n" | |
| " ```bash\n" | |
| " export GEMINI_API_KEY=your_key_here\n" | |
| " bash run_all.sh # run sequentially\n" | |
| " # β or submit individually:\n" | |
| " sbatch Task_1__Maze_Navigation__gemini-2.5-flash__raw__cot.sh\n" | |
| " ```" | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| gen_tasks = gr.CheckboxGroup( | |
| choices=list(TASK_DISPLAY_MAP.keys()), | |
| value=["Maze Navigation"], | |
| label="Tasks", | |
| ) | |
| gen_models = gr.CheckboxGroup( | |
| choices=MODEL_CHOICES, | |
| value=["gemini-2.5-flash"], | |
| label="Models", | |
| ) | |
| gen_grids = gr.Textbox( | |
| value="5,6,7,8,9", | |
| label="Grid Sizes", | |
| info="Comma-separated. Paper used 5β9.", | |
| ) | |
| with gr.Row(): | |
| gen_formats = gr.CheckboxGroup( | |
| choices=["raw", "visual"], | |
| value=["raw", "visual"], | |
| label="Input Formats (Task 1 only)", | |
| ) | |
| gen_strategies = gr.CheckboxGroup( | |
| choices=["base", "cot", "reasoning"], | |
| value=["base", "cot", "reasoning"], | |
| label="Prompt Strategies", | |
| ) | |
| with gr.Column(scale=1): | |
| gen_script_type = gr.Radio( | |
| choices=[ | |
| "SLURM (.sh with #SBATCH headers)", | |
| "Plain bash (.sh, no SLURM)", | |
| ], | |
| value="SLURM (.sh with #SBATCH headers)", | |
| label="Script Type", | |
| info="Use SLURM if you have a cluster. Plain bash runs directly.", | |
| ) | |
| gen_repo_path = gr.Textbox( | |
| label="Repo path on your cluster", | |
| placeholder="/path/to/llm-maze-solver", | |
| info="Absolute path to the llm-maze-solver repo root on the machine where you'll run the scripts. Leave blank to use relative paths.", | |
| ) | |
| with gr.Row(): | |
| gen_btn = gr.Button("βοΈ Generate Scripts", variant="primary", scale=2) | |
| gen_preview = gr.Textbox( | |
| label="Preview (first script)", | |
| interactive=False, | |
| lines=20, | |
| max_lines=30, | |
| ) | |
| gen_download = gr.File( | |
| label="Download Scripts (.zip)", | |
| interactive=False, | |
| ) | |
| gen_btn.click( | |
| generate_scripts, | |
| inputs=[ | |
| gen_tasks, gen_models, gen_grids, | |
| gen_formats, gen_strategies, | |
| gen_script_type, gen_repo_path, | |
| ], | |
| outputs=[gen_preview, gen_download], | |
| ) | |
| # ================================================================ | |
| # Tab 3: About | |
| # ================================================================ | |
| with gr.Tab("βΉοΈ About"): | |
| gr.Markdown(""" | |
| ## About SpatialBench | |
| SpatialBench is the evaluation platform accompanying the paper: | |
| > **Do LLMs Build Spatial World Models? Evidence from Grid-World Maze Tasks** | |
| > *Under review at ICLR 2026 Workshop* | |
| ### Three Tasks | |
| | Task | Type | What it tests | | |
| |------|------|---------------| | |
| | **Task 1: Maze Navigation** | Planning | Find shortest path from start to goal | | |
| | **Task 2: Sequential Point Reuse** | Reasoning | Reuse Q0 computation when Q3=Q0 | | |
| | **Task 3: Compositional Distance** | Reasoning | Compose cornerβcenter distances for Q2 | | |
| ### Input Representations | |
| - **Raw (tokenized)**: `<ADJLIST_START> (0,0) <--> (0,1) ... <ADJLIST_END>` | |
| - **Visual (grid)**: `Row 0: ['.', 'S', '.', '#'] Row 1: ['#', '.', '.', 'E']` | |
| ### Models Evaluated | |
| | Model | Provider | | |
| |-------|----------| | |
| | Gemini 2.5 Flash | Google | | |
| | GPT-5 Mini | OpenAI | | |
| | Claude Haiku 4.5 | Anthropic | | |
| | DeepSeek Chat | DeepSeek | | |
| ### Grid Sizes | |
| Experiments run on nΓn grids for n β {5, 6, 7, 8, 9} by default. | |
| ### Reproducing Experiments | |
| Clone the repo and use the **Get Scripts** tab above to generate SLURM scripts, or use the CLI directly: | |
| ```bash | |
| cd pipeline/ | |
| python run_experiments.py --tasks maze_navigation --models gemini-2.5-flash --mode slurm --dry-run | |
| ``` | |
| ### Citation | |
| ```bibtex | |
| @inproceedings{spatialbench2026, | |
| title = {Do {LLMs} Build Spatial World Models? Evidence from Grid-World Maze Tasks}, | |
| author = {Anonymous}, | |
| booktitle = {ICLR 2026 Workshop}, | |
| year = {2026}, | |
| } | |
| ``` | |
| """) | |
| return demo | |
| # --------------------------------------------------------------------------- | |
| # Entry point | |
| # --------------------------------------------------------------------------- | |
| if __name__ == "__main__": | |
| demo = build_ui() | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| share=False, | |
| show_error=True, | |
| ) | |