Spaces:

weijiang99
/

SpatialBench

Running

App Files Files Community

SpatialBench / app.py

weijiang99

Update SpatialBench pipeline

cb5acaf verified 11 days ago

raw

history blame contribute delete

27.4 kB

	"""
	app.py — SpatialBench Gradio application
	-----------------------------------------
	Entrypoint for the HuggingFace Space "SpatialBench".

	Three tabs:
	1. Leaderboard — visualize pre-computed results from all three tasks
	2. Get Scripts — generate ready-to-run SLURM scripts (or plain shell
	scripts) as a downloadable zip; no compute needed here
	3. About — paper info and citation

	To run locally:
	cd pipeline/
	python app.py

	To deploy on HuggingFace Spaces:
	- No secrets required for the Leaderboard or Get Scripts tabs.
	- The Space entrypoint is this file (app.py).
	"""

	from __future__ import annotations

	import os
	import sys
	import zipfile
	import tempfile
	from pathlib import Path

	import gradio as gr
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go

	# Load .env if running locally
	_env = Path(__file__).parent / ".env"
	if _env.exists():
	with open(_env) as _f:
	for _line in _f:
	_line = _line.strip()
	if _line and not _line.startswith("#") and "=" in _line:
	_k, _v = _line.split("=", 1)
	os.environ.setdefault(_k.strip(), _v.strip())

	# Add repo root to path so pipeline imports work
	sys.path.insert(0, str(Path(__file__).parent))

	from pipeline.task_builder import load_config, build_all_jobs, make_sbatch_script
	from pipeline.results_loader import (
	load_all_results,
	maze_navigation_leaderboard,
	point_reuse_leaderboard,
	compositional_distance_leaderboard,
	)

	# ---------------------------------------------------------------------------
	# Paths / config
	# ---------------------------------------------------------------------------
	CONFIG_PATH = Path(__file__).parent / "configs" / "experiments.yaml"
	CFG = load_config(CONFIG_PATH)
	MODEL_CHOICES = list(CFG["models"].keys())

	# ---------------------------------------------------------------------------
	# Leaderboard helpers
	# ---------------------------------------------------------------------------

	def _load_results():
	try:
	return load_all_results(CONFIG_PATH)
	except Exception:
	return {
	"maze_navigation": pd.DataFrame(),
	"point_reuse": pd.DataFrame(),
	"compositional_distance": pd.DataFrame(),
	}


	def _make_empty_fig(msg: str) -> go.Figure:
	fig = go.Figure()
	fig.add_annotation(text=msg, x=0.5, y=0.5, showarrow=False,
	font=dict(size=16), xref="paper", yref="paper")
	fig.update_layout(xaxis_visible=False, yaxis_visible=False,
	height=300, paper_bgcolor="rgba(0,0,0,0)",
	plot_bgcolor="rgba(0,0,0,0)")
	return fig


	# ── Task 1 plots ─────────────────────────────────────────────────────────────

	def plot_task1_accuracy(k_shot: int, input_format: str) -> tuple[go.Figure, pd.DataFrame]:
	results = _load_results()
	df = results["maze_navigation"]
	if df.empty:
	return _make_empty_fig("No Task 1 results found.\nRun experiments first."), pd.DataFrame()

	sub = df[(df["k_shot"] == k_shot) & (df["input_format"] == input_format)]
	if sub.empty:
	return _make_empty_fig(f"No results for k={k_shot}, format={input_format}"), pd.DataFrame()

	fig = px.line(
	sub, x="grid_size", y="accuracy",
	color="display_name", line_dash="prompt_strategy",
	markers=True,
	labels={"grid_size": "Grid Size (n×n)", "accuracy": "Accuracy",
	"display_name": "Model", "prompt_strategy": "Strategy"},
	title=f"Task 1 — Maze Navigation ({input_format} format, {k_shot}-shot)",
	color_discrete_sequence=px.colors.qualitative.Set2,
	)
	fig.update_layout(
	yaxis_range=[0, 1],
	legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
	height=420,
	)
	lb = maze_navigation_leaderboard(df, k_shot=k_shot)
	return fig, lb


	def plot_task1_format_comparison() -> go.Figure:
	results = _load_results()
	df = results["maze_navigation"]
	if df.empty:
	return _make_empty_fig("No Task 1 results found.")

	sub = df[(df["k_shot"] == 0) & (df["prompt_strategy"] == "cot")]
	if sub.empty:
	sub = df[df["k_shot"] == 0]
	agg = sub.groupby(["display_name", "input_format"])["accuracy"].mean().reset_index()

	fig = px.bar(
	agg, x="display_name", y="accuracy", color="input_format",
	barmode="group",
	labels={"display_name": "Model", "accuracy": "Mean Accuracy",
	"input_format": "Input Format"},
	title="Task 1 — Raw vs Visual Format (0-shot, CoT, averaged over grid sizes)",
	color_discrete_map={"raw": "#2196F3", "visual": "#FF9800"},
	)
	fig.update_layout(yaxis_range=[0, 1], height=380)
	return fig


	# ── Task 2 plots ─────────────────────────────────────────────────────────────

	def plot_task2_q0_q3(grid_size: int) -> tuple[go.Figure, pd.DataFrame]:
	results = _load_results()
	df = results["point_reuse"]
	if df.empty:
	return _make_empty_fig("No Task 2 results found.\nRun experiments first."), pd.DataFrame()

	sub = df[df["grid_size"] == grid_size]
	if sub.empty:
	return _make_empty_fig(f"No Task 2 results for {grid_size}×{grid_size}"), pd.DataFrame()

	q0 = sub[sub["question_idx"] == 0].groupby("display_name")["accuracy"].mean().rename("Q0")
	q3 = sub[sub["question_idx"] == 3].groupby("display_name")["accuracy"].mean().rename("Q3")
	plot_df = pd.concat([q0, q3], axis=1).reset_index()
	plot_df_melt = plot_df.melt(id_vars="display_name", var_name="Question", value_name="Accuracy")

	fig = px.bar(
	plot_df_melt, x="display_name", y="Accuracy", color="Question",
	barmode="group",
	labels={"display_name": "Model"},
	title=f"Task 2 — Q0 vs Q3 Accuracy ({grid_size}×{grid_size} maze)\n"
	"Q3 = Q0 (same question repeated — tests information reuse)",
	color_discrete_map={"Q0": "#4CAF50", "Q3": "#F44336"},
	)
	fig.update_layout(yaxis_range=[0, 1], height=400)
	lb = point_reuse_leaderboard(df)
	return fig, lb


	def plot_task2_by_grid() -> go.Figure:
	results = _load_results()
	df = results["point_reuse"]
	if df.empty:
	return _make_empty_fig("No Task 2 results found.")

	q3 = df[df["question_idx"] == 3].groupby(
	["display_name", "grid_size"])["accuracy"].mean().reset_index()

	fig = px.line(
	q3, x="grid_size", y="accuracy", color="display_name",
	markers=True,
	labels={"grid_size": "Grid Size", "accuracy": "Q3 Accuracy",
	"display_name": "Model"},
	title="Task 2 — Q3 Accuracy by Grid Size (Q3 = Q0 repeated)",
	color_discrete_sequence=px.colors.qualitative.Set2,
	)
	fig.update_layout(yaxis_range=[0, 1], height=380)
	return fig


	# ── Task 3 plots ─────────────────────────────────────────────────────────────

	def plot_task3_compositional() -> tuple[go.Figure, pd.DataFrame]:
	results = _load_results()
	df = results["compositional_distance"]
	if df.empty:
	return _make_empty_fig("No Task 3 results found.\nRun experiments first."), pd.DataFrame()

	agg = df.groupby(["display_name", "question_idx"])["accuracy"].mean().reset_index()
	q_labels = {0: "Q0: A→M", 1: "Q1: D→M", 2: "Q2: B→C (compositional)"}
	agg["Question"] = agg["question_idx"].map(q_labels)

	fig = px.bar(
	agg, x="display_name", y="accuracy", color="Question",
	barmode="group",
	labels={"display_name": "Model", "accuracy": "Accuracy"},
	title="Task 3 — Compositional Distance Comparison\n"
	"Q2 can be composed from Q0+Q1 (corner→center distances)",
	color_discrete_map={
	"Q0: A→M": "#2196F3",
	"Q1: D→M": "#9C27B0",
	"Q2: B→C (compositional)": "#FF5722",
	},
	)
	fig.update_layout(yaxis_range=[0, 1], height=420)
	lb = compositional_distance_leaderboard(df)
	return fig, lb


	def plot_task3_by_grid() -> go.Figure:
	results = _load_results()
	df = results["compositional_distance"]
	if df.empty:
	return _make_empty_fig("No Task 3 results found.")

	q2 = df[df["question_idx"] == 2].groupby(
	["display_name", "grid_size"])["accuracy"].mean().reset_index()

	fig = px.line(
	q2, x="grid_size", y="accuracy", color="display_name",
	markers=True,
	labels={"grid_size": "Grid Size", "accuracy": "Q2 Accuracy",
	"display_name": "Model"},
	title="Task 3 — Q2 (Compositional) Accuracy by Grid Size",
	color_discrete_sequence=px.colors.qualitative.Set2,
	)
	fig.update_layout(yaxis_range=[0, 1], height=380)
	return fig


	# ---------------------------------------------------------------------------
	# Script generation tab
	# ---------------------------------------------------------------------------

	TASK_DISPLAY_MAP = {
	"Maze Navigation": "maze_navigation",
	"Sequential Point Reuse": "point_reuse",
	"Compositional Distance Comparison": "compositional_distance",
	}


	def _make_plain_script(job, api_key_placeholder: str) -> str:
	"""Return a plain bash script (no SLURM headers) for running a job directly."""
	lines = [
	"#!/usr/bin/env bash",
	f"# {job.label}",
	f"export {job.api_key_env}={api_key_placeholder}",
	"",
	f"cd {job.working_dir}",
	" \\\n ".join(job.python_cmd),
	"",
	]
	return "\n".join(lines)


	def generate_scripts(
	tasks: list[str],
	models: list[str],
	grid_sizes_str: str,
	formats: list[str],
	strategies: list[str],
	script_type: str,
	repo_path: str,
	) -> tuple[str, str \| None]:
	"""
	Build experiment scripts and return (preview_text, zip_path).
	zip_path is a temp file the user can download.
	"""
	if not tasks:
	return "Select at least one task.", None
	if not models:
	return "Select at least one model.", None

	try:
	grid_sizes = [int(g.strip()) for g in grid_sizes_str.split(",") if g.strip()]
	except ValueError:
	return "Invalid grid sizes — enter comma-separated integers, e.g. 5,6,7", None

	selected_tasks = [TASK_DISPLAY_MAP[t] for t in tasks if t in TASK_DISPLAY_MAP]

	jobs = build_all_jobs(
	cfg=CFG,
	tasks=selected_tasks,
	models=models,
	grid_sizes=grid_sizes or None,
	input_formats=formats or None,
	prompt_strategies=strategies or None,
	config_path=CONFIG_PATH,
	)

	if not jobs:
	return "No jobs matched the selected filters.", None

	# Optionally override repo path in working_dir
	repo_override = repo_path.strip() if repo_path.strip() else None

	use_slurm = (script_type == "SLURM (.sh with #SBATCH headers)")
	log_dir = Path(repo_override or ".") / "maze-solver" / "eval_llm_logs"

	script_contents: dict[str, str] = {}
	for job in jobs:
	safe = job.label.replace(" ", "_").replace("\|", "").replace("/", "_").strip("_")
	filename = f"{safe}.sh"

	# If a repo path override was provided, patch working_dir in the job
	if repo_override:
	# Rebase working_dir: replace the config-derived root with the user's path
	try:
	rel = job.working_dir.relative_to(CONFIG_PATH.parent.parent.parent)
	job.working_dir = Path(repo_override) / rel
	except ValueError:
	pass
	# Rebase output_dir similarly
	try:
	rel_out = job.output_dir.relative_to(CONFIG_PATH.parent.parent.parent)
	job.output_dir = Path(repo_override) / rel_out
	except ValueError:
	pass
	# Rebase python_cmd paths (first two tokens are "python" and script path)
	if len(job.python_cmd) >= 2:
	script_abs = Path(job.python_cmd[1])
	try:
	rel_script = script_abs.relative_to(CONFIG_PATH.parent.parent.parent)
	job.python_cmd[1] = str(Path(repo_override) / rel_script)
	except ValueError:
	pass

	if use_slurm:
	content = make_sbatch_script(job, log_dir)
	else:
	content = _make_plain_script(job, f'"${{{job.api_key_env}}}"')

	script_contents[filename] = content

	# Write zip to a named temp file (Gradio File component needs a real path)
	tmp = tempfile.NamedTemporaryFile(
	delete=False, suffix=".zip", prefix="spatialbench_scripts_"
	)
	with zipfile.ZipFile(tmp, "w", zipfile.ZIP_DEFLATED) as zf:
	for fname, content in script_contents.items():
	zf.writestr(fname, content)
	# Also include a README and a master run_all.sh
	run_all_lines = ["#!/usr/bin/env bash", "# Run all generated scripts sequentially", ""]
	for fname in sorted(script_contents):
	run_all_lines.append(f"bash {fname}")
	zf.writestr("run_all.sh", "\n".join(run_all_lines) + "\n")

	tmp.close()

	# Preview: show first script + summary
	n = len(script_contents)
	first_name, first_content = next(iter(script_contents.items()))
	preview = (
	f"Generated {n} script(s) for {len(models)} model(s) across {len(selected_tasks)} task(s).\n"
	f"Download the zip below, unzip in your cluster, then run: bash run_all.sh\n\n"
	f"── {first_name} ──\n{first_content}"
	+ (f"\n\n... and {n - 1} more script(s) in the zip." if n > 1 else "")
	)

	return preview, tmp.name


	# ---------------------------------------------------------------------------
	# Gradio UI
	# ---------------------------------------------------------------------------

	PAPER_ABSTRACT = """
	Do LLMs Build Spatial World Models? Evidence from Grid-World Maze Tasks

	We systematically evaluate the spatial understanding of large language models through maze tasks—a
	controlled testing context requiring multi-step planning and spatial abstraction. Across experiments
	with Gemini-2.5-Flash, GPT-5-mini, Claude-Haiku-4.5, and DeepSeek-Chat, we uncover significant
	discrepancies in spatial reasoning that challenge assumptions about LLM planning capabilities.

	Key findings:
	- Representation sensitivity: Gemini drops from 86% (raw tokenized) to 34% (visual grid) on 5×5 mazes with CoT
	- Prompting dependency: Claude-Haiku fails completely without CoT, recovers to 78% with it
	- No spatial memory: Models treat sequential questions independently, failing to reuse computed spatial knowledge
	"""

	CSS = """
	@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600&family=IBM+Plex+Mono:wght@400;500&display=swap');
	*, body, .gradio-container { font-family: 'Inter', ui-sans-serif, system-ui, sans-serif !important; }
	code, pre, .monospace { font-family: 'IBM Plex Mono', ui-monospace, monospace !important; }
	.leaderboard-table { font-size: 0.9em; }
	footer { display: none !important; }
	"""


	def build_ui() -> gr.Blocks:
	with gr.Blocks(
	title="SpatialBench — Do LLMs Build Spatial World Models?",
	css=CSS,
	theme=gr.themes.Soft(primary_hue="blue"),
	) as demo:

	gr.Markdown("# 🧩 SpatialBench")
	gr.Markdown(
	"Evaluating Spatial World Models in Large Language Models · "
	"[Paper (ICLR 2026 Workshop)](https://arxiv.org/abs/...) · "
	"[Code](https://github.com/...)"
	)

	with gr.Tabs():

	# ================================================================
	# Tab 1: Leaderboard
	# ================================================================
	with gr.Tab("📊 Leaderboard"):
	gr.Markdown(PAPER_ABSTRACT)

	gr.Markdown("---")
	gr.Markdown("## Task 1 — Maze Navigation (Planning)")
	gr.Markdown(
	"Models find shortest paths through mazes. "
	"Two input formats: raw tokenized adjacency lists vs visual character grids."
	)

	with gr.Row():
	t1_k = gr.Radio(
	choices=[0, 3, 5], value=0, label="K-shot",
	info="Number of in-context examples",
	)
	t1_fmt = gr.Radio(
	choices=["raw", "visual"], value="raw", label="Input Format",
	)

	t1_plot = gr.Plot(label="Accuracy by Grid Size")
	t1_lb = gr.Dataframe(
	label="Leaderboard (mean accuracy across grid sizes)",
	elem_classes=["leaderboard-table"],
	)
	t1_fmt_plot = gr.Plot(label="Raw vs Visual Format Comparison")

	def refresh_task1(k, fmt):
	fig, lb = plot_task1_accuracy(int(k), fmt)
	fmt_fig = plot_task1_format_comparison()
	return fig, lb, fmt_fig

	for inp in [t1_k, t1_fmt]:
	inp.change(
	refresh_task1, inputs=[t1_k, t1_fmt],
	outputs=[t1_plot, t1_lb, t1_fmt_plot],
	)

	gr.Markdown("---")
	gr.Markdown("## Task 2 — Sequential Reasoning with Point Reuse")
	gr.Markdown(
	"Models answer 4 proximity questions. Q3 = Q0 (same question repeated). "
	"Do models reuse their earlier computation, or start from scratch?"
	)

	t2_grid = gr.Slider(minimum=5, maximum=9, step=1, value=5, label="Grid Size")
	t2_plot = gr.Plot(label="Q0 vs Q3 Accuracy")
	t2_grid_plot = gr.Plot(label="Q3 Accuracy Across Grid Sizes")
	t2_lb = gr.Dataframe(label="Leaderboard", elem_classes=["leaderboard-table"])

	def refresh_task2(gs):
	fig, lb = plot_task2_q0_q3(int(gs))
	grid_fig = plot_task2_by_grid()
	return fig, grid_fig, lb

	t2_grid.change(
	refresh_task2, inputs=[t2_grid],
	outputs=[t2_plot, t2_grid_plot, t2_lb],
	)

	gr.Markdown("---")
	gr.Markdown("## Task 3 — Compositional Distance Comparison")
	gr.Markdown(
	"Models answer 3 questions about maze corners (A, B, C, D) and center M. "
	"Q2 (B→C) can potentially be composed from Q0 (A→M) and Q1 (D→M). "
	"Δ = Q2 accuracy − avg(Q0, Q1)."
	)

	t3_plot = gr.Plot(label="Q0 / Q1 / Q2 Accuracy by Model")
	t3_grid_plot = gr.Plot(label="Q2 Accuracy Across Grid Sizes")
	t3_lb = gr.Dataframe(
	label="Leaderboard (Δ shows compositional benefit)",
	elem_classes=["leaderboard-table"],
	)

	with gr.Row():
	refresh_lb_btn = gr.Button("🔄 Refresh Results", variant="secondary")

	def refresh_all_leaderboard(_=None):
	t1_fig, t1_table = plot_task1_accuracy(0, "raw")
	t1_ff = plot_task1_format_comparison()
	t2_fig, t2_lb_table = plot_task2_q0_q3(5)
	t2_gfig = plot_task2_by_grid()
	t3_fig, t3_lb_table = plot_task3_compositional()
	t3_gfig = plot_task3_by_grid()
	return (
	t1_fig, t1_table, t1_ff,
	t2_fig, t2_gfig, t2_lb_table,
	t3_fig, t3_gfig, t3_lb_table,
	)

	refresh_lb_btn.click(
	refresh_all_leaderboard,
	outputs=[
	t1_plot, t1_lb, t1_fmt_plot,
	t2_plot, t2_grid_plot, t2_lb,
	t3_plot, t3_grid_plot, t3_lb,
	],
	)

	demo.load(
	refresh_all_leaderboard,
	outputs=[
	t1_plot, t1_lb, t1_fmt_plot,
	t2_plot, t2_grid_plot, t2_lb,
	t3_plot, t3_grid_plot, t3_lb,
	],
	)

	# ================================================================
	# Tab 2: Get Scripts
	# ================================================================
	with gr.Tab("⬇️ Get Scripts"):
	gr.Markdown(
	"## Generate Experiment Scripts\n"
	"Configure the experiments you want to run, then download a zip of ready-to-run "
	"shell scripts.\n\n"
	"How to use:\n"
	"1. Select tasks, models, and settings below\n"
	"2. Enter the path to your local clone of the repo (so paths in the scripts are correct)\n"
	"3. Click Generate — a preview appears and a zip is ready to download\n"
	"4. Unzip on your cluster, set your API key(s) as environment variables, then:\n"
	" ```bash\n"
	" export GEMINI_API_KEY=your_key_here\n"
	" bash run_all.sh # run sequentially\n"
	" # — or submit individually:\n"
	" sbatch Task_1__Maze_Navigation__gemini-2.5-flash__raw__cot.sh\n"
	" ```"
	)

	with gr.Row():
	with gr.Column(scale=2):
	gen_tasks = gr.CheckboxGroup(
	choices=list(TASK_DISPLAY_MAP.keys()),
	value=["Maze Navigation"],
	label="Tasks",
	)
	gen_models = gr.CheckboxGroup(
	choices=MODEL_CHOICES,
	value=["gemini-2.5-flash"],
	label="Models",
	)
	gen_grids = gr.Textbox(
	value="5,6,7,8,9",
	label="Grid Sizes",
	info="Comma-separated. Paper used 5–9.",
	)
	with gr.Row():
	gen_formats = gr.CheckboxGroup(
	choices=["raw", "visual"],
	value=["raw", "visual"],
	label="Input Formats (Task 1 only)",
	)
	gen_strategies = gr.CheckboxGroup(
	choices=["base", "cot", "reasoning"],
	value=["base", "cot", "reasoning"],
	label="Prompt Strategies",
	)

	with gr.Column(scale=1):
	gen_script_type = gr.Radio(
	choices=[
	"SLURM (.sh with #SBATCH headers)",
	"Plain bash (.sh, no SLURM)",
	],
	value="SLURM (.sh with #SBATCH headers)",
	label="Script Type",
	info="Use SLURM if you have a cluster. Plain bash runs directly.",
	)
	gen_repo_path = gr.Textbox(
	label="Repo path on your cluster",
	placeholder="/path/to/llm-maze-solver",
	info="Absolute path to the llm-maze-solver repo root on the machine where you'll run the scripts. Leave blank to use relative paths.",
	)

	with gr.Row():
	gen_btn = gr.Button("⚙️ Generate Scripts", variant="primary", scale=2)

	gen_preview = gr.Textbox(
	label="Preview (first script)",
	interactive=False,
	lines=20,
	max_lines=30,
	)
	gen_download = gr.File(
	label="Download Scripts (.zip)",
	interactive=False,
	)

	gen_btn.click(
	generate_scripts,
	inputs=[
	gen_tasks, gen_models, gen_grids,
	gen_formats, gen_strategies,
	gen_script_type, gen_repo_path,
	],
	outputs=[gen_preview, gen_download],
	)

	# ================================================================
	# Tab 3: About
	# ================================================================
	with gr.Tab("ℹ️ About"):
	gr.Markdown("""
	## About SpatialBench

	SpatialBench is the evaluation platform accompanying the paper:

	> Do LLMs Build Spatial World Models? Evidence from Grid-World Maze Tasks
	> Under review at ICLR 2026 Workshop

	### Three Tasks

	\| Task \| Type \| What it tests \|
	\|------\|------\|---------------\|
	\| Task 1: Maze Navigation \| Planning \| Find shortest path from start to goal \|
	\| Task 2: Sequential Point Reuse \| Reasoning \| Reuse Q0 computation when Q3=Q0 \|
	\| Task 3: Compositional Distance \| Reasoning \| Compose corner→center distances for Q2 \|

	### Input Representations

	- Raw (tokenized): `<ADJLIST_START> (0,0) <--> (0,1) ... <ADJLIST_END>`
	- Visual (grid): `Row 0: ['.', 'S', '.', '#'] Row 1: ['#', '.', '.', 'E']`

	### Models Evaluated

	\| Model \| Provider \|
	\|-------\|----------\|
	\| Gemini 2.5 Flash \| Google \|
	\| GPT-5 Mini \| OpenAI \|
	\| Claude Haiku 4.5 \| Anthropic \|
	\| DeepSeek Chat \| DeepSeek \|

	### Grid Sizes

	Experiments run on n×n grids for n ∈ {5, 6, 7, 8, 9} by default.

	### Reproducing Experiments

	Clone the repo and use the Get Scripts tab above to generate SLURM scripts, or use the CLI directly:

	```bash
	cd pipeline/
	python run_experiments.py --tasks maze_navigation --models gemini-2.5-flash --mode slurm --dry-run
	```

	### Citation
	```bibtex
	@inproceedings{spatialbench2026,
	title = {Do {LLMs} Build Spatial World Models? Evidence from Grid-World Maze Tasks},
	author = {Anonymous},
	booktitle = {ICLR 2026 Workshop},
	year = {2026},
	}
	```
	""")

	return demo


	# ---------------------------------------------------------------------------
	# Entry point
	# ---------------------------------------------------------------------------

	if __name__ == "__main__":
	demo = build_ui()
	demo.launch(
	server_name="0.0.0.0",
	share=False,
	show_error=True,
	)