"""
app.py — SpatialBench Gradio application
-----------------------------------------
Entrypoint for the HuggingFace Space "SpatialBench".

Three tabs:
  1. Leaderboard     — visualize pre-computed results from all three tasks
  2. Get Scripts     — generate ready-to-run SLURM scripts (or plain shell
                       scripts) as a downloadable zip; no compute needed here
  3. About           — paper info and citation

To run locally:
    cd pipeline/
    python app.py

To deploy on HuggingFace Spaces:
    - No secrets required for the Leaderboard or Get Scripts tabs.
    - The Space entrypoint is this file (app.py).
"""

from __future__ import annotations

import os
import sys
import zipfile
import tempfile
from pathlib import Path

import gradio as gr
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Load .env if running locally
_env = Path(__file__).parent / ".env"
if _env.exists():
    with open(_env) as _f:
        for _line in _f:
            _line = _line.strip()
            if _line and not _line.startswith("#") and "=" in _line:
                _k, _v = _line.split("=", 1)
                os.environ.setdefault(_k.strip(), _v.strip())

# Add repo root to path so pipeline imports work
sys.path.insert(0, str(Path(__file__).parent))

from pipeline.task_builder import load_config, build_all_jobs, make_sbatch_script
from pipeline.results_loader import (
    load_all_results,
    maze_navigation_leaderboard,
    point_reuse_leaderboard,
    compositional_distance_leaderboard,
)

# ---------------------------------------------------------------------------
# Paths / config
# ---------------------------------------------------------------------------
CONFIG_PATH = Path(__file__).parent / "configs" / "experiments.yaml"
CFG = load_config(CONFIG_PATH)
MODEL_CHOICES = list(CFG["models"].keys())

# ---------------------------------------------------------------------------
# Leaderboard helpers
# ---------------------------------------------------------------------------

def _load_results():
    try:
        return load_all_results(CONFIG_PATH)
    except Exception:
        return {
            "maze_navigation": pd.DataFrame(),
            "point_reuse": pd.DataFrame(),
            "compositional_distance": pd.DataFrame(),
        }


def _make_empty_fig(msg: str) -> go.Figure:
    fig = go.Figure()
    fig.add_annotation(text=msg, x=0.5, y=0.5, showarrow=False,
                       font=dict(size=16), xref="paper", yref="paper")
    fig.update_layout(xaxis_visible=False, yaxis_visible=False,
                      height=300, paper_bgcolor="rgba(0,0,0,0)",
                      plot_bgcolor="rgba(0,0,0,0)")
    return fig


# ── Task 1 plots ─────────────────────────────────────────────────────────────

def plot_task1_accuracy(k_shot: int, input_format: str) -> tuple[go.Figure, pd.DataFrame]:
    results = _load_results()
    df = results["maze_navigation"]
    if df.empty:
        return _make_empty_fig("No Task 1 results found.\nRun experiments first."), pd.DataFrame()

    sub = df[(df["k_shot"] == k_shot) & (df["input_format"] == input_format)]
    if sub.empty:
        return _make_empty_fig(f"No results for k={k_shot}, format={input_format}"), pd.DataFrame()

    fig = px.line(
        sub, x="grid_size", y="accuracy",
        color="display_name", line_dash="prompt_strategy",
        markers=True,
        labels={"grid_size": "Grid Size (n×n)", "accuracy": "Accuracy",
                "display_name": "Model", "prompt_strategy": "Strategy"},
        title=f"Task 1 — Maze Navigation ({input_format} format, {k_shot}-shot)",
        color_discrete_sequence=px.colors.qualitative.Set2,
    )
    fig.update_layout(
        yaxis_range=[0, 1],
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
        height=420,
    )
    lb = maze_navigation_leaderboard(df, k_shot=k_shot)
    return fig, lb


def plot_task1_format_comparison() -> go.Figure:
    results = _load_results()
    df = results["maze_navigation"]
    if df.empty:
        return _make_empty_fig("No Task 1 results found.")

    sub = df[(df["k_shot"] == 0) & (df["prompt_strategy"] == "cot")]
    if sub.empty:
        sub = df[df["k_shot"] == 0]
    agg = sub.groupby(["display_name", "input_format"])["accuracy"].mean().reset_index()

    fig = px.bar(
        agg, x="display_name", y="accuracy", color="input_format",
        barmode="group",
        labels={"display_name": "Model", "accuracy": "Mean Accuracy",
                "input_format": "Input Format"},
        title="Task 1 — Raw vs Visual Format (0-shot, CoT, averaged over grid sizes)",
        color_discrete_map={"raw": "#2196F3", "visual": "#FF9800"},
    )
    fig.update_layout(yaxis_range=[0, 1], height=380)
    return fig


# ── Task 2 plots ─────────────────────────────────────────────────────────────

def plot_task2_q0_q3(grid_size: int) -> tuple[go.Figure, pd.DataFrame]:
    results = _load_results()
    df = results["point_reuse"]
    if df.empty:
        return _make_empty_fig("No Task 2 results found.\nRun experiments first."), pd.DataFrame()

    sub = df[df["grid_size"] == grid_size]
    if sub.empty:
        return _make_empty_fig(f"No Task 2 results for {grid_size}×{grid_size}"), pd.DataFrame()

    q0 = sub[sub["question_idx"] == 0].groupby("display_name")["accuracy"].mean().rename("Q0")
    q3 = sub[sub["question_idx"] == 3].groupby("display_name")["accuracy"].mean().rename("Q3")
    plot_df = pd.concat([q0, q3], axis=1).reset_index()
    plot_df_melt = plot_df.melt(id_vars="display_name", var_name="Question", value_name="Accuracy")

    fig = px.bar(
        plot_df_melt, x="display_name", y="Accuracy", color="Question",
        barmode="group",
        labels={"display_name": "Model"},
        title=f"Task 2 — Q0 vs Q3 Accuracy ({grid_size}×{grid_size} maze)\n"
              "Q3 = Q0 (same question repeated — tests information reuse)",
        color_discrete_map={"Q0": "#4CAF50", "Q3": "#F44336"},
    )
    fig.update_layout(yaxis_range=[0, 1], height=400)
    lb = point_reuse_leaderboard(df)
    return fig, lb


def plot_task2_by_grid() -> go.Figure:
    results = _load_results()
    df = results["point_reuse"]
    if df.empty:
        return _make_empty_fig("No Task 2 results found.")

    q3 = df[df["question_idx"] == 3].groupby(
        ["display_name", "grid_size"])["accuracy"].mean().reset_index()

    fig = px.line(
        q3, x="grid_size", y="accuracy", color="display_name",
        markers=True,
        labels={"grid_size": "Grid Size", "accuracy": "Q3 Accuracy",
                "display_name": "Model"},
        title="Task 2 — Q3 Accuracy by Grid Size (Q3 = Q0 repeated)",
        color_discrete_sequence=px.colors.qualitative.Set2,
    )
    fig.update_layout(yaxis_range=[0, 1], height=380)
    return fig


# ── Task 3 plots ─────────────────────────────────────────────────────────────

def plot_task3_compositional() -> tuple[go.Figure, pd.DataFrame]:
    results = _load_results()
    df = results["compositional_distance"]
    if df.empty:
        return _make_empty_fig("No Task 3 results found.\nRun experiments first."), pd.DataFrame()

    agg = df.groupby(["display_name", "question_idx"])["accuracy"].mean().reset_index()
    q_labels = {0: "Q0: A→M", 1: "Q1: D→M", 2: "Q2: B→C (compositional)"}
    agg["Question"] = agg["question_idx"].map(q_labels)

    fig = px.bar(
        agg, x="display_name", y="accuracy", color="Question",
        barmode="group",
        labels={"display_name": "Model", "accuracy": "Accuracy"},
        title="Task 3 — Compositional Distance Comparison\n"
              "Q2 can be composed from Q0+Q1 (corner→center distances)",
        color_discrete_map={
            "Q0: A→M": "#2196F3",
            "Q1: D→M": "#9C27B0",
            "Q2: B→C (compositional)": "#FF5722",
        },
    )
    fig.update_layout(yaxis_range=[0, 1], height=420)
    lb = compositional_distance_leaderboard(df)
    return fig, lb


def plot_task3_by_grid() -> go.Figure:
    results = _load_results()
    df = results["compositional_distance"]
    if df.empty:
        return _make_empty_fig("No Task 3 results found.")

    q2 = df[df["question_idx"] == 2].groupby(
        ["display_name", "grid_size"])["accuracy"].mean().reset_index()

    fig = px.line(
        q2, x="grid_size", y="accuracy", color="display_name",
        markers=True,
        labels={"grid_size": "Grid Size", "accuracy": "Q2 Accuracy",
                "display_name": "Model"},
        title="Task 3 — Q2 (Compositional) Accuracy by Grid Size",
        color_discrete_sequence=px.colors.qualitative.Set2,
    )
    fig.update_layout(yaxis_range=[0, 1], height=380)
    return fig


# ---------------------------------------------------------------------------
# Script generation tab
# ---------------------------------------------------------------------------

TASK_DISPLAY_MAP = {
    "Maze Navigation":                   "maze_navigation",
    "Sequential Point Reuse":            "point_reuse",
    "Compositional Distance Comparison": "compositional_distance",
}


def _make_plain_script(job, api_key_placeholder: str) -> str:
    """Return a plain bash script (no SLURM headers) for running a job directly."""
    lines = [
        "#!/usr/bin/env bash",
        f"# {job.label}",
        f"export {job.api_key_env}={api_key_placeholder}",
        "",
        f"cd {job.working_dir}",
        " \\\n    ".join(job.python_cmd),
        "",
    ]
    return "\n".join(lines)


def generate_scripts(
    tasks: list[str],
    models: list[str],
    grid_sizes_str: str,
    formats: list[str],
    strategies: list[str],
    script_type: str,
    repo_path: str,
) -> tuple[str, str | None]:
    """
    Build experiment scripts and return (preview_text, zip_path).
    zip_path is a temp file the user can download.
    """
    if not tasks:
        return "Select at least one task.", None
    if not models:
        return "Select at least one model.", None

    try:
        grid_sizes = [int(g.strip()) for g in grid_sizes_str.split(",") if g.strip()]
    except ValueError:
        return "Invalid grid sizes — enter comma-separated integers, e.g. 5,6,7", None

    selected_tasks = [TASK_DISPLAY_MAP[t] for t in tasks if t in TASK_DISPLAY_MAP]

    jobs = build_all_jobs(
        cfg=CFG,
        tasks=selected_tasks,
        models=models,
        grid_sizes=grid_sizes or None,
        input_formats=formats or None,
        prompt_strategies=strategies or None,
        config_path=CONFIG_PATH,
    )

    if not jobs:
        return "No jobs matched the selected filters.", None

    # Optionally override repo path in working_dir
    repo_override = repo_path.strip() if repo_path.strip() else None

    use_slurm = (script_type == "SLURM (.sh with #SBATCH headers)")
    log_dir = Path(repo_override or ".") / "maze-solver" / "eval_llm_logs"

    script_contents: dict[str, str] = {}
    for job in jobs:
        safe = job.label.replace(" ", "_").replace("|", "").replace("/", "_").strip("_")
        filename = f"{safe}.sh"

        # If a repo path override was provided, patch working_dir in the job
        if repo_override:
            # Rebase working_dir: replace the config-derived root with the user's path
            try:
                rel = job.working_dir.relative_to(CONFIG_PATH.parent.parent.parent)
                job.working_dir = Path(repo_override) / rel
            except ValueError:
                pass
            # Rebase output_dir similarly
            try:
                rel_out = job.output_dir.relative_to(CONFIG_PATH.parent.parent.parent)
                job.output_dir = Path(repo_override) / rel_out
            except ValueError:
                pass
            # Rebase python_cmd paths (first two tokens are "python" and script path)
            if len(job.python_cmd) >= 2:
                script_abs = Path(job.python_cmd[1])
                try:
                    rel_script = script_abs.relative_to(CONFIG_PATH.parent.parent.parent)
                    job.python_cmd[1] = str(Path(repo_override) / rel_script)
                except ValueError:
                    pass

        if use_slurm:
            content = make_sbatch_script(job, log_dir)
        else:
            content = _make_plain_script(job, f'"${{{job.api_key_env}}}"')

        script_contents[filename] = content

    # Write zip to a named temp file (Gradio File component needs a real path)
    tmp = tempfile.NamedTemporaryFile(
        delete=False, suffix=".zip", prefix="spatialbench_scripts_"
    )
    with zipfile.ZipFile(tmp, "w", zipfile.ZIP_DEFLATED) as zf:
        for fname, content in script_contents.items():
            zf.writestr(fname, content)
        # Also include a README and a master run_all.sh
        run_all_lines = ["#!/usr/bin/env bash", "# Run all generated scripts sequentially", ""]
        for fname in sorted(script_contents):
            run_all_lines.append(f"bash {fname}")
        zf.writestr("run_all.sh", "\n".join(run_all_lines) + "\n")

    tmp.close()

    # Preview: show first script + summary
    n = len(script_contents)
    first_name, first_content = next(iter(script_contents.items()))
    preview = (
        f"Generated {n} script(s) for {len(models)} model(s) across {len(selected_tasks)} task(s).\n"
        f"Download the zip below, unzip in your cluster, then run: bash run_all.sh\n\n"
        f"── {first_name} ──\n{first_content}"
        + (f"\n\n... and {n - 1} more script(s) in the zip." if n > 1 else "")
    )

    return preview, tmp.name


# ---------------------------------------------------------------------------
# Gradio UI
# ---------------------------------------------------------------------------

PAPER_ABSTRACT = """
**Do LLMs Build Spatial World Models? Evidence from Grid-World Maze Tasks**

We systematically evaluate the spatial understanding of large language models through maze tasks—a
controlled testing context requiring multi-step planning and spatial abstraction. Across experiments
with Gemini-2.5-Flash, GPT-5-mini, Claude-Haiku-4.5, and DeepSeek-Chat, we uncover significant
discrepancies in spatial reasoning that challenge assumptions about LLM planning capabilities.

Key findings:
- **Representation sensitivity**: Gemini drops from 86% (raw tokenized) to 34% (visual grid) on 5×5 mazes with CoT
- **Prompting dependency**: Claude-Haiku fails completely without CoT, recovers to 78% with it
- **No spatial memory**: Models treat sequential questions independently, failing to reuse computed spatial knowledge
"""

CSS = """
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600&family=IBM+Plex+Mono:wght@400;500&display=swap');
*, body, .gradio-container { font-family: 'Inter', ui-sans-serif, system-ui, sans-serif !important; }
code, pre, .monospace { font-family: 'IBM Plex Mono', ui-monospace, monospace !important; }
.leaderboard-table { font-size: 0.9em; }
footer { display: none !important; }
"""


def build_ui() -> gr.Blocks:
    with gr.Blocks(
        title="SpatialBench — Do LLMs Build Spatial World Models?",
        css=CSS,
        theme=gr.themes.Soft(primary_hue="blue"),
    ) as demo:

        gr.Markdown("# 🧩 SpatialBench")
        gr.Markdown(
            "**Evaluating Spatial World Models in Large Language Models** · "
            "[Paper (ICLR 2026 Workshop)](https://arxiv.org/abs/...) · "
            "[Code](https://github.com/...)"
        )

        with gr.Tabs():

            # ================================================================
            # Tab 1: Leaderboard
            # ================================================================
            with gr.Tab("📊 Leaderboard"):
                gr.Markdown(PAPER_ABSTRACT)

                gr.Markdown("---")
                gr.Markdown("## Task 1 — Maze Navigation (Planning)")
                gr.Markdown(
                    "Models find shortest paths through mazes. "
                    "Two input formats: **raw** tokenized adjacency lists vs **visual** character grids."
                )

                with gr.Row():
                    t1_k = gr.Radio(
                        choices=[0, 3, 5], value=0, label="K-shot",
                        info="Number of in-context examples",
                    )
                    t1_fmt = gr.Radio(
                        choices=["raw", "visual"], value="raw", label="Input Format",
                    )

                t1_plot = gr.Plot(label="Accuracy by Grid Size")
                t1_lb = gr.Dataframe(
                    label="Leaderboard (mean accuracy across grid sizes)",
                    elem_classes=["leaderboard-table"],
                )
                t1_fmt_plot = gr.Plot(label="Raw vs Visual Format Comparison")

                def refresh_task1(k, fmt):
                    fig, lb = plot_task1_accuracy(int(k), fmt)
                    fmt_fig = plot_task1_format_comparison()
                    return fig, lb, fmt_fig

                for inp in [t1_k, t1_fmt]:
                    inp.change(
                        refresh_task1, inputs=[t1_k, t1_fmt],
                        outputs=[t1_plot, t1_lb, t1_fmt_plot],
                    )

                gr.Markdown("---")
                gr.Markdown("## Task 2 — Sequential Reasoning with Point Reuse")
                gr.Markdown(
                    "Models answer 4 proximity questions. **Q3 = Q0** (same question repeated). "
                    "Do models reuse their earlier computation, or start from scratch?"
                )

                t2_grid = gr.Slider(minimum=5, maximum=9, step=1, value=5, label="Grid Size")
                t2_plot = gr.Plot(label="Q0 vs Q3 Accuracy")
                t2_grid_plot = gr.Plot(label="Q3 Accuracy Across Grid Sizes")
                t2_lb = gr.Dataframe(label="Leaderboard", elem_classes=["leaderboard-table"])

                def refresh_task2(gs):
                    fig, lb = plot_task2_q0_q3(int(gs))
                    grid_fig = plot_task2_by_grid()
                    return fig, grid_fig, lb

                t2_grid.change(
                    refresh_task2, inputs=[t2_grid],
                    outputs=[t2_plot, t2_grid_plot, t2_lb],
                )

                gr.Markdown("---")
                gr.Markdown("## Task 3 — Compositional Distance Comparison")
                gr.Markdown(
                    "Models answer 3 questions about maze corners (A, B, C, D) and center M. "
                    "**Q2** (B→C) can potentially be composed from Q0 (A→M) and Q1 (D→M). "
                    "Δ = Q2 accuracy − avg(Q0, Q1)."
                )

                t3_plot = gr.Plot(label="Q0 / Q1 / Q2 Accuracy by Model")
                t3_grid_plot = gr.Plot(label="Q2 Accuracy Across Grid Sizes")
                t3_lb = gr.Dataframe(
                    label="Leaderboard (Δ shows compositional benefit)",
                    elem_classes=["leaderboard-table"],
                )

                with gr.Row():
                    refresh_lb_btn = gr.Button("🔄 Refresh Results", variant="secondary")

                def refresh_all_leaderboard(_=None):
                    t1_fig, t1_table = plot_task1_accuracy(0, "raw")
                    t1_ff = plot_task1_format_comparison()
                    t2_fig, t2_lb_table = plot_task2_q0_q3(5)
                    t2_gfig = plot_task2_by_grid()
                    t3_fig, t3_lb_table = plot_task3_compositional()
                    t3_gfig = plot_task3_by_grid()
                    return (
                        t1_fig, t1_table, t1_ff,
                        t2_fig, t2_gfig, t2_lb_table,
                        t3_fig, t3_gfig, t3_lb_table,
                    )

                refresh_lb_btn.click(
                    refresh_all_leaderboard,
                    outputs=[
                        t1_plot, t1_lb, t1_fmt_plot,
                        t2_plot, t2_grid_plot, t2_lb,
                        t3_plot, t3_grid_plot, t3_lb,
                    ],
                )

                demo.load(
                    refresh_all_leaderboard,
                    outputs=[
                        t1_plot, t1_lb, t1_fmt_plot,
                        t2_plot, t2_grid_plot, t2_lb,
                        t3_plot, t3_grid_plot, t3_lb,
                    ],
                )

            # ================================================================
            # Tab 2: Get Scripts
            # ================================================================
            with gr.Tab("⬇️ Get Scripts"):
                gr.Markdown(
                    "## Generate Experiment Scripts\n"
                    "Configure the experiments you want to run, then download a zip of ready-to-run "
                    "shell scripts.\n\n"
                    "**How to use:**\n"
                    "1. Select tasks, models, and settings below\n"
                    "2. Enter the path to your local clone of the repo (so paths in the scripts are correct)\n"
                    "3. Click **Generate** — a preview appears and a zip is ready to download\n"
                    "4. Unzip on your cluster, set your API key(s) as environment variables, then:\n"
                    "   ```bash\n"
                    "   export GEMINI_API_KEY=your_key_here\n"
                    "   bash run_all.sh        # run sequentially\n"
                    "   # — or submit individually:\n"
                    "   sbatch Task_1__Maze_Navigation__gemini-2.5-flash__raw__cot.sh\n"
                    "   ```"
                )

                with gr.Row():
                    with gr.Column(scale=2):
                        gen_tasks = gr.CheckboxGroup(
                            choices=list(TASK_DISPLAY_MAP.keys()),
                            value=["Maze Navigation"],
                            label="Tasks",
                        )
                        gen_models = gr.CheckboxGroup(
                            choices=MODEL_CHOICES,
                            value=["gemini-2.5-flash"],
                            label="Models",
                        )
                        gen_grids = gr.Textbox(
                            value="5,6,7,8,9",
                            label="Grid Sizes",
                            info="Comma-separated. Paper used 5–9.",
                        )
                        with gr.Row():
                            gen_formats = gr.CheckboxGroup(
                                choices=["raw", "visual"],
                                value=["raw", "visual"],
                                label="Input Formats (Task 1 only)",
                            )
                            gen_strategies = gr.CheckboxGroup(
                                choices=["base", "cot", "reasoning"],
                                value=["base", "cot", "reasoning"],
                                label="Prompt Strategies",
                            )

                    with gr.Column(scale=1):
                        gen_script_type = gr.Radio(
                            choices=[
                                "SLURM (.sh with #SBATCH headers)",
                                "Plain bash (.sh, no SLURM)",
                            ],
                            value="SLURM (.sh with #SBATCH headers)",
                            label="Script Type",
                            info="Use SLURM if you have a cluster. Plain bash runs directly.",
                        )
                        gen_repo_path = gr.Textbox(
                            label="Repo path on your cluster",
                            placeholder="/path/to/llm-maze-solver",
                            info="Absolute path to the llm-maze-solver repo root on the machine where you'll run the scripts. Leave blank to use relative paths.",
                        )

                with gr.Row():
                    gen_btn = gr.Button("⚙️ Generate Scripts", variant="primary", scale=2)

                gen_preview = gr.Textbox(
                    label="Preview (first script)",
                    interactive=False,
                    lines=20,
                    max_lines=30,
                )
                gen_download = gr.File(
                    label="Download Scripts (.zip)",
                    interactive=False,
                )

                gen_btn.click(
                    generate_scripts,
                    inputs=[
                        gen_tasks, gen_models, gen_grids,
                        gen_formats, gen_strategies,
                        gen_script_type, gen_repo_path,
                    ],
                    outputs=[gen_preview, gen_download],
                )

            # ================================================================
            # Tab 3: About
            # ================================================================
            with gr.Tab("ℹ️ About"):
                gr.Markdown("""
## About SpatialBench

SpatialBench is the evaluation platform accompanying the paper:

> **Do LLMs Build Spatial World Models? Evidence from Grid-World Maze Tasks**
> *Under review at ICLR 2026 Workshop*

### Three Tasks

| Task | Type | What it tests |
|------|------|---------------|
| **Task 1: Maze Navigation** | Planning | Find shortest path from start to goal |
| **Task 2: Sequential Point Reuse** | Reasoning | Reuse Q0 computation when Q3=Q0 |
| **Task 3: Compositional Distance** | Reasoning | Compose corner→center distances for Q2 |

### Input Representations

- **Raw (tokenized)**: `<ADJLIST_START> (0,0) <--> (0,1) ... <ADJLIST_END>`
- **Visual (grid)**: `Row 0: ['.', 'S', '.', '#']  Row 1: ['#', '.', '.', 'E']`

### Models Evaluated

| Model | Provider |
|-------|----------|
| Gemini 2.5 Flash | Google |
| GPT-5 Mini | OpenAI |
| Claude Haiku 4.5 | Anthropic |
| DeepSeek Chat | DeepSeek |

### Grid Sizes

Experiments run on n×n grids for n ∈ {5, 6, 7, 8, 9} by default.

### Reproducing Experiments

Clone the repo and use the **Get Scripts** tab above to generate SLURM scripts, or use the CLI directly:

```bash
cd pipeline/
python run_experiments.py --tasks maze_navigation --models gemini-2.5-flash --mode slurm --dry-run
```

### Citation
```bibtex
@inproceedings{spatialbench2026,
  title     = {Do {LLMs} Build Spatial World Models? Evidence from Grid-World Maze Tasks},
  author    = {Anonymous},
  booktitle = {ICLR 2026 Workshop},
  year      = {2026},
}
```
                """)

    return demo


# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------

if __name__ == "__main__":
    demo = build_ui()
    demo.launch(
        server_name="0.0.0.0",
        share=False,
        show_error=True,
    )