Spaces:

Ted412
/

EgoMemReason

Running

File size: 12,872 Bytes

"""EgoMemReason leaderboard — Gradio Space app.

Tabs:
  - Leaderboard   public, auto-refresh, toggle selected-only / show-all
  - Submit        HF login required; JSON upload + metadata form
  - Manage        toggle is_selected on your own past submissions
  - About         paper description + citation
"""

import os

# Workaround for a long-standing gradio_client bug that hits the /info endpoint
# when any component emits a JSON schema with `additionalProperties: True/False`
# (a plain bool). Both get_type() and _json_schema_to_python_type() assume the
# schema is a dict and crash on bools. Patch both before Gradio loads its
# FastAPI routes.
import gradio_client.utils as _gcu

_orig_get_type = _gcu.get_type
def _safe_get_type(schema):
    if not isinstance(schema, dict):
        return "Any"
    return _orig_get_type(schema)
_gcu.get_type = _safe_get_type

_orig_json_schema = _gcu._json_schema_to_python_type
def _safe_json_schema(schema, defs=None):
    if not isinstance(schema, dict):
        # `additionalProperties: True` → accepts anything; `False` → accepts nothing.
        return "Any" if schema else "None"
    return _orig_json_schema(schema, defs)
_gcu._json_schema_to_python_type = _safe_json_schema

import gradio as gr
import pandas as pd

import auth
import evaluator
import ledger


# ---------------------------------------------------------------------------
# Boot: pull annotations_private.json from the private dataset repo.
# ---------------------------------------------------------------------------

try:
    ledger.ensure_private_annotations()
except RuntimeError as e:
    # In local dev without HF_TOKEN, allow the app to come up with a clear banner.
    BOOT_ERROR = str(e)
else:
    BOOT_ERROR = None


LEADERBOARD_COLUMNS = [
    "#",
    "Method",
    "Team",
    "Overall",
    "Cumul",
    "Count",
    "Order",
    "Link",
    "Spatial",
    "Activity",
    "Size",
    "Ext",
    "Modality",
    "Links",
]


def _row_from_submission(sub, rank):
    m = sub["metrics"]
    links = []
    if sub.get("project_url"):
        links.append(f"[project]({sub['project_url']})")
    if sub.get("publication_url"):
        links.append(f"[paper]({sub['publication_url']})")
    return [
        rank,
        sub["method_name"],
        sub["team_name"],
        m["Overall"],
        m["Cumulative State Tracking"],
        m["Temporal Counting"],
        m["Event Ordering"],
        m["Event Linking"],
        m["Spatial Preference"],
        m["Activity Pattern"],
        sub.get("model_size") or "—",
        "yes" if sub.get("uses_external_data") else "no",
        sub.get("uses_video_frames") or "—",
        " · ".join(links) or "—",
    ]


def load_leaderboard(show_all):
    subs = ledger.list_submissions()
    if not show_all:
        subs = [s for s in subs if s.get("is_selected")]
    subs = sorted(subs, key=lambda s: s["metrics"]["Overall"], reverse=True)
    rows = [_row_from_submission(s, i + 1) for i, s in enumerate(subs)]
    return pd.DataFrame(rows, columns=LEADERBOARD_COLUMNS)


# ---------------------------------------------------------------------------
# Submit
# ---------------------------------------------------------------------------

def handle_submission(file, team_name, method_name, model_size, uses_external,
                      uses_frames, method_description, project_url,
                      publication_url, profile: gr.OAuthProfile | None):
    user = auth.resolve_user(profile)
    if user is None:
        return "**Error:** sign in with Hugging Face first (button at the top of the page)."
    if not team_name or not method_name:
        return "**Error:** `team_name` and `method_name` are required."
    if uses_external not in ("yes", "no"):
        return "**Error:** answer `Uses external data?` (yes/no)."
    if not uses_frames:
        return "**Error:** pick a video input modality."
    if file is None:
        return "**Error:** upload a `.json` submission file."

    recent = ledger.count_recent(user, hours=24)
    if recent >= 5:
        return (f"**Rate limit:** you have **{recent}** submissions in the last 24 h "
                "(max 5). Try again later.")

    try:
        metrics = evaluator.score_submission(file.name)
    except ValueError as e:
        return f"**Validation error:**\n```\n{e}\n```"
    except Exception as e:
        return f"**Internal error scoring submission:** `{type(e).__name__}: {e}`"

    try:
        sid = ledger.append_submission(
            hf_user_id=user,
            team_name=team_name,
            method_name=method_name,
            model_size=model_size,
            uses_external_data=(uses_external == "yes"),
            uses_video_frames=uses_frames,
            method_description=method_description,
            project_url=project_url,
            publication_url=publication_url,
            metrics=metrics,
        )
    except Exception as e:
        return (f"**Scored, but failed to persist to ledger:** `{type(e).__name__}: {e}`\n\n"
                f"Your metrics were:\n```\n{metrics}\n```")

    rows = "\n".join(f"| {k} | **{v:.2f}** |" for k, v in metrics.items())
    return (
        f"✅ **Submission logged.** `submission_id = {sid}`\n\n"
        f"| Metric | Score (%) |\n|---|---|\n{rows}\n\n"
        "Go to **Manage my submissions** to mark this as your official entry."
    )


# ---------------------------------------------------------------------------
# Manage
# ---------------------------------------------------------------------------

MANAGE_COLUMNS = ["submission_id", "method_name", "Overall", "is_selected", "submitted_at_utc"]


def load_my_submissions(profile: gr.OAuthProfile | None):
    user = auth.resolve_user(profile)
    if user is None:
        return pd.DataFrame(columns=MANAGE_COLUMNS)
    rows = []
    for sub in ledger.list_submissions():
        if sub.get("hf_user_id") != user:
            continue
        rows.append([
            sub["submission_id"],
            sub["method_name"],
            sub["metrics"]["Overall"],
            sub.get("is_selected", False),
            sub.get("submitted_at_utc", ""),
        ])
    rows.sort(key=lambda r: r[4], reverse=True)
    return pd.DataFrame(rows, columns=MANAGE_COLUMNS)


def set_my_selected(submission_id, profile: gr.OAuthProfile | None):
    user = auth.resolve_user(profile)
    if user is None:
        return "**Error:** sign in first.", load_my_submissions(profile)
    if not submission_id or not submission_id.strip():
        return "**Error:** paste a submission_id.", load_my_submissions(profile)
    try:
        ledger.set_selected(submission_id.strip(), user)
    except (ValueError, PermissionError) as e:
        return f"**Error:** {e}", load_my_submissions(profile)
    return f"✅ `{submission_id.strip()}` is now your selected entry.", load_my_submissions(profile)


# ---------------------------------------------------------------------------
# About
# ---------------------------------------------------------------------------

ABOUT_MD = """\
## EgoMemReason

**A Memory-driven Reasoning Benchmark for Long-Horizon Egocentric Video Understanding.**

EgoMemReason is a 500-question multiple-choice benchmark over week-long egocentric
videos (built on [EgoLife](https://egolife-ai.github.io/)). Models must answer
questions whose evidence is sparsely distributed across hours or days, exercising
three memory types:

- **Entity memory** — Cumulative State Tracking, Temporal Counting
- **Event memory** — Event Ordering, Event Linking
- **Behavior memory** — Spatial Preference Inference, Activity Pattern Inference

500 Qs · avg. 5.1 evidence segments / Q · avg. 25.9 h memory backtracking. The
strongest model in the paper reaches **39.6% Overall**.

### Resources

- 🌐 Project page: <https://egomemreason.github.io/>
- 📄 Paper: <https://arxiv.org/abs/2605.09874>
- 💻 Code & reference eval scripts: <https://github.com/Ziyang412/EgoMemReason>
- 📦 Public questions (no answers): <https://huggingface.co/datasets/Ted412/EgoMemReason>
- 🎬 EgoLife video frames: <https://egolife-ai.github.io/>

### Submission

Upload a JSON file with 500 entries:

```json
[
  {"example_id": 1, "predicted_answer": "A"},
  ...
]
```

Questions have 4-10 options (letters A-J) — `predicted_answer` must be a letter
that appears in that question's `options` dict. See
[SUBMISSION_FORMAT.md](https://github.com/Ziyang412/EgoMemReason/blob/main/SUBMISSION_FORMAT.md)
for the full spec.

### License

- **Annotations** (this Space + the public dataset): CC BY-NC 4.0.
- **Video frames**: governed by the [EgoLife data license](https://egolife-ai.github.io/) — you must accept their terms separately.

### Citation

```bibtex
@misc{wang2026egomemreasonmemorydrivenreasoningbenchmark,
      title={EgoMemReason: A Memory-Driven Reasoning Benchmark for Long-Horizon Egocentric Video Understanding},
      author={Ziyang Wang and Yue Zhang and Shoubin Yu and Ce Zhang and Zengqi Zhao and Jaehong Yoon and Hyunji Lee and Gedas Bertasius and Mohit Bansal},
      year={2026},
      eprint={2605.09874},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
      url={https://arxiv.org/abs/2605.09874},
}
```
"""


# ---------------------------------------------------------------------------
# UI
# ---------------------------------------------------------------------------

with gr.Blocks(title="EgoMemReason Leaderboard") as demo:
    gr.Markdown("# 🧠 EgoMemReason — Leaderboard")
    gr.Markdown(
        "*Memory-driven reasoning over week-long egocentric video. 500 MCQs · "
        "Entity / Event / Behavior memory.*"
    )
    if BOOT_ERROR:
        gr.Markdown(f"⚠️ **Boot warning:** {BOOT_ERROR}\n\nSubmissions are disabled.")

    login_btn = gr.LoginButton()

    with gr.Tab("About"):
        gr.Markdown(ABOUT_MD)

    with gr.Tab("Leaderboard"):
        with gr.Row():
            show_all = gr.Checkbox(
                value=False,
                label="Show all submissions (not just each team's selected entry)",
            )
            refresh_btn = gr.Button("Refresh", size="sm")
        leaderboard_df = gr.Dataframe(
            value=load_leaderboard(False),
            headers=LEADERBOARD_COLUMNS,
            interactive=False,
            wrap=False,
        )
        show_all.change(load_leaderboard, inputs=[show_all], outputs=[leaderboard_df])
        refresh_btn.click(load_leaderboard, inputs=[show_all], outputs=[leaderboard_df])

    with gr.Tab("Submit"):
        gr.Markdown("**Sign in with Hugging Face (button above) before submitting.** "
                    "Limit: 5 submissions per HF user per 24 h.")
        with gr.Row():
            team_name = gr.Textbox(label="Team name *", max_lines=1)
            method_name = gr.Textbox(label="Method name *", max_lines=1)
        with gr.Row():
            model_size = gr.Textbox(label="Model size (e.g. 8B, 32B, API)", max_lines=1)
            uses_external = gr.Radio(
                ["yes", "no"], label="Uses training data beyond EgoLife? *",
            )
        uses_frames = gr.Radio(
            ["frames-only", "video-only", "frames+audio", "captions-only", "other"],
            label="Video input modality *",
        )
        method_description = gr.Textbox(label="Method description", lines=3)
        with gr.Row():
            project_url = gr.Textbox(label="Project URL", max_lines=1)
            publication_url = gr.Textbox(label="Publication URL (arXiv/OpenReview)", max_lines=1)
        submission_file = gr.File(label="submission.json", file_types=[".json"])
        submit_btn = gr.Button("Score & log", variant="primary")
        result_md = gr.Markdown()
        submit_btn.click(
            handle_submission,
            inputs=[submission_file, team_name, method_name, model_size,
                    uses_external, uses_frames, method_description,
                    project_url, publication_url],
            outputs=[result_md],
        )

    with gr.Tab("Manage my submissions"):
        gr.Markdown(
            "Toggle which of your past submissions is the official **selected** entry. "
            "Only your own submissions appear here. "
            "Only one entry per HF user can be selected at a time."
        )
        my_subs = gr.Dataframe(
            value=pd.DataFrame(columns=MANAGE_COLUMNS),
            headers=MANAGE_COLUMNS,
            interactive=False,
            wrap=False,
        )
        selected_id = gr.Textbox(label="submission_id to mark as selected", max_lines=1)
        select_btn = gr.Button("Mark as my selected entry")
        manage_msg = gr.Markdown()
        demo.load(load_my_submissions, outputs=[my_subs])
        select_btn.click(set_my_selected, inputs=[selected_id], outputs=[manage_msg, my_subs])


if __name__ == "__main__":
    demo.queue().launch()