EgoMemReason / app.py
Ziyang Wang
add arXiv paper
1b38b03
"""EgoMemReason leaderboard β€” Gradio Space app.
Tabs:
- Leaderboard public, auto-refresh, toggle selected-only / show-all
- Submit HF login required; JSON upload + metadata form
- Manage toggle is_selected on your own past submissions
- About paper description + citation
"""
import os
# Workaround for a long-standing gradio_client bug that hits the /info endpoint
# when any component emits a JSON schema with `additionalProperties: True/False`
# (a plain bool). Both get_type() and _json_schema_to_python_type() assume the
# schema is a dict and crash on bools. Patch both before Gradio loads its
# FastAPI routes.
import gradio_client.utils as _gcu
_orig_get_type = _gcu.get_type
def _safe_get_type(schema):
if not isinstance(schema, dict):
return "Any"
return _orig_get_type(schema)
_gcu.get_type = _safe_get_type
_orig_json_schema = _gcu._json_schema_to_python_type
def _safe_json_schema(schema, defs=None):
if not isinstance(schema, dict):
# `additionalProperties: True` β†’ accepts anything; `False` β†’ accepts nothing.
return "Any" if schema else "None"
return _orig_json_schema(schema, defs)
_gcu._json_schema_to_python_type = _safe_json_schema
import gradio as gr
import pandas as pd
import auth
import evaluator
import ledger
# ---------------------------------------------------------------------------
# Boot: pull annotations_private.json from the private dataset repo.
# ---------------------------------------------------------------------------
try:
ledger.ensure_private_annotations()
except RuntimeError as e:
# In local dev without HF_TOKEN, allow the app to come up with a clear banner.
BOOT_ERROR = str(e)
else:
BOOT_ERROR = None
LEADERBOARD_COLUMNS = [
"#",
"Method",
"Team",
"Overall",
"Cumul",
"Count",
"Order",
"Link",
"Spatial",
"Activity",
"Size",
"Ext",
"Modality",
"Links",
]
def _row_from_submission(sub, rank):
m = sub["metrics"]
links = []
if sub.get("project_url"):
links.append(f"[project]({sub['project_url']})")
if sub.get("publication_url"):
links.append(f"[paper]({sub['publication_url']})")
return [
rank,
sub["method_name"],
sub["team_name"],
m["Overall"],
m["Cumulative State Tracking"],
m["Temporal Counting"],
m["Event Ordering"],
m["Event Linking"],
m["Spatial Preference"],
m["Activity Pattern"],
sub.get("model_size") or "β€”",
"yes" if sub.get("uses_external_data") else "no",
sub.get("uses_video_frames") or "β€”",
" Β· ".join(links) or "β€”",
]
def load_leaderboard(show_all):
subs = ledger.list_submissions()
if not show_all:
subs = [s for s in subs if s.get("is_selected")]
subs = sorted(subs, key=lambda s: s["metrics"]["Overall"], reverse=True)
rows = [_row_from_submission(s, i + 1) for i, s in enumerate(subs)]
return pd.DataFrame(rows, columns=LEADERBOARD_COLUMNS)
# ---------------------------------------------------------------------------
# Submit
# ---------------------------------------------------------------------------
def handle_submission(file, team_name, method_name, model_size, uses_external,
uses_frames, method_description, project_url,
publication_url, profile: gr.OAuthProfile | None):
user = auth.resolve_user(profile)
if user is None:
return "**Error:** sign in with Hugging Face first (button at the top of the page)."
if not team_name or not method_name:
return "**Error:** `team_name` and `method_name` are required."
if uses_external not in ("yes", "no"):
return "**Error:** answer `Uses external data?` (yes/no)."
if not uses_frames:
return "**Error:** pick a video input modality."
if file is None:
return "**Error:** upload a `.json` submission file."
recent = ledger.count_recent(user, hours=24)
if recent >= 5:
return (f"**Rate limit:** you have **{recent}** submissions in the last 24 h "
"(max 5). Try again later.")
try:
metrics = evaluator.score_submission(file.name)
except ValueError as e:
return f"**Validation error:**\n```\n{e}\n```"
except Exception as e:
return f"**Internal error scoring submission:** `{type(e).__name__}: {e}`"
try:
sid = ledger.append_submission(
hf_user_id=user,
team_name=team_name,
method_name=method_name,
model_size=model_size,
uses_external_data=(uses_external == "yes"),
uses_video_frames=uses_frames,
method_description=method_description,
project_url=project_url,
publication_url=publication_url,
metrics=metrics,
)
except Exception as e:
return (f"**Scored, but failed to persist to ledger:** `{type(e).__name__}: {e}`\n\n"
f"Your metrics were:\n```\n{metrics}\n```")
rows = "\n".join(f"| {k} | **{v:.2f}** |" for k, v in metrics.items())
return (
f"βœ… **Submission logged.** `submission_id = {sid}`\n\n"
f"| Metric | Score (%) |\n|---|---|\n{rows}\n\n"
"Go to **Manage my submissions** to mark this as your official entry."
)
# ---------------------------------------------------------------------------
# Manage
# ---------------------------------------------------------------------------
MANAGE_COLUMNS = ["submission_id", "method_name", "Overall", "is_selected", "submitted_at_utc"]
def load_my_submissions(profile: gr.OAuthProfile | None):
user = auth.resolve_user(profile)
if user is None:
return pd.DataFrame(columns=MANAGE_COLUMNS)
rows = []
for sub in ledger.list_submissions():
if sub.get("hf_user_id") != user:
continue
rows.append([
sub["submission_id"],
sub["method_name"],
sub["metrics"]["Overall"],
sub.get("is_selected", False),
sub.get("submitted_at_utc", ""),
])
rows.sort(key=lambda r: r[4], reverse=True)
return pd.DataFrame(rows, columns=MANAGE_COLUMNS)
def set_my_selected(submission_id, profile: gr.OAuthProfile | None):
user = auth.resolve_user(profile)
if user is None:
return "**Error:** sign in first.", load_my_submissions(profile)
if not submission_id or not submission_id.strip():
return "**Error:** paste a submission_id.", load_my_submissions(profile)
try:
ledger.set_selected(submission_id.strip(), user)
except (ValueError, PermissionError) as e:
return f"**Error:** {e}", load_my_submissions(profile)
return f"βœ… `{submission_id.strip()}` is now your selected entry.", load_my_submissions(profile)
# ---------------------------------------------------------------------------
# About
# ---------------------------------------------------------------------------
ABOUT_MD = """\
## EgoMemReason
**A Memory-driven Reasoning Benchmark for Long-Horizon Egocentric Video Understanding.**
EgoMemReason is a 500-question multiple-choice benchmark over week-long egocentric
videos (built on [EgoLife](https://egolife-ai.github.io/)). Models must answer
questions whose evidence is sparsely distributed across hours or days, exercising
three memory types:
- **Entity memory** β€” Cumulative State Tracking, Temporal Counting
- **Event memory** β€” Event Ordering, Event Linking
- **Behavior memory** β€” Spatial Preference Inference, Activity Pattern Inference
500 Qs Β· avg. 5.1 evidence segments / Q Β· avg. 25.9 h memory backtracking. The
strongest model in the paper reaches **39.6% Overall**.
### Resources
- 🌐 Project page: <https://egomemreason.github.io/>
- πŸ“„ Paper: <https://arxiv.org/abs/2605.09874>
- πŸ’» Code & reference eval scripts: <https://github.com/Ziyang412/EgoMemReason>
- πŸ“¦ Public questions (no answers): <https://huggingface.co/datasets/Ted412/EgoMemReason>
- 🎬 EgoLife video frames: <https://egolife-ai.github.io/>
### Submission
Upload a JSON file with 500 entries:
```json
[
{"example_id": 1, "predicted_answer": "A"},
...
]
```
Questions have 4-10 options (letters A-J) β€” `predicted_answer` must be a letter
that appears in that question's `options` dict. See
[SUBMISSION_FORMAT.md](https://github.com/Ziyang412/EgoMemReason/blob/main/SUBMISSION_FORMAT.md)
for the full spec.
### License
- **Annotations** (this Space + the public dataset): CC BY-NC 4.0.
- **Video frames**: governed by the [EgoLife data license](https://egolife-ai.github.io/) β€” you must accept their terms separately.
### Citation
```bibtex
@misc{wang2026egomemreasonmemorydrivenreasoningbenchmark,
title={EgoMemReason: A Memory-Driven Reasoning Benchmark for Long-Horizon Egocentric Video Understanding},
author={Ziyang Wang and Yue Zhang and Shoubin Yu and Ce Zhang and Zengqi Zhao and Jaehong Yoon and Hyunji Lee and Gedas Bertasius and Mohit Bansal},
year={2026},
eprint={2605.09874},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2605.09874},
}
```
"""
# ---------------------------------------------------------------------------
# UI
# ---------------------------------------------------------------------------
with gr.Blocks(title="EgoMemReason Leaderboard") as demo:
gr.Markdown("# 🧠 EgoMemReason β€” Leaderboard")
gr.Markdown(
"*Memory-driven reasoning over week-long egocentric video. 500 MCQs Β· "
"Entity / Event / Behavior memory.*"
)
if BOOT_ERROR:
gr.Markdown(f"⚠️ **Boot warning:** {BOOT_ERROR}\n\nSubmissions are disabled.")
login_btn = gr.LoginButton()
with gr.Tab("About"):
gr.Markdown(ABOUT_MD)
with gr.Tab("Leaderboard"):
with gr.Row():
show_all = gr.Checkbox(
value=False,
label="Show all submissions (not just each team's selected entry)",
)
refresh_btn = gr.Button("Refresh", size="sm")
leaderboard_df = gr.Dataframe(
value=load_leaderboard(False),
headers=LEADERBOARD_COLUMNS,
interactive=False,
wrap=False,
)
show_all.change(load_leaderboard, inputs=[show_all], outputs=[leaderboard_df])
refresh_btn.click(load_leaderboard, inputs=[show_all], outputs=[leaderboard_df])
with gr.Tab("Submit"):
gr.Markdown("**Sign in with Hugging Face (button above) before submitting.** "
"Limit: 5 submissions per HF user per 24 h.")
with gr.Row():
team_name = gr.Textbox(label="Team name *", max_lines=1)
method_name = gr.Textbox(label="Method name *", max_lines=1)
with gr.Row():
model_size = gr.Textbox(label="Model size (e.g. 8B, 32B, API)", max_lines=1)
uses_external = gr.Radio(
["yes", "no"], label="Uses training data beyond EgoLife? *",
)
uses_frames = gr.Radio(
["frames-only", "video-only", "frames+audio", "captions-only", "other"],
label="Video input modality *",
)
method_description = gr.Textbox(label="Method description", lines=3)
with gr.Row():
project_url = gr.Textbox(label="Project URL", max_lines=1)
publication_url = gr.Textbox(label="Publication URL (arXiv/OpenReview)", max_lines=1)
submission_file = gr.File(label="submission.json", file_types=[".json"])
submit_btn = gr.Button("Score & log", variant="primary")
result_md = gr.Markdown()
submit_btn.click(
handle_submission,
inputs=[submission_file, team_name, method_name, model_size,
uses_external, uses_frames, method_description,
project_url, publication_url],
outputs=[result_md],
)
with gr.Tab("Manage my submissions"):
gr.Markdown(
"Toggle which of your past submissions is the official **selected** entry. "
"Only your own submissions appear here. "
"Only one entry per HF user can be selected at a time."
)
my_subs = gr.Dataframe(
value=pd.DataFrame(columns=MANAGE_COLUMNS),
headers=MANAGE_COLUMNS,
interactive=False,
wrap=False,
)
selected_id = gr.Textbox(label="submission_id to mark as selected", max_lines=1)
select_btn = gr.Button("Mark as my selected entry")
manage_msg = gr.Markdown()
demo.load(load_my_submissions, outputs=[my_subs])
select_btn.click(set_my_selected, inputs=[selected_id], outputs=[manage_msg, my_subs])
if __name__ == "__main__":
demo.queue().launch()