"""EgoMemReason leaderboard — Gradio Space app. Tabs: - Leaderboard public, auto-refresh, toggle selected-only / show-all - Submit HF login required; JSON upload + metadata form - Manage toggle is_selected on your own past submissions - About paper description + citation """ import os # Workaround for a long-standing gradio_client bug that hits the /info endpoint # when any component emits a JSON schema with `additionalProperties: True/False` # (a plain bool). Both get_type() and _json_schema_to_python_type() assume the # schema is a dict and crash on bools. Patch both before Gradio loads its # FastAPI routes. import gradio_client.utils as _gcu _orig_get_type = _gcu.get_type def _safe_get_type(schema): if not isinstance(schema, dict): return "Any" return _orig_get_type(schema) _gcu.get_type = _safe_get_type _orig_json_schema = _gcu._json_schema_to_python_type def _safe_json_schema(schema, defs=None): if not isinstance(schema, dict): # `additionalProperties: True` → accepts anything; `False` → accepts nothing. return "Any" if schema else "None" return _orig_json_schema(schema, defs) _gcu._json_schema_to_python_type = _safe_json_schema import gradio as gr import pandas as pd import auth import evaluator import ledger # --------------------------------------------------------------------------- # Boot: pull annotations_private.json from the private dataset repo. # --------------------------------------------------------------------------- try: ledger.ensure_private_annotations() except RuntimeError as e: # In local dev without HF_TOKEN, allow the app to come up with a clear banner. BOOT_ERROR = str(e) else: BOOT_ERROR = None LEADERBOARD_COLUMNS = [ "#", "Method", "Team", "Overall", "Cumul", "Count", "Order", "Link", "Spatial", "Activity", "Size", "Ext", "Modality", "Links", ] def _row_from_submission(sub, rank): m = sub["metrics"] links = [] if sub.get("project_url"): links.append(f"[project]({sub['project_url']})") if sub.get("publication_url"): links.append(f"[paper]({sub['publication_url']})") return [ rank, sub["method_name"], sub["team_name"], m["Overall"], m["Cumulative State Tracking"], m["Temporal Counting"], m["Event Ordering"], m["Event Linking"], m["Spatial Preference"], m["Activity Pattern"], sub.get("model_size") or "—", "yes" if sub.get("uses_external_data") else "no", sub.get("uses_video_frames") or "—", " · ".join(links) or "—", ] def load_leaderboard(show_all): subs = ledger.list_submissions() if not show_all: subs = [s for s in subs if s.get("is_selected")] subs = sorted(subs, key=lambda s: s["metrics"]["Overall"], reverse=True) rows = [_row_from_submission(s, i + 1) for i, s in enumerate(subs)] return pd.DataFrame(rows, columns=LEADERBOARD_COLUMNS) # --------------------------------------------------------------------------- # Submit # --------------------------------------------------------------------------- def handle_submission(file, team_name, method_name, model_size, uses_external, uses_frames, method_description, project_url, publication_url, profile: gr.OAuthProfile | None): user = auth.resolve_user(profile) if user is None: return "**Error:** sign in with Hugging Face first (button at the top of the page)." if not team_name or not method_name: return "**Error:** `team_name` and `method_name` are required." if uses_external not in ("yes", "no"): return "**Error:** answer `Uses external data?` (yes/no)." if not uses_frames: return "**Error:** pick a video input modality." if file is None: return "**Error:** upload a `.json` submission file." recent = ledger.count_recent(user, hours=24) if recent >= 5: return (f"**Rate limit:** you have **{recent}** submissions in the last 24 h " "(max 5). Try again later.") try: metrics = evaluator.score_submission(file.name) except ValueError as e: return f"**Validation error:**\n```\n{e}\n```" except Exception as e: return f"**Internal error scoring submission:** `{type(e).__name__}: {e}`" try: sid = ledger.append_submission( hf_user_id=user, team_name=team_name, method_name=method_name, model_size=model_size, uses_external_data=(uses_external == "yes"), uses_video_frames=uses_frames, method_description=method_description, project_url=project_url, publication_url=publication_url, metrics=metrics, ) except Exception as e: return (f"**Scored, but failed to persist to ledger:** `{type(e).__name__}: {e}`\n\n" f"Your metrics were:\n```\n{metrics}\n```") rows = "\n".join(f"| {k} | **{v:.2f}** |" for k, v in metrics.items()) return ( f"✅ **Submission logged.** `submission_id = {sid}`\n\n" f"| Metric | Score (%) |\n|---|---|\n{rows}\n\n" "Go to **Manage my submissions** to mark this as your official entry." ) # --------------------------------------------------------------------------- # Manage # --------------------------------------------------------------------------- MANAGE_COLUMNS = ["submission_id", "method_name", "Overall", "is_selected", "submitted_at_utc"] def load_my_submissions(profile: gr.OAuthProfile | None): user = auth.resolve_user(profile) if user is None: return pd.DataFrame(columns=MANAGE_COLUMNS) rows = [] for sub in ledger.list_submissions(): if sub.get("hf_user_id") != user: continue rows.append([ sub["submission_id"], sub["method_name"], sub["metrics"]["Overall"], sub.get("is_selected", False), sub.get("submitted_at_utc", ""), ]) rows.sort(key=lambda r: r[4], reverse=True) return pd.DataFrame(rows, columns=MANAGE_COLUMNS) def set_my_selected(submission_id, profile: gr.OAuthProfile | None): user = auth.resolve_user(profile) if user is None: return "**Error:** sign in first.", load_my_submissions(profile) if not submission_id or not submission_id.strip(): return "**Error:** paste a submission_id.", load_my_submissions(profile) try: ledger.set_selected(submission_id.strip(), user) except (ValueError, PermissionError) as e: return f"**Error:** {e}", load_my_submissions(profile) return f"✅ `{submission_id.strip()}` is now your selected entry.", load_my_submissions(profile) # --------------------------------------------------------------------------- # About # --------------------------------------------------------------------------- ABOUT_MD = """\ ## EgoMemReason **A Memory-driven Reasoning Benchmark for Long-Horizon Egocentric Video Understanding.** EgoMemReason is a 500-question multiple-choice benchmark over week-long egocentric videos (built on [EgoLife](https://egolife-ai.github.io/)). Models must answer questions whose evidence is sparsely distributed across hours or days, exercising three memory types: - **Entity memory** — Cumulative State Tracking, Temporal Counting - **Event memory** — Event Ordering, Event Linking - **Behavior memory** — Spatial Preference Inference, Activity Pattern Inference 500 Qs · avg. 5.1 evidence segments / Q · avg. 25.9 h memory backtracking. The strongest model in the paper reaches **39.6% Overall**. ### Resources - 🌐 Project page: - 📄 Paper: - 💻 Code & reference eval scripts: - 📦 Public questions (no answers): - 🎬 EgoLife video frames: ### Submission Upload a JSON file with 500 entries: ```json [ {"example_id": 1, "predicted_answer": "A"}, ... ] ``` Questions have 4-10 options (letters A-J) — `predicted_answer` must be a letter that appears in that question's `options` dict. See [SUBMISSION_FORMAT.md](https://github.com/Ziyang412/EgoMemReason/blob/main/SUBMISSION_FORMAT.md) for the full spec. ### License - **Annotations** (this Space + the public dataset): CC BY-NC 4.0. - **Video frames**: governed by the [EgoLife data license](https://egolife-ai.github.io/) — you must accept their terms separately. ### Citation ```bibtex @misc{wang2026egomemreasonmemorydrivenreasoningbenchmark, title={EgoMemReason: A Memory-Driven Reasoning Benchmark for Long-Horizon Egocentric Video Understanding}, author={Ziyang Wang and Yue Zhang and Shoubin Yu and Ce Zhang and Zengqi Zhao and Jaehong Yoon and Hyunji Lee and Gedas Bertasius and Mohit Bansal}, year={2026}, eprint={2605.09874}, archivePrefix={arXiv}, primaryClass={cs.CV}, url={https://arxiv.org/abs/2605.09874}, } ``` """ # --------------------------------------------------------------------------- # UI # --------------------------------------------------------------------------- with gr.Blocks(title="EgoMemReason Leaderboard") as demo: gr.Markdown("# 🧠 EgoMemReason — Leaderboard") gr.Markdown( "*Memory-driven reasoning over week-long egocentric video. 500 MCQs · " "Entity / Event / Behavior memory.*" ) if BOOT_ERROR: gr.Markdown(f"⚠️ **Boot warning:** {BOOT_ERROR}\n\nSubmissions are disabled.") login_btn = gr.LoginButton() with gr.Tab("About"): gr.Markdown(ABOUT_MD) with gr.Tab("Leaderboard"): with gr.Row(): show_all = gr.Checkbox( value=False, label="Show all submissions (not just each team's selected entry)", ) refresh_btn = gr.Button("Refresh", size="sm") leaderboard_df = gr.Dataframe( value=load_leaderboard(False), headers=LEADERBOARD_COLUMNS, interactive=False, wrap=False, ) show_all.change(load_leaderboard, inputs=[show_all], outputs=[leaderboard_df]) refresh_btn.click(load_leaderboard, inputs=[show_all], outputs=[leaderboard_df]) with gr.Tab("Submit"): gr.Markdown("**Sign in with Hugging Face (button above) before submitting.** " "Limit: 5 submissions per HF user per 24 h.") with gr.Row(): team_name = gr.Textbox(label="Team name *", max_lines=1) method_name = gr.Textbox(label="Method name *", max_lines=1) with gr.Row(): model_size = gr.Textbox(label="Model size (e.g. 8B, 32B, API)", max_lines=1) uses_external = gr.Radio( ["yes", "no"], label="Uses training data beyond EgoLife? *", ) uses_frames = gr.Radio( ["frames-only", "video-only", "frames+audio", "captions-only", "other"], label="Video input modality *", ) method_description = gr.Textbox(label="Method description", lines=3) with gr.Row(): project_url = gr.Textbox(label="Project URL", max_lines=1) publication_url = gr.Textbox(label="Publication URL (arXiv/OpenReview)", max_lines=1) submission_file = gr.File(label="submission.json", file_types=[".json"]) submit_btn = gr.Button("Score & log", variant="primary") result_md = gr.Markdown() submit_btn.click( handle_submission, inputs=[submission_file, team_name, method_name, model_size, uses_external, uses_frames, method_description, project_url, publication_url], outputs=[result_md], ) with gr.Tab("Manage my submissions"): gr.Markdown( "Toggle which of your past submissions is the official **selected** entry. " "Only your own submissions appear here. " "Only one entry per HF user can be selected at a time." ) my_subs = gr.Dataframe( value=pd.DataFrame(columns=MANAGE_COLUMNS), headers=MANAGE_COLUMNS, interactive=False, wrap=False, ) selected_id = gr.Textbox(label="submission_id to mark as selected", max_lines=1) select_btn = gr.Button("Mark as my selected entry") manage_msg = gr.Markdown() demo.load(load_my_submissions, outputs=[my_subs]) select_btn.click(set_my_selected, inputs=[selected_id], outputs=[manage_msg, my_subs]) if __name__ == "__main__": demo.queue().launch()