Spaces:

Ted412
/

EgoMemReason

Running

Ziyang Wang

add arXiv paper

1b38b03 about 20 hours ago

12.9 kB

	"""EgoMemReason leaderboard — Gradio Space app.

	Tabs:
	- Leaderboard public, auto-refresh, toggle selected-only / show-all
	- Submit HF login required; JSON upload + metadata form
	- Manage toggle is_selected on your own past submissions
	- About paper description + citation
	"""

	import os

	# Workaround for a long-standing gradio_client bug that hits the /info endpoint
	# when any component emits a JSON schema with `additionalProperties: True/False`
	# (a plain bool). Both get_type() and _json_schema_to_python_type() assume the
	# schema is a dict and crash on bools. Patch both before Gradio loads its
	# FastAPI routes.
	import gradio_client.utils as _gcu

	_orig_get_type = _gcu.get_type
	def _safe_get_type(schema):
	if not isinstance(schema, dict):
	return "Any"
	return _orig_get_type(schema)
	_gcu.get_type = _safe_get_type

	_orig_json_schema = _gcu._json_schema_to_python_type
	def _safe_json_schema(schema, defs=None):
	if not isinstance(schema, dict):
	# `additionalProperties: True` → accepts anything; `False` → accepts nothing.
	return "Any" if schema else "None"
	return _orig_json_schema(schema, defs)
	_gcu._json_schema_to_python_type = _safe_json_schema

	import gradio as gr
	import pandas as pd

	import auth
	import evaluator
	import ledger


	# ---------------------------------------------------------------------------
	# Boot: pull annotations_private.json from the private dataset repo.
	# ---------------------------------------------------------------------------

	try:
	ledger.ensure_private_annotations()
	except RuntimeError as e:
	# In local dev without HF_TOKEN, allow the app to come up with a clear banner.
	BOOT_ERROR = str(e)
	else:
	BOOT_ERROR = None


	LEADERBOARD_COLUMNS = [
	"#",
	"Method",
	"Team",
	"Overall",
	"Cumul",
	"Count",
	"Order",
	"Link",
	"Spatial",
	"Activity",
	"Size",
	"Ext",
	"Modality",
	"Links",
	]


	def _row_from_submission(sub, rank):
	m = sub["metrics"]
	links = []
	if sub.get("project_url"):
	links.append(f"[project]({sub['project_url']})")
	if sub.get("publication_url"):
	links.append(f"[paper]({sub['publication_url']})")
	return [
	rank,
	sub["method_name"],
	sub["team_name"],
	m["Overall"],
	m["Cumulative State Tracking"],
	m["Temporal Counting"],
	m["Event Ordering"],
	m["Event Linking"],
	m["Spatial Preference"],
	m["Activity Pattern"],
	sub.get("model_size") or "—",
	"yes" if sub.get("uses_external_data") else "no",
	sub.get("uses_video_frames") or "—",
	" · ".join(links) or "—",
	]


	def load_leaderboard(show_all):
	subs = ledger.list_submissions()
	if not show_all:
	subs = [s for s in subs if s.get("is_selected")]
	subs = sorted(subs, key=lambda s: s["metrics"]["Overall"], reverse=True)
	rows = [_row_from_submission(s, i + 1) for i, s in enumerate(subs)]
	return pd.DataFrame(rows, columns=LEADERBOARD_COLUMNS)


	# ---------------------------------------------------------------------------
	# Submit
	# ---------------------------------------------------------------------------

	def handle_submission(file, team_name, method_name, model_size, uses_external,
	uses_frames, method_description, project_url,
	publication_url, profile: gr.OAuthProfile \| None):
	user = auth.resolve_user(profile)
	if user is None:
	return "Error: sign in with Hugging Face first (button at the top of the page)."
	if not team_name or not method_name:
	return "Error: `team_name` and `method_name` are required."
	if uses_external not in ("yes", "no"):
	return "Error: answer `Uses external data?` (yes/no)."
	if not uses_frames:
	return "Error: pick a video input modality."
	if file is None:
	return "Error: upload a `.json` submission file."

	recent = ledger.count_recent(user, hours=24)
	if recent >= 5:
	return (f"Rate limit: you have {recent} submissions in the last 24 h "
	"(max 5). Try again later.")

	try:
	metrics = evaluator.score_submission(file.name)
	except ValueError as e:
	return f"Validation error:\n```\n{e}\n```"
	except Exception as e:
	return f"Internal error scoring submission: `{type(e).__name__}: {e}`"

	try:
	sid = ledger.append_submission(
	hf_user_id=user,
	team_name=team_name,
	method_name=method_name,
	model_size=model_size,
	uses_external_data=(uses_external == "yes"),
	uses_video_frames=uses_frames,
	method_description=method_description,
	project_url=project_url,
	publication_url=publication_url,
	metrics=metrics,
	)
	except Exception as e:
	return (f"Scored, but failed to persist to ledger: `{type(e).__name__}: {e}`\n\n"
	f"Your metrics were:\n```\n{metrics}\n```")

	rows = "\n".join(f"\| {k} \| {v:.2f} \|" for k, v in metrics.items())
	return (
	f"✅ Submission logged. `submission_id = {sid}`\n\n"
	f"\| Metric \| Score (%) \|\n\|---\|---\|\n{rows}\n\n"
	"Go to Manage my submissions to mark this as your official entry."
	)


	# ---------------------------------------------------------------------------
	# Manage
	# ---------------------------------------------------------------------------

	MANAGE_COLUMNS = ["submission_id", "method_name", "Overall", "is_selected", "submitted_at_utc"]


	def load_my_submissions(profile: gr.OAuthProfile \| None):
	user = auth.resolve_user(profile)
	if user is None:
	return pd.DataFrame(columns=MANAGE_COLUMNS)
	rows = []
	for sub in ledger.list_submissions():
	if sub.get("hf_user_id") != user:
	continue
	rows.append([
	sub["submission_id"],
	sub["method_name"],
	sub["metrics"]["Overall"],
	sub.get("is_selected", False),
	sub.get("submitted_at_utc", ""),
	])
	rows.sort(key=lambda r: r[4], reverse=True)
	return pd.DataFrame(rows, columns=MANAGE_COLUMNS)


	def set_my_selected(submission_id, profile: gr.OAuthProfile \| None):
	user = auth.resolve_user(profile)
	if user is None:
	return "Error: sign in first.", load_my_submissions(profile)
	if not submission_id or not submission_id.strip():
	return "Error: paste a submission_id.", load_my_submissions(profile)
	try:
	ledger.set_selected(submission_id.strip(), user)
	except (ValueError, PermissionError) as e:
	return f"Error: {e}", load_my_submissions(profile)
	return f"✅ `{submission_id.strip()}` is now your selected entry.", load_my_submissions(profile)


	# ---------------------------------------------------------------------------
	# About
	# ---------------------------------------------------------------------------

	ABOUT_MD = """\
	## EgoMemReason

	A Memory-driven Reasoning Benchmark for Long-Horizon Egocentric Video Understanding.

	EgoMemReason is a 500-question multiple-choice benchmark over week-long egocentric
	videos (built on [EgoLife](https://egolife-ai.github.io/)). Models must answer
	questions whose evidence is sparsely distributed across hours or days, exercising
	three memory types:

	- Entity memory — Cumulative State Tracking, Temporal Counting
	- Event memory — Event Ordering, Event Linking
	- Behavior memory — Spatial Preference Inference, Activity Pattern Inference

	500 Qs · avg. 5.1 evidence segments / Q · avg. 25.9 h memory backtracking. The
	strongest model in the paper reaches 39.6% Overall.

	### Resources

	- 🌐 Project page: <https://egomemreason.github.io/>
	- 📄 Paper: <https://arxiv.org/abs/2605.09874>
	- 💻 Code & reference eval scripts: <https://github.com/Ziyang412/EgoMemReason>
	- 📦 Public questions (no answers): <https://huggingface.co/datasets/Ted412/EgoMemReason>
	- 🎬 EgoLife video frames: <https://egolife-ai.github.io/>

	### Submission

	Upload a JSON file with 500 entries:

	```json
	[
	{"example_id": 1, "predicted_answer": "A"},
	...
	]
	```

	Questions have 4-10 options (letters A-J) — `predicted_answer` must be a letter
	that appears in that question's `options` dict. See
	[SUBMISSION_FORMAT.md](https://github.com/Ziyang412/EgoMemReason/blob/main/SUBMISSION_FORMAT.md)
	for the full spec.

	### License

	- Annotations (this Space + the public dataset): CC BY-NC 4.0.
	- Video frames: governed by the [EgoLife data license](https://egolife-ai.github.io/) — you must accept their terms separately.

	### Citation

	```bibtex
	@misc{wang2026egomemreasonmemorydrivenreasoningbenchmark,
	title={EgoMemReason: A Memory-Driven Reasoning Benchmark for Long-Horizon Egocentric Video Understanding},
	author={Ziyang Wang and Yue Zhang and Shoubin Yu and Ce Zhang and Zengqi Zhao and Jaehong Yoon and Hyunji Lee and Gedas Bertasius and Mohit Bansal},
	year={2026},
	eprint={2605.09874},
	archivePrefix={arXiv},
	primaryClass={cs.CV},
	url={https://arxiv.org/abs/2605.09874},
	}
	```
	"""


	# ---------------------------------------------------------------------------
	# UI
	# ---------------------------------------------------------------------------

	with gr.Blocks(title="EgoMemReason Leaderboard") as demo:
	gr.Markdown("# 🧠 EgoMemReason — Leaderboard")
	gr.Markdown(
	"*Memory-driven reasoning over week-long egocentric video. 500 MCQs · "
	"Entity / Event / Behavior memory.*"
	)
	if BOOT_ERROR:
	gr.Markdown(f"⚠️ Boot warning: {BOOT_ERROR}\n\nSubmissions are disabled.")

	login_btn = gr.LoginButton()

	with gr.Tab("About"):
	gr.Markdown(ABOUT_MD)

	with gr.Tab("Leaderboard"):
	with gr.Row():
	show_all = gr.Checkbox(
	value=False,
	label="Show all submissions (not just each team's selected entry)",
	)
	refresh_btn = gr.Button("Refresh", size="sm")
	leaderboard_df = gr.Dataframe(
	value=load_leaderboard(False),
	headers=LEADERBOARD_COLUMNS,
	interactive=False,
	wrap=False,
	)
	show_all.change(load_leaderboard, inputs=[show_all], outputs=[leaderboard_df])
	refresh_btn.click(load_leaderboard, inputs=[show_all], outputs=[leaderboard_df])

	with gr.Tab("Submit"):
	gr.Markdown("Sign in with Hugging Face (button above) before submitting. "
	"Limit: 5 submissions per HF user per 24 h.")
	with gr.Row():
	team_name = gr.Textbox(label="Team name *", max_lines=1)
	method_name = gr.Textbox(label="Method name *", max_lines=1)
	with gr.Row():
	model_size = gr.Textbox(label="Model size (e.g. 8B, 32B, API)", max_lines=1)
	uses_external = gr.Radio(
	["yes", "no"], label="Uses training data beyond EgoLife? *",
	)
	uses_frames = gr.Radio(
	["frames-only", "video-only", "frames+audio", "captions-only", "other"],
	label="Video input modality *",
	)
	method_description = gr.Textbox(label="Method description", lines=3)
	with gr.Row():
	project_url = gr.Textbox(label="Project URL", max_lines=1)
	publication_url = gr.Textbox(label="Publication URL (arXiv/OpenReview)", max_lines=1)
	submission_file = gr.File(label="submission.json", file_types=[".json"])
	submit_btn = gr.Button("Score & log", variant="primary")
	result_md = gr.Markdown()
	submit_btn.click(
	handle_submission,
	inputs=[submission_file, team_name, method_name, model_size,
	uses_external, uses_frames, method_description,
	project_url, publication_url],
	outputs=[result_md],
	)

	with gr.Tab("Manage my submissions"):
	gr.Markdown(
	"Toggle which of your past submissions is the official selected entry. "
	"Only your own submissions appear here. "
	"Only one entry per HF user can be selected at a time."
	)
	my_subs = gr.Dataframe(
	value=pd.DataFrame(columns=MANAGE_COLUMNS),
	headers=MANAGE_COLUMNS,
	interactive=False,
	wrap=False,
	)
	selected_id = gr.Textbox(label="submission_id to mark as selected", max_lines=1)
	select_btn = gr.Button("Mark as my selected entry")
	manage_msg = gr.Markdown()
	demo.load(load_my_submissions, outputs=[my_subs])
	select_btn.click(set_my_selected, inputs=[selected_id], outputs=[manage_msg, my_subs])


	if __name__ == "__main__":
	demo.queue().launch()