File size: 12,872 Bytes
1bf5b23
 
 
 
 
 
 
 
 
 
 
7b3d33e
 
0385d22
 
 
7b3d33e
0385d22
7b3d33e
 
 
 
 
 
 
0385d22
 
 
 
 
 
 
 
1bf5b23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed7ed80
1bf5b23
 
 
ed7ed80
 
 
 
 
 
 
 
1bf5b23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7f6d4bc
1b38b03
9cf02ac
1bf5b23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9cf02ac
1bf5b23
 
 
 
 
 
 
 
 
 
1b38b03
 
 
 
 
 
 
 
1bf5b23
 
 
 
 
 
 
 
 
ed7ed80
1bf5b23
 
 
 
 
 
 
 
 
 
483161e
 
 
1bf5b23
 
 
 
 
 
 
 
 
 
 
ed7ed80
1bf5b23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed7ed80
1bf5b23
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
"""EgoMemReason leaderboard β€” Gradio Space app.

Tabs:
  - Leaderboard   public, auto-refresh, toggle selected-only / show-all
  - Submit        HF login required; JSON upload + metadata form
  - Manage        toggle is_selected on your own past submissions
  - About         paper description + citation
"""

import os

# Workaround for a long-standing gradio_client bug that hits the /info endpoint
# when any component emits a JSON schema with `additionalProperties: True/False`
# (a plain bool). Both get_type() and _json_schema_to_python_type() assume the
# schema is a dict and crash on bools. Patch both before Gradio loads its
# FastAPI routes.
import gradio_client.utils as _gcu

_orig_get_type = _gcu.get_type
def _safe_get_type(schema):
    if not isinstance(schema, dict):
        return "Any"
    return _orig_get_type(schema)
_gcu.get_type = _safe_get_type

_orig_json_schema = _gcu._json_schema_to_python_type
def _safe_json_schema(schema, defs=None):
    if not isinstance(schema, dict):
        # `additionalProperties: True` β†’ accepts anything; `False` β†’ accepts nothing.
        return "Any" if schema else "None"
    return _orig_json_schema(schema, defs)
_gcu._json_schema_to_python_type = _safe_json_schema

import gradio as gr
import pandas as pd

import auth
import evaluator
import ledger


# ---------------------------------------------------------------------------
# Boot: pull annotations_private.json from the private dataset repo.
# ---------------------------------------------------------------------------

try:
    ledger.ensure_private_annotations()
except RuntimeError as e:
    # In local dev without HF_TOKEN, allow the app to come up with a clear banner.
    BOOT_ERROR = str(e)
else:
    BOOT_ERROR = None


LEADERBOARD_COLUMNS = [
    "#",
    "Method",
    "Team",
    "Overall",
    "Cumul",
    "Count",
    "Order",
    "Link",
    "Spatial",
    "Activity",
    "Size",
    "Ext",
    "Modality",
    "Links",
]


def _row_from_submission(sub, rank):
    m = sub["metrics"]
    links = []
    if sub.get("project_url"):
        links.append(f"[project]({sub['project_url']})")
    if sub.get("publication_url"):
        links.append(f"[paper]({sub['publication_url']})")
    return [
        rank,
        sub["method_name"],
        sub["team_name"],
        m["Overall"],
        m["Cumulative State Tracking"],
        m["Temporal Counting"],
        m["Event Ordering"],
        m["Event Linking"],
        m["Spatial Preference"],
        m["Activity Pattern"],
        sub.get("model_size") or "β€”",
        "yes" if sub.get("uses_external_data") else "no",
        sub.get("uses_video_frames") or "β€”",
        " Β· ".join(links) or "β€”",
    ]


def load_leaderboard(show_all):
    subs = ledger.list_submissions()
    if not show_all:
        subs = [s for s in subs if s.get("is_selected")]
    subs = sorted(subs, key=lambda s: s["metrics"]["Overall"], reverse=True)
    rows = [_row_from_submission(s, i + 1) for i, s in enumerate(subs)]
    return pd.DataFrame(rows, columns=LEADERBOARD_COLUMNS)


# ---------------------------------------------------------------------------
# Submit
# ---------------------------------------------------------------------------

def handle_submission(file, team_name, method_name, model_size, uses_external,
                      uses_frames, method_description, project_url,
                      publication_url, profile: gr.OAuthProfile | None):
    user = auth.resolve_user(profile)
    if user is None:
        return "**Error:** sign in with Hugging Face first (button at the top of the page)."
    if not team_name or not method_name:
        return "**Error:** `team_name` and `method_name` are required."
    if uses_external not in ("yes", "no"):
        return "**Error:** answer `Uses external data?` (yes/no)."
    if not uses_frames:
        return "**Error:** pick a video input modality."
    if file is None:
        return "**Error:** upload a `.json` submission file."

    recent = ledger.count_recent(user, hours=24)
    if recent >= 5:
        return (f"**Rate limit:** you have **{recent}** submissions in the last 24 h "
                "(max 5). Try again later.")

    try:
        metrics = evaluator.score_submission(file.name)
    except ValueError as e:
        return f"**Validation error:**\n```\n{e}\n```"
    except Exception as e:
        return f"**Internal error scoring submission:** `{type(e).__name__}: {e}`"

    try:
        sid = ledger.append_submission(
            hf_user_id=user,
            team_name=team_name,
            method_name=method_name,
            model_size=model_size,
            uses_external_data=(uses_external == "yes"),
            uses_video_frames=uses_frames,
            method_description=method_description,
            project_url=project_url,
            publication_url=publication_url,
            metrics=metrics,
        )
    except Exception as e:
        return (f"**Scored, but failed to persist to ledger:** `{type(e).__name__}: {e}`\n\n"
                f"Your metrics were:\n```\n{metrics}\n```")

    rows = "\n".join(f"| {k} | **{v:.2f}** |" for k, v in metrics.items())
    return (
        f"βœ… **Submission logged.** `submission_id = {sid}`\n\n"
        f"| Metric | Score (%) |\n|---|---|\n{rows}\n\n"
        "Go to **Manage my submissions** to mark this as your official entry."
    )


# ---------------------------------------------------------------------------
# Manage
# ---------------------------------------------------------------------------

MANAGE_COLUMNS = ["submission_id", "method_name", "Overall", "is_selected", "submitted_at_utc"]


def load_my_submissions(profile: gr.OAuthProfile | None):
    user = auth.resolve_user(profile)
    if user is None:
        return pd.DataFrame(columns=MANAGE_COLUMNS)
    rows = []
    for sub in ledger.list_submissions():
        if sub.get("hf_user_id") != user:
            continue
        rows.append([
            sub["submission_id"],
            sub["method_name"],
            sub["metrics"]["Overall"],
            sub.get("is_selected", False),
            sub.get("submitted_at_utc", ""),
        ])
    rows.sort(key=lambda r: r[4], reverse=True)
    return pd.DataFrame(rows, columns=MANAGE_COLUMNS)


def set_my_selected(submission_id, profile: gr.OAuthProfile | None):
    user = auth.resolve_user(profile)
    if user is None:
        return "**Error:** sign in first.", load_my_submissions(profile)
    if not submission_id or not submission_id.strip():
        return "**Error:** paste a submission_id.", load_my_submissions(profile)
    try:
        ledger.set_selected(submission_id.strip(), user)
    except (ValueError, PermissionError) as e:
        return f"**Error:** {e}", load_my_submissions(profile)
    return f"βœ… `{submission_id.strip()}` is now your selected entry.", load_my_submissions(profile)


# ---------------------------------------------------------------------------
# About
# ---------------------------------------------------------------------------

ABOUT_MD = """\
## EgoMemReason

**A Memory-driven Reasoning Benchmark for Long-Horizon Egocentric Video Understanding.**

EgoMemReason is a 500-question multiple-choice benchmark over week-long egocentric
videos (built on [EgoLife](https://egolife-ai.github.io/)). Models must answer
questions whose evidence is sparsely distributed across hours or days, exercising
three memory types:

- **Entity memory** β€” Cumulative State Tracking, Temporal Counting
- **Event memory** β€” Event Ordering, Event Linking
- **Behavior memory** β€” Spatial Preference Inference, Activity Pattern Inference

500 Qs Β· avg. 5.1 evidence segments / Q Β· avg. 25.9 h memory backtracking. The
strongest model in the paper reaches **39.6% Overall**.

### Resources

- 🌐 Project page: <https://egomemreason.github.io/>
- πŸ“„ Paper: <https://arxiv.org/abs/2605.09874>
- πŸ’» Code & reference eval scripts: <https://github.com/Ziyang412/EgoMemReason>
- πŸ“¦ Public questions (no answers): <https://huggingface.co/datasets/Ted412/EgoMemReason>
- 🎬 EgoLife video frames: <https://egolife-ai.github.io/>

### Submission

Upload a JSON file with 500 entries:

```json
[
  {"example_id": 1, "predicted_answer": "A"},
  ...
]
```

Questions have 4-10 options (letters A-J) β€” `predicted_answer` must be a letter
that appears in that question's `options` dict. See
[SUBMISSION_FORMAT.md](https://github.com/Ziyang412/EgoMemReason/blob/main/SUBMISSION_FORMAT.md)
for the full spec.

### License

- **Annotations** (this Space + the public dataset): CC BY-NC 4.0.
- **Video frames**: governed by the [EgoLife data license](https://egolife-ai.github.io/) β€” you must accept their terms separately.

### Citation

```bibtex
@misc{wang2026egomemreasonmemorydrivenreasoningbenchmark,
      title={EgoMemReason: A Memory-Driven Reasoning Benchmark for Long-Horizon Egocentric Video Understanding},
      author={Ziyang Wang and Yue Zhang and Shoubin Yu and Ce Zhang and Zengqi Zhao and Jaehong Yoon and Hyunji Lee and Gedas Bertasius and Mohit Bansal},
      year={2026},
      eprint={2605.09874},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
      url={https://arxiv.org/abs/2605.09874},
}
```
"""


# ---------------------------------------------------------------------------
# UI
# ---------------------------------------------------------------------------

with gr.Blocks(title="EgoMemReason Leaderboard") as demo:
    gr.Markdown("# 🧠 EgoMemReason β€” Leaderboard")
    gr.Markdown(
        "*Memory-driven reasoning over week-long egocentric video. 500 MCQs Β· "
        "Entity / Event / Behavior memory.*"
    )
    if BOOT_ERROR:
        gr.Markdown(f"⚠️ **Boot warning:** {BOOT_ERROR}\n\nSubmissions are disabled.")

    login_btn = gr.LoginButton()

    with gr.Tab("About"):
        gr.Markdown(ABOUT_MD)

    with gr.Tab("Leaderboard"):
        with gr.Row():
            show_all = gr.Checkbox(
                value=False,
                label="Show all submissions (not just each team's selected entry)",
            )
            refresh_btn = gr.Button("Refresh", size="sm")
        leaderboard_df = gr.Dataframe(
            value=load_leaderboard(False),
            headers=LEADERBOARD_COLUMNS,
            interactive=False,
            wrap=False,
        )
        show_all.change(load_leaderboard, inputs=[show_all], outputs=[leaderboard_df])
        refresh_btn.click(load_leaderboard, inputs=[show_all], outputs=[leaderboard_df])

    with gr.Tab("Submit"):
        gr.Markdown("**Sign in with Hugging Face (button above) before submitting.** "
                    "Limit: 5 submissions per HF user per 24 h.")
        with gr.Row():
            team_name = gr.Textbox(label="Team name *", max_lines=1)
            method_name = gr.Textbox(label="Method name *", max_lines=1)
        with gr.Row():
            model_size = gr.Textbox(label="Model size (e.g. 8B, 32B, API)", max_lines=1)
            uses_external = gr.Radio(
                ["yes", "no"], label="Uses training data beyond EgoLife? *",
            )
        uses_frames = gr.Radio(
            ["frames-only", "video-only", "frames+audio", "captions-only", "other"],
            label="Video input modality *",
        )
        method_description = gr.Textbox(label="Method description", lines=3)
        with gr.Row():
            project_url = gr.Textbox(label="Project URL", max_lines=1)
            publication_url = gr.Textbox(label="Publication URL (arXiv/OpenReview)", max_lines=1)
        submission_file = gr.File(label="submission.json", file_types=[".json"])
        submit_btn = gr.Button("Score & log", variant="primary")
        result_md = gr.Markdown()
        submit_btn.click(
            handle_submission,
            inputs=[submission_file, team_name, method_name, model_size,
                    uses_external, uses_frames, method_description,
                    project_url, publication_url],
            outputs=[result_md],
        )

    with gr.Tab("Manage my submissions"):
        gr.Markdown(
            "Toggle which of your past submissions is the official **selected** entry. "
            "Only your own submissions appear here. "
            "Only one entry per HF user can be selected at a time."
        )
        my_subs = gr.Dataframe(
            value=pd.DataFrame(columns=MANAGE_COLUMNS),
            headers=MANAGE_COLUMNS,
            interactive=False,
            wrap=False,
        )
        selected_id = gr.Textbox(label="submission_id to mark as selected", max_lines=1)
        select_btn = gr.Button("Mark as my selected entry")
        manage_msg = gr.Markdown()
        demo.load(load_my_submissions, outputs=[my_subs])
        select_btn.click(set_my_selected, inputs=[selected_id], outputs=[manage_msg, my_subs])


if __name__ == "__main__":
    demo.queue().launch()