Leaderboard / miniapp_leaderboard.py
ha251's picture
Update miniapp_leaderboard.py
66be23a verified
import datetime
import io
import json
import os
import re
import uuid
import requests
import gradio as gr
import pandas as pd
from huggingface_hub import HfApi, hf_hub_download
APP_NAME = "MiniApp"
HF_TOKEN = os.environ.get("HF_TOKEN")
LEADERBOARD_DATASET = os.environ.get("LEADERBOARD_DATASET", "").strip()
PENDING_PREFIX = "pending/"
APPROVED_PREFIX = "approved/"
RESEND_API_KEY = os.environ.get("RESEND_API_KEY", "").strip()
NOTIFY_EMAIL_TO = os.environ.get("NOTIFY_EMAIL_TO", "").strip()
NOTIFY_EMAIL_FROM = os.environ.get("NOTIFY_EMAIL_FROM", "").strip()
COLUMNS = ["model_name","model_family","avg","easy","mid","hard","submitted_at"]
NUMERIC_COLS = ["avg","easy","mid","hard"]
def _api():
return HfApi(token=HF_TOKEN)
def _slug(s: str):
s = re.sub(r"[^a-z0-9]+", "-", (s or "").lower())
return s.strip("-") or "model"
def _load_df(prefix: str):
if not HF_TOKEN or not LEADERBOARD_DATASET:
return pd.DataFrame(columns=COLUMNS)
api = _api()
try:
files = api.list_repo_files(repo_id=LEADERBOARD_DATASET, repo_type="dataset")
except Exception:
return pd.DataFrame(columns=COLUMNS)
files = [f for f in files if f.startswith(prefix) and f.endswith(".json")]
rows = []
for f in files:
try:
path = hf_hub_download(
repo_id=LEADERBOARD_DATASET,
repo_type="dataset",
filename=f,
token=HF_TOKEN,
)
with open(path, "r") as fp:
rows.append(json.load(fp))
except Exception:
continue
if not rows:
return pd.DataFrame(columns=COLUMNS)
df = pd.DataFrame(rows)
for c in COLUMNS:
if c not in df.columns:
df[c] = ""
for c in NUMERIC_COLS:
df[c] = pd.to_numeric(df[c], errors="coerce")
df = df.sort_values(by="avg", ascending=False)
for c in NUMERIC_COLS:
df[c] = df[c].map(lambda x: "" if pd.isna(x) else f"{x:.2f}")
return df[COLUMNS]
def refresh():
return _load_df(APPROVED_PREFIX)
def _today_utc():
return datetime.datetime.utcnow().date().isoformat()
def _send_email_resend(subject: str, text: str):
if not (RESEND_API_KEY and NOTIFY_EMAIL_TO and NOTIFY_EMAIL_FROM):
return
requests.post(
"https://api.resend.com/emails",
headers={"Authorization": f"Bearer {RESEND_API_KEY}", "Content-Type": "application/json"},
json={"from": NOTIFY_EMAIL_FROM, "to": [NOTIFY_EMAIL_TO], "subject": subject, "text": text},
timeout=20,
)
def _already_submitted_today(api: HfApi, day: str, username: str) -> bool:
marker = f"{PENDING_PREFIX}{day}/{username}/_submitted.json"
try:
files = api.list_repo_files(repo_id=LEADERBOARD_DATASET, repo_type="dataset")
return marker in files
except Exception:
return True # 更安全:查不到就当提交过
# ✅ 改这里:用 profile 取 username
def submit(model_name, model_family, zip_file, profile: gr.OAuthProfile):
if profile is None or not getattr(profile, "username", None):
return "Please sign in with Hugging Face first.", refresh()
username = profile.username
if not model_name or not model_family or zip_file is None:
return "All fields are required.", refresh()
if not zip_file.name.endswith(".zip"):
return "Please upload a .zip file.", refresh()
if not HF_TOKEN or not LEADERBOARD_DATASET:
return "Server is not configured (HF_TOKEN / LEADERBOARD_DATASET).", refresh()
api = _api()
day = _today_utc()
if _already_submitted_today(api, day, username):
return f"Limit: you can only submit once per day. (user={username}, day={day})", refresh()
now = datetime.datetime.utcnow().isoformat() + "Z"
safe_model = _slug(model_name)
nonce = uuid.uuid4().hex[:6]
base_dir = f"{PENDING_PREFIX}{day}/{username}/"
json_path = f"{base_dir}{now}-{safe_model}-{nonce}.json"
zip_path = f"{base_dir}{now}-{safe_model}-{nonce}.zip"
marker_path = f"{base_dir}_submitted.json"
payload = {
"model name": model_name,
"model family": model_family,
"avg": 0, "easy": 0, "mid": 0, "hard": 0,
"submitted_at": now,
"username": username,
"day": day,
}
api.upload_file(
repo_id=LEADERBOARD_DATASET, repo_type="dataset",
path_or_fileobj=io.BytesIO(json.dumps(payload, indent=2).encode("utf-8")),
path_in_repo=json_path,
commit_message=f"submit {model_name} by {username}",
)
api.upload_file(
repo_id=LEADERBOARD_DATASET, repo_type="dataset",
path_or_fileobj=zip_file,
path_in_repo=zip_path,
commit_message=f"upload zip {model_name} by {username}",
)
api.upload_file(
repo_id=LEADERBOARD_DATASET, repo_type="dataset",
path_or_fileobj=io.BytesIO(json.dumps({"submitted_at": now, "username": username, "day": day}, indent=2).encode("utf-8")),
path_in_repo=marker_path,
commit_message=f"marker {day} {username}",
)
_send_email_resend(
subject=f"[{APP_NAME}] New submission from {username} ({day})",
text=f"user: {username}\nday: {day}\nmodel: {model_name}\nfamily: {model_family}\njson: {json_path}\nzip: {zip_path}\n",
)
return "Submitted. Waiting for review.", refresh()
with gr.Blocks(title=f"{APP_NAME} leaderboard") as demo:
gr.Markdown(f"# {APP_NAME} Leaderboard")
gr.Markdown("""
## Data
MiniAppBench is the first comprehensive benchmark designed to evaluate principle-driven, interactive application generation. Unlike prior benchmarks that emphasize static UI layouts or isolated algorithmic code snippets, MiniAppBench targets **MiniApps**—HTML-based applications that require both faithful visual rendering and non-trivial interaction logic.
The dataset is split into two subsets: **validation (100 instances)** and **test (400 instances)**, and can be accessed at **[MiniAppBench dataset](https://huggingface.co/datasets/MiniAppBench/Dataset)**. The **validation** set includes publicly available **evaluation references** to support reproducible experiments, while the **test** set keeps the references hidden to enable unbiased evaluation.
""")
gr.Markdown(
"""
## Leaderboard
All results shown on this leaderboard are evaluated on the **test split** of MiniAppBench.
""",
)
leaderboard = gr.Dataframe(
value=pd.DataFrame(columns=COLUMNS), # 启动不访问Hub
interactive=False,
wrap=True,
)
refresh_btn = gr.Button("Refresh")
refresh_btn.click(refresh, outputs=[leaderboard])
# 页面加载时自动刷新一次
demo.load(refresh, outputs=[leaderboard])
gr.Markdown("## Submit")
gr.Markdown(
"""
**Submission requirements**
- Please **sign in with Hugging Face** before submitting.
- **One submission per user per day (UTC)**.
- Upload a **.zip** file only.
- The `.zip` must contain the HTML outputs for the **test set queries**.
- Each file should be named using the query index: `<index>.html` (e.g., `1.html`, `2.html`, ...).
- We may contact you via email for verification and request additional materials. Please be prepared to provide:
- **Model access** (one of the following):
- Preferred: an **inference API endpoint** we can use to reproduce the results.
- Alternatively: **model checkpoints (ckpts)** plus clear **deployment / inference instructions** (environment, dependencies, and how to run).
- **A related paper**, if available (e.g., an **arXiv link** or a PDF).
- After you submit, we will update the results within **3 days**.
""",
)
model_name = gr.Textbox(label="Model name", placeholder="e.g. MyModel v1")
model_family = gr.Textbox(label="Model family", placeholder="e.g. Llama / Qwen / InternLM ...")
zip_file = gr.File(label="Upload zip (.zip only)", file_types=[".zip"])
with gr.Row():
with gr.Column(scale=1, min_width=220):
login_btn = gr.LoginButton()
with gr.Column(scale=1, min_width=260):
submit_btn = gr.Button("Submit", variant="primary")
status = gr.Markdown()
submit_btn.click(submit, inputs=[model_name, model_family, zip_file], outputs=[status, leaderboard])
demo.launch()