Spaces:

MiniAppBench
/

Leaderboard

Running

App Files Files Community

Leaderboard / miniapp_leaderboard.py

ha251

Update miniapp_leaderboard.py

66be23a verified about 1 month ago

raw

history blame contribute delete

8.42 kB

	import datetime
	import io
	import json
	import os
	import re
	import uuid
	import requests

	import gradio as gr
	import pandas as pd
	from huggingface_hub import HfApi, hf_hub_download

	APP_NAME = "MiniApp"

	HF_TOKEN = os.environ.get("HF_TOKEN")
	LEADERBOARD_DATASET = os.environ.get("LEADERBOARD_DATASET", "").strip()

	PENDING_PREFIX = "pending/"
	APPROVED_PREFIX = "approved/"

	RESEND_API_KEY = os.environ.get("RESEND_API_KEY", "").strip()
	NOTIFY_EMAIL_TO = os.environ.get("NOTIFY_EMAIL_TO", "").strip()
	NOTIFY_EMAIL_FROM = os.environ.get("NOTIFY_EMAIL_FROM", "").strip()

	COLUMNS = ["model_name","model_family","avg","easy","mid","hard","submitted_at"]
	NUMERIC_COLS = ["avg","easy","mid","hard"]

	def _api():
	return HfApi(token=HF_TOKEN)

	def _slug(s: str):
	s = re.sub(r"[^a-z0-9]+", "-", (s or "").lower())
	return s.strip("-") or "model"

	def _load_df(prefix: str):
	if not HF_TOKEN or not LEADERBOARD_DATASET:
	return pd.DataFrame(columns=COLUMNS)

	api = _api()
	try:
	files = api.list_repo_files(repo_id=LEADERBOARD_DATASET, repo_type="dataset")
	except Exception:
	return pd.DataFrame(columns=COLUMNS)

	files = [f for f in files if f.startswith(prefix) and f.endswith(".json")]
	rows = []
	for f in files:
	try:
	path = hf_hub_download(
	repo_id=LEADERBOARD_DATASET,
	repo_type="dataset",
	filename=f,
	token=HF_TOKEN,
	)
	with open(path, "r") as fp:
	rows.append(json.load(fp))
	except Exception:
	continue

	if not rows:
	return pd.DataFrame(columns=COLUMNS)

	df = pd.DataFrame(rows)
	for c in COLUMNS:
	if c not in df.columns:
	df[c] = ""

	for c in NUMERIC_COLS:
	df[c] = pd.to_numeric(df[c], errors="coerce")

	df = df.sort_values(by="avg", ascending=False)

	for c in NUMERIC_COLS:
	df[c] = df[c].map(lambda x: "" if pd.isna(x) else f"{x:.2f}")
	return df[COLUMNS]

	def refresh():
	return _load_df(APPROVED_PREFIX)

	def _today_utc():
	return datetime.datetime.utcnow().date().isoformat()

	def _send_email_resend(subject: str, text: str):
	if not (RESEND_API_KEY and NOTIFY_EMAIL_TO and NOTIFY_EMAIL_FROM):
	return
	requests.post(
	"https://api.resend.com/emails",
	headers={"Authorization": f"Bearer {RESEND_API_KEY}", "Content-Type": "application/json"},
	json={"from": NOTIFY_EMAIL_FROM, "to": [NOTIFY_EMAIL_TO], "subject": subject, "text": text},
	timeout=20,
	)

	def _already_submitted_today(api: HfApi, day: str, username: str) -> bool:
	marker = f"{PENDING_PREFIX}{day}/{username}/_submitted.json"
	try:
	files = api.list_repo_files(repo_id=LEADERBOARD_DATASET, repo_type="dataset")
	return marker in files
	except Exception:
	return True # 更安全：查不到就当提交过

	# ✅ 改这里：用 profile 取 username
	def submit(model_name, model_family, zip_file, profile: gr.OAuthProfile):
	if profile is None or not getattr(profile, "username", None):
	return "Please sign in with Hugging Face first.", refresh()
	username = profile.username

	if not model_name or not model_family or zip_file is None:
	return "All fields are required.", refresh()
	if not zip_file.name.endswith(".zip"):
	return "Please upload a .zip file.", refresh()
	if not HF_TOKEN or not LEADERBOARD_DATASET:
	return "Server is not configured (HF_TOKEN / LEADERBOARD_DATASET).", refresh()

	api = _api()
	day = _today_utc()

	if _already_submitted_today(api, day, username):
	return f"Limit: you can only submit once per day. (user={username}, day={day})", refresh()

	now = datetime.datetime.utcnow().isoformat() + "Z"
	safe_model = _slug(model_name)
	nonce = uuid.uuid4().hex[:6]

	base_dir = f"{PENDING_PREFIX}{day}/{username}/"
	json_path = f"{base_dir}{now}-{safe_model}-{nonce}.json"
	zip_path = f"{base_dir}{now}-{safe_model}-{nonce}.zip"
	marker_path = f"{base_dir}_submitted.json"

	payload = {
	"model name": model_name,
	"model family": model_family,
	"avg": 0, "easy": 0, "mid": 0, "hard": 0,
	"submitted_at": now,
	"username": username,
	"day": day,
	}

	api.upload_file(
	repo_id=LEADERBOARD_DATASET, repo_type="dataset",
	path_or_fileobj=io.BytesIO(json.dumps(payload, indent=2).encode("utf-8")),
	path_in_repo=json_path,
	commit_message=f"submit {model_name} by {username}",
	)

	api.upload_file(
	repo_id=LEADERBOARD_DATASET, repo_type="dataset",
	path_or_fileobj=zip_file,
	path_in_repo=zip_path,
	commit_message=f"upload zip {model_name} by {username}",
	)

	api.upload_file(
	repo_id=LEADERBOARD_DATASET, repo_type="dataset",
	path_or_fileobj=io.BytesIO(json.dumps({"submitted_at": now, "username": username, "day": day}, indent=2).encode("utf-8")),
	path_in_repo=marker_path,
	commit_message=f"marker {day} {username}",
	)

	_send_email_resend(
	subject=f"[{APP_NAME}] New submission from {username} ({day})",
	text=f"user: {username}\nday: {day}\nmodel: {model_name}\nfamily: {model_family}\njson: {json_path}\nzip: {zip_path}\n",
	)

	return "Submitted. Waiting for review.", refresh()

	with gr.Blocks(title=f"{APP_NAME} leaderboard") as demo:
	gr.Markdown(f"# {APP_NAME} Leaderboard")
	gr.Markdown("""
	## Data

	MiniAppBench is the first comprehensive benchmark designed to evaluate principle-driven, interactive application generation. Unlike prior benchmarks that emphasize static UI layouts or isolated algorithmic code snippets, MiniAppBench targets MiniApps—HTML-based applications that require both faithful visual rendering and non-trivial interaction logic.

	The dataset is split into two subsets: validation (100 instances) and test (400 instances), and can be accessed at [MiniAppBench dataset](https://huggingface.co/datasets/MiniAppBench/Dataset). The validation set includes publicly available evaluation references to support reproducible experiments, while the test set keeps the references hidden to enable unbiased evaluation.
	""")

	gr.Markdown(
	"""
	## Leaderboard

	All results shown on this leaderboard are evaluated on the test split of MiniAppBench.
	""",
	)

	leaderboard = gr.Dataframe(
	value=pd.DataFrame(columns=COLUMNS), # 启动不访问Hub
	interactive=False,
	wrap=True,
	)

	refresh_btn = gr.Button("Refresh")
	refresh_btn.click(refresh, outputs=[leaderboard])

	# 页面加载时自动刷新一次
	demo.load(refresh, outputs=[leaderboard])

	gr.Markdown("## Submit")

	gr.Markdown(
	"""
	Submission requirements
	- Please sign in with Hugging Face before submitting.
	- One submission per user per day (UTC).
	- Upload a .zip file only.
	- The `.zip` must contain the HTML outputs for the test set queries.
	- Each file should be named using the query index: `<index>.html` (e.g., `1.html`, `2.html`, ...).
	- We may contact you via email for verification and request additional materials. Please be prepared to provide:
	- Model access (one of the following):
	- Preferred: an inference API endpoint we can use to reproduce the results.
	- Alternatively: model checkpoints (ckpts) plus clear deployment / inference instructions (environment, dependencies, and how to run).
	- A related paper, if available (e.g., an arXiv link or a PDF).
	- After you submit, we will update the results within 3 days.
	""",
	)



	model_name = gr.Textbox(label="Model name", placeholder="e.g. MyModel v1")
	model_family = gr.Textbox(label="Model family", placeholder="e.g. Llama / Qwen / InternLM ...")
	zip_file = gr.File(label="Upload zip (.zip only)", file_types=[".zip"])

	with gr.Row():
	with gr.Column(scale=1, min_width=220):
	login_btn = gr.LoginButton()
	with gr.Column(scale=1, min_width=260):
	submit_btn = gr.Button("Submit", variant="primary")

	status = gr.Markdown()

	submit_btn.click(submit, inputs=[model_name, model_family, zip_file], outputs=[status, leaderboard])

	demo.launch()