Spaces:
Running
Running
Alex W. commited on
Commit ·
ba623bd
1
Parent(s): 5827b27
改动汇总
Browse filescore/plotter.py — _aggregate_by_layer() 改为两步:先按 (layer, kv_head) 分组取 median,再跨 kv_head 取 median/q25/q75
core/plotter_plotly.py — _agg() 同上
core/table_gen.py — 加 _pseudobulk(df, col) helper,所有 make_table 函数调用它而非直接 .median()
db/writer.py — 加 _pseudobulk_col(rows, col) 和 refresh_all_summaries(conn),_calc_summary_row() 和 update_model_summary() 全部改用伪重复安全的聚合方式
ui/tab_leaderboard.py — Refresh 按钮触发 load_leaderboard(),其中静默调用 refresh_all_summaries(),在展示排行榜前先把历史数据全部用新方式重算
- core/plotter.py +21 -7
- core/plotter_plotly.py +30 -8
- core/table_gen.py +57 -20
- db/writer.py +101 -50
- ui/tab_leaderboard.py +14 -6
core/plotter.py
CHANGED
|
@@ -73,19 +73,33 @@ BAND_COLORS = {
|
|
| 73 |
|
| 74 |
def _aggregate_by_layer(df: pd.DataFrame, col: str):
|
| 75 |
"""
|
| 76 |
-
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
| 78 |
"""
|
| 79 |
kv_cols = {"ssr_KV", "pearson_KV", "cosU_KV", "cosV_KV", "alpha_KV"}
|
| 80 |
if col in kv_cols:
|
| 81 |
df = df[df["kv_shared"] == 0] if "kv_shared" in df.columns else df
|
| 82 |
|
| 83 |
-
grp = df.groupby("layer")[col]
|
| 84 |
layers = np.array(sorted(df["layer"].unique()))
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
|
| 91 |
def _global_layers(df: pd.DataFrame):
|
|
|
|
| 73 |
|
| 74 |
def _aggregate_by_layer(df: pd.DataFrame, col: str):
|
| 75 |
"""
|
| 76 |
+
Pseudo-bulk two-step aggregation per layer (Nature Comms 2021).
|
| 77 |
+
Step 1: median across Q heads within each (layer, kv_head) group.
|
| 78 |
+
Step 2: median / q25 / q75 across kv_head groups per layer.
|
| 79 |
+
Avoids pseudoreplication bias in GQA models (e.g. 4Q:1K).
|
| 80 |
+
Excludes kv_shared rows for KV metrics (theoretical-value bias).
|
| 81 |
"""
|
| 82 |
kv_cols = {"ssr_KV", "pearson_KV", "cosU_KV", "cosV_KV", "alpha_KV"}
|
| 83 |
if col in kv_cols:
|
| 84 |
df = df[df["kv_shared"] == 0] if "kv_shared" in df.columns else df
|
| 85 |
|
|
|
|
| 86 |
layers = np.array(sorted(df["layer"].unique()))
|
| 87 |
+
med_vals, q25_vals, q75_vals = [], [], []
|
| 88 |
+
|
| 89 |
+
for layer in layers:
|
| 90 |
+
ldf = df[df["layer"] == layer]
|
| 91 |
+
# Step 1: median within each kv_head group
|
| 92 |
+
if "kv_head" in ldf.columns:
|
| 93 |
+
step1 = ldf.groupby("kv_head")[col].median().values
|
| 94 |
+
else:
|
| 95 |
+
step1 = ldf[col].dropna().values
|
| 96 |
+
step1 = step1[~np.isnan(step1)] if len(step1) > 0 else step1
|
| 97 |
+
# Step 2: statistics across kv_head medians
|
| 98 |
+
med_vals.append(float(np.median(step1)) if len(step1) > 0 else np.nan)
|
| 99 |
+
q25_vals.append(float(np.percentile(step1, 25)) if len(step1) > 0 else np.nan)
|
| 100 |
+
q75_vals.append(float(np.percentile(step1, 75)) if len(step1) > 0 else np.nan)
|
| 101 |
+
|
| 102 |
+
return layers, np.array(med_vals), np.array(q25_vals), np.array(q75_vals)
|
| 103 |
|
| 104 |
|
| 105 |
def _global_layers(df: pd.DataFrame):
|
core/plotter_plotly.py
CHANGED
|
@@ -63,15 +63,37 @@ TOTAL_HEIGHT = SUBPLOT_HEIGHT * len(PANELS) + 120 # +header
|
|
| 63 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 64 |
|
| 65 |
def _agg(df: pd.DataFrame, col: str):
|
| 66 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
kv_cols = {"ssr_KV", "pearson_KV", "cosU_KV", "cosV_KV", "alpha_KV"}
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
|
| 77 |
def _global_layers(df: pd.DataFrame) -> list[int]:
|
|
|
|
| 63 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 64 |
|
| 65 |
def _agg(df: pd.DataFrame, col: str):
|
| 66 |
+
"""
|
| 67 |
+
Pseudo-bulk two-step aggregation per layer (Nature Comms 2021).
|
| 68 |
+
Step 1: median across Q heads within each (layer, kv_head) group.
|
| 69 |
+
Step 2: median / q25 / q75 across kv_head groups per layer.
|
| 70 |
+
Avoids pseudoreplication bias in GQA models (e.g. 4Q:1K).
|
| 71 |
+
Excludes kv_shared rows for KV metrics (theoretical-value bias).
|
| 72 |
+
"""
|
| 73 |
kv_cols = {"ssr_KV", "pearson_KV", "cosU_KV", "cosV_KV", "alpha_KV"}
|
| 74 |
+
if col in kv_cols and "kv_shared" in df.columns:
|
| 75 |
+
df = df[df["kv_shared"] == 0]
|
| 76 |
+
|
| 77 |
+
layers = np.array(sorted(df["layer"].unique()), dtype=int)
|
| 78 |
+
med_vals, q25_vals, q75_vals = [], [], []
|
| 79 |
+
|
| 80 |
+
for layer in layers:
|
| 81 |
+
ldf = df[df["layer"] == layer]
|
| 82 |
+
# Step 1: median within each kv_head group
|
| 83 |
+
if "kv_head" in ldf.columns:
|
| 84 |
+
step1 = ldf.groupby("kv_head")[col].median().values
|
| 85 |
+
else:
|
| 86 |
+
step1 = ldf[col].dropna().values
|
| 87 |
+
step1 = step1[~np.isnan(step1.astype(float))] if len(step1) > 0 else step1
|
| 88 |
+
# Step 2: statistics across kv_head medians
|
| 89 |
+
med_vals.append(float(np.median(step1)) if len(step1) > 0 else np.nan)
|
| 90 |
+
q25_vals.append(float(np.percentile(step1, 25)) if len(step1) > 0 else np.nan)
|
| 91 |
+
q75_vals.append(float(np.percentile(step1, 75)) if len(step1) > 0 else np.nan)
|
| 92 |
+
|
| 93 |
+
return (layers,
|
| 94 |
+
np.array(med_vals, dtype=float),
|
| 95 |
+
np.array(q25_vals, dtype=float),
|
| 96 |
+
np.array(q75_vals, dtype=float))
|
| 97 |
|
| 98 |
|
| 99 |
def _global_layers(df: pd.DataFrame) -> list[int]:
|
core/table_gen.py
CHANGED
|
@@ -32,6 +32,43 @@ def _mean(series) -> Optional[float]:
|
|
| 32 |
return float(v.mean()) if len(v) > 0 else None
|
| 33 |
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
def _fmt(x, decimals=6) -> str:
|
| 36 |
if x is None or (isinstance(x, float) and np.isnan(x)):
|
| 37 |
return "—"
|
|
@@ -134,10 +171,10 @@ def make_table1(
|
|
| 134 |
"Model": _short(model_id),
|
| 135 |
"Std Layers": n_layers,
|
| 136 |
"Global Layers": n_global if n_global > 0 else "—",
|
| 137 |
-
"Median Pearson":_fmt(
|
| 138 |
-
"Mean Pearson": _fmt(
|
| 139 |
-
"Median SSR": _fmt(
|
| 140 |
-
"Mean SSR": _fmt(
|
| 141 |
})
|
| 142 |
return pd.DataFrame(rows)
|
| 143 |
|
|
@@ -166,13 +203,13 @@ def make_table2(
|
|
| 166 |
for lo, hi in group_bounds:
|
| 167 |
label = f"{lo}–{hi}"
|
| 168 |
grp_a = std_a[(std_a["layer"] >= lo) & (std_a["layer"] <= hi)]
|
| 169 |
-
ssr_a =
|
| 170 |
|
| 171 |
row = {"Layer Group": label, f"{_short(name_a)} SSR": _fmt(ssr_a, 6)}
|
| 172 |
|
| 173 |
if std_b is not None and name_b:
|
| 174 |
grp_b = std_b[(std_b["layer"] >= lo) & (std_b["layer"] <= hi)]
|
| 175 |
-
ssr_b =
|
| 176 |
row[f"{_short(name_b)} SSR"] = _fmt(ssr_b, 6)
|
| 177 |
if ssr_a and ssr_b and ssr_a > 0:
|
| 178 |
improvement = (ssr_a - ssr_b) / ssr_a * 100
|
|
@@ -207,9 +244,9 @@ def make_table3(
|
|
| 207 |
"Model": _short(model_id),
|
| 208 |
"d_h": head_dim,
|
| 209 |
"Random 1/√d_h": _fmt(baseline, 4),
|
| 210 |
-
"cosU(Q,K)": _fmt(
|
| 211 |
-
"cosU(Q,V)": _fmt(
|
| 212 |
-
"cosU(K,V)": _fmt(
|
| 213 |
})
|
| 214 |
return pd.DataFrame(rows)
|
| 215 |
|
|
@@ -237,9 +274,9 @@ def make_table4(
|
|
| 237 |
"Model": _short(model_id),
|
| 238 |
"d_model": d_model,
|
| 239 |
"Random 1/√D": _fmt(baseline, 4),
|
| 240 |
-
"cosV(Q,K)": _fmt(
|
| 241 |
-
"cosV(Q,V)": _fmt(
|
| 242 |
-
"cosV(K,V)": _fmt(
|
| 243 |
})
|
| 244 |
return pd.DataFrame(rows)
|
| 245 |
|
|
@@ -267,12 +304,12 @@ def make_table5(
|
|
| 267 |
deep = std[std["layer"] > std["layer"].min()]
|
| 268 |
rows.append({
|
| 269 |
"Model": _short(model_id),
|
| 270 |
-
"Median κ(Q) all": _fmt(
|
| 271 |
-
"Median κ(K) all": _fmt(
|
| 272 |
-
"κ(Q) Layer 0": _fmt(
|
| 273 |
-
"κ(K) Layer 0": _fmt(
|
| 274 |
-
"Median κ(Q) deep": _fmt(
|
| 275 |
-
"Median κ(K) deep": _fmt(
|
| 276 |
})
|
| 277 |
return pd.DataFrame(rows)
|
| 278 |
|
|
@@ -293,9 +330,9 @@ def make_table6(
|
|
| 293 |
std = _standard_only(df)
|
| 294 |
if std.empty:
|
| 295 |
continue
|
| 296 |
-
med_ssr =
|
| 297 |
wang_score = 1 - med_ssr if med_ssr is not None else None
|
| 298 |
-
med_pearson =
|
| 299 |
rows.append({
|
| 300 |
"Model": _short(model_id),
|
| 301 |
"Std Layers": std["layer"].nunique(),
|
|
|
|
| 32 |
return float(v.mean()) if len(v) > 0 else None
|
| 33 |
|
| 34 |
|
| 35 |
+
def _pseudobulk(df: pd.DataFrame, col: str) -> np.ndarray:
|
| 36 |
+
"""
|
| 37 |
+
Pseudo-bulk two-step aggregation (Nature Comms 2021).
|
| 38 |
+
Step 1: median across Q heads within each (layer, kv_head) group.
|
| 39 |
+
Step 2: median across kv_head groups per layer.
|
| 40 |
+
Returns 1-D array of per-layer medians.
|
| 41 |
+
For MHA models this equals a plain per-layer median.
|
| 42 |
+
"""
|
| 43 |
+
if df.empty or col not in df.columns:
|
| 44 |
+
return np.array([])
|
| 45 |
+
layers = sorted(df["layer"].unique())
|
| 46 |
+
per_layer = []
|
| 47 |
+
for layer in layers:
|
| 48 |
+
ldf = df[df["layer"] == layer]
|
| 49 |
+
if "kv_head" in ldf.columns:
|
| 50 |
+
step1 = ldf.groupby("kv_head")[col].median().values
|
| 51 |
+
else:
|
| 52 |
+
step1 = ldf[col].dropna().values
|
| 53 |
+
step1 = np.array(step1, dtype=float)
|
| 54 |
+
step1 = step1[~np.isnan(step1)]
|
| 55 |
+
if len(step1) > 0:
|
| 56 |
+
per_layer.append(float(np.median(step1)))
|
| 57 |
+
return np.array(per_layer, dtype=float)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def _pb_med(df: pd.DataFrame, col: str) -> Optional[float]:
|
| 61 |
+
"""Pseudo-bulk median across layers."""
|
| 62 |
+
v = _pseudobulk(df, col)
|
| 63 |
+
return float(np.median(v)) if len(v) > 0 else None
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def _pb_mean(df: pd.DataFrame, col: str) -> Optional[float]:
|
| 67 |
+
"""Pseudo-bulk mean across layers."""
|
| 68 |
+
v = _pseudobulk(df, col)
|
| 69 |
+
return float(np.mean(v)) if len(v) > 0 else None
|
| 70 |
+
|
| 71 |
+
|
| 72 |
def _fmt(x, decimals=6) -> str:
|
| 73 |
if x is None or (isinstance(x, float) and np.isnan(x)):
|
| 74 |
return "—"
|
|
|
|
| 171 |
"Model": _short(model_id),
|
| 172 |
"Std Layers": n_layers,
|
| 173 |
"Global Layers": n_global if n_global > 0 else "—",
|
| 174 |
+
"Median Pearson":_fmt(_pb_med(std, "pearson_QK"), 4),
|
| 175 |
+
"Mean Pearson": _fmt(_pb_mean(std, "pearson_QK"), 4),
|
| 176 |
+
"Median SSR": _fmt(_pb_med(std, "ssr_QK"), 6),
|
| 177 |
+
"Mean SSR": _fmt(_pb_mean(std, "ssr_QK"), 6),
|
| 178 |
})
|
| 179 |
return pd.DataFrame(rows)
|
| 180 |
|
|
|
|
| 203 |
for lo, hi in group_bounds:
|
| 204 |
label = f"{lo}–{hi}"
|
| 205 |
grp_a = std_a[(std_a["layer"] >= lo) & (std_a["layer"] <= hi)]
|
| 206 |
+
ssr_a = _pb_med(grp_a, "ssr_QK")
|
| 207 |
|
| 208 |
row = {"Layer Group": label, f"{_short(name_a)} SSR": _fmt(ssr_a, 6)}
|
| 209 |
|
| 210 |
if std_b is not None and name_b:
|
| 211 |
grp_b = std_b[(std_b["layer"] >= lo) & (std_b["layer"] <= hi)]
|
| 212 |
+
ssr_b = _pb_med(grp_b, "ssr_QK")
|
| 213 |
row[f"{_short(name_b)} SSR"] = _fmt(ssr_b, 6)
|
| 214 |
if ssr_a and ssr_b and ssr_a > 0:
|
| 215 |
improvement = (ssr_a - ssr_b) / ssr_a * 100
|
|
|
|
| 244 |
"Model": _short(model_id),
|
| 245 |
"d_h": head_dim,
|
| 246 |
"Random 1/√d_h": _fmt(baseline, 4),
|
| 247 |
+
"cosU(Q,K)": _fmt(_pb_med(std, "cosU_QK"), 4),
|
| 248 |
+
"cosU(Q,V)": _fmt(_pb_med(std, "cosU_QV"), 4),
|
| 249 |
+
"cosU(K,V)": _fmt(_pb_med(std, "cosU_KV"), 4),
|
| 250 |
})
|
| 251 |
return pd.DataFrame(rows)
|
| 252 |
|
|
|
|
| 274 |
"Model": _short(model_id),
|
| 275 |
"d_model": d_model,
|
| 276 |
"Random 1/√D": _fmt(baseline, 4),
|
| 277 |
+
"cosV(Q,K)": _fmt(_pb_med(std, "cosV_QK"), 4),
|
| 278 |
+
"cosV(Q,V)": _fmt(_pb_med(std, "cosV_QV"), 4),
|
| 279 |
+
"cosV(K,V)": _fmt(_pb_med(std, "cosV_KV"), 4),
|
| 280 |
})
|
| 281 |
return pd.DataFrame(rows)
|
| 282 |
|
|
|
|
| 304 |
deep = std[std["layer"] > std["layer"].min()]
|
| 305 |
rows.append({
|
| 306 |
"Model": _short(model_id),
|
| 307 |
+
"Median κ(Q) all": _fmt(_pb_med(std, "cond_Q"), 1),
|
| 308 |
+
"Median κ(K) all": _fmt(_pb_med(std, "cond_K"), 1),
|
| 309 |
+
"κ(Q) Layer 0": _fmt(_pb_med(l0, "cond_Q"), 1),
|
| 310 |
+
"κ(K) Layer 0": _fmt(_pb_med(l0, "cond_K"), 1),
|
| 311 |
+
"Median κ(Q) deep": _fmt(_pb_med(deep, "cond_Q"), 1),
|
| 312 |
+
"Median κ(K) deep": _fmt(_pb_med(deep, "cond_K"), 1),
|
| 313 |
})
|
| 314 |
return pd.DataFrame(rows)
|
| 315 |
|
|
|
|
| 330 |
std = _standard_only(df)
|
| 331 |
if std.empty:
|
| 332 |
continue
|
| 333 |
+
med_ssr = _pb_med(std, "ssr_QK")
|
| 334 |
wang_score = 1 - med_ssr if med_ssr is not None else None
|
| 335 |
+
med_pearson = _pb_med(std, "pearson_QK")
|
| 336 |
rows.append({
|
| 337 |
"Model": _short(model_id),
|
| 338 |
"Std Layers": std["layer"].nunique(),
|
db/writer.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
"""
|
| 3 |
数据库写入模块
|
| 4 |
- 写入分析结果到 layer_head_metrics
|
| 5 |
-
- 计算并写入 model_summary
|
| 6 |
- 支持断点续传(以 prefix+layer 为粒度)
|
| 7 |
- 写入权限验证
|
| 8 |
"""
|
|
@@ -10,6 +10,7 @@
|
|
| 10 |
import os
|
| 11 |
import sqlite3
|
| 12 |
import numpy as np
|
|
|
|
| 13 |
from datetime import datetime
|
| 14 |
from db.schema import get_connection, init_db
|
| 15 |
|
|
@@ -19,21 +20,10 @@ from db.schema import get_connection, init_db
|
|
| 19 |
# ─────────────────────────────────────────────
|
| 20 |
|
| 21 |
def infer_layer_type(kv_shared: bool) -> str:
|
| 22 |
-
"""
|
| 23 |
-
从结构特征推断层类型
|
| 24 |
-
kv_shared=True → 'global' (K=V共享,如 Gemma 全局层)
|
| 25 |
-
kv_shared=False → 'standard'
|
| 26 |
-
"""
|
| 27 |
return "global" if kv_shared else "standard"
|
| 28 |
|
| 29 |
|
| 30 |
def infer_modality(prefix: str) -> str:
|
| 31 |
-
"""
|
| 32 |
-
从组件前缀推断模态
|
| 33 |
-
纯关键词匹配,不 hard coding 模型名
|
| 34 |
-
未匹配到任何关键词 → 默认 'language'
|
| 35 |
-
(覆盖纯语言模型,如 "model." 前缀的 LLaMA/Qwen)
|
| 36 |
-
"""
|
| 37 |
p = prefix.lower()
|
| 38 |
if "vision" in p or "visual" in p or "image" in p:
|
| 39 |
return "vision"
|
|
@@ -47,15 +37,6 @@ def infer_modality(prefix: str) -> str:
|
|
| 47 |
# ─────────────────────────────────────────────
|
| 48 |
|
| 49 |
def check_write_permission(admin_token: str) -> bool:
|
| 50 |
-
"""
|
| 51 |
-
验证管理员写入权限。
|
| 52 |
-
WRITE_TOKEN 存储在 HF Space Secrets(加密,不进入 git repo)。
|
| 53 |
-
运行时由 HF 注入为环境变量,只在服务端比对,不返回给前端。
|
| 54 |
-
|
| 55 |
-
返回:
|
| 56 |
-
True = 有写入权限
|
| 57 |
-
False = 只读模式(分析可以跑,结果不写库)
|
| 58 |
-
"""
|
| 59 |
server_token = os.environ.get("WRITE_TOKEN", "")
|
| 60 |
if not server_token:
|
| 61 |
return False
|
|
@@ -71,7 +52,6 @@ def get_analyzed_layers(
|
|
| 71 |
model_id: str,
|
| 72 |
prefix: str,
|
| 73 |
) -> set:
|
| 74 |
-
"""返回已完成分析的层号集合"""
|
| 75 |
cur = conn.cursor()
|
| 76 |
cur.execute(
|
| 77 |
"""SELECT DISTINCT layer FROM layer_head_metrics
|
|
@@ -88,7 +68,6 @@ def is_layer_complete(
|
|
| 88 |
layer: int,
|
| 89 |
expected_records: int,
|
| 90 |
) -> bool:
|
| 91 |
-
"""检查某层是否已完整写入"""
|
| 92 |
cur = conn.cursor()
|
| 93 |
cur.execute(
|
| 94 |
"""SELECT COUNT(*) FROM layer_head_metrics
|
|
@@ -226,12 +205,67 @@ def write_layer_records(
|
|
| 226 |
conn.commit()
|
| 227 |
|
| 228 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
# ─────────────────────────────────────────────
|
| 230 |
# 计算并写入 model_summary
|
| 231 |
# ─────────────────────────────────────────────
|
| 232 |
|
| 233 |
def _calc_summary_row(
|
| 234 |
-
rows
|
| 235 |
model_id: str,
|
| 236 |
prefix: str,
|
| 237 |
layer_type: str,
|
|
@@ -239,14 +273,13 @@ def _calc_summary_row(
|
|
| 239 |
if not rows:
|
| 240 |
return None
|
| 241 |
|
| 242 |
-
def
|
| 243 |
-
|
| 244 |
-
return np.array(vals, dtype=float) if vals else np.array([])
|
| 245 |
|
| 246 |
def med(arr): return float(np.median(arr)) if len(arr) > 0 else None
|
| 247 |
def avg(arr): return float(np.mean(arr)) if len(arr) > 0 else None
|
| 248 |
|
| 249 |
-
ssr_qk =
|
| 250 |
wang_score = float(1 - np.median(ssr_qk)) if len(ssr_qk) > 0 else None
|
| 251 |
n_layers = len(set(r["layer"] for r in rows))
|
| 252 |
n_records = len(rows)
|
|
@@ -255,18 +288,18 @@ def _calc_summary_row(
|
|
| 255 |
"model_id": model_id,
|
| 256 |
"prefix": prefix,
|
| 257 |
"layer_type": layer_type,
|
| 258 |
-
"median_pearson_QK": med(
|
| 259 |
-
"mean_pearson_QK": avg(
|
| 260 |
"median_ssr_QK": med(ssr_qk),
|
| 261 |
"mean_ssr_QK": avg(ssr_qk),
|
| 262 |
-
"median_ssr_QV": med(
|
| 263 |
-
"mean_ssr_QV": avg(
|
| 264 |
-
"median_cond_Q": med(
|
| 265 |
-
"mean_cond_Q": avg(
|
| 266 |
-
"median_cosU_QK": med(
|
| 267 |
-
"median_cosU_QV": med(
|
| 268 |
-
"median_cosV_QK": med(
|
| 269 |
-
"median_cosV_QV": med(
|
| 270 |
"wang_score": wang_score,
|
| 271 |
"n_layers": n_layers,
|
| 272 |
"n_records": n_records,
|
|
@@ -280,21 +313,20 @@ def update_model_summary(
|
|
| 280 |
prefix: str,
|
| 281 |
):
|
| 282 |
"""
|
| 283 |
-
重新计算并写入 model_summary(all / standard / global 三行)
|
| 284 |
-
wang_score 统一用 standard 层计算
|
| 285 |
"""
|
| 286 |
cur = conn.cursor()
|
|
|
|
| 287 |
|
| 288 |
-
#
|
| 289 |
cur.execute(
|
| 290 |
-
"""SELECT ssr_QK FROM layer_head_metrics
|
| 291 |
-
WHERE model_id = ? AND prefix = ? AND layer_type = 'standard'
|
|
|
|
| 292 |
(model_id, prefix)
|
| 293 |
)
|
| 294 |
-
|
| 295 |
-
std_ssr = np.array(
|
| 296 |
-
[r[0] for r in std_ssr_rows if r[0] is not None], dtype=float
|
| 297 |
-
)
|
| 298 |
std_wang_score = float(1 - np.median(std_ssr)) if len(std_ssr) > 0 else None
|
| 299 |
|
| 300 |
for layer_type in ["all", "standard", "global"]:
|
|
@@ -315,8 +347,7 @@ def update_model_summary(
|
|
| 315 |
if summary is None:
|
| 316 |
continue
|
| 317 |
|
| 318 |
-
#
|
| 319 |
-
summary["wang_score"] = std_wang_score
|
| 320 |
|
| 321 |
conn.execute(
|
| 322 |
"""INSERT OR REPLACE INTO model_summary(
|
|
@@ -341,4 +372,24 @@ def update_model_summary(
|
|
| 341 |
summary
|
| 342 |
)
|
| 343 |
|
| 344 |
-
conn.commit()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"""
|
| 3 |
数据库写入模块
|
| 4 |
- 写入分析结果到 layer_head_metrics
|
| 5 |
+
- 计算并写入 model_summary(pseudo-bulk 两步聚合,避免 GQA 伪重复)
|
| 6 |
- 支持断点续传(以 prefix+layer 为粒度)
|
| 7 |
- 写入权限验证
|
| 8 |
"""
|
|
|
|
| 10 |
import os
|
| 11 |
import sqlite3
|
| 12 |
import numpy as np
|
| 13 |
+
from collections import defaultdict
|
| 14 |
from datetime import datetime
|
| 15 |
from db.schema import get_connection, init_db
|
| 16 |
|
|
|
|
| 20 |
# ─────────────────────────────────────────────
|
| 21 |
|
| 22 |
def infer_layer_type(kv_shared: bool) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
return "global" if kv_shared else "standard"
|
| 24 |
|
| 25 |
|
| 26 |
def infer_modality(prefix: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
p = prefix.lower()
|
| 28 |
if "vision" in p or "visual" in p or "image" in p:
|
| 29 |
return "vision"
|
|
|
|
| 37 |
# ─────────────────────────────────────────────
|
| 38 |
|
| 39 |
def check_write_permission(admin_token: str) -> bool:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
server_token = os.environ.get("WRITE_TOKEN", "")
|
| 41 |
if not server_token:
|
| 42 |
return False
|
|
|
|
| 52 |
model_id: str,
|
| 53 |
prefix: str,
|
| 54 |
) -> set:
|
|
|
|
| 55 |
cur = conn.cursor()
|
| 56 |
cur.execute(
|
| 57 |
"""SELECT DISTINCT layer FROM layer_head_metrics
|
|
|
|
| 68 |
layer: int,
|
| 69 |
expected_records: int,
|
| 70 |
) -> bool:
|
|
|
|
| 71 |
cur = conn.cursor()
|
| 72 |
cur.execute(
|
| 73 |
"""SELECT COUNT(*) FROM layer_head_metrics
|
|
|
|
| 205 |
conn.commit()
|
| 206 |
|
| 207 |
|
| 208 |
+
# ─────────────────────────────────────────────
|
| 209 |
+
# Pseudo-bulk 聚合核心函数
|
| 210 |
+
# ─────────────────────────────────────────────
|
| 211 |
+
|
| 212 |
+
def _pseudobulk(rows, col_name: str) -> np.ndarray:
|
| 213 |
+
"""
|
| 214 |
+
Pseudo-bulk two-step aggregation (Nature Comms 2021).
|
| 215 |
+
Avoids GQA pseudoreplication (e.g. 4Q:1K → 4 correlated records per KV head).
|
| 216 |
+
|
| 217 |
+
Step 1: median within each (layer, kv_head) group
|
| 218 |
+
→ one value per KV-head per layer
|
| 219 |
+
Step 2: return flat array of Step-1 values
|
| 220 |
+
→ caller computes final median / mean / quantile
|
| 221 |
+
|
| 222 |
+
Works with both sqlite3.Row objects and plain dicts.
|
| 223 |
+
"""
|
| 224 |
+
groups: dict[tuple, list] = defaultdict(list)
|
| 225 |
+
for r in rows:
|
| 226 |
+
try:
|
| 227 |
+
v = r["ssr_QK"] if col_name == "ssr_QK" else r[col_name]
|
| 228 |
+
layer = int(r["layer"])
|
| 229 |
+
kv_head = int(r["kv_head"]) if r["kv_head"] is not None else 0
|
| 230 |
+
except (KeyError, TypeError, IndexError):
|
| 231 |
+
continue
|
| 232 |
+
if v is None:
|
| 233 |
+
continue
|
| 234 |
+
groups[(layer, kv_head)].append(float(v))
|
| 235 |
+
|
| 236 |
+
if not groups:
|
| 237 |
+
return np.array([])
|
| 238 |
+
|
| 239 |
+
# Step 1: median within each (layer, kv_head) group
|
| 240 |
+
return np.array([float(np.median(vals)) for vals in groups.values()])
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
def _pseudobulk_col(rows, col_name: str) -> np.ndarray:
|
| 244 |
+
"""Generic version of _pseudobulk for any column name."""
|
| 245 |
+
groups: dict[tuple, list] = defaultdict(list)
|
| 246 |
+
for r in rows:
|
| 247 |
+
try:
|
| 248 |
+
v = r[col_name]
|
| 249 |
+
layer = int(r["layer"])
|
| 250 |
+
kv_head = int(r["kv_head"]) if r["kv_head"] is not None else 0
|
| 251 |
+
except (KeyError, TypeError, IndexError):
|
| 252 |
+
continue
|
| 253 |
+
if v is None:
|
| 254 |
+
continue
|
| 255 |
+
groups[(layer, kv_head)].append(float(v))
|
| 256 |
+
|
| 257 |
+
if not groups:
|
| 258 |
+
return np.array([])
|
| 259 |
+
|
| 260 |
+
return np.array([float(np.median(vals)) for vals in groups.values()])
|
| 261 |
+
|
| 262 |
+
|
| 263 |
# ─────────────────────────────────────────────
|
| 264 |
# 计算并写入 model_summary
|
| 265 |
# ─────────────────────────────────────────────
|
| 266 |
|
| 267 |
def _calc_summary_row(
|
| 268 |
+
rows,
|
| 269 |
model_id: str,
|
| 270 |
prefix: str,
|
| 271 |
layer_type: str,
|
|
|
|
| 273 |
if not rows:
|
| 274 |
return None
|
| 275 |
|
| 276 |
+
def pb(col):
|
| 277 |
+
return _pseudobulk_col(rows, col)
|
|
|
|
| 278 |
|
| 279 |
def med(arr): return float(np.median(arr)) if len(arr) > 0 else None
|
| 280 |
def avg(arr): return float(np.mean(arr)) if len(arr) > 0 else None
|
| 281 |
|
| 282 |
+
ssr_qk = pb("ssr_QK")
|
| 283 |
wang_score = float(1 - np.median(ssr_qk)) if len(ssr_qk) > 0 else None
|
| 284 |
n_layers = len(set(r["layer"] for r in rows))
|
| 285 |
n_records = len(rows)
|
|
|
|
| 288 |
"model_id": model_id,
|
| 289 |
"prefix": prefix,
|
| 290 |
"layer_type": layer_type,
|
| 291 |
+
"median_pearson_QK": med(pb("pearson_QK")),
|
| 292 |
+
"mean_pearson_QK": avg(pb("pearson_QK")),
|
| 293 |
"median_ssr_QK": med(ssr_qk),
|
| 294 |
"mean_ssr_QK": avg(ssr_qk),
|
| 295 |
+
"median_ssr_QV": med(pb("ssr_QV")),
|
| 296 |
+
"mean_ssr_QV": avg(pb("ssr_QV")),
|
| 297 |
+
"median_cond_Q": med(pb("cond_Q")),
|
| 298 |
+
"mean_cond_Q": avg(pb("cond_Q")),
|
| 299 |
+
"median_cosU_QK": med(pb("cosU_QK")),
|
| 300 |
+
"median_cosU_QV": med(pb("cosU_QV")),
|
| 301 |
+
"median_cosV_QK": med(pb("cosV_QK")),
|
| 302 |
+
"median_cosV_QV": med(pb("cosV_QV")),
|
| 303 |
"wang_score": wang_score,
|
| 304 |
"n_layers": n_layers,
|
| 305 |
"n_records": n_records,
|
|
|
|
| 313 |
prefix: str,
|
| 314 |
):
|
| 315 |
"""
|
| 316 |
+
重新计算并写入 model_summary(all / standard / global 三行)。
|
| 317 |
+
wang_score 统一用 standard 层 pseudo-bulk median(SSR_QK) 计算。
|
| 318 |
"""
|
| 319 |
cur = conn.cursor()
|
| 320 |
+
cur.row_factory = sqlite3.Row
|
| 321 |
|
| 322 |
+
# ── Wang Score: standard 层 pseudo-bulk ──────────────────────────────
|
| 323 |
cur.execute(
|
| 324 |
+
"""SELECT layer, kv_head, ssr_QK FROM layer_head_metrics
|
| 325 |
+
WHERE model_id = ? AND prefix = ? AND layer_type = 'standard'
|
| 326 |
+
AND kv_shared = 0""",
|
| 327 |
(model_id, prefix)
|
| 328 |
)
|
| 329 |
+
std_ssr = _pseudobulk_col(cur.fetchall(), "ssr_QK")
|
|
|
|
|
|
|
|
|
|
| 330 |
std_wang_score = float(1 - np.median(std_ssr)) if len(std_ssr) > 0 else None
|
| 331 |
|
| 332 |
for layer_type in ["all", "standard", "global"]:
|
|
|
|
| 347 |
if summary is None:
|
| 348 |
continue
|
| 349 |
|
| 350 |
+
summary["wang_score"] = std_wang_score # always from standard pseudo-bulk
|
|
|
|
| 351 |
|
| 352 |
conn.execute(
|
| 353 |
"""INSERT OR REPLACE INTO model_summary(
|
|
|
|
| 372 |
summary
|
| 373 |
)
|
| 374 |
|
| 375 |
+
conn.commit()
|
| 376 |
+
|
| 377 |
+
|
| 378 |
+
# ─────────────────────────────────────────────
|
| 379 |
+
# 批量刷新所有模型的 model_summary
|
| 380 |
+
# ─────────────────────────────────────────────
|
| 381 |
+
|
| 382 |
+
def refresh_all_summaries(conn: sqlite3.Connection) -> int:
|
| 383 |
+
"""
|
| 384 |
+
Re-compute model_summary for every (model_id, prefix) in the DB.
|
| 385 |
+
Called by Tab 3 Refresh button to migrate historical data to pseudo-bulk.
|
| 386 |
+
Returns number of (model_id, prefix) pairs refreshed.
|
| 387 |
+
"""
|
| 388 |
+
cur = conn.cursor()
|
| 389 |
+
cur.execute(
|
| 390 |
+
"SELECT DISTINCT model_id, prefix FROM layer_head_metrics"
|
| 391 |
+
)
|
| 392 |
+
pairs = cur.fetchall()
|
| 393 |
+
for model_id, prefix in pairs:
|
| 394 |
+
update_model_summary(conn, model_id, prefix)
|
| 395 |
+
return len(pairs)
|
ui/tab_leaderboard.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
| 1 |
# ui/tab_leaderboard.py
|
| 2 |
"""
|
| 3 |
Tab3: Wang's Five Laws Leaderboard
|
| 4 |
-
- Ranked by wang_score (= 1 − median SSR_QK, standard layers only)
|
|
|
|
| 5 |
- Filter by modality (default: language)
|
| 6 |
- Filter by layer_type (default: standard)
|
| 7 |
"""
|
|
@@ -12,6 +13,7 @@ import numpy as np
|
|
| 12 |
|
| 13 |
from db.schema import init_db
|
| 14 |
from db.reader import get_leaderboard
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
def _format_leaderboard(df: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -47,8 +49,12 @@ def load_leaderboard(
|
|
| 47 |
layer_type: str,
|
| 48 |
) -> tuple[pd.DataFrame, str]:
|
| 49 |
conn = init_db()
|
| 50 |
-
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
df = get_leaderboard(conn, modality=mod, layer_type=lt, limit=100)
|
| 54 |
|
|
@@ -62,7 +68,8 @@ def load_leaderboard(
|
|
| 62 |
formatted = _format_leaderboard(df)
|
| 63 |
status = (
|
| 64 |
f"✅ {len(formatted)} entries "
|
| 65 |
-
f"| modality={mod} layer_type={lt}"
|
|
|
|
| 66 |
)
|
| 67 |
return formatted, status
|
| 68 |
|
|
@@ -72,11 +79,12 @@ def build_tab_leaderboard():
|
|
| 72 |
gr.Markdown(r"""
|
| 73 |
## Wang's Five Laws — Model Leaderboard
|
| 74 |
|
| 75 |
-
**Wang Score = 1 − median(SSR\_QK)** Higher is better. Theoretical max = 1.
|
| 76 |
Computed from `standard` layers only (global/KV-shared layers excluded).
|
|
|
|
| 77 |
|
| 78 |
> 王氏评分 = 1 − median(SSR_QK),越高越好,理论极值=1。
|
| 79 |
-
> 仅基于 standard 层计算
|
| 80 |
""")
|
| 81 |
|
| 82 |
with gr.Row():
|
|
|
|
| 1 |
# ui/tab_leaderboard.py
|
| 2 |
"""
|
| 3 |
Tab3: Wang's Five Laws Leaderboard
|
| 4 |
+
- Ranked by wang_score (= 1 − pseudo-bulk median SSR_QK, standard layers only)
|
| 5 |
+
- On Refresh: silently re-computes all model_summary rows (pseudo-bulk migration)
|
| 6 |
- Filter by modality (default: language)
|
| 7 |
- Filter by layer_type (default: standard)
|
| 8 |
"""
|
|
|
|
| 13 |
|
| 14 |
from db.schema import init_db
|
| 15 |
from db.reader import get_leaderboard
|
| 16 |
+
from db.writer import refresh_all_summaries
|
| 17 |
|
| 18 |
|
| 19 |
def _format_leaderboard(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
|
| 49 |
layer_type: str,
|
| 50 |
) -> tuple[pd.DataFrame, str]:
|
| 51 |
conn = init_db()
|
| 52 |
+
|
| 53 |
+
# ── Silently refresh all summaries (pseudo-bulk migration) ────────────
|
| 54 |
+
n_refreshed = refresh_all_summaries(conn)
|
| 55 |
+
|
| 56 |
+
lt = layer_type if layer_type != "all" else "standard"
|
| 57 |
+
mod = modality
|
| 58 |
|
| 59 |
df = get_leaderboard(conn, modality=mod, layer_type=lt, limit=100)
|
| 60 |
|
|
|
|
| 68 |
formatted = _format_leaderboard(df)
|
| 69 |
status = (
|
| 70 |
f"✅ {len(formatted)} entries "
|
| 71 |
+
f"| modality={mod} layer_type={lt} "
|
| 72 |
+
f"| summaries refreshed: {n_refreshed}"
|
| 73 |
)
|
| 74 |
return formatted, status
|
| 75 |
|
|
|
|
| 79 |
gr.Markdown(r"""
|
| 80 |
## Wang's Five Laws — Model Leaderboard
|
| 81 |
|
| 82 |
+
**Wang Score = 1 − median(SSR\_QK)** Higher is better. Theoretical max = 1.
|
| 83 |
Computed from `standard` layers only (global/KV-shared layers excluded).
|
| 84 |
+
Metrics use **pseudo-bulk aggregation** (Nature Comms 2021) to avoid GQA pseudoreplication.
|
| 85 |
|
| 86 |
> 王氏评分 = 1 − median(SSR_QK),越高越好,理论极值=1。
|
| 87 |
+
> 仅基于 standard 层计算。采用 pseudo-bulk 两步聚合避免 GQA 伪重复计数。
|
| 88 |
""")
|
| 89 |
|
| 90 |
with gr.Row():
|