Spaces:
Running
Running
Alex W. commited on
Commit ·
667e8f1
1
Parent(s): b342230
debug for gemma-4-31b-it layer 0, KV head 0
Browse files- core/metrics.py +40 -13
core/metrics.py
CHANGED
|
@@ -82,9 +82,17 @@ def analyze_layer(
|
|
| 82 |
records: list[dict] = []
|
| 83 |
lines: list[str] = []
|
| 84 |
|
| 85 |
-
|
| 86 |
lines.append(
|
| 87 |
f"\n{'─'*80}\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
f"[{profile.prefix}] Layer {profile.layer_idx:3d}{kv_tag} "
|
| 89 |
f"n_q={n_q} n_kv={n_kv} group={group} "
|
| 90 |
f"d_head={d_head}({profile.head_dim_source})\n"
|
|
@@ -107,6 +115,16 @@ def analyze_layer(
|
|
| 107 |
smxk, smnk, cond_k = sigma_stats(s_k)
|
| 108 |
smxv, smnv, cond_v = sigma_stats(s_v)
|
| 109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
# KV 指标
|
| 111 |
if kv_shared:
|
| 112 |
ssr_kv = 0.0
|
|
@@ -116,11 +134,11 @@ def analyze_layer(
|
|
| 116 |
alpha_kv = 1.0
|
| 117 |
res_kv = 0.0
|
| 118 |
else:
|
| 119 |
-
n_kv_sv
|
| 120 |
-
ssr_kv
|
| 121 |
-
pkv
|
| 122 |
-
cosU_KV
|
| 123 |
-
cosV_KV
|
| 124 |
alpha_kv, res_kv = svr(s_k, s_v)
|
| 125 |
|
| 126 |
for q_off in range(group):
|
|
@@ -130,6 +148,13 @@ def analyze_layer(
|
|
| 130 |
|
| 131 |
smxq, smnq, cond_q = sigma_stats(s_q)
|
| 132 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
nqk = min(len(s_q), len(s_k))
|
| 134 |
nqv = min(len(s_q), len(s_v))
|
| 135 |
|
|
@@ -148,37 +173,40 @@ def analyze_layer(
|
|
| 148 |
cU_QV = cos_U(U_q, U_v)
|
| 149 |
cV_QV = cos_V(Vt_q, Vt_v)
|
| 150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
records.append({
|
| 152 |
"prefix": profile.prefix,
|
| 153 |
"layer": profile.layer_idx,
|
| 154 |
"kv_head": kv_h,
|
| 155 |
"q_head": h,
|
| 156 |
"kv_shared": kv_shared,
|
| 157 |
-
# 第一定律
|
| 158 |
"pearson_QK": round(pqk, 6),
|
| 159 |
"spearman_QK": round(spqk, 6),
|
| 160 |
"pearson_QV": round(pqv, 6),
|
| 161 |
"pearson_KV": round(pkv, 6),
|
| 162 |
-
# 第二定律
|
| 163 |
"ssr_QK": round(ssr_qk, 8),
|
| 164 |
"ssr_QV": round(ssr_qv, 8),
|
| 165 |
"ssr_KV": round(ssr_kv, 8),
|
| 166 |
-
# 第四定律
|
| 167 |
"cosU_QK": round(cU_QK, 6),
|
| 168 |
"cosU_QV": round(cU_QV, 6),
|
| 169 |
"cosU_KV": round(cosU_KV, 6),
|
| 170 |
-
# 第五定律
|
| 171 |
"cosV_QK": round(cV_QK, 6),
|
| 172 |
"cosV_QV": round(cV_QV, 6),
|
| 173 |
"cosV_KV": round(cosV_KV, 6),
|
| 174 |
-
# 尺度因子 + 最小二乘残差
|
| 175 |
"alpha_QK": round(a_qk, 4),
|
| 176 |
"alpha_QV": round(a_qv, 4),
|
| 177 |
"alpha_KV": round(alpha_kv,4),
|
| 178 |
"alpha_res_QK": round(r_qk, 6),
|
| 179 |
"alpha_res_QV": round(r_qv, 6),
|
| 180 |
"alpha_res_KV": round(res_kv, 6),
|
| 181 |
-
# 第三定律:奇异值范围 + 条件数
|
| 182 |
"sigma_max_Q": round(smxq, 4),
|
| 183 |
"sigma_min_Q": round(smnq, 4),
|
| 184 |
"sigma_max_K": round(smxk, 4),
|
|
@@ -188,7 +216,6 @@ def analyze_layer(
|
|
| 188 |
"cond_Q": round(cond_q, 2),
|
| 189 |
"cond_K": round(cond_k, 2),
|
| 190 |
"cond_V": round(cond_v, 2),
|
| 191 |
-
# 维度信息
|
| 192 |
"head_dim": d_head,
|
| 193 |
"d_model": profile.d_model,
|
| 194 |
"n_q_heads": n_q,
|
|
|
|
| 82 |
records: list[dict] = []
|
| 83 |
lines: list[str] = []
|
| 84 |
|
| 85 |
+
# ── 调试:打印整体 shape ──────────────────────
|
| 86 |
lines.append(
|
| 87 |
f"\n{'─'*80}\n"
|
| 88 |
+
f"[DEBUG] W_q={list(W_q.shape)} W_k={list(W_k.shape)} "
|
| 89 |
+
f"W_v={list(W_v.shape)}\n"
|
| 90 |
+
f"[DEBUG] n_q={n_q} n_kv={n_kv} group={group} "
|
| 91 |
+
f"d_head={d_head} source={profile.head_dim_source}\n"
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
kv_tag = " [K=V共享]" if kv_shared else ""
|
| 95 |
+
lines.append(
|
| 96 |
f"[{profile.prefix}] Layer {profile.layer_idx:3d}{kv_tag} "
|
| 97 |
f"n_q={n_q} n_kv={n_kv} group={group} "
|
| 98 |
f"d_head={d_head}({profile.head_dim_source})\n"
|
|
|
|
| 115 |
smxk, smnk, cond_k = sigma_stats(s_k)
|
| 116 |
smxv, smnv, cond_v = sigma_stats(s_v)
|
| 117 |
|
| 118 |
+
# ── 调试:打印每个 KV 头的切片和奇异值 ──────
|
| 119 |
+
lines.append(
|
| 120 |
+
f"[DEBUG] KV头{kv_h}: "
|
| 121 |
+
f"k_t={list(k_t.shape)} "
|
| 122 |
+
f"s_k前5={s_k[:5].tolist()}\n"
|
| 123 |
+
f"[DEBUG] KV头{kv_h}: "
|
| 124 |
+
f"v_t={list(v_t.shape)} "
|
| 125 |
+
f"s_v前5={s_v[:5].tolist()}\n"
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
# KV 指标
|
| 129 |
if kv_shared:
|
| 130 |
ssr_kv = 0.0
|
|
|
|
| 134 |
alpha_kv = 1.0
|
| 135 |
res_kv = 0.0
|
| 136 |
else:
|
| 137 |
+
n_kv_sv = min(len(s_k), len(s_v))
|
| 138 |
+
ssr_kv = ssr(s_k, s_v)
|
| 139 |
+
pkv = pearson(s_k[:n_kv_sv], s_v[:n_kv_sv])
|
| 140 |
+
cosU_KV = cos_U(U_k, U_v)
|
| 141 |
+
cosV_KV = cos_V(Vt_k, Vt_v)
|
| 142 |
alpha_kv, res_kv = svr(s_k, s_v)
|
| 143 |
|
| 144 |
for q_off in range(group):
|
|
|
|
| 148 |
|
| 149 |
smxq, smnq, cond_q = sigma_stats(s_q)
|
| 150 |
|
| 151 |
+
# ── 调试:打印每个 Q 头的切片和奇异值 ────
|
| 152 |
+
lines.append(
|
| 153 |
+
f"[DEBUG] Q头{h}: "
|
| 154 |
+
f"q_t={list(q_t.shape)} "
|
| 155 |
+
f"s_q前5={s_q[:5].tolist()}\n"
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
nqk = min(len(s_q), len(s_k))
|
| 159 |
nqv = min(len(s_q), len(s_v))
|
| 160 |
|
|
|
|
| 173 |
cU_QV = cos_U(U_q, U_v)
|
| 174 |
cV_QV = cos_V(Vt_q, Vt_v)
|
| 175 |
|
| 176 |
+
# ── 调试:打印关键指标 ────────────────────
|
| 177 |
+
lines.append(
|
| 178 |
+
f"[DEBUG] Q头{h}: "
|
| 179 |
+
f"pearson={pqk:+.4f} "
|
| 180 |
+
f"alpha_QK={a_qk:.4f} "
|
| 181 |
+
f"s_q[0]={s_q[0]:.4f} "
|
| 182 |
+
f"s_k[0]={s_k[0]:.4f}\n"
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
records.append({
|
| 186 |
"prefix": profile.prefix,
|
| 187 |
"layer": profile.layer_idx,
|
| 188 |
"kv_head": kv_h,
|
| 189 |
"q_head": h,
|
| 190 |
"kv_shared": kv_shared,
|
|
|
|
| 191 |
"pearson_QK": round(pqk, 6),
|
| 192 |
"spearman_QK": round(spqk, 6),
|
| 193 |
"pearson_QV": round(pqv, 6),
|
| 194 |
"pearson_KV": round(pkv, 6),
|
|
|
|
| 195 |
"ssr_QK": round(ssr_qk, 8),
|
| 196 |
"ssr_QV": round(ssr_qv, 8),
|
| 197 |
"ssr_KV": round(ssr_kv, 8),
|
|
|
|
| 198 |
"cosU_QK": round(cU_QK, 6),
|
| 199 |
"cosU_QV": round(cU_QV, 6),
|
| 200 |
"cosU_KV": round(cosU_KV, 6),
|
|
|
|
| 201 |
"cosV_QK": round(cV_QK, 6),
|
| 202 |
"cosV_QV": round(cV_QV, 6),
|
| 203 |
"cosV_KV": round(cosV_KV, 6),
|
|
|
|
| 204 |
"alpha_QK": round(a_qk, 4),
|
| 205 |
"alpha_QV": round(a_qv, 4),
|
| 206 |
"alpha_KV": round(alpha_kv,4),
|
| 207 |
"alpha_res_QK": round(r_qk, 6),
|
| 208 |
"alpha_res_QV": round(r_qv, 6),
|
| 209 |
"alpha_res_KV": round(res_kv, 6),
|
|
|
|
| 210 |
"sigma_max_Q": round(smxq, 4),
|
| 211 |
"sigma_min_Q": round(smnq, 4),
|
| 212 |
"sigma_max_K": round(smxk, 4),
|
|
|
|
| 216 |
"cond_Q": round(cond_q, 2),
|
| 217 |
"cond_K": round(cond_k, 2),
|
| 218 |
"cond_V": round(cond_v, 2),
|
|
|
|
| 219 |
"head_dim": d_head,
|
| 220 |
"d_model": profile.d_model,
|
| 221 |
"n_q_heads": n_q,
|