Spaces:
Running
Running
Alex W. commited on
Commit ยท
a2dfa0f
1
Parent(s): f02f9b7
debug gemma-4-31b-it 2 code ,2 result issue
Browse files- core/metrics.py +18 -46
core/metrics.py
CHANGED
|
@@ -82,17 +82,22 @@ def analyze_layer(
|
|
| 82 |
records: list[dict] = []
|
| 83 |
lines: list[str] = []
|
| 84 |
|
| 85 |
-
# โโ ่ฐ่ฏ๏ผๆๅฐๆดไฝ
|
| 86 |
lines.append(
|
| 87 |
-
f"\n
|
| 88 |
-
f"[DEBUG]
|
| 89 |
-
f"
|
| 90 |
-
f"[DEBUG]
|
| 91 |
-
f"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
)
|
| 93 |
|
| 94 |
kv_tag = " [K=Vๅ
ฑไบซ]" if kv_shared else ""
|
| 95 |
lines.append(
|
|
|
|
| 96 |
f"[{profile.prefix}] Layer {profile.layer_idx:3d}{kv_tag} "
|
| 97 |
f"n_q={n_q} n_kv={n_kv} group={group} "
|
| 98 |
f"d_head={d_head}({profile.head_dim_source})\n"
|
|
@@ -105,29 +110,6 @@ def analyze_layer(
|
|
| 105 |
f" {'ฮฑ_QK':>7} {'ฮฑ_QV':>7} {'ฮฑ_KV':>7}\n"
|
| 106 |
)
|
| 107 |
|
| 108 |
-
# ๆๅฐ W_k ๆฏไธช d_head ๅ็ L2 norm๏ผ็่ฝ้ๅๅธ
|
| 109 |
-
lines.append(f"[DEBUG] W_k ๅๅคด่ฝ้๏ผ่กๅ L2 norm๏ผ:\n")
|
| 110 |
-
for i in range(n_kv):
|
| 111 |
-
block = W_k[i * d_head:(i + 1) * d_head, :]
|
| 112 |
-
norm = float(block.norm())
|
| 113 |
-
# ๅๆถๆๅฐ่ฏฅๅ็ๆๅคงๅฅๅผๅผ
|
| 114 |
-
s_tmp = torch.linalg.svd(block, full_matrices=False)[1]
|
| 115 |
-
lines.append(
|
| 116 |
-
f" KVๅคด{i:2d}: block_norm={norm:.2f} "
|
| 117 |
-
f"sigma_max={float(s_tmp[0]):.4f}\n"
|
| 118 |
-
)
|
| 119 |
-
|
| 120 |
-
lines.append(f"[DEBUG] W_q ๅๅคด่ฝ้:\n")
|
| 121 |
-
for i in range(n_q):
|
| 122 |
-
block = W_q[i * d_head:(i + 1) * d_head, :]
|
| 123 |
-
norm = float(block.norm())
|
| 124 |
-
s_tmp = torch.linalg.svd(block, full_matrices=False)[1]
|
| 125 |
-
lines.append(
|
| 126 |
-
f" Qๅคด{i:2d}: block_norm={norm:.2f} "
|
| 127 |
-
f"sigma_max={float(s_tmp[0]):.4f}\n"
|
| 128 |
-
)
|
| 129 |
-
|
| 130 |
-
|
| 131 |
for kv_h in range(n_kv):
|
| 132 |
k_t = W_k[kv_h * d_head:(kv_h + 1) * d_head, :]
|
| 133 |
v_t = W_v[kv_h * d_head:(kv_h + 1) * d_head, :]
|
|
@@ -138,14 +120,13 @@ def analyze_layer(
|
|
| 138 |
smxk, smnk, cond_k = sigma_stats(s_k)
|
| 139 |
smxv, smnv, cond_v = sigma_stats(s_v)
|
| 140 |
|
| 141 |
-
# โโ ่ฐ่ฏ๏ผ
|
| 142 |
lines.append(
|
| 143 |
f"[DEBUG] KVๅคด{kv_h}: "
|
| 144 |
f"k_t={list(k_t.shape)} "
|
| 145 |
-
f"s_kๅ5={s_k[:5].tolist()}\n"
|
| 146 |
f"[DEBUG] KVๅคด{kv_h}: "
|
| 147 |
-
f"
|
| 148 |
-
f"s_vๅ5={s_v[:5].tolist()}\n"
|
| 149 |
)
|
| 150 |
|
| 151 |
# KV ๆๆ
|
|
@@ -171,17 +152,18 @@ def analyze_layer(
|
|
| 171 |
|
| 172 |
smxq, smnq, cond_q = sigma_stats(s_q)
|
| 173 |
|
| 174 |
-
# โโ ่ฐ่ฏ๏ผ
|
| 175 |
lines.append(
|
| 176 |
f"[DEBUG] Qๅคด{h}: "
|
| 177 |
f"q_t={list(q_t.shape)} "
|
| 178 |
-
f"s_qๅ5={s_q[:5].tolist()}\n"
|
|
|
|
|
|
|
| 179 |
)
|
| 180 |
|
| 181 |
nqk = min(len(s_q), len(s_k))
|
| 182 |
nqv = min(len(s_q), len(s_v))
|
| 183 |
|
| 184 |
-
# QK
|
| 185 |
pqk = pearson(s_q[:nqk], s_k[:nqk])
|
| 186 |
spqk = spearman_r(s_q[:nqk], s_k[:nqk])
|
| 187 |
ssr_qk = ssr(s_q, s_k)
|
|
@@ -189,22 +171,12 @@ def analyze_layer(
|
|
| 189 |
cU_QK = cos_U(U_q, U_k)
|
| 190 |
cV_QK = cos_V(Vt_q, Vt_k)
|
| 191 |
|
| 192 |
-
# QV
|
| 193 |
pqv = pearson(s_q[:nqv], s_v[:nqv])
|
| 194 |
ssr_qv = ssr(s_q, s_v)
|
| 195 |
a_qv, r_qv = svr(s_q, s_v)
|
| 196 |
cU_QV = cos_U(U_q, U_v)
|
| 197 |
cV_QV = cos_V(Vt_q, Vt_v)
|
| 198 |
|
| 199 |
-
# โโ ่ฐ่ฏ๏ผๆๅฐๅ
ณ้ฎๆๆ โโโโโโโโโโโโโโโโโโโโ
|
| 200 |
-
lines.append(
|
| 201 |
-
f"[DEBUG] Qๅคด{h}: "
|
| 202 |
-
f"pearson={pqk:+.4f} "
|
| 203 |
-
f"alpha_QK={a_qk:.4f} "
|
| 204 |
-
f"s_q[0]={s_q[0]:.4f} "
|
| 205 |
-
f"s_k[0]={s_k[0]:.4f}\n"
|
| 206 |
-
)
|
| 207 |
-
|
| 208 |
records.append({
|
| 209 |
"prefix": profile.prefix,
|
| 210 |
"layer": profile.layer_idx,
|
|
|
|
| 82 |
records: list[dict] = []
|
| 83 |
lines: list[str] = []
|
| 84 |
|
| 85 |
+
# โโ ่ฐ่ฏ๏ผๆๅฐๆดไฝไฟกๆฏ + ๅๅงๆ้้ฆ่ก โโโโโโโโโโ
|
| 86 |
lines.append(
|
| 87 |
+
f"\n[DEBUG] โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n"
|
| 88 |
+
f"[DEBUG] key_q = {profile.q.key}\n"
|
| 89 |
+
f"[DEBUG] key_k = {profile.k.key}\n"
|
| 90 |
+
f"[DEBUG] key_v = {profile.v.key if profile.v else 'K=V shared'}\n"
|
| 91 |
+
f"[DEBUG] W_q={list(W_q.shape)} W_k={list(W_k.shape)} W_v={list(W_v.shape)}\n"
|
| 92 |
+
f"[DEBUG] n_q={n_q} n_kv={n_kv} group={group} d_head={d_head}\n"
|
| 93 |
+
f"[DEBUG] W_k[0, :10] = {W_k[0, :10].tolist()}\n"
|
| 94 |
+
f"[DEBUG] W_q[0, :10] = {W_q[0, :10].tolist()}\n"
|
| 95 |
+
f"[DEBUG] โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n"
|
| 96 |
)
|
| 97 |
|
| 98 |
kv_tag = " [K=Vๅ
ฑไบซ]" if kv_shared else ""
|
| 99 |
lines.append(
|
| 100 |
+
f"\n{'โ'*80}\n"
|
| 101 |
f"[{profile.prefix}] Layer {profile.layer_idx:3d}{kv_tag} "
|
| 102 |
f"n_q={n_q} n_kv={n_kv} group={group} "
|
| 103 |
f"d_head={d_head}({profile.head_dim_source})\n"
|
|
|
|
| 110 |
f" {'ฮฑ_QK':>7} {'ฮฑ_QV':>7} {'ฮฑ_KV':>7}\n"
|
| 111 |
)
|
| 112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
for kv_h in range(n_kv):
|
| 114 |
k_t = W_k[kv_h * d_head:(kv_h + 1) * d_head, :]
|
| 115 |
v_t = W_v[kv_h * d_head:(kv_h + 1) * d_head, :]
|
|
|
|
| 120 |
smxk, smnk, cond_k = sigma_stats(s_k)
|
| 121 |
smxv, smnv, cond_v = sigma_stats(s_v)
|
| 122 |
|
| 123 |
+
# โโ ่ฐ่ฏ๏ผKVๅคดๅ็้ฆ่กๅๅงๆ้ โโโโโโโโโโโโโโ
|
| 124 |
lines.append(
|
| 125 |
f"[DEBUG] KVๅคด{kv_h}: "
|
| 126 |
f"k_t={list(k_t.shape)} "
|
| 127 |
+
f"s_kๅ5={[round(x,4) for x in s_k[:5].tolist()]}\n"
|
| 128 |
f"[DEBUG] KVๅคด{kv_h}: "
|
| 129 |
+
f"k_t[0,:10]={k_t[0, :10].tolist()}\n"
|
|
|
|
| 130 |
)
|
| 131 |
|
| 132 |
# KV ๆๆ
|
|
|
|
| 152 |
|
| 153 |
smxq, smnq, cond_q = sigma_stats(s_q)
|
| 154 |
|
| 155 |
+
# โโ ่ฐ่ฏ๏ผQๅคดๅ็้ฆ่กๅๅงๆ้ โโโโโโโโโโโโ
|
| 156 |
lines.append(
|
| 157 |
f"[DEBUG] Qๅคด{h}: "
|
| 158 |
f"q_t={list(q_t.shape)} "
|
| 159 |
+
f"s_qๅ5={[round(x,4) for x in s_q[:5].tolist()]}\n"
|
| 160 |
+
f"[DEBUG] Qๅคด{h}: "
|
| 161 |
+
f"q_t[0,:10]={q_t[0, :10].tolist()}\n"
|
| 162 |
)
|
| 163 |
|
| 164 |
nqk = min(len(s_q), len(s_k))
|
| 165 |
nqv = min(len(s_q), len(s_v))
|
| 166 |
|
|
|
|
| 167 |
pqk = pearson(s_q[:nqk], s_k[:nqk])
|
| 168 |
spqk = spearman_r(s_q[:nqk], s_k[:nqk])
|
| 169 |
ssr_qk = ssr(s_q, s_k)
|
|
|
|
| 171 |
cU_QK = cos_U(U_q, U_k)
|
| 172 |
cV_QK = cos_V(Vt_q, Vt_k)
|
| 173 |
|
|
|
|
| 174 |
pqv = pearson(s_q[:nqv], s_v[:nqv])
|
| 175 |
ssr_qv = ssr(s_q, s_v)
|
| 176 |
a_qv, r_qv = svr(s_q, s_v)
|
| 177 |
cU_QV = cos_U(U_q, U_v)
|
| 178 |
cV_QV = cos_V(Vt_q, Vt_v)
|
| 179 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
records.append({
|
| 181 |
"prefix": profile.prefix,
|
| 182 |
"layer": profile.layer_idx,
|