Alex W. commited on
Commit
a2dfa0f
ยท
1 Parent(s): f02f9b7

debug gemma-4-31b-it 2 code ,2 result issue

Browse files
Files changed (1) hide show
  1. core/metrics.py +18 -46
core/metrics.py CHANGED
@@ -82,17 +82,22 @@ def analyze_layer(
82
  records: list[dict] = []
83
  lines: list[str] = []
84
 
85
- # โ”€โ”€ ่ฐƒ่ฏ•๏ผšๆ‰“ๅฐๆ•ดไฝ“ shape โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
86
  lines.append(
87
- f"\n{'โ”€'*80}\n"
88
- f"[DEBUG] W_q={list(W_q.shape)} W_k={list(W_k.shape)} "
89
- f"W_v={list(W_v.shape)}\n"
90
- f"[DEBUG] n_q={n_q} n_kv={n_kv} group={group} "
91
- f"d_head={d_head} source={profile.head_dim_source}\n"
 
 
 
 
92
  )
93
 
94
  kv_tag = " [K=Vๅ…ฑไบซ]" if kv_shared else ""
95
  lines.append(
 
96
  f"[{profile.prefix}] Layer {profile.layer_idx:3d}{kv_tag} "
97
  f"n_q={n_q} n_kv={n_kv} group={group} "
98
  f"d_head={d_head}({profile.head_dim_source})\n"
@@ -105,29 +110,6 @@ def analyze_layer(
105
  f" {'ฮฑ_QK':>7} {'ฮฑ_QV':>7} {'ฮฑ_KV':>7}\n"
106
  )
107
 
108
- # ๆ‰“ๅฐ W_k ๆฏไธช d_head ๅ—็š„ L2 norm๏ผŒ็œ‹่ƒฝ้‡ๅˆ†ๅธƒ
109
- lines.append(f"[DEBUG] W_k ๅ„ๅคด่ƒฝ้‡๏ผˆ่กŒๅ— L2 norm๏ผ‰:\n")
110
- for i in range(n_kv):
111
- block = W_k[i * d_head:(i + 1) * d_head, :]
112
- norm = float(block.norm())
113
- # ๅŒๆ—ถๆ‰“ๅฐ่ฏฅๅ—็š„ๆœ€ๅคงๅฅ‡ๅผ‚ๅ€ผ
114
- s_tmp = torch.linalg.svd(block, full_matrices=False)[1]
115
- lines.append(
116
- f" KVๅคด{i:2d}: block_norm={norm:.2f} "
117
- f"sigma_max={float(s_tmp[0]):.4f}\n"
118
- )
119
-
120
- lines.append(f"[DEBUG] W_q ๅ„ๅคด่ƒฝ้‡:\n")
121
- for i in range(n_q):
122
- block = W_q[i * d_head:(i + 1) * d_head, :]
123
- norm = float(block.norm())
124
- s_tmp = torch.linalg.svd(block, full_matrices=False)[1]
125
- lines.append(
126
- f" Qๅคด{i:2d}: block_norm={norm:.2f} "
127
- f"sigma_max={float(s_tmp[0]):.4f}\n"
128
- )
129
-
130
-
131
  for kv_h in range(n_kv):
132
  k_t = W_k[kv_h * d_head:(kv_h + 1) * d_head, :]
133
  v_t = W_v[kv_h * d_head:(kv_h + 1) * d_head, :]
@@ -138,14 +120,13 @@ def analyze_layer(
138
  smxk, smnk, cond_k = sigma_stats(s_k)
139
  smxv, smnv, cond_v = sigma_stats(s_v)
140
 
141
- # โ”€โ”€ ่ฐƒ่ฏ•๏ผšๆ‰“ๅฐๆฏไธช KV ๅคด็š„ๅˆ‡็‰‡ๅ’Œๅฅ‡ๅผ‚ๅ€ผ โ”€โ”€โ”€โ”€โ”€โ”€
142
  lines.append(
143
  f"[DEBUG] KVๅคด{kv_h}: "
144
  f"k_t={list(k_t.shape)} "
145
- f"s_kๅ‰5={s_k[:5].tolist()}\n"
146
  f"[DEBUG] KVๅคด{kv_h}: "
147
- f"v_t={list(v_t.shape)} "
148
- f"s_vๅ‰5={s_v[:5].tolist()}\n"
149
  )
150
 
151
  # KV ๆŒ‡ๆ ‡
@@ -171,17 +152,18 @@ def analyze_layer(
171
 
172
  smxq, smnq, cond_q = sigma_stats(s_q)
173
 
174
- # โ”€โ”€ ่ฐƒ่ฏ•๏ผšๆ‰“ๅฐๆฏไธช Q ๅคด็š„ๅˆ‡็‰‡ๅ’Œๅฅ‡ๅผ‚ๅ€ผ โ”€โ”€โ”€โ”€
175
  lines.append(
176
  f"[DEBUG] Qๅคด{h}: "
177
  f"q_t={list(q_t.shape)} "
178
- f"s_qๅ‰5={s_q[:5].tolist()}\n"
 
 
179
  )
180
 
181
  nqk = min(len(s_q), len(s_k))
182
  nqv = min(len(s_q), len(s_v))
183
 
184
- # QK
185
  pqk = pearson(s_q[:nqk], s_k[:nqk])
186
  spqk = spearman_r(s_q[:nqk], s_k[:nqk])
187
  ssr_qk = ssr(s_q, s_k)
@@ -189,22 +171,12 @@ def analyze_layer(
189
  cU_QK = cos_U(U_q, U_k)
190
  cV_QK = cos_V(Vt_q, Vt_k)
191
 
192
- # QV
193
  pqv = pearson(s_q[:nqv], s_v[:nqv])
194
  ssr_qv = ssr(s_q, s_v)
195
  a_qv, r_qv = svr(s_q, s_v)
196
  cU_QV = cos_U(U_q, U_v)
197
  cV_QV = cos_V(Vt_q, Vt_v)
198
 
199
- # โ”€โ”€ ่ฐƒ่ฏ•๏ผšๆ‰“ๅฐๅ…ณ้”ฎๆŒ‡ๆ ‡ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
200
- lines.append(
201
- f"[DEBUG] Qๅคด{h}: "
202
- f"pearson={pqk:+.4f} "
203
- f"alpha_QK={a_qk:.4f} "
204
- f"s_q[0]={s_q[0]:.4f} "
205
- f"s_k[0]={s_k[0]:.4f}\n"
206
- )
207
-
208
  records.append({
209
  "prefix": profile.prefix,
210
  "layer": profile.layer_idx,
 
82
  records: list[dict] = []
83
  lines: list[str] = []
84
 
85
+ # โ”€โ”€ ่ฐƒ่ฏ•๏ผšๆ‰“ๅฐๆ•ดไฝ“ไฟกๆฏ + ๅŽŸๅง‹ๆƒ้‡้ฆ–่กŒ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
86
  lines.append(
87
+ f"\n[DEBUG] โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•\n"
88
+ f"[DEBUG] key_q = {profile.q.key}\n"
89
+ f"[DEBUG] key_k = {profile.k.key}\n"
90
+ f"[DEBUG] key_v = {profile.v.key if profile.v else 'K=V shared'}\n"
91
+ f"[DEBUG] W_q={list(W_q.shape)} W_k={list(W_k.shape)} W_v={list(W_v.shape)}\n"
92
+ f"[DEBUG] n_q={n_q} n_kv={n_kv} group={group} d_head={d_head}\n"
93
+ f"[DEBUG] W_k[0, :10] = {W_k[0, :10].tolist()}\n"
94
+ f"[DEBUG] W_q[0, :10] = {W_q[0, :10].tolist()}\n"
95
+ f"[DEBUG] โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•\n"
96
  )
97
 
98
  kv_tag = " [K=Vๅ…ฑไบซ]" if kv_shared else ""
99
  lines.append(
100
+ f"\n{'โ”€'*80}\n"
101
  f"[{profile.prefix}] Layer {profile.layer_idx:3d}{kv_tag} "
102
  f"n_q={n_q} n_kv={n_kv} group={group} "
103
  f"d_head={d_head}({profile.head_dim_source})\n"
 
110
  f" {'ฮฑ_QK':>7} {'ฮฑ_QV':>7} {'ฮฑ_KV':>7}\n"
111
  )
112
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  for kv_h in range(n_kv):
114
  k_t = W_k[kv_h * d_head:(kv_h + 1) * d_head, :]
115
  v_t = W_v[kv_h * d_head:(kv_h + 1) * d_head, :]
 
120
  smxk, smnk, cond_k = sigma_stats(s_k)
121
  smxv, smnv, cond_v = sigma_stats(s_v)
122
 
123
+ # โ”€โ”€ ่ฐƒ่ฏ•๏ผšKVๅคดๅˆ‡็‰‡้ฆ–่กŒๅŽŸๅง‹ๆƒ้‡ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
124
  lines.append(
125
  f"[DEBUG] KVๅคด{kv_h}: "
126
  f"k_t={list(k_t.shape)} "
127
+ f"s_kๅ‰5={[round(x,4) for x in s_k[:5].tolist()]}\n"
128
  f"[DEBUG] KVๅคด{kv_h}: "
129
+ f"k_t[0,:10]={k_t[0, :10].tolist()}\n"
 
130
  )
131
 
132
  # KV ๆŒ‡ๆ ‡
 
152
 
153
  smxq, smnq, cond_q = sigma_stats(s_q)
154
 
155
+ # โ”€โ”€ ่ฐƒ่ฏ•๏ผšQๅคดๅˆ‡็‰‡้ฆ–่กŒๅŽŸๅง‹ๆƒ้‡ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
156
  lines.append(
157
  f"[DEBUG] Qๅคด{h}: "
158
  f"q_t={list(q_t.shape)} "
159
+ f"s_qๅ‰5={[round(x,4) for x in s_q[:5].tolist()]}\n"
160
+ f"[DEBUG] Qๅคด{h}: "
161
+ f"q_t[0,:10]={q_t[0, :10].tolist()}\n"
162
  )
163
 
164
  nqk = min(len(s_q), len(s_k))
165
  nqv = min(len(s_q), len(s_v))
166
 
 
167
  pqk = pearson(s_q[:nqk], s_k[:nqk])
168
  spqk = spearman_r(s_q[:nqk], s_k[:nqk])
169
  ssr_qk = ssr(s_q, s_k)
 
171
  cU_QK = cos_U(U_q, U_k)
172
  cV_QK = cos_V(Vt_q, Vt_k)
173
 
 
174
  pqv = pearson(s_q[:nqv], s_v[:nqv])
175
  ssr_qv = ssr(s_q, s_v)
176
  a_qv, r_qv = svr(s_q, s_v)
177
  cU_QV = cos_U(U_q, U_v)
178
  cV_QV = cos_V(Vt_q, Vt_v)
179
 
 
 
 
 
 
 
 
 
 
180
  records.append({
181
  "prefix": profile.prefix,
182
  "layer": profile.layer_idx,