Alex W. commited on
Commit
4e18dab
·
1 Parent(s): a69ce56

关键事实:

Browse files

Gemma-3/4 视觉层的 key 命名是 model.vision_tower.vision_model.encoder.layers.{N}.self_attn.q_proj.weight —— 我的视觉关键词 vision/visual/vit 能识别到。
但是! Gemma-3 还有一种特殊情况:multi-modal projector 或某些层用 layers.{N} 但属于视觉部分,且有时视觉层 key 不含明显视觉关键词。
更关键:当前你的日志显示 Layer 11 是 1024×1024(视觉),Layer 12 是 1536(文本)—— 说明 视觉层和文本层的层号是独立编号的,但合在一起后被当成了"同一个模型的连续层"。
真正的 bug:

Python

m = re.search(r'layers\.(\d+)\.', key)
这个正则会把视觉层和文本层的 layers.N 都抓出来,如果某个视觉层 key 没有被 is_vision_key 识别(比如 model.vision_tower.encoder.layers.0.self_attn.q_proj.weight 中如果路径稍有变化),就会被混进来。

但从你的日志看,视觉层 356 个 key 已经被正确跳过了。所以真正的问题是:

Layer 11 的形状 (1024, 1024) 不对劲 —— 这不应该出现在文本层中。说明 is_vision_key 漏判了某些视觉层。

真正的修复方案
核心思路:不能只靠 key 名关键词判断模态,应该用 hidden_size 维度(输入维度) 作为模态指纹:

文本层:W_q.shape[1] == text_hidden_size(比如 1536, 2048, 5120 ...)
视觉层:W_q.shape[1] == vision_hidden_size(比如 1024, 1152, 768 ...)
当 K 和 V 来自不同模态(一个文本一个视觉),它们的输入维度不同,SVD 的 Vt 形状就不同 → 就出现了你的报错。

Files changed (1) hide show
  1. app.py +302 -248
app.py CHANGED
@@ -2,7 +2,7 @@ import gradio as gr
2
  import requests
3
  import struct
4
  import json
5
- import re # [改动1] 新增:用于多模态层名过滤
6
  import numpy as np
7
  import torch
8
  from scipy.stats import pearsonr, spearmanr
@@ -32,11 +32,12 @@ except AttributeError:
32
  UNSUPPORTED_SVD_DTYPES = {"I8", "U8", "I32", "I64", "F8_E4M3", "F8_E5M2"}
33
  QUANTIZED_KEY_SIGNATURES = ["qweight", "qzeros", "scales", "g_idx", "packed_weight"]
34
 
35
- # [改动1] 多模态视觉层关键词 → 跳过这些层
36
  VISION_KEY_PATTERNS = [
37
  "vision", "visual", "image_encoder",
38
  "img_encoder", "patch_embed", "vit",
39
- "vision_tower", "mm_projector",
 
40
  ]
41
 
42
 
@@ -60,7 +61,6 @@ def read_safetensors_header(url: str, token: str = None) -> tuple[dict, int]:
60
  )
61
  r.raise_for_status()
62
  raw = json.loads(r.content)
63
- # 过滤 __metadata__
64
  raw.pop("__metadata__", None)
65
  return raw, header_size
66
 
@@ -126,14 +126,114 @@ def _http_error_msg(e: requests.exceptions.HTTPError, model_id: str) -> str:
126
  return f"❌ HTTP {code}:{e}"
127
 
128
 
129
- # [改动1] 判断一个 key 是否属于视觉模态层
130
  def is_vision_key(key: str) -> bool:
131
  key_lower = key.lower()
132
  return any(pat in key_lower for pat in VISION_KEY_PATTERNS)
133
 
134
 
135
  # ─────────────────────────────────────────────
136
- # 量化三重检测(不变)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  # ─────────────────────────────────────────────
138
 
139
  def check_quantization(model_id: str, token: str = None) -> tuple[bool, str]:
@@ -200,24 +300,28 @@ def check_quantization(model_id: str, token: str = None) -> tuple[bool, str]:
200
 
201
 
202
  # ─────────────────────────────────────────────
203
- # GQA 参数自动推断(不变)
204
  # ─────────────────────────────────────────────
205
 
206
  def infer_gqa_params(
207
  W_q: torch.Tensor,
208
  W_k: torch.Tensor,
209
- config: dict | None
 
210
  ) -> tuple[int,int,int]:
211
  q_rows = W_q.shape[0]
212
  k_rows = W_k.shape[0]
213
 
214
  d_head = None
215
- if config:
216
- d_head = (
217
- config.get("head_dim") or
218
- config.get("kv_channels") or
219
- config.get("hidden_size", 0) // max(config.get("num_attention_heads", 1), 1)
220
- )
 
 
 
221
  if d_head == 0:
222
  d_head = None
223
 
@@ -243,7 +347,7 @@ def infer_gqa_params(
243
 
244
 
245
  # ─────────────────────────────────────────────
246
- # [改动2] 指标计算函数:新增右奇异向量对齐
247
  # ─────────────────────────────────────────────
248
 
249
  def compute_pearson_corr(s_a: torch.Tensor, s_b: torch.Tensor) -> float:
@@ -281,12 +385,9 @@ def compute_ssr(s_a: torch.Tensor, s_b: torch.Tensor) -> float:
281
  def compute_left_vector_alignment(
282
  U_a: torch.Tensor, U_b: torch.Tensor
283
  ) -> float:
284
- """
285
- 左奇异向量(输出子空间)对齐度:
286
- cosU = mean_i |<u_a_i, u_b_i>|
287
- 对应第四定律:cos(Uq,Uk) ≈ 1/√d_head(随机正交)
288
- cos(Uq,Uv) < 1/√d_head(超正交)
289
- """
290
  min_c = min(U_a.shape[1], U_b.shape[1])
291
  Ua = U_a[:, :min_c]
292
  Ub = U_b[:, :min_c]
@@ -295,16 +396,12 @@ def compute_left_vector_alignment(
295
  return float(torch.diag(torch.abs(Ua_n.T @ Ub_n)).mean())
296
 
297
 
298
- # [改动2] 新增:右奇异向量(输入子空间)对齐度
299
  def compute_right_vector_alignment(
300
  Vt_a: torch.Tensor, Vt_b: torch.Tensor
301
  ) -> float:
302
- """
303
- 右奇异向量(输入子空间)对齐度:
304
- cosV = mean_i |<v_a_i, v_b_i>|
305
- 对应第五定律:所有对之间 ≈ 1/√d_model(全局随机正交)
306
- 注意:SVD 返回 Vt(转置),每行是一个右奇异向量
307
- """
308
  min_r = min(Vt_a.shape[0], Vt_b.shape[0])
309
  Va_n = Vt_a[:min_r, :]
310
  Vb_n = Vt_b[:min_r, :]
@@ -314,37 +411,38 @@ def compute_right_vector_alignment(
314
 
315
 
316
  # ─────────────────────────────────────────────
317
- # [改动3] 逐头分析:Q-K + Q-V + K-V 全指标
318
  # ─────────────────────────────────────────────
319
 
320
  def analyze_layer_heads(
321
  W_q: torch.Tensor,
322
  W_k: torch.Tensor,
323
- W_v: torch.Tensor, # [改动3] 新增 W_v 输入
324
  layer_idx: int,
325
  n_q_heads: int,
326
  n_kv_heads: int,
327
  d_head: int,
328
- modality: str = "text", # [改动4] 新增 modality 标记
329
  ) -> tuple[list[dict], str]:
330
- """
331
- GQA 逐头全指标分析:
332
- 对每个 KV 头:
333
- - 计算 K-V 对的全部指标(只算一次)
334
- - 对组内每个 Q 头:计算 Q-K、Q-V 全部指标
335
- """
 
 
336
  group_size = n_q_heads // n_kv_heads
337
  records = []
338
  log_lines = []
339
 
340
  log_lines.append(
341
  f"\n{'─'*80}\n"
342
- f"Layer {layer_idx:3d} [{modality}] " # [改动4] 显示模态
343
  f"n_q={n_q_heads} n_kv={n_kv_heads} "
344
  f"group={group_size} d_head={d_head}\n"
345
  f"{'─'*80}\n"
346
  )
347
- # 表头
348
  log_lines.append(
349
  f" {'KV':>3} {'Q':>3} │"
350
  f" {'P_QK':>7} {'Sp_QK':>7} {'SSR_QK':>8} │"
@@ -355,19 +453,16 @@ def analyze_layer_heads(
355
  )
356
 
357
  for kv_h in range(n_kv_heads):
358
-
359
- # ── 提取 K / V 头矩阵 ─────────────────────────
360
  k_tensor = W_k[kv_h * d_head : (kv_h + 1) * d_head, :]
361
- v_tensor = W_v[kv_h * d_head : (kv_h + 1) * d_head, :] # [改动3]
362
 
363
  U_k, s_k, Vt_k = torch.linalg.svd(k_tensor, full_matrices=False)
364
- U_v, s_v, Vt_v = torch.linalg.svd(v_tensor, full_matrices=False) # [改动3]
365
 
366
- # ── K-V 指标(每个 KV 头只算一次)─────────────
367
- alpha_kv, alpha_res_kv = compute_singular_value_ratio(s_k, s_v)
368
- cosU_KV = compute_left_vector_alignment(U_k, U_v)
369
- cosV_KV = compute_right_vector_alignment(Vt_k, Vt_v) # [改动2]
370
- ssr_kv = compute_ssr(s_k, s_v)
371
  pearson_kv = compute_pearson_corr(
372
  s_k[:min(s_k.shape[0], s_v.shape[0])],
373
  s_v[:min(s_k.shape[0], s_v.shape[0])]
@@ -381,25 +476,22 @@ def analyze_layer_heads(
381
  min_qk = min(s_q.shape[0], s_k.shape[0])
382
  min_qv = min(s_q.shape[0], s_v.shape[0])
383
 
384
- # ── Q-K 指标 ──────────────────────────────
385
  pearson_qk = compute_pearson_corr(s_q[:min_qk], s_k[:min_qk])
386
  spearman_qk = float(spearmanr(
387
  s_q[:min_qk].cpu().numpy(),
388
  s_k[:min_qk].cpu().numpy()
389
  )[0])
390
- ssr_qk = compute_ssr(s_q, s_k)
391
- alpha_qk, alpha_res_qk = compute_singular_value_ratio(s_q, s_k)
392
- cosU_QK = compute_left_vector_alignment(U_q, U_k)
393
- cosV_QK = compute_right_vector_alignment(Vt_q, Vt_k) # [改动2]
394
 
395
- # ── Q-V 指标 ────────────────────────────── [改动3]
396
  pearson_qv = compute_pearson_corr(s_q[:min_qv], s_v[:min_qv])
397
  ssr_qv = compute_ssr(s_q, s_v)
398
- alpha_qv, alpha_res_qv = compute_singular_value_ratio(s_q, s_v)
399
- cosU_QV = compute_left_vector_alignment(U_q, U_v)
400
- cosV_QV = compute_right_vector_alignment(Vt_q, Vt_v) # [改动2]
401
 
402
- # ── 奇异值范围 ───────────────────────────── [改动3]
403
  sig_max_q = float(s_q.max())
404
  sig_min_q = float(s_q[s_q > 1e-10].min()) if (s_q > 1e-10).any() else 0.0
405
  sig_max_k = float(s_k.max())
@@ -407,52 +499,43 @@ def analyze_layer_heads(
407
  sig_max_v = float(s_v.max())
408
  sig_min_v = float(s_v[s_v > 1e-10].min()) if (s_v > 1e-10).any() else 0.0
409
 
410
- # 条件数(第三定律)
411
  cond_q = sig_max_q / (sig_min_q + 1e-10)
412
  cond_k = sig_max_k / (sig_min_k + 1e-10)
413
  cond_v = sig_max_v / (sig_min_v + 1e-10)
414
 
415
  records.append({
416
- # 位置信息
417
- "layer": layer_idx,
418
- "modality": modality, # [改动4]
419
- "kv_head": kv_h,
420
- "q_head": h_idx,
421
- # 第一定律:谱线性对齐
422
- "pearson_QK": round(pearson_qk, 6),
423
- "spearman_QK": round(spearman_qk, 6),
424
- "pearson_QV": round(pearson_qv, 6), # [改动3]
425
- "pearson_KV": round(pearson_kv, 6), # [改动3]
426
- # 第二定律:SSR
427
- "ssr_QK": round(ssr_qk, 8),
428
- "ssr_QV": round(ssr_qv, 8), # [改动3]
429
- "ssr_KV": round(ssr_kv, 8), # [改动3]
430
- # 第四定律:左奇异向量(输出子空间)
431
- "cosU_QK": round(cosU_QK, 6),
432
- "cosU_QV": round(cosU_QV, 6), # [改动3]
433
- "cosU_KV": round(cosU_KV, 6), # [改动3]
434
- # 第五定律:右奇异向量(输入子空间)[改动2]
435
- "cosV_QK": round(cosV_QK, 6),
436
- "cosV_QV": round(cosV_QV, 6),
437
- "cosV_KV": round(cosV_KV, 6),
438
- # 尺度因子
439
- "alpha_QK": round(alpha_qk, 4),
440
- "alpha_QV": round(alpha_qv, 4), # [改动3]
441
- "alpha_KV": round(alpha_kv, 4), # [改动3]
442
- "alpha_res_QK": round(alpha_res_qk, 6),
443
- "alpha_res_QV": round(alpha_res_qv, 6), # [改动3]
444
- "alpha_res_KV": round(alpha_res_kv, 6), # [改动3]
445
- # 奇异值范围 [改动3]
446
- "sigma_max_Q": round(sig_max_q, 4),
447
- "sigma_min_Q": round(sig_min_q, 4),
448
- "sigma_max_K": round(sig_max_k, 4),
449
- "sigma_min_K": round(sig_min_k, 4),
450
- "sigma_max_V": round(sig_max_v, 4),
451
- "sigma_min_V": round(sig_min_v, 4),
452
- # 条件数(第三定律)[改动3]
453
- "cond_Q": round(cond_q, 2),
454
- "cond_K": round(cond_k, 2),
455
- "cond_V": round(cond_v, 2),
456
  })
457
 
458
  log_lines.append(
@@ -492,7 +575,7 @@ def analyze_model(
492
  return "".join(log_lines), None
493
 
494
  # ── config.json ───────────────────────────────
495
- config = None
496
  try:
497
  r = requests.get(
498
  f"https://huggingface.co/{model_id}/resolve/main/config.json",
@@ -500,162 +583,151 @@ def analyze_model(
500
  timeout=15
501
  )
502
  if r.status_code == 200:
503
- config = r.json()
 
504
  log_lines.append(
505
  f"📋 config.json:\n"
506
- f" model_type = {config.get('model_type')}\n"
507
- f" hidden_size = {config.get('hidden_size')}\n"
508
- f" num_attention_heads = {config.get('num_attention_heads')}\n"
509
- f" num_key_value_heads = {config.get('num_key_value_heads')}\n"
510
- f" head_dim = {config.get('head_dim')}\n"
511
  f"{'─'*80}\n"
512
  )
513
  except Exception:
514
  log_lines.append("⚠️ 无法读取 config.json,将从 weight shape 自动推断\n")
515
 
516
- # ── 分片索引 ──────────────────────────────────
517
  progress(0.05, desc="读取模型索引...")
518
  try:
519
- index_data = find_index_file(model_id, token)
520
- shard_headers: dict[str, tuple[dict, int]] = {}
521
-
522
  if index_data:
523
- weight_map = index_data["weight_map"]
524
  log_lines.append(
525
- f"📦 分片模型,共 {len(set(weight_map.values()))} 个 shard\n"
526
  )
527
  else:
528
- sf_files = get_safetensor_files(model_id, token)
529
- weight_map = None
530
- log_lines.append(f"📦 单文件:{sf_files}\n")
531
  except requests.exceptions.HTTPError as e:
532
  return _http_error_msg(e, model_id), None
533
 
534
- # ── 探测第一个 shard ──────────────────────────
535
- progress(0.08, desc="识别层结构...")
536
- try:
537
- if index_data:
538
- first_shard = sorted(set(index_data["weight_map"].values()))[0]
539
- else:
540
- first_shard = sf_files[0]
541
- first_url = get_file_url(model_id, first_shard)
542
- first_header, first_hsize = read_safetensors_header(first_url, token)
543
- shard_headers[first_shard] = (first_header, first_hsize)
544
- all_keys = list(first_header.keys())
545
- except Exception as e:
546
- return f"❌ 读取 shard header 失败:{e}", None
547
-
548
- # [改动1] 区分文本 key 和视觉层 key
549
- text_keys = [k for k in all_keys if not is_vision_key(k)]
550
- vision_keys = [k for k in all_keys if is_vision_key(k)]
551
- log_lines.append(
552
- f"🔑 key 数:{len(all_keys)} "
553
- f"(文本层:{len(text_keys)},视觉层跳过:{len(vision_keys)})\n"
554
- )
555
-
556
- # 识别 Q/K/V key 命名规则(只在文本 key 中识别)
557
- q_candidates = [k for k in text_keys if any(
558
- p in k for p in ["q_proj.weight","query.weight","q.weight","wq.weight"]
559
- )]
560
- if not q_candidates:
561
- sample = "\n".join(text_keys[:30])
562
- return f"⚠️ 无法识别文本层 Q/K/V key,前 30 个文本 key:\n{sample}", None
563
-
564
- sample_q = q_candidates[0]
565
- if "q_proj" in sample_q: q_sfx, k_sfx, v_sfx = "self_attn.q_proj.weight", "self_attn.k_proj.weight", "self_attn.v_proj.weight"
566
- elif "query" in sample_q: q_sfx, k_sfx, v_sfx = "attention.query.weight", "attention.key.weight", "attention.value.weight"
567
- elif "wq" in sample_q: q_sfx, k_sfx, v_sfx = "attention.wq.weight", "attention.wk.weight", "attention.wv.weight"
568
- else:
569
- q_sfx = sample_q.split("layers.0.")[-1]
570
- k_sfx = q_sfx.replace("q.", "k.")
571
- v_sfx = q_sfx.replace("q.", "v.")
 
 
 
 
572
 
573
- log_lines.append(f"🔑 Q suffix:{q_sfx}\n")
574
- log_lines.append(f"🔑 K suffix:{k_sfx}\n")
575
- log_lines.append(f"🔑 V suffix:{v_sfx}\n") # [改动3]
576
  log_lines.append(f"{'═'*80}\n")
577
 
578
- # ── 辅助:查找 key 所在 shard ─────────────────
579
- def get_shard_for_key(key: str) -> str | None:
580
- if index_data:
581
- return index_data["weight_map"].get(key)
582
- for sf in sf_files:
583
- if sf not in shard_headers:
584
- h, hs = read_safetensors_header(get_file_url(model_id, sf), token)
585
- shard_headers[sf] = (h, hs)
586
- if key in shard_headers[sf][0]:
587
- return sf
588
- return None
589
-
590
  # ── 逐层分析 ─────────────────────────────────
591
- gqa_logged = False
 
 
 
 
 
 
592
 
593
- for layer_idx in range(int(max_layers)):
594
  progress(
595
- 0.10 + 0.85 * layer_idx / int(max_layers),
596
  desc=f"第 {layer_idx} 层..."
597
  )
598
 
599
- q_key = f"model.layers.{layer_idx}.{q_sfx}"
600
- k_key = f"model.layers.{layer_idx}.{k_sfx}"
601
- v_key = f"model.layers.{layer_idx}.{v_sfx}" # [改动3]
602
-
603
- q_shard = get_shard_for_key(q_key)
604
- k_shard = get_shard_for_key(k_key)
605
- v_shard = get_shard_for_key(v_key) # [改动3]
606
-
607
- if q_shard is None or k_shard is None:
608
- log_lines.append(
609
- f"\nLayer {layer_idx}: Q/K 未找到,分析结束(共 {layer_idx} 层)\n"
610
- )
611
- break
612
-
613
- # [改动3] V 找不到时降级处理(不阻断整体分析)
614
- if v_shard is None:
615
  log_lines.append(
616
- f"Layer {layer_idx}: ⚠️ V 未找到,跳过该层\n"
617
  )
618
  continue
619
 
620
- for shard in {q_shard, k_shard, v_shard}:
621
- if shard not in shard_headers:
622
- h, hs = read_safetensors_header(get_file_url(model_id, shard), token)
623
- shard_headers[shard] = (h, hs)
624
 
625
  try:
626
  W_q = load_tensor_remote(
627
  get_file_url(model_id, q_shard), q_key,
628
- *shard_headers[q_shard], token
629
  )
630
  W_k = load_tensor_remote(
631
  get_file_url(model_id, k_shard), k_key,
632
- *shard_headers[k_shard], token
633
  )
634
- W_v = load_tensor_remote( # [改动3]
635
  get_file_url(model_id, v_shard), v_key,
636
- *shard_headers[v_shard], token
637
  )
638
  except ValueError as e:
639
  log_lines.append(f"Layer {layer_idx}: ⚠️ 跳过({e})\n")
 
 
 
 
 
640
  continue
641
 
642
  if W_q is None or W_k is None or W_v is None:
643
  log_lines.append(f"Layer {layer_idx}: ⚠️ tensor 为 None,跳过\n")
 
644
  continue
645
 
646
- # [改动1] 判断该层是文本层还是视觉层
647
- modality = "vision" if is_vision_key(q_key) else "text"
648
- if modality == "vision":
649
- log_lines.append(f"Layer {layer_idx}: 🖼视觉层,跳过\n")
 
 
 
650
  del W_q, W_k, W_v
 
651
  continue
652
 
653
- # GQA 推断
654
  try:
655
- n_q_heads, n_kv_heads, d_head = infer_gqa_params(W_q, W_k, config)
 
 
656
  except ValueError as e:
657
  log_lines.append(f"Layer {layer_idx}: ❌ GQA 推断失败:{e}\n")
658
  del W_q, W_k, W_v
 
659
  continue
660
 
661
  if not gqa_logged:
@@ -663,29 +735,31 @@ def analyze_model(
663
  f"🧠 GQA 结构:n_q={n_q_heads} n_kv={n_kv_heads} "
664
  f"group={n_q_heads//n_kv_heads} d_head={d_head}\n"
665
  f" W_q={list(W_q.shape)} W_k={list(W_k.shape)} "
666
- f"W_v={list(W_v.shape)}\n" # [改动3]
667
  f"{'═'*80}\n"
668
  )
669
  gqa_logged = True
670
 
671
- # 逐头全指标计算
672
  records, layer_log = analyze_layer_heads(
673
- W_q, W_k, W_v, # [改动3]
674
  layer_idx,
675
  n_q_heads, n_kv_heads, d_head,
676
- modality=modality # [改动4]
677
  )
678
  all_records.extend(records)
679
  log_lines.append(layer_log)
680
 
681
  del W_q, W_k, W_v
 
682
 
683
- # ── 全局汇总 ──────────────────────────────────
684
  if all_records:
685
  df = pd.DataFrame(all_records)
686
 
687
- # [改动5] 分模态统计
688
  def stat_block(arr: np.ndarray, name: str) -> str:
 
 
 
689
  return (
690
  f" {name:<14}"
691
  f" Median={np.median(arr):.6f}"
@@ -706,25 +780,25 @@ def analyze_model(
706
 
707
  f"【第一定律 — Pearson r(→ 1)】\n",
708
  stat_block(text_df["pearson_QK"].values, "Q-K:"),
709
- stat_block(text_df["pearson_QV"].values, "Q-V:"), # [改动3]
710
- stat_block(text_df["pearson_KV"].values, "K-V:"), # [改动3]
711
 
712
  f"\n【第二定律 — SSR(→ 0)】\n",
713
  stat_block(text_df["ssr_QK"].values, "Q-K:"),
714
- stat_block(text_df["ssr_QV"].values, "Q-V:"), # [改动3]
715
- stat_block(text_df["ssr_KV"].values, "K-V:"), # [改动3]
716
 
717
- f"\n【第四定律 — cosU 输出子空间(Q-K≈1/√d,Q-V<1/√d 超正交)】\n",
718
  stat_block(text_df["cosU_QK"].values, "cosU Q-K:"),
719
- stat_block(text_df["cosU_QV"].values, "cosU Q-V:"), # [改动3]
720
- stat_block(text_df["cosU_KV"].values, "cosU K-V:"), # [改动3]
721
 
722
- f"\n【第五定律 — cosV 输入子空间(≈1/√d_model 全局随机正交)】\n", # [改动2]
723
  stat_block(text_df["cosV_QK"].values, "cosV Q-K:"),
724
  stat_block(text_df["cosV_QV"].values, "cosV Q-V:"),
725
  stat_block(text_df["cosV_KV"].values, "cosV K-V:"),
726
 
727
- f"\n【第三定律 — 条件数(越小越稳定)】\n", # [改动3]
728
  stat_block(text_df["cond_Q"].values, "cond Q:"),
729
  stat_block(text_df["cond_K"].values, "cond K:"),
730
  stat_block(text_df["cond_V"].values, "cond V:"),
@@ -740,7 +814,7 @@ def analyze_model(
740
 
741
 
742
  # ─────────────────────────────────────────────
743
- # Gradio UI
744
  # ─────────────────────────────────────────────
745
 
746
  with gr.Blocks(title="Wang's Five Laws — LLM Spectral Analyzer") as demo:
@@ -750,18 +824,6 @@ with gr.Blocks(title="Wang's Five Laws — LLM Spectral Analyzer") as demo:
750
  **Mathematical Foundations of Large Language Models (MF-LLM)**
751
 
752
  通过 **HTTP Range Request** 直接读取 HF 权重,**无需下载整个模型**。
753
- 支持 GQA + 多模态(自动跳过视觉层)。逐头计算全部五定律指标:
754
-
755
- | 定律 | 指标 | 理论极值 | 对象 |
756
- |------|------|---------|------|
757
- | 第一定律 | Pearson r / Spearman r | → 1 | Q-K |
758
- | 第二定律 | SSR | → 0 | Q-K, Q-V, K-V |
759
- | 第三定律 | 条件数 κ | 越小越好 | Q, K, V |
760
- | 第四定律 | cosU(Uq,Uk) | ≈1/√d_head;cosU(Uq,Uv)<1/√d_head | Q-K, Q-V, K-V |
761
- | 第五定律 | cosV(Vq,Vk) | ≈1/√d_model(随机正交) | Q-K, Q-V, K-V |
762
-
763
- [![DOI](https://img.shields.io/badge/DOI-10.5281%2Fzenodo.19707844-blue)](https://doi.org/10.5281/zenodo.19707844)
764
- [![HAL](https://img.shields.io/badge/HAL-hal--05609398-red)](https://hal.science/hal-05609398)
765
  """)
766
 
767
  with gr.Row():
@@ -782,33 +844,25 @@ with gr.Blocks(title="Wang's Five Laws — LLM Spectral Analyzer") as demo:
782
  )
783
  analyze_btn = gr.Button("🚀 开始分析", variant="primary")
784
 
785
- # [改动6] 更新推荐模型列表
786
  with gr.Column(scale=1):
787
  gr.Markdown("""
788
  ### ✅ 推荐模型
789
  ```
790
- Qwen/Qwen2.5-14B-Instruct (GQA 8Q/2K)
791
- meta-llama/Llama-3-8B (GQA)
792
- google/gemma-4-e2b (MHA 多模态)
793
- google/gemma-4-e4b-it (MHA 多模态)
794
  deepseek-ai/DeepSeek-R1-Distill-Qwen-14B
795
  ```
796
- ### GQA 典型结构
797
- | | Q头 | KV头 | 每组 |
798
- |------|-----|------|------|
799
- | Qwen2.5-7B | 28 | 4 | 7 |
800
- | LLaMA-3-8B | 32 | 8 | 4 |
801
- | Qwen2.5-14B | 40 | 8 | 5 |
802
- | Gemma-4-E2B | 8 | 4 | 2 |
803
-
804
- ### 🖼️ 多模态说明
805
- - 视觉层自动跳过
806
- - 仅分析文本 Transformer 层
807
- - 跳过关键词:`vision / visual / vit / patch_embed`
808
  """)
809
 
810
  log_output = gr.Textbox(
811
- label="分析日志(逐头详情)",
812
  lines=35, max_lines=100
813
  )
814
 
 
2
  import requests
3
  import struct
4
  import json
5
+ import re
6
  import numpy as np
7
  import torch
8
  from scipy.stats import pearsonr, spearmanr
 
32
  UNSUPPORTED_SVD_DTYPES = {"I8", "U8", "I32", "I64", "F8_E4M3", "F8_E5M2"}
33
  QUANTIZED_KEY_SIGNATURES = ["qweight", "qzeros", "scales", "g_idx", "packed_weight"]
34
 
35
+ # 视觉层关键词(扩充)
36
  VISION_KEY_PATTERNS = [
37
  "vision", "visual", "image_encoder",
38
  "img_encoder", "patch_embed", "vit",
39
+ "vision_tower", "vision_model", # ★ 补充 gemma 的命名
40
+ "mm_projector", "multi_modal",
41
  ]
42
 
43
 
 
61
  )
62
  r.raise_for_status()
63
  raw = json.loads(r.content)
 
64
  raw.pop("__metadata__", None)
65
  return raw, header_size
66
 
 
126
  return f"❌ HTTP {code}:{e}"
127
 
128
 
 
129
  def is_vision_key(key: str) -> bool:
130
  key_lower = key.lower()
131
  return any(pat in key_lower for pat in VISION_KEY_PATTERNS)
132
 
133
 
134
  # ─────────────────────────────────────────────
135
+ # ★ 修复1:发现层时记录 key 完整路径,并区分模态
136
+ # ─────────────────────────────────────────────
137
+
138
+ def discover_layer_qkv_keys(all_shard_headers: dict) -> dict:
139
+ """
140
+ 遍历所有 shard 的全部 keys,为每层归类 Q/K/V key。
141
+
142
+ 返回结构:
143
+ {
144
+ (modality, layer_idx, prefix): {
145
+ "q": (shard, key),
146
+ "k": (shard, key),
147
+ "v": (shard, key),
148
+ }
149
+ }
150
+ 其中 prefix 是 layers.{N} 之前的部分(如 "language_model.model."),
151
+ 用来区分同时存在多套 layer 编号的情况(如 vision tower + language model)。
152
+ """
153
+ layer_map: dict[tuple, dict] = {}
154
+
155
+ for shard_name, (header, _) in all_shard_headers.items():
156
+ for key in header.keys():
157
+ # 必须是 weight,不要 bias / norm
158
+ if not key.endswith(".weight"):
159
+ continue
160
+
161
+ # 提取 layers.{N} 的位置
162
+ m = re.search(r'(.*?)layers\.(\d+)\.(.*)', key)
163
+ if not m:
164
+ continue
165
+ prefix = m.group(1) # e.g. "language_model.model."
166
+ layer_idx = int(m.group(2))
167
+ suffix = m.group(3) # e.g. "self_attn.q_proj.weight"
168
+
169
+ # ★ 关键:模态判断基于 prefix(不是整个 key)
170
+ modality = "vision" if is_vision_key(prefix) else "text"
171
+
172
+ # 识别 Q/K/V
173
+ qkv = None
174
+ if any(p in suffix for p in [
175
+ "q_proj.weight", "wq.weight",
176
+ "attention.query.weight",
177
+ "self_attn.q.weight", "attn.q.weight",
178
+ ]):
179
+ qkv = "q"
180
+ elif any(p in suffix for p in [
181
+ "k_proj.weight", "wk.weight",
182
+ "attention.key.weight",
183
+ "self_attn.k.weight", "attn.k.weight",
184
+ ]):
185
+ qkv = "k"
186
+ elif any(p in suffix for p in [
187
+ "v_proj.weight", "wv.weight",
188
+ "attention.value.weight",
189
+ "self_attn.v.weight", "attn.v.weight",
190
+ ]):
191
+ qkv = "v"
192
+ else:
193
+ continue
194
+
195
+ # ★ 用 (modality, prefix, layer_idx) 作为唯一键
196
+ uid = (modality, prefix, layer_idx)
197
+ if uid not in layer_map:
198
+ layer_map[uid] = {"q": None, "k": None, "v": None}
199
+
200
+ if layer_map[uid][qkv] is None:
201
+ layer_map[uid][qkv] = (shard_name, key)
202
+
203
+ return layer_map
204
+
205
+
206
+ # ─────────────────────────────────────────────
207
+ # Gemma4 等 config 兼容
208
+ # ─────────────────────────────────────────────
209
+
210
+ def extract_config_params(config: dict) -> dict:
211
+ if config is None:
212
+ return {}
213
+
214
+ text_cfg = config.get("text_config", {}) or {}
215
+
216
+ def get_field(*keys):
217
+ for k in keys:
218
+ v = config.get(k)
219
+ if v is not None:
220
+ return v
221
+ v = text_cfg.get(k)
222
+ if v is not None:
223
+ return v
224
+ return None
225
+
226
+ return {
227
+ "hidden_size": get_field("hidden_size"),
228
+ "num_attention_heads": get_field("num_attention_heads"),
229
+ "num_key_value_heads": get_field("num_key_value_heads"),
230
+ "head_dim": get_field("head_dim"),
231
+ "model_type": get_field("model_type"),
232
+ }
233
+
234
+
235
+ # ─────────────────────────────────────────────
236
+ # 量化检测(不变)
237
  # ─────────────────────────────────────────────
238
 
239
  def check_quantization(model_id: str, token: str = None) -> tuple[bool, str]:
 
300
 
301
 
302
  # ─────────────────────────────────────────────
303
+ # GQA 推断
304
  # ─────────────────────────────────────────────
305
 
306
  def infer_gqa_params(
307
  W_q: torch.Tensor,
308
  W_k: torch.Tensor,
309
+ config_params: dict | None,
310
+ modality: str = "text",
311
  ) -> tuple[int,int,int]:
312
  q_rows = W_q.shape[0]
313
  k_rows = W_k.shape[0]
314
 
315
  d_head = None
316
+
317
+ # 视觉层不要用文本层的 head_dim
318
+ if config_params and modality == "text":
319
+ d_head = config_params.get("head_dim")
320
+ if not d_head:
321
+ nh = config_params.get("num_attention_heads") or 1
322
+ hs = config_params.get("hidden_size") or 0
323
+ if hs and nh:
324
+ d_head = hs // nh
325
  if d_head == 0:
326
  d_head = None
327
 
 
347
 
348
 
349
  # ─────────────────────────────────────────────
350
+ # 指标计算
351
  # ─────────────────────────────────────────────
352
 
353
  def compute_pearson_corr(s_a: torch.Tensor, s_b: torch.Tensor) -> float:
 
385
  def compute_left_vector_alignment(
386
  U_a: torch.Tensor, U_b: torch.Tensor
387
  ) -> float:
388
+ # ★ 安全:行数(输出维度 d_head)必须相同才有意义
389
+ if U_a.shape[0] != U_b.shape[0]:
390
+ return float('nan')
 
 
 
391
  min_c = min(U_a.shape[1], U_b.shape[1])
392
  Ua = U_a[:, :min_c]
393
  Ub = U_b[:, :min_c]
 
396
  return float(torch.diag(torch.abs(Ua_n.T @ Ub_n)).mean())
397
 
398
 
 
399
  def compute_right_vector_alignment(
400
  Vt_a: torch.Tensor, Vt_b: torch.Tensor
401
  ) -> float:
402
+ # ★ 安全:列数(输入维度 d_model)必须相同才有意义
403
+ if Vt_a.shape[1] != Vt_b.shape[1]:
404
+ return float('nan')
 
 
 
405
  min_r = min(Vt_a.shape[0], Vt_b.shape[0])
406
  Va_n = Vt_a[:min_r, :]
407
  Vb_n = Vt_b[:min_r, :]
 
411
 
412
 
413
  # ─────────────────────────────────────────────
414
+ # 逐头分析
415
  # ─────────────────────────────────────────────
416
 
417
  def analyze_layer_heads(
418
  W_q: torch.Tensor,
419
  W_k: torch.Tensor,
420
+ W_v: torch.Tensor,
421
  layer_idx: int,
422
  n_q_heads: int,
423
  n_kv_heads: int,
424
  d_head: int,
425
+ modality: str = "text",
426
  ) -> tuple[list[dict], str]:
427
+ # ★ 强一致性检查:Q/K/V 的输入维度必须一致
428
+ if W_q.shape[1] != W_k.shape[1] or W_k.shape[1] != W_v.shape[1]:
429
+ return [], (
430
+ f"\nLayer {layer_idx} [{modality}]: "
431
+ f"⚠️ Q/K/V 输入维度不一致 "
432
+ f"({W_q.shape}, {W_k.shape}, {W_v.shape}),跳过\n"
433
+ )
434
+
435
  group_size = n_q_heads // n_kv_heads
436
  records = []
437
  log_lines = []
438
 
439
  log_lines.append(
440
  f"\n{'─'*80}\n"
441
+ f"Layer {layer_idx:3d} [{modality}] "
442
  f"n_q={n_q_heads} n_kv={n_kv_heads} "
443
  f"group={group_size} d_head={d_head}\n"
444
  f"{'─'*80}\n"
445
  )
 
446
  log_lines.append(
447
  f" {'KV':>3} {'Q':>3} │"
448
  f" {'P_QK':>7} {'Sp_QK':>7} {'SSR_QK':>8} │"
 
453
  )
454
 
455
  for kv_h in range(n_kv_heads):
 
 
456
  k_tensor = W_k[kv_h * d_head : (kv_h + 1) * d_head, :]
457
+ v_tensor = W_v[kv_h * d_head : (kv_h + 1) * d_head, :]
458
 
459
  U_k, s_k, Vt_k = torch.linalg.svd(k_tensor, full_matrices=False)
460
+ U_v, s_v, Vt_v = torch.linalg.svd(v_tensor, full_matrices=False)
461
 
462
+ alpha_kv, alpha_res_kv = compute_singular_value_ratio(s_k, s_v)
463
+ cosU_KV = compute_left_vector_alignment(U_k, U_v)
464
+ cosV_KV = compute_right_vector_alignment(Vt_k, Vt_v)
465
+ ssr_kv = compute_ssr(s_k, s_v)
 
466
  pearson_kv = compute_pearson_corr(
467
  s_k[:min(s_k.shape[0], s_v.shape[0])],
468
  s_v[:min(s_k.shape[0], s_v.shape[0])]
 
476
  min_qk = min(s_q.shape[0], s_k.shape[0])
477
  min_qv = min(s_q.shape[0], s_v.shape[0])
478
 
 
479
  pearson_qk = compute_pearson_corr(s_q[:min_qk], s_k[:min_qk])
480
  spearman_qk = float(spearmanr(
481
  s_q[:min_qk].cpu().numpy(),
482
  s_k[:min_qk].cpu().numpy()
483
  )[0])
484
+ ssr_qk = compute_ssr(s_q, s_k)
485
+ alpha_qk, alpha_res_qk = compute_singular_value_ratio(s_q, s_k)
486
+ cosU_QK = compute_left_vector_alignment(U_q, U_k)
487
+ cosV_QK = compute_right_vector_alignment(Vt_q, Vt_k)
488
 
 
489
  pearson_qv = compute_pearson_corr(s_q[:min_qv], s_v[:min_qv])
490
  ssr_qv = compute_ssr(s_q, s_v)
491
+ alpha_qv, alpha_res_qv = compute_singular_value_ratio(s_q, s_v)
492
+ cosU_QV = compute_left_vector_alignment(U_q, U_v)
493
+ cosV_QV = compute_right_vector_alignment(Vt_q, Vt_v)
494
 
 
495
  sig_max_q = float(s_q.max())
496
  sig_min_q = float(s_q[s_q > 1e-10].min()) if (s_q > 1e-10).any() else 0.0
497
  sig_max_k = float(s_k.max())
 
499
  sig_max_v = float(s_v.max())
500
  sig_min_v = float(s_v[s_v > 1e-10].min()) if (s_v > 1e-10).any() else 0.0
501
 
 
502
  cond_q = sig_max_q / (sig_min_q + 1e-10)
503
  cond_k = sig_max_k / (sig_min_k + 1e-10)
504
  cond_v = sig_max_v / (sig_min_v + 1e-10)
505
 
506
  records.append({
507
+ "layer": layer_idx,
508
+ "modality": modality,
509
+ "kv_head": kv_h,
510
+ "q_head": h_idx,
511
+ "pearson_QK": round(pearson_qk, 6),
512
+ "spearman_QK": round(spearman_qk, 6),
513
+ "pearson_QV": round(pearson_qv, 6),
514
+ "pearson_KV": round(pearson_kv, 6),
515
+ "ssr_QK": round(ssr_qk, 8),
516
+ "ssr_QV": round(ssr_qv, 8),
517
+ "ssr_KV": round(ssr_kv, 8),
518
+ "cosU_QK": round(cosU_QK, 6),
519
+ "cosU_QV": round(cosU_QV, 6),
520
+ "cosU_KV": round(cosU_KV, 6),
521
+ "cosV_QK": round(cosV_QK, 6),
522
+ "cosV_QV": round(cosV_QV, 6),
523
+ "cosV_KV": round(cosV_KV, 6),
524
+ "alpha_QK": round(alpha_qk, 4),
525
+ "alpha_QV": round(alpha_qv, 4),
526
+ "alpha_KV": round(alpha_kv, 4),
527
+ "alpha_res_QK": round(alpha_res_qk, 6),
528
+ "alpha_res_QV": round(alpha_res_qv, 6),
529
+ "alpha_res_KV": round(alpha_res_kv, 6),
530
+ "sigma_max_Q": round(sig_max_q, 4),
531
+ "sigma_min_Q": round(sig_min_q, 4),
532
+ "sigma_max_K": round(sig_max_k, 4),
533
+ "sigma_min_K": round(sig_min_k, 4),
534
+ "sigma_max_V": round(sig_max_v, 4),
535
+ "sigma_min_V": round(sig_min_v, 4),
536
+ "cond_Q": round(cond_q, 2),
537
+ "cond_K": round(cond_k, 2),
538
+ "cond_V": round(cond_v, 2),
 
 
 
 
 
 
 
 
539
  })
540
 
541
  log_lines.append(
 
575
  return "".join(log_lines), None
576
 
577
  # ── config.json ───────────────────────────────
578
+ config_params = {}
579
  try:
580
  r = requests.get(
581
  f"https://huggingface.co/{model_id}/resolve/main/config.json",
 
583
  timeout=15
584
  )
585
  if r.status_code == 200:
586
+ raw_config = r.json()
587
+ config_params = extract_config_params(raw_config)
588
  log_lines.append(
589
  f"📋 config.json:\n"
590
+ f" model_type = {config_params.get('model_type')}\n"
591
+ f" hidden_size (text) = {config_params.get('hidden_size')}\n"
592
+ f" num_attention_heads = {config_params.get('num_attention_heads')}\n"
593
+ f" num_key_value_heads = {config_params.get('num_key_value_heads')}\n"
594
+ f" head_dim = {config_params.get('head_dim')}\n"
595
  f"{'─'*80}\n"
596
  )
597
  except Exception:
598
  log_lines.append("⚠️ 无法读取 config.json,将从 weight shape 自动推断\n")
599
 
600
+ # ── shard 列表 ────────────────────────────────
601
  progress(0.05, desc="读取模型索引...")
602
  try:
603
+ index_data = find_index_file(model_id, token)
 
 
604
  if index_data:
605
+ shard_files = sorted(set(index_data["weight_map"].values()))
606
  log_lines.append(
607
+ f"📦 分片模型,共 {len(shard_files)} 个 shard\n"
608
  )
609
  else:
610
+ shard_files = get_safetensor_files(model_id, token)
611
+ log_lines.append(f"📦 单/多文件:{shard_files}\n")
 
612
  except requests.exceptions.HTTPError as e:
613
  return _http_error_msg(e, model_id), None
614
 
615
+ # ── 读取所有 shard headers ────────────────────
616
+ progress(0.08, desc="读取所有 shard headers...")
617
+ all_shard_headers: dict[str, tuple[dict, int]] = {}
618
+ total_keys = 0
619
+ for shard in shard_files:
620
+ try:
621
+ url = get_file_url(model_id, shard)
622
+ h, hs = read_safetensors_header(url, token)
623
+ all_shard_headers[shard] = (h, hs)
624
+ total_keys += len(h)
625
+ except Exception as e:
626
+ log_lines.append(f"⚠️ 读取 {shard} header 失败:{e}\n")
627
+
628
+ # ── 发现层(区分模态)─────────────────────────
629
+ progress(0.12, desc="识别结构...")
630
+ layer_map = discover_layer_qkv_keys(all_shard_headers)
631
+
632
+ # ★ 统计每个 (modality, prefix) 的层数
633
+ groups: dict[tuple, list[int]] = {}
634
+ for (modality, prefix, layer_idx), _ in layer_map.items():
635
+ groups.setdefault((modality, prefix), []).append(layer_idx)
636
+
637
+ log_lines.append(f"🔑 key 数:{total_keys}\n")
638
+ log_lines.append(f"📐 发现层组:\n")
639
+ for (modality, prefix), layers in sorted(groups.items()):
640
+ log_lines.append(
641
+ f" [{modality:6s}] prefix='{prefix}' "
642
+ f"层数={len(layers)} 范围={min(layers)}~{max(layers)}\n"
643
+ )
644
+ log_lines.append(f"{'─'*80}\n")
645
+
646
+ # 只分析 text 模态(视觉层暂不分析)
647
+ text_layers = sorted([
648
+ (uid, info) for uid, info in layer_map.items()
649
+ if uid[0] == "text"
650
+ ], key=lambda x: x[0][2]) # 按 layer_idx 排序
651
+
652
+ if not text_layers:
653
+ return (
654
+ "".join(log_lines) +
655
+ "❌ 未发现任何文本层\n", None
656
+ )
657
 
658
+ log_lines.append(f"🔵 将分析 {len(text_layers)} 个文本层(前 {max_layers} 层)\n")
 
 
659
  log_lines.append(f"{'═'*80}\n")
660
 
 
 
 
 
 
 
 
 
 
 
 
 
661
  # ── 逐层分析 ─────────────────────────────────
662
+ gqa_logged = False
663
+ layers_done = 0
664
+ max_layers_i = int(max_layers)
665
+
666
+ for (modality, prefix, layer_idx), qkv in text_layers:
667
+ if layers_done >= max_layers_i:
668
+ break
669
 
 
670
  progress(
671
+ 0.15 + 0.80 * layers_done / max(max_layers_i, 1),
672
  desc=f"第 {layer_idx} 层..."
673
  )
674
 
675
+ if qkv["q"] is None or qkv["k"] is None or qkv["v"] is None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
676
  log_lines.append(
677
+ f"Layer {layer_idx} [{modality}]: ⚠️ Q/K/V 不完整,跳过\n"
678
  )
679
  continue
680
 
681
+ q_shard, q_key = qkv["q"]
682
+ k_shard, k_key = qkv["k"]
683
+ v_shard, v_key = qkv["v"]
 
684
 
685
  try:
686
  W_q = load_tensor_remote(
687
  get_file_url(model_id, q_shard), q_key,
688
+ *all_shard_headers[q_shard], token
689
  )
690
  W_k = load_tensor_remote(
691
  get_file_url(model_id, k_shard), k_key,
692
+ *all_shard_headers[k_shard], token
693
  )
694
+ W_v = load_tensor_remote(
695
  get_file_url(model_id, v_shard), v_key,
696
+ *all_shard_headers[v_shard], token
697
  )
698
  except ValueError as e:
699
  log_lines.append(f"Layer {layer_idx}: ⚠️ 跳过({e})\n")
700
+ layers_done += 1
701
+ continue
702
+ except Exception as e:
703
+ log_lines.append(f"Layer {layer_idx}: ❌ 加载失败({e})\n")
704
+ layers_done += 1
705
  continue
706
 
707
  if W_q is None or W_k is None or W_v is None:
708
  log_lines.append(f"Layer {layer_idx}: ⚠️ tensor 为 None,跳过\n")
709
+ layers_done += 1
710
  continue
711
 
712
+ # 一致性校验
713
+ if W_q.shape[1] != W_k.shape[1] or W_k.shape[1] != W_v.shape[1]:
714
+ log_lines.append(
715
+ f"Layer {layer_idx}: Q/K/V 输入维度不一致 "
716
+ f"Wq={list(W_q.shape)} Wk={list(W_k.shape)} "
717
+ f"Wv={list(W_v.shape)},跳过\n"
718
+ )
719
  del W_q, W_k, W_v
720
+ layers_done += 1
721
  continue
722
 
 
723
  try:
724
+ n_q_heads, n_kv_heads, d_head = infer_gqa_params(
725
+ W_q, W_k, config_params, modality=modality
726
+ )
727
  except ValueError as e:
728
  log_lines.append(f"Layer {layer_idx}: ❌ GQA 推断失败:{e}\n")
729
  del W_q, W_k, W_v
730
+ layers_done += 1
731
  continue
732
 
733
  if not gqa_logged:
 
735
  f"🧠 GQA 结构:n_q={n_q_heads} n_kv={n_kv_heads} "
736
  f"group={n_q_heads//n_kv_heads} d_head={d_head}\n"
737
  f" W_q={list(W_q.shape)} W_k={list(W_k.shape)} "
738
+ f"W_v={list(W_v.shape)}\n"
739
  f"{'═'*80}\n"
740
  )
741
  gqa_logged = True
742
 
 
743
  records, layer_log = analyze_layer_heads(
744
+ W_q, W_k, W_v,
745
  layer_idx,
746
  n_q_heads, n_kv_heads, d_head,
747
+ modality=modality
748
  )
749
  all_records.extend(records)
750
  log_lines.append(layer_log)
751
 
752
  del W_q, W_k, W_v
753
+ layers_done += 1
754
 
755
+ # ── 汇总 ─────────────────────────────────────
756
  if all_records:
757
  df = pd.DataFrame(all_records)
758
 
 
759
  def stat_block(arr: np.ndarray, name: str) -> str:
760
+ arr = arr[~np.isnan(arr)]
761
+ if len(arr) == 0:
762
+ return f" {name:<14} (无数据)\n"
763
  return (
764
  f" {name:<14}"
765
  f" Median={np.median(arr):.6f}"
 
780
 
781
  f"【第一定律 — Pearson r(→ 1)】\n",
782
  stat_block(text_df["pearson_QK"].values, "Q-K:"),
783
+ stat_block(text_df["pearson_QV"].values, "Q-V:"),
784
+ stat_block(text_df["pearson_KV"].values, "K-V:"),
785
 
786
  f"\n【第二定律 — SSR(→ 0)】\n",
787
  stat_block(text_df["ssr_QK"].values, "Q-K:"),
788
+ stat_block(text_df["ssr_QV"].values, "Q-V:"),
789
+ stat_block(text_df["ssr_KV"].values, "K-V:"),
790
 
791
+ f"\n【第四定律 — cosU 输出子空间】\n",
792
  stat_block(text_df["cosU_QK"].values, "cosU Q-K:"),
793
+ stat_block(text_df["cosU_QV"].values, "cosU Q-V:"),
794
+ stat_block(text_df["cosU_KV"].values, "cosU K-V:"),
795
 
796
+ f"\n【第五定律 — cosV 输入子空间】\n",
797
  stat_block(text_df["cosV_QK"].values, "cosV Q-K:"),
798
  stat_block(text_df["cosV_QV"].values, "cosV Q-V:"),
799
  stat_block(text_df["cosV_KV"].values, "cosV K-V:"),
800
 
801
+ f"\n【第三定律 — 条件数】\n",
802
  stat_block(text_df["cond_Q"].values, "cond Q:"),
803
  stat_block(text_df["cond_K"].values, "cond K:"),
804
  stat_block(text_df["cond_V"].values, "cond V:"),
 
814
 
815
 
816
  # ─────────────────────────────────────────────
817
+ # Gradio UI(不变)
818
  # ─────────────────────────────────────────────
819
 
820
  with gr.Blocks(title="Wang's Five Laws — LLM Spectral Analyzer") as demo:
 
824
  **Mathematical Foundations of Large Language Models (MF-LLM)**
825
 
826
  通过 **HTTP Range Request** 直接读取 HF 权重,**无需下载整个模型**。
 
 
 
 
 
 
 
 
 
 
 
 
827
  """)
828
 
829
  with gr.Row():
 
844
  )
845
  analyze_btn = gr.Button("🚀 开始分析", variant="primary")
846
 
 
847
  with gr.Column(scale=1):
848
  gr.Markdown("""
849
  ### ✅ 推荐模型
850
  ```
851
+ Qwen/Qwen2.5-14B-Instruct
852
+ meta-llama/Llama-3-8B
853
+ google/gemma-4-e2b
854
+ google/gemma-4-31b-it
855
  deepseek-ai/DeepSeek-R1-Distill-Qwen-14B
856
  ```
857
+ ### 🔑 关键修复
858
+ - 态判断基于 prefix 路径
859
+ - ✅ 视觉/文本层分组独立编号
860
+ - Q/K/V 输入维度一致性校验
861
+ - 视觉层不复用文本 head_dim
 
 
 
 
 
 
 
862
  """)
863
 
864
  log_output = gr.Textbox(
865
+ label="分析日志",
866
  lines=35, max_lines=100
867
  )
868