Spaces:
Running
关键事实:
Browse filesGemma-3/4 视觉层的 key 命名是 model.vision_tower.vision_model.encoder.layers.{N}.self_attn.q_proj.weight —— 我的视觉关键词 vision/visual/vit 能识别到。
但是! Gemma-3 还有一种特殊情况:multi-modal projector 或某些层用 layers.{N} 但属于视觉部分,且有时视觉层 key 不含明显视觉关键词。
更关键:当前你的日志显示 Layer 11 是 1024×1024(视觉),Layer 12 是 1536(文本)—— 说明 视觉层和文本层的层号是独立编号的,但合在一起后被当成了"同一个模型的连续层"。
真正的 bug:
Python
m = re.search(r'layers\.(\d+)\.', key)
这个正则会把视觉层和文本层的 layers.N 都抓出来,如果某个视觉层 key 没有被 is_vision_key 识别(比如 model.vision_tower.encoder.layers.0.self_attn.q_proj.weight 中如果路径稍有变化),就会被混进来。
但从你的日志看,视觉层 356 个 key 已经被正确跳过了。所以真正的问题是:
Layer 11 的形状 (1024, 1024) 不对劲 —— 这不应该出现在文本层中。说明 is_vision_key 漏判了某些视觉层。
真正的修复方案
核心思路:不能只靠 key 名关键词判断模态,应该用 hidden_size 维度(输入维度) 作为模态指纹:
文本层:W_q.shape[1] == text_hidden_size(比如 1536, 2048, 5120 ...)
视觉层:W_q.shape[1] == vision_hidden_size(比如 1024, 1152, 768 ...)
当 K 和 V 来自不同模态(一个文本一个视觉),它们的输入维度不同,SVD 的 Vt 形状就不同 → 就出现了你的报错。
|
@@ -2,7 +2,7 @@ import gradio as gr
|
|
| 2 |
import requests
|
| 3 |
import struct
|
| 4 |
import json
|
| 5 |
-
import re
|
| 6 |
import numpy as np
|
| 7 |
import torch
|
| 8 |
from scipy.stats import pearsonr, spearmanr
|
|
@@ -32,11 +32,12 @@ except AttributeError:
|
|
| 32 |
UNSUPPORTED_SVD_DTYPES = {"I8", "U8", "I32", "I64", "F8_E4M3", "F8_E5M2"}
|
| 33 |
QUANTIZED_KEY_SIGNATURES = ["qweight", "qzeros", "scales", "g_idx", "packed_weight"]
|
| 34 |
|
| 35 |
-
#
|
| 36 |
VISION_KEY_PATTERNS = [
|
| 37 |
"vision", "visual", "image_encoder",
|
| 38 |
"img_encoder", "patch_embed", "vit",
|
| 39 |
-
"vision_tower", "
|
|
|
|
| 40 |
]
|
| 41 |
|
| 42 |
|
|
@@ -60,7 +61,6 @@ def read_safetensors_header(url: str, token: str = None) -> tuple[dict, int]:
|
|
| 60 |
)
|
| 61 |
r.raise_for_status()
|
| 62 |
raw = json.loads(r.content)
|
| 63 |
-
# 过滤 __metadata__
|
| 64 |
raw.pop("__metadata__", None)
|
| 65 |
return raw, header_size
|
| 66 |
|
|
@@ -126,14 +126,114 @@ def _http_error_msg(e: requests.exceptions.HTTPError, model_id: str) -> str:
|
|
| 126 |
return f"❌ HTTP {code}:{e}"
|
| 127 |
|
| 128 |
|
| 129 |
-
# [改动1] 判断一个 key 是否属于视觉模态层
|
| 130 |
def is_vision_key(key: str) -> bool:
|
| 131 |
key_lower = key.lower()
|
| 132 |
return any(pat in key_lower for pat in VISION_KEY_PATTERNS)
|
| 133 |
|
| 134 |
|
| 135 |
# ─────────────────────────────────────────────
|
| 136 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
# ─────────────────────────────────────────────
|
| 138 |
|
| 139 |
def check_quantization(model_id: str, token: str = None) -> tuple[bool, str]:
|
|
@@ -200,24 +300,28 @@ def check_quantization(model_id: str, token: str = None) -> tuple[bool, str]:
|
|
| 200 |
|
| 201 |
|
| 202 |
# ─────────────────────────────────────────────
|
| 203 |
-
# GQA
|
| 204 |
# ─────────────────────────────────────────────
|
| 205 |
|
| 206 |
def infer_gqa_params(
|
| 207 |
W_q: torch.Tensor,
|
| 208 |
W_k: torch.Tensor,
|
| 209 |
-
|
|
|
|
| 210 |
) -> tuple[int,int,int]:
|
| 211 |
q_rows = W_q.shape[0]
|
| 212 |
k_rows = W_k.shape[0]
|
| 213 |
|
| 214 |
d_head = None
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
|
|
|
|
|
|
|
|
|
| 221 |
if d_head == 0:
|
| 222 |
d_head = None
|
| 223 |
|
|
@@ -243,7 +347,7 @@ def infer_gqa_params(
|
|
| 243 |
|
| 244 |
|
| 245 |
# ─────────────────────────────────────────────
|
| 246 |
-
#
|
| 247 |
# ─────────────────────────────────────────────
|
| 248 |
|
| 249 |
def compute_pearson_corr(s_a: torch.Tensor, s_b: torch.Tensor) -> float:
|
|
@@ -281,12 +385,9 @@ def compute_ssr(s_a: torch.Tensor, s_b: torch.Tensor) -> float:
|
|
| 281 |
def compute_left_vector_alignment(
|
| 282 |
U_a: torch.Tensor, U_b: torch.Tensor
|
| 283 |
) -> float:
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
对应第四定律:cos(Uq,Uk) ≈ 1/√d_head(随机正交)
|
| 288 |
-
cos(Uq,Uv) < 1/√d_head(超正交)
|
| 289 |
-
"""
|
| 290 |
min_c = min(U_a.shape[1], U_b.shape[1])
|
| 291 |
Ua = U_a[:, :min_c]
|
| 292 |
Ub = U_b[:, :min_c]
|
|
@@ -295,16 +396,12 @@ def compute_left_vector_alignment(
|
|
| 295 |
return float(torch.diag(torch.abs(Ua_n.T @ Ub_n)).mean())
|
| 296 |
|
| 297 |
|
| 298 |
-
# [改动2] 新增:右奇异向量(输入子空间)对齐度
|
| 299 |
def compute_right_vector_alignment(
|
| 300 |
Vt_a: torch.Tensor, Vt_b: torch.Tensor
|
| 301 |
) -> float:
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
对应第五定律:所有对之间 ≈ 1/√d_model(全局随机正交)
|
| 306 |
-
注意:SVD 返回 Vt(转置),每行是一个右奇异向量
|
| 307 |
-
"""
|
| 308 |
min_r = min(Vt_a.shape[0], Vt_b.shape[0])
|
| 309 |
Va_n = Vt_a[:min_r, :]
|
| 310 |
Vb_n = Vt_b[:min_r, :]
|
|
@@ -314,37 +411,38 @@ def compute_right_vector_alignment(
|
|
| 314 |
|
| 315 |
|
| 316 |
# ─────────────────────────────────────────────
|
| 317 |
-
#
|
| 318 |
# ─────────────────────────────────────────────
|
| 319 |
|
| 320 |
def analyze_layer_heads(
|
| 321 |
W_q: torch.Tensor,
|
| 322 |
W_k: torch.Tensor,
|
| 323 |
-
W_v: torch.Tensor,
|
| 324 |
layer_idx: int,
|
| 325 |
n_q_heads: int,
|
| 326 |
n_kv_heads: int,
|
| 327 |
d_head: int,
|
| 328 |
-
modality: str = "text",
|
| 329 |
) -> tuple[list[dict], str]:
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
|
|
|
|
|
|
| 336 |
group_size = n_q_heads // n_kv_heads
|
| 337 |
records = []
|
| 338 |
log_lines = []
|
| 339 |
|
| 340 |
log_lines.append(
|
| 341 |
f"\n{'─'*80}\n"
|
| 342 |
-
f"Layer {layer_idx:3d} [{modality}] "
|
| 343 |
f"n_q={n_q_heads} n_kv={n_kv_heads} "
|
| 344 |
f"group={group_size} d_head={d_head}\n"
|
| 345 |
f"{'─'*80}\n"
|
| 346 |
)
|
| 347 |
-
# 表头
|
| 348 |
log_lines.append(
|
| 349 |
f" {'KV':>3} {'Q':>3} │"
|
| 350 |
f" {'P_QK':>7} {'Sp_QK':>7} {'SSR_QK':>8} │"
|
|
@@ -355,19 +453,16 @@ def analyze_layer_heads(
|
|
| 355 |
)
|
| 356 |
|
| 357 |
for kv_h in range(n_kv_heads):
|
| 358 |
-
|
| 359 |
-
# ── 提取 K / V 头矩阵 ─────────────────────────
|
| 360 |
k_tensor = W_k[kv_h * d_head : (kv_h + 1) * d_head, :]
|
| 361 |
-
v_tensor = W_v[kv_h * d_head : (kv_h + 1) * d_head, :]
|
| 362 |
|
| 363 |
U_k, s_k, Vt_k = torch.linalg.svd(k_tensor, full_matrices=False)
|
| 364 |
-
U_v, s_v, Vt_v = torch.linalg.svd(v_tensor, full_matrices=False)
|
| 365 |
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
ssr_kv = compute_ssr(s_k, s_v)
|
| 371 |
pearson_kv = compute_pearson_corr(
|
| 372 |
s_k[:min(s_k.shape[0], s_v.shape[0])],
|
| 373 |
s_v[:min(s_k.shape[0], s_v.shape[0])]
|
|
@@ -381,25 +476,22 @@ def analyze_layer_heads(
|
|
| 381 |
min_qk = min(s_q.shape[0], s_k.shape[0])
|
| 382 |
min_qv = min(s_q.shape[0], s_v.shape[0])
|
| 383 |
|
| 384 |
-
# ── Q-K 指标 ──────────────────────────────
|
| 385 |
pearson_qk = compute_pearson_corr(s_q[:min_qk], s_k[:min_qk])
|
| 386 |
spearman_qk = float(spearmanr(
|
| 387 |
s_q[:min_qk].cpu().numpy(),
|
| 388 |
s_k[:min_qk].cpu().numpy()
|
| 389 |
)[0])
|
| 390 |
-
ssr_qk
|
| 391 |
-
alpha_qk,
|
| 392 |
-
cosU_QK
|
| 393 |
-
cosV_QK
|
| 394 |
|
| 395 |
-
# ── Q-V 指标 ────────────────────────────── [改动3]
|
| 396 |
pearson_qv = compute_pearson_corr(s_q[:min_qv], s_v[:min_qv])
|
| 397 |
ssr_qv = compute_ssr(s_q, s_v)
|
| 398 |
-
alpha_qv,
|
| 399 |
-
cosU_QV
|
| 400 |
-
cosV_QV
|
| 401 |
|
| 402 |
-
# ── 奇异值范围 ───────────────────────────── [改动3]
|
| 403 |
sig_max_q = float(s_q.max())
|
| 404 |
sig_min_q = float(s_q[s_q > 1e-10].min()) if (s_q > 1e-10).any() else 0.0
|
| 405 |
sig_max_k = float(s_k.max())
|
|
@@ -407,52 +499,43 @@ def analyze_layer_heads(
|
|
| 407 |
sig_max_v = float(s_v.max())
|
| 408 |
sig_min_v = float(s_v[s_v > 1e-10].min()) if (s_v > 1e-10).any() else 0.0
|
| 409 |
|
| 410 |
-
# 条件数(第三定律)
|
| 411 |
cond_q = sig_max_q / (sig_min_q + 1e-10)
|
| 412 |
cond_k = sig_max_k / (sig_min_k + 1e-10)
|
| 413 |
cond_v = sig_max_v / (sig_min_v + 1e-10)
|
| 414 |
|
| 415 |
records.append({
|
| 416 |
-
|
| 417 |
-
"
|
| 418 |
-
"
|
| 419 |
-
"
|
| 420 |
-
"
|
| 421 |
-
|
| 422 |
-
"
|
| 423 |
-
"
|
| 424 |
-
"
|
| 425 |
-
"
|
| 426 |
-
|
| 427 |
-
"
|
| 428 |
-
"
|
| 429 |
-
"
|
| 430 |
-
|
| 431 |
-
"
|
| 432 |
-
"
|
| 433 |
-
"
|
| 434 |
-
|
| 435 |
-
"
|
| 436 |
-
"
|
| 437 |
-
"
|
| 438 |
-
|
| 439 |
-
"
|
| 440 |
-
"
|
| 441 |
-
"
|
| 442 |
-
"
|
| 443 |
-
"
|
| 444 |
-
"
|
| 445 |
-
|
| 446 |
-
"
|
| 447 |
-
"
|
| 448 |
-
"sigma_max_K": round(sig_max_k, 4),
|
| 449 |
-
"sigma_min_K": round(sig_min_k, 4),
|
| 450 |
-
"sigma_max_V": round(sig_max_v, 4),
|
| 451 |
-
"sigma_min_V": round(sig_min_v, 4),
|
| 452 |
-
# 条件数(第三定律)[改动3]
|
| 453 |
-
"cond_Q": round(cond_q, 2),
|
| 454 |
-
"cond_K": round(cond_k, 2),
|
| 455 |
-
"cond_V": round(cond_v, 2),
|
| 456 |
})
|
| 457 |
|
| 458 |
log_lines.append(
|
|
@@ -492,7 +575,7 @@ def analyze_model(
|
|
| 492 |
return "".join(log_lines), None
|
| 493 |
|
| 494 |
# ── config.json ───────────────────────────────
|
| 495 |
-
|
| 496 |
try:
|
| 497 |
r = requests.get(
|
| 498 |
f"https://huggingface.co/{model_id}/resolve/main/config.json",
|
|
@@ -500,162 +583,151 @@ def analyze_model(
|
|
| 500 |
timeout=15
|
| 501 |
)
|
| 502 |
if r.status_code == 200:
|
| 503 |
-
|
|
|
|
| 504 |
log_lines.append(
|
| 505 |
f"📋 config.json:\n"
|
| 506 |
-
f" model_type = {
|
| 507 |
-
f" hidden_size
|
| 508 |
-
f" num_attention_heads = {
|
| 509 |
-
f" num_key_value_heads = {
|
| 510 |
-
f" head_dim = {
|
| 511 |
f"{'─'*80}\n"
|
| 512 |
)
|
| 513 |
except Exception:
|
| 514 |
log_lines.append("⚠️ 无法读取 config.json,将从 weight shape 自动推断\n")
|
| 515 |
|
| 516 |
-
# ──
|
| 517 |
progress(0.05, desc="读取模型索引...")
|
| 518 |
try:
|
| 519 |
-
index_data
|
| 520 |
-
shard_headers: dict[str, tuple[dict, int]] = {}
|
| 521 |
-
|
| 522 |
if index_data:
|
| 523 |
-
|
| 524 |
log_lines.append(
|
| 525 |
-
f"📦 分片模型,共 {len(
|
| 526 |
)
|
| 527 |
else:
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
log_lines.append(f"📦 单文件:{sf_files}\n")
|
| 531 |
except requests.exceptions.HTTPError as e:
|
| 532 |
return _http_error_msg(e, model_id), None
|
| 533 |
|
| 534 |
-
# ──
|
| 535 |
-
progress(0.08, desc="
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
| 559 |
-
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 572 |
|
| 573 |
-
log_lines.append(f"
|
| 574 |
-
log_lines.append(f"🔑 K suffix:{k_sfx}\n")
|
| 575 |
-
log_lines.append(f"🔑 V suffix:{v_sfx}\n") # [改动3]
|
| 576 |
log_lines.append(f"{'═'*80}\n")
|
| 577 |
|
| 578 |
-
# ── 辅助:查找 key 所在 shard ─────────────────
|
| 579 |
-
def get_shard_for_key(key: str) -> str | None:
|
| 580 |
-
if index_data:
|
| 581 |
-
return index_data["weight_map"].get(key)
|
| 582 |
-
for sf in sf_files:
|
| 583 |
-
if sf not in shard_headers:
|
| 584 |
-
h, hs = read_safetensors_header(get_file_url(model_id, sf), token)
|
| 585 |
-
shard_headers[sf] = (h, hs)
|
| 586 |
-
if key in shard_headers[sf][0]:
|
| 587 |
-
return sf
|
| 588 |
-
return None
|
| 589 |
-
|
| 590 |
# ── 逐层分析 ─────────────────────────────────
|
| 591 |
-
gqa_logged
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 592 |
|
| 593 |
-
for layer_idx in range(int(max_layers)):
|
| 594 |
progress(
|
| 595 |
-
0.
|
| 596 |
desc=f"第 {layer_idx} 层..."
|
| 597 |
)
|
| 598 |
|
| 599 |
-
|
| 600 |
-
k_key = f"model.layers.{layer_idx}.{k_sfx}"
|
| 601 |
-
v_key = f"model.layers.{layer_idx}.{v_sfx}" # [改动3]
|
| 602 |
-
|
| 603 |
-
q_shard = get_shard_for_key(q_key)
|
| 604 |
-
k_shard = get_shard_for_key(k_key)
|
| 605 |
-
v_shard = get_shard_for_key(v_key) # [改动3]
|
| 606 |
-
|
| 607 |
-
if q_shard is None or k_shard is None:
|
| 608 |
-
log_lines.append(
|
| 609 |
-
f"\nLayer {layer_idx}: Q/K 未找到,分析结束(共 {layer_idx} 层)\n"
|
| 610 |
-
)
|
| 611 |
-
break
|
| 612 |
-
|
| 613 |
-
# [改动3] V 找不到时降级处理(不阻断整体分析)
|
| 614 |
-
if v_shard is None:
|
| 615 |
log_lines.append(
|
| 616 |
-
f"Layer {layer_idx}: ⚠️ V
|
| 617 |
)
|
| 618 |
continue
|
| 619 |
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
shard_headers[shard] = (h, hs)
|
| 624 |
|
| 625 |
try:
|
| 626 |
W_q = load_tensor_remote(
|
| 627 |
get_file_url(model_id, q_shard), q_key,
|
| 628 |
-
*
|
| 629 |
)
|
| 630 |
W_k = load_tensor_remote(
|
| 631 |
get_file_url(model_id, k_shard), k_key,
|
| 632 |
-
*
|
| 633 |
)
|
| 634 |
-
W_v = load_tensor_remote(
|
| 635 |
get_file_url(model_id, v_shard), v_key,
|
| 636 |
-
*
|
| 637 |
)
|
| 638 |
except ValueError as e:
|
| 639 |
log_lines.append(f"Layer {layer_idx}: ⚠️ 跳过({e})\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 640 |
continue
|
| 641 |
|
| 642 |
if W_q is None or W_k is None or W_v is None:
|
| 643 |
log_lines.append(f"Layer {layer_idx}: ⚠️ tensor 为 None,跳过\n")
|
|
|
|
| 644 |
continue
|
| 645 |
|
| 646 |
-
#
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
|
|
|
|
|
|
|
|
|
|
| 650 |
del W_q, W_k, W_v
|
|
|
|
| 651 |
continue
|
| 652 |
|
| 653 |
-
# GQA 推断
|
| 654 |
try:
|
| 655 |
-
n_q_heads, n_kv_heads, d_head = infer_gqa_params(
|
|
|
|
|
|
|
| 656 |
except ValueError as e:
|
| 657 |
log_lines.append(f"Layer {layer_idx}: ❌ GQA 推断失败:{e}\n")
|
| 658 |
del W_q, W_k, W_v
|
|
|
|
| 659 |
continue
|
| 660 |
|
| 661 |
if not gqa_logged:
|
|
@@ -663,29 +735,31 @@ def analyze_model(
|
|
| 663 |
f"🧠 GQA 结构:n_q={n_q_heads} n_kv={n_kv_heads} "
|
| 664 |
f"group={n_q_heads//n_kv_heads} d_head={d_head}\n"
|
| 665 |
f" W_q={list(W_q.shape)} W_k={list(W_k.shape)} "
|
| 666 |
-
f"W_v={list(W_v.shape)}\n"
|
| 667 |
f"{'═'*80}\n"
|
| 668 |
)
|
| 669 |
gqa_logged = True
|
| 670 |
|
| 671 |
-
# 逐头全指标计算
|
| 672 |
records, layer_log = analyze_layer_heads(
|
| 673 |
-
W_q, W_k, W_v,
|
| 674 |
layer_idx,
|
| 675 |
n_q_heads, n_kv_heads, d_head,
|
| 676 |
-
modality=modality
|
| 677 |
)
|
| 678 |
all_records.extend(records)
|
| 679 |
log_lines.append(layer_log)
|
| 680 |
|
| 681 |
del W_q, W_k, W_v
|
|
|
|
| 682 |
|
| 683 |
-
# ──
|
| 684 |
if all_records:
|
| 685 |
df = pd.DataFrame(all_records)
|
| 686 |
|
| 687 |
-
# [改动5] 分模态统计
|
| 688 |
def stat_block(arr: np.ndarray, name: str) -> str:
|
|
|
|
|
|
|
|
|
|
| 689 |
return (
|
| 690 |
f" {name:<14}"
|
| 691 |
f" Median={np.median(arr):.6f}"
|
|
@@ -706,25 +780,25 @@ def analyze_model(
|
|
| 706 |
|
| 707 |
f"【第一定律 — Pearson r(→ 1)】\n",
|
| 708 |
stat_block(text_df["pearson_QK"].values, "Q-K:"),
|
| 709 |
-
stat_block(text_df["pearson_QV"].values, "Q-V:"),
|
| 710 |
-
stat_block(text_df["pearson_KV"].values, "K-V:"),
|
| 711 |
|
| 712 |
f"\n【第二定律 — SSR(→ 0)】\n",
|
| 713 |
stat_block(text_df["ssr_QK"].values, "Q-K:"),
|
| 714 |
-
stat_block(text_df["ssr_QV"].values, "Q-V:"),
|
| 715 |
-
stat_block(text_df["ssr_KV"].values, "K-V:"),
|
| 716 |
|
| 717 |
-
f"\n【第四定律 — cosU 输出子空间
|
| 718 |
stat_block(text_df["cosU_QK"].values, "cosU Q-K:"),
|
| 719 |
-
stat_block(text_df["cosU_QV"].values, "cosU Q-V:"),
|
| 720 |
-
stat_block(text_df["cosU_KV"].values, "cosU K-V:"),
|
| 721 |
|
| 722 |
-
f"\n【第五定律 — cosV 输入子空间
|
| 723 |
stat_block(text_df["cosV_QK"].values, "cosV Q-K:"),
|
| 724 |
stat_block(text_df["cosV_QV"].values, "cosV Q-V:"),
|
| 725 |
stat_block(text_df["cosV_KV"].values, "cosV K-V:"),
|
| 726 |
|
| 727 |
-
f"\n【第三定律 — 条件数
|
| 728 |
stat_block(text_df["cond_Q"].values, "cond Q:"),
|
| 729 |
stat_block(text_df["cond_K"].values, "cond K:"),
|
| 730 |
stat_block(text_df["cond_V"].values, "cond V:"),
|
|
@@ -740,7 +814,7 @@ def analyze_model(
|
|
| 740 |
|
| 741 |
|
| 742 |
# ─────────────────────────────────────────────
|
| 743 |
-
# Gradio UI
|
| 744 |
# ─────────────────────────────────────────────
|
| 745 |
|
| 746 |
with gr.Blocks(title="Wang's Five Laws — LLM Spectral Analyzer") as demo:
|
|
@@ -750,18 +824,6 @@ with gr.Blocks(title="Wang's Five Laws — LLM Spectral Analyzer") as demo:
|
|
| 750 |
**Mathematical Foundations of Large Language Models (MF-LLM)**
|
| 751 |
|
| 752 |
通过 **HTTP Range Request** 直接读取 HF 权重,**无需下载整个模型**。
|
| 753 |
-
支持 GQA + 多模态(自动跳过视觉层)。逐头计算全部五定律指标:
|
| 754 |
-
|
| 755 |
-
| 定律 | 指标 | 理论极值 | 对象 |
|
| 756 |
-
|------|------|---------|------|
|
| 757 |
-
| 第一定律 | Pearson r / Spearman r | → 1 | Q-K |
|
| 758 |
-
| 第二定律 | SSR | → 0 | Q-K, Q-V, K-V |
|
| 759 |
-
| 第三定律 | 条件数 κ | 越小越好 | Q, K, V |
|
| 760 |
-
| 第四定律 | cosU(Uq,Uk) | ≈1/√d_head;cosU(Uq,Uv)<1/√d_head | Q-K, Q-V, K-V |
|
| 761 |
-
| 第五定律 | cosV(Vq,Vk) | ≈1/√d_model(随机正交) | Q-K, Q-V, K-V |
|
| 762 |
-
|
| 763 |
-
[](https://doi.org/10.5281/zenodo.19707844)
|
| 764 |
-
[](https://hal.science/hal-05609398)
|
| 765 |
""")
|
| 766 |
|
| 767 |
with gr.Row():
|
|
@@ -782,33 +844,25 @@ with gr.Blocks(title="Wang's Five Laws — LLM Spectral Analyzer") as demo:
|
|
| 782 |
)
|
| 783 |
analyze_btn = gr.Button("🚀 开始分析", variant="primary")
|
| 784 |
|
| 785 |
-
# [改动6] 更新推荐模型列表
|
| 786 |
with gr.Column(scale=1):
|
| 787 |
gr.Markdown("""
|
| 788 |
### ✅ 推荐模型
|
| 789 |
```
|
| 790 |
-
Qwen/Qwen2.5-14B-Instruct
|
| 791 |
-
meta-llama/Llama-3-8B
|
| 792 |
-
google/gemma-4-e2b
|
| 793 |
-
google/gemma-4-
|
| 794 |
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B
|
| 795 |
```
|
| 796 |
-
###
|
| 797 |
-
|
| 798 |
-
|
| 799 |
-
|
| 800 |
-
|
| 801 |
-
| Qwen2.5-14B | 40 | 8 | 5 |
|
| 802 |
-
| Gemma-4-E2B | 8 | 4 | 2 |
|
| 803 |
-
|
| 804 |
-
### 🖼️ 多模态说明
|
| 805 |
-
- 视觉层自动跳过
|
| 806 |
-
- 仅分析文本 Transformer 层
|
| 807 |
-
- 跳过关键词:`vision / visual / vit / patch_embed`
|
| 808 |
""")
|
| 809 |
|
| 810 |
log_output = gr.Textbox(
|
| 811 |
-
label="分析日志
|
| 812 |
lines=35, max_lines=100
|
| 813 |
)
|
| 814 |
|
|
|
|
| 2 |
import requests
|
| 3 |
import struct
|
| 4 |
import json
|
| 5 |
+
import re
|
| 6 |
import numpy as np
|
| 7 |
import torch
|
| 8 |
from scipy.stats import pearsonr, spearmanr
|
|
|
|
| 32 |
UNSUPPORTED_SVD_DTYPES = {"I8", "U8", "I32", "I64", "F8_E4M3", "F8_E5M2"}
|
| 33 |
QUANTIZED_KEY_SIGNATURES = ["qweight", "qzeros", "scales", "g_idx", "packed_weight"]
|
| 34 |
|
| 35 |
+
# 视觉层关键词(扩充)
|
| 36 |
VISION_KEY_PATTERNS = [
|
| 37 |
"vision", "visual", "image_encoder",
|
| 38 |
"img_encoder", "patch_embed", "vit",
|
| 39 |
+
"vision_tower", "vision_model", # ★ 补充 gemma 的命名
|
| 40 |
+
"mm_projector", "multi_modal",
|
| 41 |
]
|
| 42 |
|
| 43 |
|
|
|
|
| 61 |
)
|
| 62 |
r.raise_for_status()
|
| 63 |
raw = json.loads(r.content)
|
|
|
|
| 64 |
raw.pop("__metadata__", None)
|
| 65 |
return raw, header_size
|
| 66 |
|
|
|
|
| 126 |
return f"❌ HTTP {code}:{e}"
|
| 127 |
|
| 128 |
|
|
|
|
| 129 |
def is_vision_key(key: str) -> bool:
|
| 130 |
key_lower = key.lower()
|
| 131 |
return any(pat in key_lower for pat in VISION_KEY_PATTERNS)
|
| 132 |
|
| 133 |
|
| 134 |
# ─────────────────────────────────────────────
|
| 135 |
+
# ★ 修复1:发现层时记录 key 完整路径,并区分模态
|
| 136 |
+
# ─────────────────────────────────────────────
|
| 137 |
+
|
| 138 |
+
def discover_layer_qkv_keys(all_shard_headers: dict) -> dict:
|
| 139 |
+
"""
|
| 140 |
+
遍历所有 shard 的全部 keys,为每层归类 Q/K/V key。
|
| 141 |
+
|
| 142 |
+
返回结构:
|
| 143 |
+
{
|
| 144 |
+
(modality, layer_idx, prefix): {
|
| 145 |
+
"q": (shard, key),
|
| 146 |
+
"k": (shard, key),
|
| 147 |
+
"v": (shard, key),
|
| 148 |
+
}
|
| 149 |
+
}
|
| 150 |
+
其中 prefix 是 layers.{N} 之前的部分(如 "language_model.model."),
|
| 151 |
+
用来区分同时存在多套 layer 编号的情况(如 vision tower + language model)。
|
| 152 |
+
"""
|
| 153 |
+
layer_map: dict[tuple, dict] = {}
|
| 154 |
+
|
| 155 |
+
for shard_name, (header, _) in all_shard_headers.items():
|
| 156 |
+
for key in header.keys():
|
| 157 |
+
# 必须是 weight,不要 bias / norm
|
| 158 |
+
if not key.endswith(".weight"):
|
| 159 |
+
continue
|
| 160 |
+
|
| 161 |
+
# 提取 layers.{N} 的位置
|
| 162 |
+
m = re.search(r'(.*?)layers\.(\d+)\.(.*)', key)
|
| 163 |
+
if not m:
|
| 164 |
+
continue
|
| 165 |
+
prefix = m.group(1) # e.g. "language_model.model."
|
| 166 |
+
layer_idx = int(m.group(2))
|
| 167 |
+
suffix = m.group(3) # e.g. "self_attn.q_proj.weight"
|
| 168 |
+
|
| 169 |
+
# ★ 关键:模态判断基于 prefix(不是整个 key)
|
| 170 |
+
modality = "vision" if is_vision_key(prefix) else "text"
|
| 171 |
+
|
| 172 |
+
# 识别 Q/K/V
|
| 173 |
+
qkv = None
|
| 174 |
+
if any(p in suffix for p in [
|
| 175 |
+
"q_proj.weight", "wq.weight",
|
| 176 |
+
"attention.query.weight",
|
| 177 |
+
"self_attn.q.weight", "attn.q.weight",
|
| 178 |
+
]):
|
| 179 |
+
qkv = "q"
|
| 180 |
+
elif any(p in suffix for p in [
|
| 181 |
+
"k_proj.weight", "wk.weight",
|
| 182 |
+
"attention.key.weight",
|
| 183 |
+
"self_attn.k.weight", "attn.k.weight",
|
| 184 |
+
]):
|
| 185 |
+
qkv = "k"
|
| 186 |
+
elif any(p in suffix for p in [
|
| 187 |
+
"v_proj.weight", "wv.weight",
|
| 188 |
+
"attention.value.weight",
|
| 189 |
+
"self_attn.v.weight", "attn.v.weight",
|
| 190 |
+
]):
|
| 191 |
+
qkv = "v"
|
| 192 |
+
else:
|
| 193 |
+
continue
|
| 194 |
+
|
| 195 |
+
# ★ 用 (modality, prefix, layer_idx) 作为唯一键
|
| 196 |
+
uid = (modality, prefix, layer_idx)
|
| 197 |
+
if uid not in layer_map:
|
| 198 |
+
layer_map[uid] = {"q": None, "k": None, "v": None}
|
| 199 |
+
|
| 200 |
+
if layer_map[uid][qkv] is None:
|
| 201 |
+
layer_map[uid][qkv] = (shard_name, key)
|
| 202 |
+
|
| 203 |
+
return layer_map
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
# ─────────────────────────────────────────────
|
| 207 |
+
# Gemma4 等 config 兼容
|
| 208 |
+
# ─────────────────────────────────────────────
|
| 209 |
+
|
| 210 |
+
def extract_config_params(config: dict) -> dict:
|
| 211 |
+
if config is None:
|
| 212 |
+
return {}
|
| 213 |
+
|
| 214 |
+
text_cfg = config.get("text_config", {}) or {}
|
| 215 |
+
|
| 216 |
+
def get_field(*keys):
|
| 217 |
+
for k in keys:
|
| 218 |
+
v = config.get(k)
|
| 219 |
+
if v is not None:
|
| 220 |
+
return v
|
| 221 |
+
v = text_cfg.get(k)
|
| 222 |
+
if v is not None:
|
| 223 |
+
return v
|
| 224 |
+
return None
|
| 225 |
+
|
| 226 |
+
return {
|
| 227 |
+
"hidden_size": get_field("hidden_size"),
|
| 228 |
+
"num_attention_heads": get_field("num_attention_heads"),
|
| 229 |
+
"num_key_value_heads": get_field("num_key_value_heads"),
|
| 230 |
+
"head_dim": get_field("head_dim"),
|
| 231 |
+
"model_type": get_field("model_type"),
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
# ─────────────────────────────────────────────
|
| 236 |
+
# 量化检测(不变)
|
| 237 |
# ─────────────────────────────────────────────
|
| 238 |
|
| 239 |
def check_quantization(model_id: str, token: str = None) -> tuple[bool, str]:
|
|
|
|
| 300 |
|
| 301 |
|
| 302 |
# ─────────────────────────────────────────────
|
| 303 |
+
# GQA 推断
|
| 304 |
# ─────────────────────────────────────────────
|
| 305 |
|
| 306 |
def infer_gqa_params(
|
| 307 |
W_q: torch.Tensor,
|
| 308 |
W_k: torch.Tensor,
|
| 309 |
+
config_params: dict | None,
|
| 310 |
+
modality: str = "text",
|
| 311 |
) -> tuple[int,int,int]:
|
| 312 |
q_rows = W_q.shape[0]
|
| 313 |
k_rows = W_k.shape[0]
|
| 314 |
|
| 315 |
d_head = None
|
| 316 |
+
|
| 317 |
+
# ★ 视觉层不要用文本层的 head_dim
|
| 318 |
+
if config_params and modality == "text":
|
| 319 |
+
d_head = config_params.get("head_dim")
|
| 320 |
+
if not d_head:
|
| 321 |
+
nh = config_params.get("num_attention_heads") or 1
|
| 322 |
+
hs = config_params.get("hidden_size") or 0
|
| 323 |
+
if hs and nh:
|
| 324 |
+
d_head = hs // nh
|
| 325 |
if d_head == 0:
|
| 326 |
d_head = None
|
| 327 |
|
|
|
|
| 347 |
|
| 348 |
|
| 349 |
# ─────────────────────────────────────────────
|
| 350 |
+
# 指标计算
|
| 351 |
# ─────────────────────────────────────────────
|
| 352 |
|
| 353 |
def compute_pearson_corr(s_a: torch.Tensor, s_b: torch.Tensor) -> float:
|
|
|
|
| 385 |
def compute_left_vector_alignment(
|
| 386 |
U_a: torch.Tensor, U_b: torch.Tensor
|
| 387 |
) -> float:
|
| 388 |
+
# ★ 安全:行数(输出维度 d_head)必须相同才有意义
|
| 389 |
+
if U_a.shape[0] != U_b.shape[0]:
|
| 390 |
+
return float('nan')
|
|
|
|
|
|
|
|
|
|
| 391 |
min_c = min(U_a.shape[1], U_b.shape[1])
|
| 392 |
Ua = U_a[:, :min_c]
|
| 393 |
Ub = U_b[:, :min_c]
|
|
|
|
| 396 |
return float(torch.diag(torch.abs(Ua_n.T @ Ub_n)).mean())
|
| 397 |
|
| 398 |
|
|
|
|
| 399 |
def compute_right_vector_alignment(
|
| 400 |
Vt_a: torch.Tensor, Vt_b: torch.Tensor
|
| 401 |
) -> float:
|
| 402 |
+
# ★ 安全:列数(输入维度 d_model)必须相同才有意义
|
| 403 |
+
if Vt_a.shape[1] != Vt_b.shape[1]:
|
| 404 |
+
return float('nan')
|
|
|
|
|
|
|
|
|
|
| 405 |
min_r = min(Vt_a.shape[0], Vt_b.shape[0])
|
| 406 |
Va_n = Vt_a[:min_r, :]
|
| 407 |
Vb_n = Vt_b[:min_r, :]
|
|
|
|
| 411 |
|
| 412 |
|
| 413 |
# ─────────────────────────────────────────────
|
| 414 |
+
# 逐头分析
|
| 415 |
# ─────────────────────────────────────────────
|
| 416 |
|
| 417 |
def analyze_layer_heads(
|
| 418 |
W_q: torch.Tensor,
|
| 419 |
W_k: torch.Tensor,
|
| 420 |
+
W_v: torch.Tensor,
|
| 421 |
layer_idx: int,
|
| 422 |
n_q_heads: int,
|
| 423 |
n_kv_heads: int,
|
| 424 |
d_head: int,
|
| 425 |
+
modality: str = "text",
|
| 426 |
) -> tuple[list[dict], str]:
|
| 427 |
+
# ★ 强一致性检查:Q/K/V 的输入维度必须一致
|
| 428 |
+
if W_q.shape[1] != W_k.shape[1] or W_k.shape[1] != W_v.shape[1]:
|
| 429 |
+
return [], (
|
| 430 |
+
f"\nLayer {layer_idx} [{modality}]: "
|
| 431 |
+
f"⚠️ Q/K/V 输入维度不一致 "
|
| 432 |
+
f"({W_q.shape}, {W_k.shape}, {W_v.shape}),跳过\n"
|
| 433 |
+
)
|
| 434 |
+
|
| 435 |
group_size = n_q_heads // n_kv_heads
|
| 436 |
records = []
|
| 437 |
log_lines = []
|
| 438 |
|
| 439 |
log_lines.append(
|
| 440 |
f"\n{'─'*80}\n"
|
| 441 |
+
f"Layer {layer_idx:3d} [{modality}] "
|
| 442 |
f"n_q={n_q_heads} n_kv={n_kv_heads} "
|
| 443 |
f"group={group_size} d_head={d_head}\n"
|
| 444 |
f"{'─'*80}\n"
|
| 445 |
)
|
|
|
|
| 446 |
log_lines.append(
|
| 447 |
f" {'KV':>3} {'Q':>3} │"
|
| 448 |
f" {'P_QK':>7} {'Sp_QK':>7} {'SSR_QK':>8} │"
|
|
|
|
| 453 |
)
|
| 454 |
|
| 455 |
for kv_h in range(n_kv_heads):
|
|
|
|
|
|
|
| 456 |
k_tensor = W_k[kv_h * d_head : (kv_h + 1) * d_head, :]
|
| 457 |
+
v_tensor = W_v[kv_h * d_head : (kv_h + 1) * d_head, :]
|
| 458 |
|
| 459 |
U_k, s_k, Vt_k = torch.linalg.svd(k_tensor, full_matrices=False)
|
| 460 |
+
U_v, s_v, Vt_v = torch.linalg.svd(v_tensor, full_matrices=False)
|
| 461 |
|
| 462 |
+
alpha_kv, alpha_res_kv = compute_singular_value_ratio(s_k, s_v)
|
| 463 |
+
cosU_KV = compute_left_vector_alignment(U_k, U_v)
|
| 464 |
+
cosV_KV = compute_right_vector_alignment(Vt_k, Vt_v)
|
| 465 |
+
ssr_kv = compute_ssr(s_k, s_v)
|
|
|
|
| 466 |
pearson_kv = compute_pearson_corr(
|
| 467 |
s_k[:min(s_k.shape[0], s_v.shape[0])],
|
| 468 |
s_v[:min(s_k.shape[0], s_v.shape[0])]
|
|
|
|
| 476 |
min_qk = min(s_q.shape[0], s_k.shape[0])
|
| 477 |
min_qv = min(s_q.shape[0], s_v.shape[0])
|
| 478 |
|
|
|
|
| 479 |
pearson_qk = compute_pearson_corr(s_q[:min_qk], s_k[:min_qk])
|
| 480 |
spearman_qk = float(spearmanr(
|
| 481 |
s_q[:min_qk].cpu().numpy(),
|
| 482 |
s_k[:min_qk].cpu().numpy()
|
| 483 |
)[0])
|
| 484 |
+
ssr_qk = compute_ssr(s_q, s_k)
|
| 485 |
+
alpha_qk, alpha_res_qk = compute_singular_value_ratio(s_q, s_k)
|
| 486 |
+
cosU_QK = compute_left_vector_alignment(U_q, U_k)
|
| 487 |
+
cosV_QK = compute_right_vector_alignment(Vt_q, Vt_k)
|
| 488 |
|
|
|
|
| 489 |
pearson_qv = compute_pearson_corr(s_q[:min_qv], s_v[:min_qv])
|
| 490 |
ssr_qv = compute_ssr(s_q, s_v)
|
| 491 |
+
alpha_qv, alpha_res_qv = compute_singular_value_ratio(s_q, s_v)
|
| 492 |
+
cosU_QV = compute_left_vector_alignment(U_q, U_v)
|
| 493 |
+
cosV_QV = compute_right_vector_alignment(Vt_q, Vt_v)
|
| 494 |
|
|
|
|
| 495 |
sig_max_q = float(s_q.max())
|
| 496 |
sig_min_q = float(s_q[s_q > 1e-10].min()) if (s_q > 1e-10).any() else 0.0
|
| 497 |
sig_max_k = float(s_k.max())
|
|
|
|
| 499 |
sig_max_v = float(s_v.max())
|
| 500 |
sig_min_v = float(s_v[s_v > 1e-10].min()) if (s_v > 1e-10).any() else 0.0
|
| 501 |
|
|
|
|
| 502 |
cond_q = sig_max_q / (sig_min_q + 1e-10)
|
| 503 |
cond_k = sig_max_k / (sig_min_k + 1e-10)
|
| 504 |
cond_v = sig_max_v / (sig_min_v + 1e-10)
|
| 505 |
|
| 506 |
records.append({
|
| 507 |
+
"layer": layer_idx,
|
| 508 |
+
"modality": modality,
|
| 509 |
+
"kv_head": kv_h,
|
| 510 |
+
"q_head": h_idx,
|
| 511 |
+
"pearson_QK": round(pearson_qk, 6),
|
| 512 |
+
"spearman_QK": round(spearman_qk, 6),
|
| 513 |
+
"pearson_QV": round(pearson_qv, 6),
|
| 514 |
+
"pearson_KV": round(pearson_kv, 6),
|
| 515 |
+
"ssr_QK": round(ssr_qk, 8),
|
| 516 |
+
"ssr_QV": round(ssr_qv, 8),
|
| 517 |
+
"ssr_KV": round(ssr_kv, 8),
|
| 518 |
+
"cosU_QK": round(cosU_QK, 6),
|
| 519 |
+
"cosU_QV": round(cosU_QV, 6),
|
| 520 |
+
"cosU_KV": round(cosU_KV, 6),
|
| 521 |
+
"cosV_QK": round(cosV_QK, 6),
|
| 522 |
+
"cosV_QV": round(cosV_QV, 6),
|
| 523 |
+
"cosV_KV": round(cosV_KV, 6),
|
| 524 |
+
"alpha_QK": round(alpha_qk, 4),
|
| 525 |
+
"alpha_QV": round(alpha_qv, 4),
|
| 526 |
+
"alpha_KV": round(alpha_kv, 4),
|
| 527 |
+
"alpha_res_QK": round(alpha_res_qk, 6),
|
| 528 |
+
"alpha_res_QV": round(alpha_res_qv, 6),
|
| 529 |
+
"alpha_res_KV": round(alpha_res_kv, 6),
|
| 530 |
+
"sigma_max_Q": round(sig_max_q, 4),
|
| 531 |
+
"sigma_min_Q": round(sig_min_q, 4),
|
| 532 |
+
"sigma_max_K": round(sig_max_k, 4),
|
| 533 |
+
"sigma_min_K": round(sig_min_k, 4),
|
| 534 |
+
"sigma_max_V": round(sig_max_v, 4),
|
| 535 |
+
"sigma_min_V": round(sig_min_v, 4),
|
| 536 |
+
"cond_Q": round(cond_q, 2),
|
| 537 |
+
"cond_K": round(cond_k, 2),
|
| 538 |
+
"cond_V": round(cond_v, 2),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 539 |
})
|
| 540 |
|
| 541 |
log_lines.append(
|
|
|
|
| 575 |
return "".join(log_lines), None
|
| 576 |
|
| 577 |
# ── config.json ───────────────────────────────
|
| 578 |
+
config_params = {}
|
| 579 |
try:
|
| 580 |
r = requests.get(
|
| 581 |
f"https://huggingface.co/{model_id}/resolve/main/config.json",
|
|
|
|
| 583 |
timeout=15
|
| 584 |
)
|
| 585 |
if r.status_code == 200:
|
| 586 |
+
raw_config = r.json()
|
| 587 |
+
config_params = extract_config_params(raw_config)
|
| 588 |
log_lines.append(
|
| 589 |
f"📋 config.json:\n"
|
| 590 |
+
f" model_type = {config_params.get('model_type')}\n"
|
| 591 |
+
f" hidden_size (text) = {config_params.get('hidden_size')}\n"
|
| 592 |
+
f" num_attention_heads = {config_params.get('num_attention_heads')}\n"
|
| 593 |
+
f" num_key_value_heads = {config_params.get('num_key_value_heads')}\n"
|
| 594 |
+
f" head_dim = {config_params.get('head_dim')}\n"
|
| 595 |
f"{'─'*80}\n"
|
| 596 |
)
|
| 597 |
except Exception:
|
| 598 |
log_lines.append("⚠️ 无法读取 config.json,将从 weight shape 自动推断\n")
|
| 599 |
|
| 600 |
+
# ── shard 列表 ────────────────────────────────
|
| 601 |
progress(0.05, desc="读取模型索引...")
|
| 602 |
try:
|
| 603 |
+
index_data = find_index_file(model_id, token)
|
|
|
|
|
|
|
| 604 |
if index_data:
|
| 605 |
+
shard_files = sorted(set(index_data["weight_map"].values()))
|
| 606 |
log_lines.append(
|
| 607 |
+
f"📦 分片模型,共 {len(shard_files)} 个 shard\n"
|
| 608 |
)
|
| 609 |
else:
|
| 610 |
+
shard_files = get_safetensor_files(model_id, token)
|
| 611 |
+
log_lines.append(f"📦 单/多文件:{shard_files}\n")
|
|
|
|
| 612 |
except requests.exceptions.HTTPError as e:
|
| 613 |
return _http_error_msg(e, model_id), None
|
| 614 |
|
| 615 |
+
# ── 读取所有 shard headers ────────────────────
|
| 616 |
+
progress(0.08, desc="读取所有 shard headers...")
|
| 617 |
+
all_shard_headers: dict[str, tuple[dict, int]] = {}
|
| 618 |
+
total_keys = 0
|
| 619 |
+
for shard in shard_files:
|
| 620 |
+
try:
|
| 621 |
+
url = get_file_url(model_id, shard)
|
| 622 |
+
h, hs = read_safetensors_header(url, token)
|
| 623 |
+
all_shard_headers[shard] = (h, hs)
|
| 624 |
+
total_keys += len(h)
|
| 625 |
+
except Exception as e:
|
| 626 |
+
log_lines.append(f"⚠️ 读取 {shard} header 失败:{e}\n")
|
| 627 |
+
|
| 628 |
+
# ── 发现层(区分模态)─────────────────────────
|
| 629 |
+
progress(0.12, desc="识别层结构...")
|
| 630 |
+
layer_map = discover_layer_qkv_keys(all_shard_headers)
|
| 631 |
+
|
| 632 |
+
# ★ 统计每个 (modality, prefix) 的层数
|
| 633 |
+
groups: dict[tuple, list[int]] = {}
|
| 634 |
+
for (modality, prefix, layer_idx), _ in layer_map.items():
|
| 635 |
+
groups.setdefault((modality, prefix), []).append(layer_idx)
|
| 636 |
+
|
| 637 |
+
log_lines.append(f"🔑 总 key 数:{total_keys}\n")
|
| 638 |
+
log_lines.append(f"📐 发现层组:\n")
|
| 639 |
+
for (modality, prefix), layers in sorted(groups.items()):
|
| 640 |
+
log_lines.append(
|
| 641 |
+
f" [{modality:6s}] prefix='{prefix}' "
|
| 642 |
+
f"层数={len(layers)} 范围={min(layers)}~{max(layers)}\n"
|
| 643 |
+
)
|
| 644 |
+
log_lines.append(f"{'─'*80}\n")
|
| 645 |
+
|
| 646 |
+
# ★ 只分析 text 模态(视觉层暂不分析)
|
| 647 |
+
text_layers = sorted([
|
| 648 |
+
(uid, info) for uid, info in layer_map.items()
|
| 649 |
+
if uid[0] == "text"
|
| 650 |
+
], key=lambda x: x[0][2]) # 按 layer_idx 排序
|
| 651 |
+
|
| 652 |
+
if not text_layers:
|
| 653 |
+
return (
|
| 654 |
+
"".join(log_lines) +
|
| 655 |
+
"❌ 未发现任何文本层\n", None
|
| 656 |
+
)
|
| 657 |
|
| 658 |
+
log_lines.append(f"🔵 将分析 {len(text_layers)} 个文本层(前 {max_layers} 层)\n")
|
|
|
|
|
|
|
| 659 |
log_lines.append(f"{'═'*80}\n")
|
| 660 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 661 |
# ── 逐层分析 ─────────────────────────────────
|
| 662 |
+
gqa_logged = False
|
| 663 |
+
layers_done = 0
|
| 664 |
+
max_layers_i = int(max_layers)
|
| 665 |
+
|
| 666 |
+
for (modality, prefix, layer_idx), qkv in text_layers:
|
| 667 |
+
if layers_done >= max_layers_i:
|
| 668 |
+
break
|
| 669 |
|
|
|
|
| 670 |
progress(
|
| 671 |
+
0.15 + 0.80 * layers_done / max(max_layers_i, 1),
|
| 672 |
desc=f"第 {layer_idx} 层..."
|
| 673 |
)
|
| 674 |
|
| 675 |
+
if qkv["q"] is None or qkv["k"] is None or qkv["v"] is None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 676 |
log_lines.append(
|
| 677 |
+
f"Layer {layer_idx} [{modality}]: ⚠️ Q/K/V 不完整,跳过\n"
|
| 678 |
)
|
| 679 |
continue
|
| 680 |
|
| 681 |
+
q_shard, q_key = qkv["q"]
|
| 682 |
+
k_shard, k_key = qkv["k"]
|
| 683 |
+
v_shard, v_key = qkv["v"]
|
|
|
|
| 684 |
|
| 685 |
try:
|
| 686 |
W_q = load_tensor_remote(
|
| 687 |
get_file_url(model_id, q_shard), q_key,
|
| 688 |
+
*all_shard_headers[q_shard], token
|
| 689 |
)
|
| 690 |
W_k = load_tensor_remote(
|
| 691 |
get_file_url(model_id, k_shard), k_key,
|
| 692 |
+
*all_shard_headers[k_shard], token
|
| 693 |
)
|
| 694 |
+
W_v = load_tensor_remote(
|
| 695 |
get_file_url(model_id, v_shard), v_key,
|
| 696 |
+
*all_shard_headers[v_shard], token
|
| 697 |
)
|
| 698 |
except ValueError as e:
|
| 699 |
log_lines.append(f"Layer {layer_idx}: ⚠️ 跳过({e})\n")
|
| 700 |
+
layers_done += 1
|
| 701 |
+
continue
|
| 702 |
+
except Exception as e:
|
| 703 |
+
log_lines.append(f"Layer {layer_idx}: ❌ 加载失败({e})\n")
|
| 704 |
+
layers_done += 1
|
| 705 |
continue
|
| 706 |
|
| 707 |
if W_q is None or W_k is None or W_v is None:
|
| 708 |
log_lines.append(f"Layer {layer_idx}: ⚠️ tensor 为 None,跳过\n")
|
| 709 |
+
layers_done += 1
|
| 710 |
continue
|
| 711 |
|
| 712 |
+
# ★ 一致性校验
|
| 713 |
+
if W_q.shape[1] != W_k.shape[1] or W_k.shape[1] != W_v.shape[1]:
|
| 714 |
+
log_lines.append(
|
| 715 |
+
f"Layer {layer_idx}: ⚠️ Q/K/V 输入维度不一致 "
|
| 716 |
+
f"Wq={list(W_q.shape)} Wk={list(W_k.shape)} "
|
| 717 |
+
f"Wv={list(W_v.shape)},跳过\n"
|
| 718 |
+
)
|
| 719 |
del W_q, W_k, W_v
|
| 720 |
+
layers_done += 1
|
| 721 |
continue
|
| 722 |
|
|
|
|
| 723 |
try:
|
| 724 |
+
n_q_heads, n_kv_heads, d_head = infer_gqa_params(
|
| 725 |
+
W_q, W_k, config_params, modality=modality
|
| 726 |
+
)
|
| 727 |
except ValueError as e:
|
| 728 |
log_lines.append(f"Layer {layer_idx}: ❌ GQA 推断失败:{e}\n")
|
| 729 |
del W_q, W_k, W_v
|
| 730 |
+
layers_done += 1
|
| 731 |
continue
|
| 732 |
|
| 733 |
if not gqa_logged:
|
|
|
|
| 735 |
f"🧠 GQA 结构:n_q={n_q_heads} n_kv={n_kv_heads} "
|
| 736 |
f"group={n_q_heads//n_kv_heads} d_head={d_head}\n"
|
| 737 |
f" W_q={list(W_q.shape)} W_k={list(W_k.shape)} "
|
| 738 |
+
f"W_v={list(W_v.shape)}\n"
|
| 739 |
f"{'═'*80}\n"
|
| 740 |
)
|
| 741 |
gqa_logged = True
|
| 742 |
|
|
|
|
| 743 |
records, layer_log = analyze_layer_heads(
|
| 744 |
+
W_q, W_k, W_v,
|
| 745 |
layer_idx,
|
| 746 |
n_q_heads, n_kv_heads, d_head,
|
| 747 |
+
modality=modality
|
| 748 |
)
|
| 749 |
all_records.extend(records)
|
| 750 |
log_lines.append(layer_log)
|
| 751 |
|
| 752 |
del W_q, W_k, W_v
|
| 753 |
+
layers_done += 1
|
| 754 |
|
| 755 |
+
# ── 汇总 ─────────────────────────────────────
|
| 756 |
if all_records:
|
| 757 |
df = pd.DataFrame(all_records)
|
| 758 |
|
|
|
|
| 759 |
def stat_block(arr: np.ndarray, name: str) -> str:
|
| 760 |
+
arr = arr[~np.isnan(arr)]
|
| 761 |
+
if len(arr) == 0:
|
| 762 |
+
return f" {name:<14} (无数据)\n"
|
| 763 |
return (
|
| 764 |
f" {name:<14}"
|
| 765 |
f" Median={np.median(arr):.6f}"
|
|
|
|
| 780 |
|
| 781 |
f"【第一定律 — Pearson r(→ 1)】\n",
|
| 782 |
stat_block(text_df["pearson_QK"].values, "Q-K:"),
|
| 783 |
+
stat_block(text_df["pearson_QV"].values, "Q-V:"),
|
| 784 |
+
stat_block(text_df["pearson_KV"].values, "K-V:"),
|
| 785 |
|
| 786 |
f"\n【第二定律 — SSR(→ 0)】\n",
|
| 787 |
stat_block(text_df["ssr_QK"].values, "Q-K:"),
|
| 788 |
+
stat_block(text_df["ssr_QV"].values, "Q-V:"),
|
| 789 |
+
stat_block(text_df["ssr_KV"].values, "K-V:"),
|
| 790 |
|
| 791 |
+
f"\n【第四定律 — cosU 输出子空间】\n",
|
| 792 |
stat_block(text_df["cosU_QK"].values, "cosU Q-K:"),
|
| 793 |
+
stat_block(text_df["cosU_QV"].values, "cosU Q-V:"),
|
| 794 |
+
stat_block(text_df["cosU_KV"].values, "cosU K-V:"),
|
| 795 |
|
| 796 |
+
f"\n【第五定律 — cosV 输入子空间】\n",
|
| 797 |
stat_block(text_df["cosV_QK"].values, "cosV Q-K:"),
|
| 798 |
stat_block(text_df["cosV_QV"].values, "cosV Q-V:"),
|
| 799 |
stat_block(text_df["cosV_KV"].values, "cosV K-V:"),
|
| 800 |
|
| 801 |
+
f"\n【第三定律 — 条件数】\n",
|
| 802 |
stat_block(text_df["cond_Q"].values, "cond Q:"),
|
| 803 |
stat_block(text_df["cond_K"].values, "cond K:"),
|
| 804 |
stat_block(text_df["cond_V"].values, "cond V:"),
|
|
|
|
| 814 |
|
| 815 |
|
| 816 |
# ─────────────────────────────────────────────
|
| 817 |
+
# Gradio UI(不变)
|
| 818 |
# ─────────────────────────────────────────────
|
| 819 |
|
| 820 |
with gr.Blocks(title="Wang's Five Laws — LLM Spectral Analyzer") as demo:
|
|
|
|
| 824 |
**Mathematical Foundations of Large Language Models (MF-LLM)**
|
| 825 |
|
| 826 |
通过 **HTTP Range Request** 直接读取 HF 权重,**无需下载整个模型**。
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 827 |
""")
|
| 828 |
|
| 829 |
with gr.Row():
|
|
|
|
| 844 |
)
|
| 845 |
analyze_btn = gr.Button("🚀 开始分析", variant="primary")
|
| 846 |
|
|
|
|
| 847 |
with gr.Column(scale=1):
|
| 848 |
gr.Markdown("""
|
| 849 |
### ✅ 推荐模型
|
| 850 |
```
|
| 851 |
+
Qwen/Qwen2.5-14B-Instruct
|
| 852 |
+
meta-llama/Llama-3-8B
|
| 853 |
+
google/gemma-4-e2b
|
| 854 |
+
google/gemma-4-31b-it
|
| 855 |
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B
|
| 856 |
```
|
| 857 |
+
### 🔑 关键修复
|
| 858 |
+
- ✅ 模态判断基于 prefix 路径
|
| 859 |
+
- ✅ 视觉/文本层分组独立编号
|
| 860 |
+
- ✅ Q/K/V 输入维度一致性校验
|
| 861 |
+
- ✅ 视觉层不复用文本 head_dim
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 862 |
""")
|
| 863 |
|
| 864 |
log_output = gr.Textbox(
|
| 865 |
+
label="分析日志",
|
| 866 |
lines=35, max_lines=100
|
| 867 |
)
|
| 868 |
|