Alex W. commited on
Commit
fe55b90
·
1 Parent(s): eb4a470

Add Wang's Five Laws spectral analyzer

Browse files
Files changed (2) hide show
  1. app.py +395 -0
  2. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,395 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ import struct
4
+ import json
5
+ import numpy as np
6
+ import torch
7
+ from scipy import stats
8
+ from huggingface_hub import list_repo_files
9
+
10
+ # ─────────────────────────────────────────────
11
+ # 核心:HTTP Range Request 读取单个 tensor
12
+ # ─────────────────────────────────────────────
13
+
14
+ DTYPE_MAP = {
15
+ "F32": (torch.float32, 4),
16
+ "F16": (torch.float16, 2),
17
+ "BF16": (torch.bfloat16, 2),
18
+ "F64": (torch.float64, 8),
19
+ "I32": (torch.int32, 4),
20
+ "I64": (torch.int64, 8),
21
+ }
22
+
23
+ def get_file_url(model_id: str, filename: str) -> str:
24
+ """生成 HuggingFace 直链 URL"""
25
+ return f"https://huggingface.co/{model_id}/resolve/main/{filename}"
26
+
27
+ def read_safetensors_header(url: str, token: str = None) -> dict:
28
+ """
29
+ 只读取 safetensors 文件头部(几KB),
30
+ 获取所有 tensor 的 offset、dtype、shape
31
+ """
32
+ headers = {"Authorization": f"Bearer {token}"} if token else {}
33
+
34
+ # 第一步:读前 8 bytes → 获取 header_size
35
+ r = requests.get(url, headers={**headers, "Range": "bytes=0-7"}, timeout=30)
36
+ r.raise_for_status()
37
+ header_size = struct.unpack("<Q", r.content)[0]
38
+
39
+ # 第二步:读 header JSON
40
+ r = requests.get(
41
+ url,
42
+ headers={**headers, "Range": f"bytes=8-{8 + header_size - 1}"},
43
+ timeout=30
44
+ )
45
+ r.raise_for_status()
46
+ return json.loads(r.content), header_size
47
+
48
+ def load_tensor_remote(url: str, tensor_name: str, header: dict,
49
+ header_size: int, token: str = None) -> torch.Tensor:
50
+ """
51
+ 只下载指定 tensor 的字节数据(Range Request),
52
+ 完全不缓存整个文件
53
+ """
54
+ if tensor_name not in header:
55
+ return None
56
+
57
+ info = header[tensor_name]
58
+ dtype_str = info["dtype"]
59
+ shape = info["shape"]
60
+ offsets = info["data_offsets"] # [start, end] 相对于数据区
61
+
62
+ if dtype_str not in DTYPE_MAP:
63
+ raise ValueError(f"不支持的 dtype: {dtype_str}")
64
+
65
+ torch_dtype, _ = DTYPE_MAP[dtype_str]
66
+
67
+ # 计算文件中的绝对字节位置
68
+ # safetensors 文件布局:8字节(header_size) + header_size字节(header) + 数据区
69
+ abs_start = 8 + header_size + offsets[0]
70
+ abs_end = 8 + header_size + offsets[1] - 1
71
+
72
+ req_headers = {"Range": f"bytes={abs_start}-{abs_end}"}
73
+ if token:
74
+ req_headers["Authorization"] = f"Bearer {token}"
75
+
76
+ r = requests.get(url, headers=req_headers, timeout=120)
77
+ r.raise_for_status()
78
+
79
+ # 转换为 tensor(BF16 需特殊处理)
80
+ raw = r.content
81
+ if torch_dtype == torch.bfloat16:
82
+ tensor = torch.frombuffer(bytearray(raw), dtype=torch.int16).view(torch.bfloat16)
83
+ else:
84
+ tensor = torch.frombuffer(bytearray(raw), dtype=torch_dtype)
85
+
86
+ return tensor.reshape(shape).float() # 统一转 float32 做 SVD
87
+
88
+ # ─────────────────────────────────────────────
89
+ # 查找模型的 safetensors 文件列表
90
+ # ─────────────────────────────────────────────
91
+
92
+ def get_safetensor_files(model_id: str, token: str = None) -> list:
93
+ """列出模型 repo 中的所有 .safetensors 文件"""
94
+ kwargs = {"token": token} if token else {}
95
+ all_files = list(list_repo_files(model_id, **kwargs))
96
+ sf_files = [f for f in all_files if f.endswith(".safetensors")]
97
+ return sorted(sf_files)
98
+
99
+ def find_index_file(model_id: str, token: str = None):
100
+ """检查是否有 model.safetensors.index.json(分片模型)"""
101
+ url = f"https://huggingface.co/{model_id}/resolve/main/model.safetensors.index.json"
102
+ headers = {"Authorization": f"Bearer {token}"} if token else {}
103
+ r = requests.get(url, headers=headers, timeout=15)
104
+ if r.status_code == 200:
105
+ return r.json()
106
+ return None
107
+
108
+ # ─────────────────────────────────────────────
109
+ # 王氏五定律计算核心
110
+ # ─────────────────────────────────────────────
111
+
112
+ def compute_svd_metrics(W_q: torch.Tensor, W_k: torch.Tensor):
113
+ """对一层的 Q/K 矩阵计算 SVD,返回 Pearson r 和 SSR"""
114
+ _, sq, _ = torch.linalg.svd(W_q, full_matrices=False)
115
+ _, sk, _ = torch.linalg.svd(W_k, full_matrices=False)
116
+
117
+ sq = sq.numpy()
118
+ sk = sk.numpy()
119
+
120
+ # 第一定律:Pearson r
121
+ r, _ = stats.pearsonr(sq, sk)
122
+
123
+ # 第二定律:SSR(谱形状残差)
124
+ sq_norm = sq / (np.linalg.norm(sq) + 1e-10)
125
+ sk_norm = sk / (np.linalg.norm(sk) + 1e-10)
126
+ ssr = np.mean(np.abs(sq_norm - sk_norm))
127
+
128
+ return float(r), float(ssr)
129
+
130
+ # ─────────────────────────────────────────────
131
+ # 主分析函数:扫描所有层
132
+ # ─────────────────────────────────────────────
133
+
134
+ def analyze_model(model_id: str, hf_token: str, max_layers: int, progress=gr.Progress()):
135
+ """
136
+ 主函数:
137
+ 1. 找到所有 safetensors 文件
138
+ 2. 逐层用 Range Request 读取 Q/K tensor
139
+ 3. 计算 SVD,输出 Pearson r 和 SSR
140
+ """
141
+ if not model_id.strip():
142
+ return "❌ 请输入模型 ID,例如:Qwen/Qwen2.5-14B-Instruct", None
143
+
144
+ token = hf_token.strip() if hf_token.strip() else None
145
+ results = []
146
+ log_lines = [f"🔍 分析模型:{model_id}\n"]
147
+
148
+ try:
149
+ # Step 1: 获取 tensor 名称 → 文件的映射
150
+ progress(0.05, desc="读取模型索引...")
151
+
152
+ # 尝试分片索引
153
+ index_data = find_index_file(model_id, token)
154
+
155
+ # 收集所有 shard 的 header
156
+ shard_headers = {} # filename → (header_dict, header_size)
157
+
158
+ if index_data:
159
+ weight_map = index_data.get("weight_map", {})
160
+ log_lines.append(f"📦 分片模型,共 {len(set(weight_map.values()))} 个 shard 文件\n")
161
+ else:
162
+ # 单文件模型
163
+ sf_files = get_safetensor_files(model_id, token)
164
+ if not sf_files:
165
+ return "❌ 未找到 .safetensors 文件,请检查模型 ID 或 token", None
166
+ weight_map = {}
167
+ for f in sf_files:
168
+ log_lines.append(f"📦 单文件模型:{f}\n")
169
+
170
+ # Step 2: 检测层数和 Q/K key 命名规则
171
+ progress(0.1, desc="检测层结构...")
172
+
173
+ # 先读第一个 shard 来探测 key 命名
174
+ first_shard = None
175
+ if index_data:
176
+ first_shard = list(set(index_data["weight_map"].values()))[0]
177
+ else:
178
+ first_shard = sf_files[0]
179
+
180
+ first_url = get_file_url(model_id, first_shard)
181
+ first_header, first_hsize = read_safetensors_header(first_url, token)
182
+ shard_headers[first_shard] = (first_header, first_hsize)
183
+
184
+ # 自动检测 Q/K key 命名模式
185
+ all_keys = list(first_header.keys())
186
+ q_keys_sample = [k for k in all_keys if any(
187
+ p in k for p in ["q_proj.weight", "query.weight", "q.weight", "wq.weight"]
188
+ )]
189
+
190
+ if not q_keys_sample:
191
+ # 展示所有 key 供用户参考
192
+ sample_keys = "\n".join(all_keys[:30])
193
+ return f"⚠️ 无法自动识别 Q/K key,前30个 key:\n{sample_keys}", None
194
+
195
+ # 判断命名模式
196
+ sample_q = q_keys_sample[0]
197
+ if "q_proj" in sample_q:
198
+ q_pattern = "self_attn.q_proj.weight"
199
+ k_pattern = "self_attn.k_proj.weight"
200
+ elif "query" in sample_q:
201
+ q_pattern = "attention.query.weight"
202
+ k_pattern = "attention.key.weight"
203
+ else:
204
+ q_pattern = sample_q.split(".")[-3] + ".q.weight"
205
+ k_pattern = sample_q.split(".")[-3] + ".k.weight"
206
+
207
+ log_lines.append(f"🔑 Q key 模式:{q_pattern}\n")
208
+ log_lines.append(f"🔑 K key 模式:{k_pattern}\n\n")
209
+
210
+ # Step 3: 逐层计算
211
+ max_layers = int(max_layers)
212
+ layer_idx = 0
213
+ pearson_list = []
214
+ ssr_list = []
215
+
216
+ while layer_idx < max_layers:
217
+ progress(0.1 + 0.85 * layer_idx / max_layers,
218
+ desc=f"处理第 {layer_idx} 层...")
219
+
220
+ # 构建 key 名称(支持常见命名方式)
221
+ q_key = f"model.layers.{layer_idx}.{q_pattern}"
222
+ k_key = f"model.layers.{layer_idx}.{k_pattern}"
223
+
224
+ # 找到对应的 shard
225
+ def get_shard_for_key(key):
226
+ if index_data:
227
+ return index_data["weight_map"].get(key)
228
+ else:
229
+ # 遍历所有 shard header 查找
230
+ for sf in sf_files:
231
+ if sf not in shard_headers:
232
+ url = get_file_url(model_id, sf)
233
+ h, hs = read_safetensors_header(url, token)
234
+ shard_headers[sf] = (h, hs)
235
+ h, _ = shard_headers[sf]
236
+ if key in h:
237
+ return sf
238
+ return None
239
+
240
+ q_shard = get_shard_for_key(q_key)
241
+ k_shard = get_shard_for_key(k_key)
242
+
243
+ if q_shard is None or k_shard is None:
244
+ log_lines.append(f"Layer {layer_idx}: ⚠️ 未找到 Q/K,停止\n")
245
+ break
246
+
247
+ # 加载对应 shard 的 header
248
+ for shard in [q_shard, k_shard]:
249
+ if shard not in shard_headers:
250
+ url = get_file_url(model_id, shard)
251
+ h, hs = read_safetensors_header(url, token)
252
+ shard_headers[shard] = (h, hs)
253
+
254
+ # Range Request 只下载 Q 和 K tensor
255
+ q_url = get_file_url(model_id, q_shard)
256
+ k_url = get_file_url(model_id, k_shard)
257
+
258
+ q_header, q_hsize = shard_headers[q_shard]
259
+ k_header, k_hsize = shard_headers[k_shard]
260
+
261
+ W_q = load_tensor_remote(q_url, q_key, q_header, q_hsize, token)
262
+ W_k = load_tensor_remote(k_url, k_key, k_header, k_hsize, token)
263
+
264
+ if W_q is None or W_k is None:
265
+ log_lines.append(f"Layer {layer_idx}: ⚠️ tensor 读取失败\n")
266
+ break
267
+
268
+ r, ssr = compute_svd_metrics(W_q, W_k)
269
+ pearson_list.append(r)
270
+ ssr_list.append(ssr)
271
+ results.append({
272
+ "Layer": layer_idx,
273
+ "Pearson_r": round(r, 6),
274
+ "SSR": round(ssr, 6)
275
+ })
276
+
277
+ log_lines.append(
278
+ f"Layer {layer_idx:3d} | Q shape: {list(W_q.shape)} "
279
+ f"| Pearson r = {r:.4f} | SSR = {ssr:.6f}\n"
280
+ )
281
+
282
+ # 释放内存
283
+ del W_q, W_k
284
+ layer_idx += 1
285
+
286
+ # Step 4: 汇总统计
287
+ if pearson_list:
288
+ summary = (
289
+ f"\n{'='*50}\n"
290
+ f"📊 王氏五定律分析结果 — {model_id}\n"
291
+ f"{'='*50}\n"
292
+ f"总层数分析: {len(pearson_list)} 层\n\n"
293
+ f"【第一定律 - 谱线性对齐 Pearson r】\n"
294
+ f" Median: {np.median(pearson_list):.4f} "
295
+ f" Mean: {np.mean(pearson_list):.4f}\n"
296
+ f" Min: {np.min(pearson_list):.4f} "
297
+ f" Max: {np.max(pearson_list):.4f}\n\n"
298
+ f"【第二定律 - 谱形状保真 SSR】\n"
299
+ f" Median: {np.median(ssr_list):.6f} "
300
+ f" Mean: {np.mean(ssr_list):.6f}\n"
301
+ f" Min: {np.min(ssr_list):.6f} "
302
+ f" Max: {np.max(ssr_list):.6f}\n\n"
303
+ f"⚡ 理论值:Pearson r → 1,SSR → 0\n"
304
+ f"{'='*50}\n"
305
+ )
306
+ log_lines.append(summary)
307
+
308
+ # 生成图表数据
309
+ import pandas as pd
310
+ df = pd.DataFrame(results)
311
+
312
+ return "".join(log_lines), df
313
+
314
+ except requests.exceptions.HTTPError as e:
315
+ if e.response.status_code == 401:
316
+ return "❌ 401 未授权:该模型需要 HF Token,请填写 Access Token", None
317
+ elif e.response.status_code == 403:
318
+ return "❌ 403 禁止访问:请确认已在 HF 接受该模型的使用协议", None
319
+ elif e.response.status_code == 404:
320
+ return f"❌ 404 未找到:模型 {model_id} 不存在或文件路径错误", None
321
+ else:
322
+ return f"❌ HTTP 错误:{e}", None
323
+ except Exception as e:
324
+ return f"❌ 错误:{str(e)}", None
325
+
326
+ # ─────────────────────────────────────────────
327
+ # Gradio UI
328
+ # ─────────────────────────────────────────────
329
+
330
+ with gr.Blocks(title="Wang's Five Laws — LLM Spectral Analyzer") as demo:
331
+ gr.Markdown("""
332
+ # 🔬 Wang's Five Laws — LLM Spectral Analyzer
333
+ **Mathematical Foundations of Large Language Models (MF-LLM)**
334
+
335
+ 通过 HTTP Range Request 直接读取 HuggingFace 模型的 Q/K 权重 tensor,
336
+ **无需下载完整模型**,计算王氏五定律的核心指标:
337
+ - 📐 **第一定律**:Pearson r → 1(谱线性对齐)
338
+ - 📏 **第二定律**:SSR → 0(谱形状保真)
339
+
340
+ [![DOI](https://img.shields.io/badge/DOI-10.5281%2Fzenodo.19707844-blue)](https://doi.org/10.5281/zenodo.19707844)
341
+ """)
342
+
343
+ with gr.Row():
344
+ with gr.Column(scale=2):
345
+ model_input = gr.Textbox(
346
+ label="HuggingFace 模型 ID",
347
+ placeholder="例如:Qwen/Qwen2.5-14B-Instruct",
348
+ value="Qwen/Qwen2.5-14B-Instruct"
349
+ )
350
+ token_input = gr.Textbox(
351
+ label="HF Access Token(公开模型可留空)",
352
+ placeholder="hf_xxxxxxxxxxxx",
353
+ type="password"
354
+ )
355
+ max_layers_input = gr.Slider(
356
+ label="最大分析层数",
357
+ minimum=1, maximum=100, value=32, step=1
358
+ )
359
+ analyze_btn = gr.Button("🚀 开始分析", variant="primary")
360
+
361
+ with gr.Column(scale=1):
362
+ gr.Markdown("""
363
+ ### 💡 快速测试模型
364
+ - `meta-llama/Llama-3.2-1B`
365
+ - `Qwen/Qwen2.5-7B-Instruct`
366
+ - `google/gemma-2-2b`
367
+ - `deepseek-ai/DeepSeek-R1-Distill-Qwen-14B`
368
+
369
+ ### ⚙️ 运行环境
370
+ - CPU Only(无 GPU)
371
+ - 每层约 5-30 秒(取决于网速和矩阵大小)
372
+ - **零缓存**:仅下载 Q/K tensor 字节
373
+ """)
374
+
375
+ with gr.Row():
376
+ log_output = gr.Textbox(
377
+ label="分析日志",
378
+ lines=25,
379
+ max_lines=50
380
+ )
381
+
382
+ with gr.Row():
383
+ table_output = gr.Dataframe(
384
+ label="逐层结果(Pearson r & SSR)",
385
+ headers=["Layer", "Pearson_r", "SSR"]
386
+ )
387
+
388
+ analyze_btn.click(
389
+ fn=analyze_model,
390
+ inputs=[model_input, token_input, max_layers_input],
391
+ outputs=[log_output, table_output]
392
+ )
393
+
394
+ if __name__ == "__main__":
395
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ requests
3
+ numpy
4
+ scipy
5
+ torch
6
+ huggingface_hub