Spaces:

wehe1pwe
/

math-under-llm

Running

Alex W. commited on 12 days ago

Commit

0105df7

1 Parent(s): e1ce951

feat: add global debug switch and unified debug logging system

- Global configuration module
- DEBUG = False (default: silent)
- Single line to toggle all debug output across entire codebase

- Unified debug output utilities
- dlog(lines, msg): appends [DEBUG] msg to UI log list (for metrics/analyze)
- dprint(msg): prints to stdout (for fetcher, no access to lines)
- Both functions are no-ops when DEBUG=False, zero performance impact

- Replaced all print() with dprint()
- Debug info covered:
- tensor_name, shape, dtype
- data_offsets (raw and absolute)
- expected_bytes vs actual_bytes check ✅/❌
- first 8 bytes hex (for cross-validation with local file reader)
- result[0,:5] (first row sanity check)
- All controlled by DEBUG switch, zero output in production

- Replaced all log.append("[DEBUG]...") with dlog(lines, ...)
- Debug info covered:
- key_q / key_k / key_v (full key names)
- W_q / W_k / W_v shapes
- n_q / n_kv / group / d_head / head_dim_source
- W_k[0,:10] / W_q[0,:10] raw weights (for cross-validation)
- Per KV head: k_t shape, s_k前5, k_t[0,:10]
- Per Q head: q_t shape, s_q前5, q_t[0,:10]
- Per Q head: pearson, alpha_QK, s_q[0], s_k[0]

- Added dlog() for shard/key/offset info before tensor loading
- Debug info covered:
- q/k/v shard filename
- q/k/v full key name
- k_header_size
- k_offsets (raw data_offsets)
- k_abs_start (= 8 + header_size + offset, the actual HTTP Range start)

During cross-validation of gemma-4-31b-it against reference implementation:

Reference code result: K head0 sigma_max = 393.07 (wrong)
Our result: K head0 sigma_max = 4.40 (correct)

Root cause found via debug output:
Reference code bug in load_tensor_from_file():
f.seek(start) # ❌ offset relative to data section
f.seek(8 + header_len + start) # ✅ correct absolute file offset

gemma-4-31b-it header_size ≈ 136KB
→ seek error = 136KB = ~13 rows of BF16 data
→ KV head0 first row completely wrong
→ sigma_max inflated from 4.40 to 393.07

Smaller models (gemma-4-e2b, Qwen2.5, LLaMA-3) not affected because
their early tensor offsets start near 0, masking the seek error.

Our HTTP Range Request implementation was correct throughout:
abs_start = 8 + header_size + offsets[0] ✅

# Enable debug (cross-validation, new model investigation):
# core/config.py
DEBUG = True

# Disable debug (production):
DEBUG = False

One line change, all debug output across fetcher/metrics/analyze
synchronized instantly.

Files changed (5) hide show

core/config.py +21 -0
core/debug.py +26 -0
core/fetcher.py +5 -3
core/metrics.py +28 -23
ui/tab_analyze.py +11 -0

core/config.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# core/config.py
+"""
+全局配置开关
+"""
+# ─────────────────────────────────────────────
+# Debug 开关
+# True  → 打印详细调试信息到日志
+# False → 静默运行，只输出结果
+# ─────────────────────────────────────────────
+DEBUG = False# core/config.py
+"""
+全局配置开关
+"""
+# ─────────────────────────────────────────────
+# Debug 开关
+# True  → 打印详细调试信息到日志
+# False → 静默运行，只输出结果
+# ─────────────────────────────────────────────
+DEBUG = False

core/debug.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# core/debug.py
+"""
+调试输出工具
+所有调试信息统一走这里，受 DEBUG 开关控制
+"""
+from core.config import DEBUG
+def dlog(lines: list[str], msg: str):
+    """
+    向 lines 追加调试信息（仅 DEBUG=True 时）
+    lines: 日志行列表（传引用，直接 append）
+    msg:   调试信息字符串
+    """
+    if DEBUG:
+        lines.append(f"[DEBUG] {msg}\n")
+def dprint(msg: str):
+    """
+    打印到 stdout（仅 DEBUG=True 时）
+    用于 fetcher.py 等无法访问 lines 的地方
+    """
+    if DEBUG:
+        print(f"[DEBUG] {msg}")

core/fetcher.py CHANGED Viewed

@@ -9,6 +9,8 @@ import json
 import requests
 import torch
 from huggingface_hub import list_repo_files
 # ─────────────────────────────────────────────
 # dtype 映射
@@ -102,7 +104,7 @@ def load_tensor_remote(
     expected_elems = 1
     for d in shape:
         expected_elems *= d
-    print(
         f"[FETCH] {tensor_name}\n"
         f"  shape={shape} dtype={dtype_str}\n"
         f"  data_offsets={offsets}\n"
@@ -123,7 +125,7 @@ def load_tensor_remote(
     # ── 调试：打印实际收到的字节数 ────────────────
     actual_bytes = len(r.content)
-    print(
         f"  actual_bytes={actual_bytes} "
         f"{'✅' if actual_bytes == expected_bytes else '❌ 字节数不匹配!'}\n"
         f"  前8字节(hex)={r.content[:8].hex()}\n"
@@ -139,7 +141,7 @@ def load_tensor_remote(
     result = tensor.reshape(shape).float()
     # ── 调试：打印结果首行 ────────────────────────
-    print(f"  result[0,:5]={result[0,:5].tolist()}\n")
     return result

 import requests
 import torch
 from huggingface_hub import list_repo_files
+from core.debug import dprint
 # ─────────────────────────────────────────────
 # dtype 映射
     expected_elems = 1
     for d in shape:
         expected_elems *= d
+    dprint(
         f"[FETCH] {tensor_name}\n"
         f"  shape={shape} dtype={dtype_str}\n"
         f"  data_offsets={offsets}\n"
     # ── 调试：打印实际收到的字节数 ────────────────
     actual_bytes = len(r.content)
+    dprint(
         f"  actual_bytes={actual_bytes} "
         f"{'✅' if actual_bytes == expected_bytes else '❌ 字节数不匹配!'}\n"
         f"  前8字节(hex)={r.content[:8].hex()}\n"
     result = tensor.reshape(shape).float()
     # ── 调试：打印结果首行 ────────────────────────
+    dprint(f"  result[0,:5]={result[0,:5].tolist()}\n")
     return result

core/metrics.py CHANGED Viewed

@@ -3,6 +3,7 @@ import torch
 import numpy as np
 from scipy.stats import spearmanr
 from core.layer_profile import LayerProfile
 def pearson(a: torch.Tensor, b: torch.Tensor) -> float:
@@ -83,17 +84,16 @@ def analyze_layer(
     lines:   list[str]  = []
     # ── 调试：打印整体信息 + 原始权重首行 ──────────
-    lines.append(
-        f"\n[DEBUG] ═══════════════════════════════\n"
-        f"[DEBUG] key_q = {profile.q.key}\n"
-        f"[DEBUG] key_k = {profile.k.key}\n"
-        f"[DEBUG] key_v = {profile.v.key if profile.v else 'K=V shared'}\n"
-        f"[DEBUG] W_q={list(W_q.shape)} W_k={list(W_k.shape)} W_v={list(W_v.shape)}\n"
-        f"[DEBUG] n_q={n_q} n_kv={n_kv} group={group} d_head={d_head}\n"
-        f"[DEBUG] W_k[0, :10] = {W_k[0, :10].tolist()}\n"
-        f"[DEBUG] W_q[0, :10] = {W_q[0, :10].tolist()}\n"
-        f"[DEBUG] ═══════════════════════════════\n"
-    )
     kv_tag = " [K=V共享]" if kv_shared else ""
     lines.append(
@@ -121,13 +121,12 @@ def analyze_layer(
         smxv, smnv, cond_v = sigma_stats(s_v)
         # ── 调试：KV头切片首行原始权重 ──────────────
-        lines.append(
-            f"[DEBUG] KV头{kv_h}: "
-            f"k_t={list(k_t.shape)} "
-            f"s_k前5={[round(x,4) for x in s_k[:5].tolist()]}\n"
-            f"[DEBUG] KV头{kv_h}: "
-            f"k_t[0,:10]={k_t[0, :10].tolist()}\n"
         )
         # KV 指标
         if kv_shared:
@@ -153,13 +152,12 @@ def analyze_layer(
             smxq, smnq, cond_q = sigma_stats(s_q)
             # ── 调试：Q头切片首行原始权重 ────────────
-            lines.append(
-                f"[DEBUG]   Q头{h}: "
-                f"q_t={list(q_t.shape)} "
-                f"s_q前5={[round(x,4) for x in s_q[:5].tolist()]}\n"
-                f"[DEBUG]   Q头{h}: "
-                f"q_t[0,:10]={q_t[0, :10].tolist()}\n"
             )
             nqk = min(len(s_q), len(s_k))
             nqv = min(len(s_q), len(s_v))
@@ -177,6 +175,13 @@ def analyze_layer(
             cU_QV      = cos_U(U_q, U_v)
             cV_QV      = cos_V(Vt_q, Vt_v)
             records.append({
                 "prefix":        profile.prefix,
                 "layer":         profile.layer_idx,

 import numpy as np
 from scipy.stats import spearmanr
 from core.layer_profile import LayerProfile
+from core.debug import dlog
 def pearson(a: torch.Tensor, b: torch.Tensor) -> float:
     lines:   list[str]  = []
     # ── 调试：打印整体信息 + 原始权重首行 ──────────
+    # ── Debug：整体信息 ───────────────────────
+    dlog(lines, f"═══════════════════════════════")
+    dlog(lines, f"key_q = {profile.q.key}")
+    dlog(lines, f"key_k = {profile.k.key}")
+    dlog(lines, f"key_v = {profile.v.key if profile.v else 'K=V shared'}")
+    dlog(lines, f"W_q={list(W_q.shape)} W_k={list(W_k.shape)} W_v={list(W_v.shape)}")
+    dlog(lines, f"n_q={n_q} n_kv={n_kv} group={group} d_head={d_head} source={profile.head_dim_source}")
+    dlog(lines, f"W_k[0,:10] = {W_k[0, :10].tolist()}")
+    dlog(lines, f"W_q[0,:10] = {W_q[0, :10].tolist()}")
+    dlog(lines, f"═══════════════════════════════")
     kv_tag = " [K=V共享]" if kv_shared else ""
     lines.append(
         smxv, smnv, cond_v = sigma_stats(s_v)
         # ── 调试：KV头切片首行原始权重 ──────────────
+        # ── Debug：KV 头 ──────────────────────
+        dlog(lines,
+            f"KV头{kv_h}: k_t={list(k_t.shape)} "
+            f"s_k前5={[round(x,4) for x in s_k[:5].tolist()]}"
         )
+        dlog(lines, f"KV头{kv_h}: k_t[0,:10]={k_t[0, :10].tolist()}")
         # KV 指标
         if kv_shared:
             smxq, smnq, cond_q = sigma_stats(s_q)
             # ── 调试：Q头切片首行原始权重 ────────────
+            # ── Debug：Q 头 ───────────────────
+            dlog(lines,
+                f"  Q头{h}: q_t={list(q_t.shape)} "
+                f"s_q前5={[round(x,4) for x in s_q[:5].tolist()]}"
             )
+            dlog(lines, f"  Q头{h}: q_t[0,:10]={q_t[0, :10].tolist()}")
             nqk = min(len(s_q), len(s_k))
             nqv = min(len(s_q), len(s_v))
             cU_QV      = cos_U(U_q, U_v)
             cV_QV      = cos_V(Vt_q, Vt_v)
+            # ── Debug：关键指标 ───────────────
+            dlog(lines,
+                f"  Q头{h}: pearson={pqk:+.4f} "
+                f"alpha_QK={a_qk:.4f} "
+                f"s_q[0]={s_q[0]:.4f} s_k[0]={s_k[0]:.4f}"
+            )
             records.append({
                 "prefix":        profile.prefix,
                 "layer":         profile.layer_idx,

ui/tab_analyze.py CHANGED Viewed

@@ -11,6 +11,7 @@ import gradio as gr
 import requests
 import pandas as pd
 import numpy as np
 from core.fetcher import (
     load_all_shard_headers,
@@ -175,6 +176,16 @@ def run_analysis(
             q_hdr, q_hs = all_headers[prof.q.shard]
             k_hdr, k_hs = all_headers[prof.k.shard]
             W_q = load_tensor_remote(q_url, prof.q.key, q_hdr, q_hs, token)
             W_k = load_tensor_remote(k_url, prof.k.key, k_hdr, k_hs, token)

 import requests
 import pandas as pd
 import numpy as np
+from core.debug import dlog
 from core.fetcher import (
     load_all_shard_headers,
             q_hdr, q_hs = all_headers[prof.q.shard]
             k_hdr, k_hs = all_headers[prof.k.shard]
+            dlog(log,
+                f"Layer {idx}:\n"
+                f"  q: {prof.q.shard} → {prof.q.key}\n"
+                f"  k: {prof.k.shard} → {prof.k.key}\n"
+                f"  v: {prof.v.shard + ' → ' + prof.v.key if prof.v else 'K=V shared'}\n"
+                f"  k_header_size={k_hs}\n"
+                f"  k_offsets={k_hdr[prof.k.key]['data_offsets']}\n"
+                f"  k_abs_start={8 + k_hs + k_hdr[prof.k.key]['data_offsets'][0]}"
+            )
             W_q = load_tensor_remote(q_url, prof.q.key, q_hdr, q_hs, token)
             W_k = load_tensor_remote(k_url, prof.k.key, k_hdr, k_hs, token)