# db/schema.py """ 数据库表结构定义与初始化 SQLite 存储在 /data/wang_laws.db(HF Space bucket 持久化) """ import sqlite3 import os from datetime import datetime # ───────────────────────────────────────────── # 数据库路径 # /data 是 HF Space bucket 挂载点,重启后数据不丢失 # 本地开发时自动回退到当前目录 # ───────────────────────────────────────────── def get_db_path() -> str: if os.path.exists("/data"): return "/data/wang_laws.db" return "wang_laws.db" def get_connection() -> sqlite3.Connection: """获取数据库连接,启用 WAL 模式提升并发性能""" conn = sqlite3.connect(get_db_path(), check_same_thread=False) conn.row_factory = sqlite3.Row # 支持按列名访问 conn.execute("PRAGMA journal_mode=WAL") conn.execute("PRAGMA foreign_keys=ON") return conn # ───────────────────────────────────────────── # 建表 SQL # ───────────────────────────────────────────── SQL_CREATE_MODELS = """ CREATE TABLE IF NOT EXISTS models ( model_id TEXT PRIMARY KEY, model_type TEXT, -- gemma4 / llama / qwen2 等 analyzed_at TIMESTAMP, analyze_sec REAL, -- 分析耗时(秒) notes TEXT -- 备注 ); """ SQL_CREATE_COMPONENTS = """ CREATE TABLE IF NOT EXISTS components ( id INTEGER PRIMARY KEY AUTOINCREMENT, model_id TEXT NOT NULL, prefix TEXT NOT NULL, -- 如 model.language_model. n_layers INTEGER, -- 该组件完整层数 head_dim_min INTEGER, -- 最小 head_dim(异构层用) head_dim_max INTEGER, -- 最大 head_dim has_kv_shared INTEGER DEFAULT 0, -- 是否有 K=V 共享层(全局层) has_global INTEGER DEFAULT 0, -- 是否有 global 层 d_model INTEGER, -- 输入维度 UNIQUE(model_id, prefix), FOREIGN KEY(model_id) REFERENCES models(model_id) ); """ SQL_CREATE_LAYER_HEAD_METRICS = """ CREATE TABLE IF NOT EXISTS layer_head_metrics ( id INTEGER PRIMARY KEY AUTOINCREMENT, model_id TEXT NOT NULL, prefix TEXT NOT NULL, layer INTEGER NOT NULL, layer_type TEXT DEFAULT 'standard', -- standard / global kv_head INTEGER NOT NULL, q_head INTEGER NOT NULL, kv_shared INTEGER DEFAULT 0, -- 1=K=V共享(理论值),0=独立V head_dim INTEGER, d_model INTEGER, n_q_heads INTEGER, n_kv_heads INTEGER, -- 第一定律:谱线性对齐 pearson_QK REAL, spearman_QK REAL, pearson_QV REAL, pearson_KV REAL, -- 第二定律:谱形状残差 ssr_QK REAL, ssr_QV REAL, ssr_KV REAL, -- 第三定律:条件数 sigma_max_Q REAL, sigma_min_Q REAL, cond_Q REAL, sigma_max_K REAL, sigma_min_K REAL, cond_K REAL, sigma_max_V REAL, sigma_min_V REAL, cond_V REAL, -- 第四定律:左奇异向量对齐(输出子空间) cosU_QK REAL, cosU_QV REAL, cosU_KV REAL, -- 第五定律:右奇异向量对齐(输入子空间) cosV_QK REAL, cosV_QV REAL, cosV_KV REAL, -- 尺度因子 + 最小二乘残差 alpha_QK REAL, alpha_res_QK REAL, alpha_QV REAL, alpha_res_QV REAL, alpha_KV REAL, alpha_res_KV REAL, UNIQUE(model_id, prefix, layer, kv_head, q_head), FOREIGN KEY(model_id) REFERENCES models(model_id) ); """ SQL_CREATE_MODEL_SUMMARY = """ CREATE TABLE IF NOT EXISTS model_summary ( model_id TEXT NOT NULL, prefix TEXT NOT NULL, layer_type TEXT NOT NULL DEFAULT 'all', -- all / standard / global -- 第一定律 median_pearson_QK REAL, mean_pearson_QK REAL, -- 第二定律(王氏评分核心) median_ssr_QK REAL, mean_ssr_QK REAL, median_ssr_QV REAL, mean_ssr_QV REAL, -- 第三定律 median_cond_Q REAL, mean_cond_Q REAL, -- 第四定律 median_cosU_QK REAL, median_cosU_QV REAL, -- 第五定律 median_cosV_QK REAL, median_cosV_QV REAL, -- 王氏评分(暂时 = 1 - median_ssr_QK,基于 standard 层) wang_score REAL, -- 统计范围 n_layers INTEGER, -- 参与统计的层数 n_records INTEGER, -- 参与统计的记录数 updated_at TIMESTAMP, PRIMARY KEY(model_id, prefix, layer_type), FOREIGN KEY(model_id) REFERENCES models(model_id) ); """ # 索引:加速常用查询 SQL_CREATE_INDEXES = [ # 按模型+组件查询层数据 """CREATE INDEX IF NOT EXISTS idx_metrics_model_prefix ON layer_head_metrics(model_id, prefix)""", # 按层号范围查询 """CREATE INDEX IF NOT EXISTS idx_metrics_layer ON layer_head_metrics(model_id, prefix, layer)""", # 排行榜查询 """CREATE INDEX IF NOT EXISTS idx_summary_wang_score ON model_summary(wang_score DESC)""", # 断点续传:快速判断某层是否已分析 """CREATE INDEX IF NOT EXISTS idx_metrics_resume ON layer_head_metrics(model_id, prefix, layer, kv_head, q_head)""", ] # ───────────────────────────────────────────── # 初始化函数 # ───────────────────────────────────────────── def init_db() -> sqlite3.Connection: """ 初始化数据库:建表 + 建索引 幂等操作,重复调用安全 返回数据库连接 """ conn = get_connection() cur = conn.cursor() cur.execute(SQL_CREATE_MODELS) cur.execute(SQL_CREATE_COMPONENTS) cur.execute(SQL_CREATE_LAYER_HEAD_METRICS) cur.execute(SQL_CREATE_MODEL_SUMMARY) for sql in SQL_CREATE_INDEXES: cur.execute(sql) conn.commit() return conn def get_db_stats(conn: sqlite3.Connection) -> dict: """获取数据库统计信息""" cur = conn.cursor() stats = {} for table in ["models", "components", "layer_head_metrics", "model_summary"]: cur.execute(f"SELECT COUNT(*) FROM {table}") stats[table] = cur.fetchone()[0] # 数据库文件大小 db_path = get_db_path() if os.path.exists(db_path): stats["db_size_mb"] = round(os.path.getsize(db_path) / 1024 / 1024, 2) else: stats["db_size_mb"] = 0 return stats