math-under-llm / db /schema.py
Alex W.
Bug ๅŽŸๅ›  ไฟฎๅค
f38ed3a
# db/schema.py
"""
ๆ•ฐๆฎๅบ“่กจ็ป“ๆž„ๅฎšไน‰ไธŽๅˆๅง‹ๅŒ–
SQLite ๅญ˜ๅ‚จๅœจ /data/wang_laws.db๏ผˆHF Space bucket ๆŒไน…ๅŒ–๏ผ‰
"""
import sqlite3
import os
from datetime import datetime
def get_db_path() -> str:
if os.path.exists("/data"):
return "/data/wang_laws.db"
return "wang_laws.db"
def get_connection() -> sqlite3.Connection:
conn = sqlite3.connect(get_db_path(), check_same_thread=False)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA foreign_keys=ON")
return conn
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# ๅปบ่กจ SQL
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
SQL_CREATE_MODELS = """
CREATE TABLE IF NOT EXISTS models (
model_id TEXT PRIMARY KEY,
model_type TEXT,
analyzed_at TIMESTAMP,
analyze_sec REAL,
notes TEXT
);
"""
SQL_CREATE_COMPONENTS = """
CREATE TABLE IF NOT EXISTS components (
id INTEGER PRIMARY KEY AUTOINCREMENT,
model_id TEXT NOT NULL,
prefix TEXT NOT NULL,
modality TEXT DEFAULT 'language', -- language/vision/audio
n_layers INTEGER,
head_dim_min INTEGER,
head_dim_max INTEGER,
has_kv_shared INTEGER DEFAULT 0,
has_global INTEGER DEFAULT 0,
d_model INTEGER,
UNIQUE(model_id, prefix),
FOREIGN KEY(model_id) REFERENCES models(model_id)
);
"""
SQL_CREATE_LAYER_HEAD_METRICS = """
CREATE TABLE IF NOT EXISTS layer_head_metrics (
id INTEGER PRIMARY KEY AUTOINCREMENT,
model_id TEXT NOT NULL,
prefix TEXT NOT NULL,
layer INTEGER NOT NULL,
layer_type TEXT DEFAULT 'standard', -- standard/global
modality TEXT DEFAULT 'language', -- language/vision/audio
kv_head INTEGER NOT NULL,
q_head INTEGER NOT NULL,
kv_shared INTEGER DEFAULT 0,
head_dim INTEGER,
d_model INTEGER,
n_q_heads INTEGER,
n_kv_heads INTEGER,
-- ็ฌฌไธ€ๅฎšๅพ‹
pearson_QK REAL, spearman_QK REAL,
pearson_QV REAL, pearson_KV REAL,
-- ็ฌฌไบŒๅฎšๅพ‹
ssr_QK REAL, ssr_QV REAL, ssr_KV REAL,
-- ็ฌฌไธ‰ๅฎšๅพ‹
sigma_max_Q REAL, sigma_min_Q REAL, cond_Q REAL,
sigma_max_K REAL, sigma_min_K REAL, cond_K REAL,
sigma_max_V REAL, sigma_min_V REAL, cond_V REAL,
-- ็ฌฌๅ››ๅฎšๅพ‹
cosU_QK REAL, cosU_QV REAL, cosU_KV REAL,
-- ็ฌฌไบ”ๅฎšๅพ‹
cosV_QK REAL, cosV_QV REAL, cosV_KV REAL,
-- ๅฐบๅบฆๅ› ๅญ
alpha_QK REAL, alpha_res_QK REAL,
alpha_QV REAL, alpha_res_QV REAL,
alpha_KV REAL, alpha_res_KV REAL,
UNIQUE(model_id, prefix, layer, kv_head, q_head),
FOREIGN KEY(model_id) REFERENCES models(model_id)
);
"""
SQL_CREATE_MODEL_SUMMARY = """
CREATE TABLE IF NOT EXISTS model_summary (
model_id TEXT NOT NULL,
prefix TEXT NOT NULL,
layer_type TEXT NOT NULL DEFAULT 'all',
-- ็ฌฌไธ€ๅฎšๅพ‹
median_pearson_QK REAL, mean_pearson_QK REAL,
-- ็ฌฌไบŒๅฎšๅพ‹
median_ssr_QK REAL, mean_ssr_QK REAL,
median_ssr_QV REAL, mean_ssr_QV REAL,
-- ็ฌฌไธ‰ๅฎšๅพ‹
median_cond_Q REAL, mean_cond_Q REAL,
-- ็ฌฌๅ››ๅฎšๅพ‹
median_cosU_QK REAL, median_cosU_QV REAL,
-- ็ฌฌไบ”ๅฎšๅพ‹
median_cosV_QK REAL, median_cosV_QV REAL,
-- ็Ž‹ๆฐ่ฏ„ๅˆ†
wang_score REAL,
-- ็ปŸ่ฎก่Œƒๅ›ด
n_layers INTEGER,
n_records INTEGER,
updated_at TIMESTAMP,
PRIMARY KEY(model_id, prefix, layer_type),
FOREIGN KEY(model_id) REFERENCES models(model_id)
);
"""
SQL_CREATE_INDEXES = [
"""CREATE INDEX IF NOT EXISTS idx_metrics_model_prefix
ON layer_head_metrics(model_id, prefix)""",
"""CREATE INDEX IF NOT EXISTS idx_metrics_layer
ON layer_head_metrics(model_id, prefix, layer)""",
"""CREATE INDEX IF NOT EXISTS idx_metrics_modality
ON layer_head_metrics(model_id, modality)""",
"""CREATE INDEX IF NOT EXISTS idx_summary_wang_score
ON model_summary(wang_score DESC)""",
"""CREATE INDEX IF NOT EXISTS idx_metrics_resume
ON layer_head_metrics(model_id, prefix, layer, kv_head, q_head)""",
"""CREATE INDEX IF NOT EXISTS idx_components_modality
ON components(model_id, modality)""",
]
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# ่ฟ็งป๏ผšไธบๆ—งๆ•ฐๆฎๅบ“ๅŠ  modality ๅˆ—
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def _migrate_add_modality(conn: sqlite3.Connection):
"""
ๅน‚็ญ‰่ฟ็งป๏ผš็ป™ๆ—ง่กจๅŠ  modality ๅˆ—ๅนถๅ›žๅกซๆ•ฐๆฎใ€‚
ๆ–ฐๅปบๆ•ฐๆฎๅบ“ๆ—ถ่ฟ™ไบ›ๅˆ—ๅทฒๅœจๅปบ่กจSQLไธญ๏ผŒPRAGMAไผšๆฃ€ๆต‹ๅˆฐ็›ดๆŽฅ่ทณ่ฟ‡ใ€‚
"""
cur = conn.cursor()
# โ”€โ”€ layer_head_metrics โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
cur.execute("PRAGMA table_info(layer_head_metrics)")
lhm_cols = [row[1] for row in cur.fetchall()]
if "modality" not in lhm_cols:
cur.execute(
"ALTER TABLE layer_head_metrics "
"ADD COLUMN modality TEXT DEFAULT 'language'"
)
# ๅ›žๅกซ vision
cur.execute(
"""UPDATE layer_head_metrics SET modality = 'vision'
WHERE prefix LIKE '%vision%'
OR prefix LIKE '%visual%'
OR prefix LIKE '%image%'"""
)
# ๅ›žๅกซ audio
cur.execute(
"""UPDATE layer_head_metrics SET modality = 'audio'
WHERE prefix LIKE '%audio%'
OR prefix LIKE '%speech%'
OR prefix LIKE '%acoustic%'"""
)
# language ๅทฒ็”ฑ DEFAULT 'language' ่ฆ†็›–๏ผŒๆ— ้œ€้ขๅค–ๆ›ดๆ–ฐ
# โ”€โ”€ components โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
cur.execute("PRAGMA table_info(components)")
comp_cols = [row[1] for row in cur.fetchall()]
if "modality" not in comp_cols:
cur.execute(
"ALTER TABLE components "
"ADD COLUMN modality TEXT DEFAULT 'language'"
)
cur.execute(
"""UPDATE components SET modality = 'vision'
WHERE prefix LIKE '%vision%'
OR prefix LIKE '%visual%'
OR prefix LIKE '%image%'"""
)
cur.execute(
"""UPDATE components SET modality = 'audio'
WHERE prefix LIKE '%audio%'
OR prefix LIKE '%speech%'
OR prefix LIKE '%acoustic%'"""
)
conn.commit()
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# ๅˆๅง‹ๅŒ–
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def init_db() -> sqlite3.Connection:
conn = get_connection()
cur = conn.cursor()
# ็ฌฌไธ€ๆญฅ๏ผšๅปบ่กจ
cur.execute(SQL_CREATE_MODELS)
cur.execute(SQL_CREATE_COMPONENTS)
cur.execute(SQL_CREATE_LAYER_HEAD_METRICS)
cur.execute(SQL_CREATE_MODEL_SUMMARY)
conn.commit()
# ็ฌฌไบŒๆญฅ๏ผš่ฟ็งปๆ—งๆ•ฐๆฎ๏ผˆๅŠ  modality ๅˆ—๏ผ‰โ† ๅฟ…้กปๅœจๅปบ็ดขๅผ•ไน‹ๅ‰
_migrate_add_modality(conn)
# ็ฌฌไธ‰ๆญฅ๏ผšๅปบ็ดขๅผ•๏ผˆๆญคๆ—ถ modality ๅˆ—ๅทฒ็กฎไฟๅญ˜ๅœจ๏ผ‰
cur = conn.cursor()
for sql in SQL_CREATE_INDEXES:
cur.execute(sql)
conn.commit()
return conn
def get_db_stats(conn: sqlite3.Connection) -> dict:
cur = conn.cursor()
stats = {}
for table in ["models", "components", "layer_head_metrics", "model_summary"]:
cur.execute(f"SELECT COUNT(*) FROM {table}")
stats[table] = cur.fetchone()[0]
db_path = get_db_path()
if os.path.exists(db_path):
stats["db_size_mb"] = round(os.path.getsize(db_path) / 1024 / 1024, 2)
else:
stats["db_size_mb"] = 0
return stats