math-under-llm / db /reader.py
Alex W.
feat๏ผšwrite 5 laws's data into sqlite.
38fc6ed
raw
history blame
5.6 kB
# db/reader.py
"""
ๆ•ฐๆฎๅบ“ๆŸฅ่ฏขๆจกๅ—
- ๆŽ’่กŒๆฆœๆŸฅ่ฏข
- ๆจกๅž‹่ฏฆๆƒ…ๆŸฅ่ฏข
- ๆ–ญ็‚น็ปญไผ ็Šถๆ€ๆŸฅ่ฏข
"""
import sqlite3
import pandas as pd
from db.schema import get_connection, init_db
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# ๆŽ’่กŒๆฆœ
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def get_leaderboard(
conn: sqlite3.Connection,
prefix_filter: str = None, # ๅช็œ‹ๆŸไธช็ป„ไปถ๏ผŒNone=ๅ…จ้ƒจ
layer_type: str = "standard",
limit: int = 50,
) -> pd.DataFrame:
"""
ๆŽ’่กŒๆฆœๆŸฅ่ฏข
ๆŒ‰ wang_score ้™ๅบๆŽ’ๅˆ—
"""
sql = """
SELECT
s.model_id,
s.prefix,
s.layer_type,
s.wang_score,
s.median_pearson_QK,
s.median_ssr_QK,
s.mean_ssr_QK,
s.median_cosU_QK,
s.median_cosU_QV,
s.median_cosV_QK,
s.median_cond_Q,
s.n_layers,
s.n_records,
s.updated_at,
-- ็ป„ไปถไฟกๆฏ
c.head_dim_min,
c.head_dim_max,
c.has_kv_shared,
c.has_global,
c.d_model
FROM model_summary s
LEFT JOIN components c
ON s.model_id = c.model_id AND s.prefix = c.prefix
WHERE s.layer_type = ?
"""
params = [layer_type]
if prefix_filter:
sql += " AND s.prefix LIKE ?"
params.append(f"%{prefix_filter}%")
sql += " ORDER BY s.wang_score DESC LIMIT ?"
params.append(limit)
cur = conn.cursor()
cur.execute(sql, params)
rows = cur.fetchall()
if not rows:
return pd.DataFrame()
cols = [d[0] for d in cur.description]
return pd.DataFrame([dict(zip(cols, row)) for row in rows])
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# ๆจกๅž‹่ฏฆๆƒ…
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def get_model_summary(
conn: sqlite3.Connection,
model_id: str,
) -> pd.DataFrame:
"""่Žทๅ–ๆŸๆจกๅž‹ๆ‰€ๆœ‰็ป„ไปถ็š„ๆฑ‡ๆ€ป็ปŸ่ฎก"""
cur = conn.cursor()
cur.execute(
"""
SELECT * FROM model_summary
WHERE model_id = ?
ORDER BY prefix, layer_type
""",
(model_id,)
)
rows = cur.fetchall()
if not rows:
return pd.DataFrame()
cols = [d[0] for d in cur.description]
return pd.DataFrame([dict(zip(cols, row)) for row in rows])
def get_layer_metrics(
conn: sqlite3.Connection,
model_id: str,
prefix: str = None,
layer_type: str = None,
start_layer:int = None,
end_layer: int = None,
) -> pd.DataFrame:
"""
ๆŸฅ่ฏข้€ๅคดๅŽŸๅง‹ๆ•ฐๆฎ
ๆ”ฏๆŒๆŒ‰ prefix / layer_type / ๅฑ‚ๅท่Œƒๅ›ด่ฟ‡ๆปค
"""
sql = "SELECT * FROM layer_head_metrics WHERE model_id = ?"
params = [model_id]
if prefix:
sql += " AND prefix = ?"
params.append(prefix)
if layer_type:
sql += " AND layer_type = ?"
params.append(layer_type)
if start_layer is not None:
sql += " AND layer >= ?"
params.append(start_layer)
if end_layer is not None:
sql += " AND layer <= ?"
params.append(end_layer)
sql += " ORDER BY prefix, layer, kv_head, q_head"
cur = conn.cursor()
cur.execute(sql, params)
rows = cur.fetchall()
if not rows:
return pd.DataFrame()
cols = [d[0] for d in cur.description]
return pd.DataFrame([dict(zip(cols, row)) for row in rows])
def get_analyzed_models(conn: sqlite3.Connection) -> pd.DataFrame:
"""่Žทๅ–ๆ‰€ๆœ‰ๅทฒๅˆ†ๆžๆจกๅž‹ๅˆ—่กจ"""
cur = conn.cursor()
cur.execute(
"""
SELECT
m.model_id,
m.model_type,
m.analyzed_at,
m.analyze_sec,
COUNT(DISTINCT c.prefix) as n_components,
SUM(c.n_layers) as total_layers
FROM models m
LEFT JOIN components c ON m.model_id = c.model_id
GROUP BY m.model_id
ORDER BY m.analyzed_at DESC
"""
)
rows = cur.fetchall()
if not rows:
return pd.DataFrame()
cols = [d[0] for d in cur.description]
return pd.DataFrame([dict(zip(cols, row)) for row in rows])
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# ๆ–ญ็‚น็ปญไผ ็Šถๆ€
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def get_resume_status(
conn: sqlite3.Connection,
model_id: str,
prefix: str,
) -> dict:
"""
ๆŸฅ่ฏขๆŸ (model_id, prefix) ็š„ๆ–ญ็‚น็ปญไผ ็Šถๆ€
่ฟ”ๅ›žๅทฒๅฎŒๆˆ็š„ๅฑ‚ๅท้›†ๅˆๅ’Œ็ปŸ่ฎกไฟกๆฏ
"""
cur = conn.cursor()
# ๅทฒๅฎŒๆˆ็š„ๅฑ‚
cur.execute(
"""
SELECT DISTINCT layer, COUNT(*) as n_heads
FROM layer_head_metrics
WHERE model_id = ? AND prefix = ?
GROUP BY layer
ORDER BY layer
""",
(model_id, prefix)
)
rows = cur.fetchall()
done_layers = {r[0]: r[1] for r in rows}
return {
"done_layers": set(done_layers.keys()),
"layer_detail": done_layers, # layer โ†’ n_heads
"total_done": len(done_layers),
}