Add phd_research_os/db.py

Browse files

Files changed (1) hide show

phd_research_os/db.py +521 -0

phd_research_os/db.py ADDED Viewed

	@@ -0,0 +1,521 @@

+"""
+PhD Research OS — Core Data Layer (Phase 0)
+============================================
+SQLite database with CRUD operations for Claim, Source, Goal, Conflict,
+Decision, Override, and Experiment objects.
+All objects carry schema_version tags. Fixed-point confidence math.
+"""
+import sqlite3
+import json
+import uuid
+import os
+from datetime import datetime, timezone
+from typing import Optional
+from dataclasses import dataclass, field, asdict
+SCHEMA_VERSION = "1.0"
+DB_PATH = os.environ.get("RESEARCH_OS_DB", "data/research_os.db")
+def get_db(db_path: str = None) -> sqlite3.Connection:
+    """Get database connection with WAL mode and foreign keys."""
+    path = db_path or DB_PATH
+    os.makedirs(os.path.dirname(path) if os.path.dirname(path) else ".", exist_ok=True)
+    conn = sqlite3.connect(path)
+    conn.row_factory = sqlite3.Row
+    conn.execute("PRAGMA journal_mode=WAL")
+    conn.execute("PRAGMA foreign_keys=ON")
+    return conn
+def init_db(db_path: str = None):
+    """Initialize all tables."""
+    conn = get_db(db_path)
+    conn.executescript("""
+    CREATE TABLE IF NOT EXISTS claims (
+        claim_id TEXT PRIMARY KEY,
+        text TEXT NOT NULL,
+        status TEXT NOT NULL CHECK(status IN ('Complete', 'Incomplete')),
+        epistemic_tag TEXT NOT NULL CHECK(epistemic_tag IN ('Fact', 'Interpretation', 'Hypothesis', 'Conflict_Hypothesis')),
+        confidence INTEGER NOT NULL,  -- Fixed-point: actual * 1000
+        evidence_strength INTEGER,    -- Fixed-point: actual * 1000
+        study_type TEXT,
+        study_quality_weight INTEGER, -- Fixed-point: actual * 1000
+        journal_tier_weight INTEGER,  -- Fixed-point: actual * 1000
+        completeness_penalty INTEGER, -- Fixed-point: actual * 1000
+        source_doi TEXT,
+        source_type TEXT,
+        missing_fields TEXT,          -- JSON array
+        parameters TEXT,              -- JSON dict
+        is_canonical INTEGER DEFAULT 0,
+        expert_override TEXT,         -- JSON object if set
+        schema_version TEXT NOT NULL DEFAULT '1.0',
+        created_at TEXT NOT NULL,
+        updated_at TEXT NOT NULL
+    );
+    CREATE TABLE IF NOT EXISTS sources (
+        doi TEXT PRIMARY KEY,
+        title TEXT NOT NULL,
+        authors TEXT,                 -- JSON array
+        year INTEGER,
+        journal TEXT,
+        journal_tier INTEGER CHECK(journal_tier IN (1, 2, 3)),
+        study_type TEXT,
+        is_canonical INTEGER DEFAULT 0,
+        source_type TEXT,
+        schema_version TEXT NOT NULL DEFAULT '1.0',
+        created_at TEXT NOT NULL
+    );
+    CREATE TABLE IF NOT EXISTS goals (
+        goal_id TEXT PRIMARY KEY,
+        description TEXT NOT NULL,
+        priority TEXT NOT NULL CHECK(priority IN ('high', 'medium', 'low')),
+        status TEXT NOT NULL CHECK(status IN ('Active', 'Paused', 'Completed')),
+        linked_claim_ids TEXT,        -- JSON array
+        schema_version TEXT NOT NULL DEFAULT '1.0',
+        created_at TEXT NOT NULL,
+        updated_at TEXT NOT NULL
+    );
+    CREATE TABLE IF NOT EXISTS conflicts (
+        conflict_id TEXT PRIMARY KEY,
+        claim_a_id TEXT NOT NULL,
+        claim_b_id TEXT NOT NULL,
+        conflict_type TEXT NOT NULL CHECK(conflict_type IN ('value_mismatch', 'methodology_difference', 'scope_difference', 'no_conflict')),
+        generated_hypothesis TEXT,
+        hypothesis_confidence TEXT DEFAULT 'low',
+        resolution_status TEXT NOT NULL DEFAULT 'Unresolved' CHECK(resolution_status IN ('Unresolved', 'Human_Resolved')),
+        resolution_notes TEXT,
+        key_differences TEXT,         -- JSON array
+        schema_version TEXT NOT NULL DEFAULT '1.0',
+        created_at TEXT NOT NULL,
+        resolved_at TEXT,
+        FOREIGN KEY(claim_a_id) REFERENCES claims(claim_id),
+        FOREIGN KEY(claim_b_id) REFERENCES claims(claim_id)
+    );
+    CREATE TABLE IF NOT EXISTS decisions (
+        decision_id TEXT PRIMARY KEY,
+        linked_claim_ids TEXT,        -- JSON array
+        recommended_action TEXT NOT NULL,
+        action_description TEXT,
+        expected_information_gain INTEGER,  -- Fixed-point * 1000
+        linked_goal_id TEXT,
+        status TEXT NOT NULL DEFAULT 'Proposed' CHECK(status IN ('Proposed', 'Accepted', 'Rejected', 'Completed')),
+        why_not_log TEXT,             -- Reason if rejected
+        priority TEXT DEFAULT 'medium',
+        estimated_effort TEXT,
+        schema_version TEXT NOT NULL DEFAULT '1.0',
+        created_at TEXT NOT NULL,
+        FOREIGN KEY(linked_goal_id) REFERENCES goals(goal_id)
+    );
+    CREATE TABLE IF NOT EXISTS overrides (
+        override_id TEXT PRIMARY KEY,
+        claim_id TEXT NOT NULL,
+        who TEXT NOT NULL,
+        rationale TEXT NOT NULL,
+        authoritative_source TEXT,
+        original_confidence INTEGER,  -- Fixed-point * 1000
+        new_confidence INTEGER,       -- Fixed-point * 1000
+        status TEXT DEFAULT 'Provisional',
+        schema_version TEXT NOT NULL DEFAULT '1.0',
+        created_at TEXT NOT NULL,
+        FOREIGN KEY(claim_id) REFERENCES claims(claim_id)
+    );
+    CREATE TABLE IF NOT EXISTS experiments (
+        experiment_id TEXT PRIMARY KEY,
+        date TEXT NOT NULL,
+        instrument TEXT,
+        measurement_type TEXT,
+        raw_file_path TEXT,
+        summary_statistics TEXT,      -- JSON dict
+        linked_claim_ids TEXT,        -- JSON array
+        approved INTEGER DEFAULT 0,
+        schema_version TEXT NOT NULL DEFAULT '1.0',
+        created_at TEXT NOT NULL
+    );
+    CREATE TABLE IF NOT EXISTS api_usage_log (
+        id INTEGER PRIMARY KEY AUTOINCREMENT,
+        timestamp TEXT NOT NULL,
+        model TEXT NOT NULL,
+        tokens_in INTEGER NOT NULL,
+        tokens_out INTEGER NOT NULL,
+        cost_usd REAL NOT NULL,
+        task_type TEXT
+    );
+    CREATE TABLE IF NOT EXISTS calibration_log (
+        id INTEGER PRIMARY KEY AUTOINCREMENT,
+        conflict_id TEXT,
+        system_predicted_confidence INTEGER,  -- Fixed-point * 1000
+        human_actual_judgment TEXT,
+        timestamp TEXT NOT NULL
+    );
+    CREATE TABLE IF NOT EXISTS embedding_cache (
+        text_hash TEXT PRIMARY KEY,
+        embedding BLOB NOT NULL,
+        model TEXT NOT NULL,
+        created_at TEXT NOT NULL
+    );
+    """)
+    conn.commit()
+    conn.close()
+# ============================================================
+# Fixed-Point Math Utilities
+# ============================================================
+def to_fixed(value: float) -> int:
+    """Convert float to fixed-point integer (×1000).
+    Research OS Rule 5: All probability/confidence use scaled integers."""
+    return round(value * 1000)
+def from_fixed(value: int) -> float:
+    """Convert fixed-point integer back to float."""
+    return value / 1000.0
+# ============================================================
+# CRUD Operations
+# ============================================================
+def now_iso() -> str:
+    return datetime.now(timezone.utc).isoformat()
+def gen_id(prefix: str) -> str:
+    return f"{prefix}_{uuid.uuid4().hex[:8].upper()}"
+# --- Claims ---
+def create_claim(conn: sqlite3.Connection, text: str, epistemic_tag: str,
+                 confidence: float, source_doi: str = None,
+                 evidence_strength: float = None, study_type: str = None,
+                 study_quality_weight: float = None, journal_tier_weight: float = None,
+                 completeness_penalty: float = None, missing_fields: list = None,
+                 parameters: dict = None, is_canonical: bool = False) -> str:
+    """Create a new Claim Object. Returns claim_id."""
+    claim_id = gen_id("CLM")
+    status = "Incomplete" if missing_fields else "Complete"
+    now = now_iso()
+    conn.execute("""
+        INSERT INTO claims (claim_id, text, status, epistemic_tag, confidence,
+            evidence_strength, study_type, study_quality_weight, journal_tier_weight,
+            completeness_penalty, source_doi, missing_fields, parameters,
+            is_canonical, schema_version, created_at, updated_at)
+        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+    """, (claim_id, text, status, epistemic_tag, to_fixed(confidence),
+          to_fixed(evidence_strength) if evidence_strength else None,
+          study_type,
+          to_fixed(study_quality_weight) if study_quality_weight else None,
+          to_fixed(journal_tier_weight) if journal_tier_weight else None,
+          to_fixed(completeness_penalty) if completeness_penalty else None,
+          source_doi,
+          json.dumps(missing_fields or []),
+          json.dumps(parameters or {}),
+          int(is_canonical),
+          SCHEMA_VERSION, now, now))
+    conn.commit()
+    return claim_id
+def get_claim(conn: sqlite3.Connection, claim_id: str) -> Optional[dict]:
+    """Get a single claim by ID."""
+    row = conn.execute("SELECT * FROM claims WHERE claim_id = ?", (claim_id,)).fetchone()
+    if row is None:
+        return None
+    d = dict(row)
+    # Convert fixed-point back to float
+    for key in ['confidence', 'evidence_strength', 'study_quality_weight',
+                'journal_tier_weight', 'completeness_penalty']:
+        if d.get(key) is not None:
+            d[key] = from_fixed(d[key])
+    d['missing_fields'] = json.loads(d.get('missing_fields', '[]'))
+    d['parameters'] = json.loads(d.get('parameters', '{}'))
+    if d.get('expert_override'):
+        d['expert_override'] = json.loads(d['expert_override'])
+    return d
+def update_claim(conn: sqlite3.Connection, claim_id: str, **kwargs) -> bool:
+    """Update claim fields. Fixed-point conversion applied automatically."""
+    fixed_fields = {'confidence', 'evidence_strength', 'study_quality_weight',
+                    'journal_tier_weight', 'completeness_penalty'}
+    json_fields = {'missing_fields', 'parameters', 'expert_override'}
+    updates = []
+    values = []
+    for key, val in kwargs.items():
+        if key in fixed_fields and val is not None:
+            updates.append(f"{key} = ?")
+            values.append(to_fixed(val))
+        elif key in json_fields and val is not None:
+            updates.append(f"{key} = ?")
+            values.append(json.dumps(val))
+        else:
+            updates.append(f"{key} = ?")
+            values.append(val)
+    updates.append("updated_at = ?")
+    values.append(now_iso())
+    values.append(claim_id)
+    if updates:
+        conn.execute(f"UPDATE claims SET {', '.join(updates)} WHERE claim_id = ?", values)
+        conn.commit()
+        return True
+    return False
+def search_claims(conn: sqlite3.Connection, query: str = None,
+                  epistemic_tag: str = None, min_confidence: float = None,
+                  source_doi: str = None, limit: int = 50) -> list:
+    """Search claims with optional filters."""
+    conditions = []
+    params = []
+    if query:
+        conditions.append("text LIKE ?")
+        params.append(f"%{query}%")
+    if epistemic_tag:
+        conditions.append("epistemic_tag = ?")
+        params.append(epistemic_tag)
+    if min_confidence is not None:
+        conditions.append("confidence >= ?")
+        params.append(to_fixed(min_confidence))
+    if source_doi:
+        conditions.append("source_doi = ?")
+        params.append(source_doi)
+    where = " AND ".join(conditions) if conditions else "1=1"
+    rows = conn.execute(
+        f"SELECT * FROM claims WHERE {where} ORDER BY confidence DESC LIMIT ?",
+        params + [limit]
+    ).fetchall()
+    results = []
+    for row in rows:
+        d = dict(row)
+        for key in ['confidence', 'evidence_strength', 'study_quality_weight',
+                     'journal_tier_weight', 'completeness_penalty']:
+            if d.get(key) is not None:
+                d[key] = from_fixed(d[key])
+        d['missing_fields'] = json.loads(d.get('missing_fields', '[]'))
+        d['parameters'] = json.loads(d.get('parameters', '{}'))
+        results.append(d)
+    return results
+# --- Sources ---
+def create_source(conn: sqlite3.Connection, doi: str, title: str,
+                  authors: list = None, year: int = None, journal: str = None,
+                  journal_tier: int = None, study_type: str = None,
+                  is_canonical: bool = False) -> str:
+    """Create a Source Object."""
+    conn.execute("""
+        INSERT OR REPLACE INTO sources (doi, title, authors, year, journal, journal_tier,
+            study_type, is_canonical, schema_version, created_at)
+        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+    """, (doi, title, json.dumps(authors or []), year, journal, journal_tier,
+          study_type, int(is_canonical), SCHEMA_VERSION, now_iso()))
+    conn.commit()
+    return doi
+def get_source(conn: sqlite3.Connection, doi: str) -> Optional[dict]:
+    """Get source by DOI."""
+    row = conn.execute("SELECT * FROM sources WHERE doi = ?", (doi,)).fetchone()
+    if row is None:
+        return None
+    d = dict(row)
+    d['authors'] = json.loads(d.get('authors', '[]'))
+    return d
+# --- Goals ---
+def create_goal(conn: sqlite3.Connection, description: str, priority: str,
+                linked_claim_ids: list = None) -> str:
+    """Create a Goal Object."""
+    goal_id = gen_id("GOAL")
+    now = now_iso()
+    conn.execute("""
+        INSERT INTO goals (goal_id, description, priority, status, linked_claim_ids,
+            schema_version, created_at, updated_at)
+        VALUES (?, ?, ?, 'Active', ?, ?, ?, ?)
+    """, (goal_id, description, priority, json.dumps(linked_claim_ids or []),
+          SCHEMA_VERSION, now, now))
+    conn.commit()
+    return goal_id
+def get_goals_by_priority(conn: sqlite3.Connection, status: str = "Active") -> list:
+    """Get goals sorted by priority."""
+    priority_order = {"high": 1, "medium": 2, "low": 3}
+    rows = conn.execute(
+        "SELECT * FROM goals WHERE status = ? ORDER BY priority", (status,)
+    ).fetchall()
+    results = [dict(r) for r in rows]
+    for r in results:
+        r['linked_claim_ids'] = json.loads(r.get('linked_claim_ids', '[]'))
+    results.sort(key=lambda x: priority_order.get(x['priority'], 99))
+    return results
+# --- Conflicts ---
+def create_conflict(conn: sqlite3.Connection, claim_a_id: str, claim_b_id: str,
+                    conflict_type: str, generated_hypothesis: str = None,
+                    key_differences: list = None) -> str:
+    """Create a Conflict Resolution Object."""
+    conflict_id = gen_id("CONF")
+    conn.execute("""
+        INSERT INTO conflicts (conflict_id, claim_a_id, claim_b_id, conflict_type,
+            generated_hypothesis, hypothesis_confidence, key_differences,
+            schema_version, created_at)
+        VALUES (?, ?, ?, ?, ?, 'low', ?, ?, ?)
+    """, (conflict_id, claim_a_id, claim_b_id, conflict_type,
+          generated_hypothesis, json.dumps(key_differences or []),
+          SCHEMA_VERSION, now_iso()))
+    conn.commit()
+    return conflict_id
+# --- Decisions ---
+def create_decision(conn: sqlite3.Connection, recommended_action: str,
+                    action_description: str, expected_info_gain: float,
+                    linked_goal_id: str = None, linked_claim_ids: list = None,
+                    priority: str = "medium", estimated_effort: str = None) -> str:
+    """Create a Decision Object."""
+    dec_id = gen_id("DEC")
+    conn.execute("""
+        INSERT INTO decisions (decision_id, linked_claim_ids, recommended_action,
+            action_description, expected_information_gain, linked_goal_id,
+            priority, estimated_effort, schema_version, created_at)
+        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+    """, (dec_id, json.dumps(linked_claim_ids or []), recommended_action,
+          action_description, to_fixed(expected_info_gain), linked_goal_id,
+          priority, estimated_effort, SCHEMA_VERSION, now_iso()))
+    conn.commit()
+    return dec_id
+# --- Overrides ---
+def create_override(conn: sqlite3.Connection, claim_id: str, who: str,
+                    rationale: str, new_confidence: float,
+                    authoritative_source: str = None) -> str:
+    """Create an expert override. Locks claim confidence."""
+    override_id = gen_id("OVR")
+    # Get original confidence
+    claim = get_claim(conn, claim_id)
+    if not claim:
+        raise ValueError(f"Claim {claim_id} not found")
+    conn.execute("""
+        INSERT INTO overrides (override_id, claim_id, who, rationale,
+            authoritative_source, original_confidence, new_confidence,
+            schema_version, created_at)
+        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
+    """, (override_id, claim_id, who, rationale, authoritative_source,
+          to_fixed(claim['confidence']), to_fixed(new_confidence),
+          SCHEMA_VERSION, now_iso()))
+    # Update the claim with override
+    update_claim(conn, claim_id, confidence=new_confidence,
+                 expert_override={"override_id": override_id, "who": who,
+                                  "rationale": rationale})
+    conn.commit()
+    return override_id
+# --- API Usage Logging ---
+def log_api_usage(conn: sqlite3.Connection, model: str, tokens_in: int,
+                  tokens_out: int, cost_usd: float, task_type: str = None):
+    """Log API usage for cost tracking."""
+    conn.execute("""
+        INSERT INTO api_usage_log (timestamp, model, tokens_in, tokens_out, cost_usd, task_type)
+        VALUES (?, ?, ?, ?, ?, ?)
+    """, (now_iso(), model, tokens_in, tokens_out, cost_usd, task_type))
+    conn.commit()
+def get_cost_summary(conn: sqlite3.Connection, days: int = 7) -> dict:
+    """Get API cost summary for the last N days."""
+    row = conn.execute("""
+        SELECT SUM(cost_usd) as total_cost, SUM(tokens_in) as total_in,
+               SUM(tokens_out) as total_out, COUNT(*) as num_calls
+        FROM api_usage_log
+        WHERE timestamp >= datetime('now', ?)
+    """, (f"-{days} days",)).fetchone()
+    return dict(row) if row else {"total_cost": 0, "total_in": 0, "total_out": 0, "num_calls": 0}
+if __name__ == "__main__":
+    # Self-test
+    init_db("test_research_os.db")
+    conn = get_db("test_research_os.db")
+    # Test claim CRUD
+    cid = create_claim(conn, "Test claim about graphene", "Fact", 0.85,
+                       evidence_strength=0.9, study_quality_weight=1.0,
+                       journal_tier_weight=1.0, completeness_penalty=1.0)
+    claim = get_claim(conn, cid)
+    assert claim is not None
+    assert claim['confidence'] == 0.85
+    assert claim['status'] == 'Complete'
+    # Test with missing fields
+    cid2 = create_claim(conn, "Incomplete claim", "Hypothesis", 0.35,
+                        missing_fields=["temperature", "pH"])
+    claim2 = get_claim(conn, cid2)
+    assert claim2['status'] == 'Incomplete'
+    assert len(claim2['missing_fields']) == 2
+    # Test source
+    create_source(conn, "10.1234/test", "Test Paper", ["Author A"], 2024,
+                  "Nature", 1)
+    src = get_source(conn, "10.1234/test")
+    assert src is not None
+    # Test goal
+    gid = create_goal(conn, "Achieve LOD < 1 fM", "high", [cid])
+    goals = get_goals_by_priority(conn)
+    assert len(goals) == 1
+    # Test conflict
+    conf_id = create_conflict(conn, cid, cid2, "value_mismatch",
+                              "Different measurement conditions")
+    # Test override
+    ovr_id = create_override(conn, cid, "Dr. Smith", "Expert knowledge", 0.95)
+    claim_updated = get_claim(conn, cid)
+    assert claim_updated['confidence'] == 0.95
+    assert claim_updated['expert_override'] is not None
+    # Test search
+    results = search_claims(conn, query="graphene")
+    assert len(results) >= 1
+    # Test fixed-point math
+    assert to_fixed(0.85) == 850
+    assert from_fixed(850) == 0.85
+    assert to_fixed(0.123) == 123
+    conn.close()
+    os.remove("test_research_os.db")
+    print("All database tests passed! ✓")