v2.0: phd_research_os_v2/core/database.py

Browse files

Files changed (1) hide show

phd_research_os_v2/core/database.py +444 -0

phd_research_os_v2/core/database.py ADDED Viewed

	@@ -0,0 +1,444 @@

+"""
+PhD Research OS v2.0 — Core Database
+======================================
+SQLite database supporting all 7 layers. Fixed-point math throughout.
+Every table has schema_version. Every record has created_at.
+"""
+import sqlite3
+import json
+import uuid
+import os
+import hashlib
+from datetime import datetime, timezone
+from typing import Optional
+SCHEMA_VERSION = "2.0"
+PIPELINE_VERSION = "2.1.0"
+DB_PATH = os.environ.get("RESEARCH_OS_DB", "data/research_os_v2.db")
+def to_fixed(value: float) -> int:
+    """Float → fixed-point integer (×1000). Research OS Rule 5."""
+    return round(value * 1000)
+def from_fixed(value: int) -> float:
+    """Fixed-point integer → float."""
+    return value / 1000.0 if value is not None else 0.0
+def now_iso() -> str:
+    return datetime.now(timezone.utc).isoformat()
+def gen_id(prefix: str) -> str:
+    return f"{prefix}_{uuid.uuid4().hex[:8].upper()}"
+def hash_text(text: str) -> str:
+    return hashlib.sha256(text.encode()).hexdigest()[:16]
+def get_db(db_path: str = None) -> sqlite3.Connection:
+    path = db_path or DB_PATH
+    os.makedirs(os.path.dirname(path) if os.path.dirname(path) else ".", exist_ok=True)
+    conn = sqlite3.connect(path)
+    conn.row_factory = sqlite3.Row
+    conn.execute("PRAGMA journal_mode=WAL")
+    conn.execute("PRAGMA foreign_keys=ON")
+    return conn
+def init_db(db_path: str = None):
+    """Initialize ALL tables for the complete 7-layer system."""
+    conn = get_db(db_path)
+    conn.executescript("""
+    -- ═══════════════════════════════════════════════════════════
+    -- LAYER 0: Structural Ingestion
+    -- ═══════════════════════════════════════════════════════════
+    CREATE TABLE IF NOT EXISTS documents (
+        doc_id TEXT PRIMARY KEY,
+        file_path TEXT NOT NULL,
+        doc_type TEXT NOT NULL DEFAULT 'main',  -- main, supplement, dataset, code_repo
+        title TEXT,
+        doi TEXT,
+        arxiv_id TEXT,
+        vor_status TEXT DEFAULT 'unknown',      -- preprint, vor, erratum, retracted
+        lineage_parent TEXT,                     -- doc_id of parent version
+        parse_method TEXT,
+        parse_quality_avg INTEGER,               -- Fixed-point ×1000
+        total_regions INTEGER DEFAULT 0,
+        total_pages INTEGER DEFAULT 0,
+        ingestion_status TEXT DEFAULT 'pending', -- pending, processing, complete, failed
+        metadata TEXT,                           -- JSON
+        schema_version TEXT DEFAULT '2.0',
+        created_at TEXT NOT NULL
+    );
+    CREATE TABLE IF NOT EXISTS regions (
+        region_id TEXT PRIMARY KEY,
+        doc_id TEXT NOT NULL,
+        page INTEGER NOT NULL,
+        bbox TEXT,                               -- JSON: [x1, y1, x2, y2]
+        region_type TEXT NOT NULL,               -- body_text, table, figure, equation, caption, header, reference, footnote
+        section TEXT,
+        subsection TEXT,
+        content_text TEXT,
+        content_markdown TEXT,
+        parse_method TEXT,
+        parse_confidence INTEGER,                -- Fixed-point ×1000
+        ocr_source INTEGER DEFAULT 0,
+        extraction_status TEXT DEFAULT 'extractable',  -- extractable, low_confidence, unextractable
+        quality_flags TEXT,                      -- JSON array
+        cross_refs TEXT,                         -- JSON array of {ref_text, ref_type, resolved_to, verified}
+        figure_type TEXT,                        -- scatter_plot, bar_chart, diagram, micrograph, schematic, null
+        digitized_data TEXT,                     -- JSON: recovered data points from plot
+        schema_version TEXT DEFAULT '2.0',
+        created_at TEXT NOT NULL,
+        FOREIGN KEY(doc_id) REFERENCES documents(doc_id)
+    );
+    -- ═══════════════════════════════════════════════════════════
+    -- LAYER 1: Entity Resolution
+    -- ═══════════════════════════════════════════════════════════
+    CREATE TABLE IF NOT EXISTS entities (
+        entity_id TEXT PRIMARY KEY,
+        canonical_name TEXT NOT NULL,
+        entity_type TEXT NOT NULL,               -- gene, protein, chemical, assay, disease, instrument, method
+        aliases TEXT,                             -- JSON array of alternative names
+        external_ids TEXT,                        -- JSON: {uniprot: ..., pubchem: ..., mesh: ...}
+        schema_version TEXT DEFAULT '2.0',
+        created_at TEXT NOT NULL
+    );
+    CREATE TABLE IF NOT EXISTS citation_chains (
+        chain_id TEXT PRIMARY KEY,
+        citing_doc TEXT NOT NULL,
+        cited_doi TEXT,
+        cited_title TEXT,
+        in_text_ref TEXT,                        -- e.g., "[32]"
+        is_in_knowledge_base INTEGER DEFAULT 0,
+        resolved_doc_id TEXT,
+        chain_type TEXT DEFAULT 'direct',        -- direct, inherited, self_cite
+        schema_version TEXT DEFAULT '2.0',
+        created_at TEXT NOT NULL,
+        FOREIGN KEY(citing_doc) REFERENCES documents(doc_id)
+    );
+    -- ═══════════════════════════════════════════════════════════
+    -- LAYER 2: Qualified Extraction (Claims)
+    -- ═══════════════════════════════════════════════════════════
+    CREATE TABLE IF NOT EXISTS claims (
+        claim_id TEXT PRIMARY KEY,
+        canonical_id TEXT,                       -- Layer 3: canonical claim this maps to
+        text TEXT NOT NULL,
+        epistemic_tag TEXT NOT NULL CHECK(epistemic_tag IN
+            ('Fact','Interpretation','Hypothesis','Conflict_Hypothesis')),
+        -- Confidence components (all fixed-point ×1000)
+        evidence_strength INTEGER,
+        study_quality_weight INTEGER,
+        journal_tier_weight INTEGER,
+        completeness_penalty INTEGER,
+        section_modifier INTEGER,
+        qualifier_penalty INTEGER,
+        -- Computed scores (code-computed, NOT LLM-stated)
+        evidence_quality INTEGER,                -- Fixed-point ×1000
+        truth_likelihood INTEGER,                -- Fixed-point ×1000
+        qualifier_strength_score INTEGER,        -- Fixed-point ×1000
+        composite_confidence INTEGER,            -- Fixed-point ×1000
+        status TEXT NOT NULL CHECK(status IN ('Complete','Incomplete','Unextractable')),
+        is_null_result INTEGER DEFAULT 0,
+        is_inherited_citation INTEGER DEFAULT 0,
+        causal_direction TEXT DEFAULT 'unspecified',  -- observed_correlation, causal_claim, unspecified
+        practical_significance INTEGER DEFAULT 1,
+        -- Qualifiers
+        qualifiers TEXT,                         -- JSON array
+        missing_fields TEXT,                     -- JSON array
+        -- Statistical evidence
+        stat_p_value REAL,
+        stat_effect_size REAL,
+        stat_effect_type TEXT,
+        stat_sample_size INTEGER,
+        stat_ci_lower REAL,
+        stat_ci_upper REAL,
+        -- Source provenance
+        source_quote TEXT,
+        source_page INTEGER,
+        source_bbox TEXT,                        -- JSON: [x1, y1, x2, y2]
+        source_section TEXT,
+        source_region_id TEXT,
+        source_doc_id TEXT,
+        source_doi TEXT,
+        -- Council provenance
+        council_votes TEXT,                      -- JSON: {member: {tag, reasoning}}
+        -- Granularity
+        granularity TEXT DEFAULT 'atomic',       -- atomic, aggregate
+        parent_claim_id TEXT,
+        -- Version tracking
+        ontology_version TEXT,
+        pipeline_version TEXT DEFAULT '2.1.0',
+        taxonomy_version TEXT,
+        extraction_timestamp TEXT,
+        schema_version TEXT DEFAULT '2.0',
+        created_at TEXT NOT NULL,
+        updated_at TEXT NOT NULL
+    );
+    -- ═══════════════════════════════════════════════════════════
+    -- LAYER 3: Canonicalization
+    -- ═══════════════════════════════════════════════════════════
+    CREATE TABLE IF NOT EXISTS canonical_claims (
+        canonical_id TEXT PRIMARY KEY,
+        representative_text TEXT NOT NULL,
+        epistemic_tag TEXT NOT NULL,
+        composite_confidence INTEGER,            -- Aggregated across sources
+        evidence_count INTEGER DEFAULT 1,
+        source_dois TEXT,                        -- JSON array
+        aliases TEXT,                            -- JSON array of claim_ids
+        version_history TEXT,                    -- JSON array of {version, source, confidence, date}
+        current_version INTEGER DEFAULT 1,
+        supersedes TEXT,
+        superseded_by TEXT,
+        schema_version TEXT DEFAULT '2.0',
+        created_at TEXT NOT NULL,
+        updated_at TEXT NOT NULL
+    );
+    -- ═══════════════════════════════════════════════════════════
+    -- LAYER 4: Knowledge Graph
+    -- ═══════════════════════════════════════════════════════════
+    CREATE TABLE IF NOT EXISTS graph_nodes (
+        node_id TEXT PRIMARY KEY,
+        node_type TEXT NOT NULL,                 -- claim, entity, method, condition, lab
+        label TEXT NOT NULL,
+        properties TEXT,                         -- JSON
+        created_at TEXT NOT NULL
+    );
+    CREATE TABLE IF NOT EXISTS graph_edges (
+        edge_id TEXT PRIMARY KEY,
+        source_node TEXT NOT NULL,
+        target_node TEXT NOT NULL,
+        edge_type TEXT NOT NULL,                 -- supports, refutes, extends, depends_on, supersedes, blocks, investigative_hypothesis
+        confidence INTEGER NOT NULL,             -- Fixed-point ×1000
+        evidence_sources TEXT,                   -- JSON array of DOIs
+        is_inferred INTEGER DEFAULT 0,
+        inference_chain TEXT,                    -- JSON
+        method_compatible INTEGER,               -- NULL, 0, 1
+        resolution_id TEXT,
+        created_at TEXT NOT NULL,
+        updated_at TEXT NOT NULL,
+        FOREIGN KEY(source_node) REFERENCES graph_nodes(node_id),
+        FOREIGN KEY(target_node) REFERENCES graph_nodes(node_id)
+    );
+    CREATE INDEX IF NOT EXISTS idx_edges_source ON graph_edges(source_node);
+    CREATE INDEX IF NOT EXISTS idx_edges_target ON graph_edges(target_node);
+    CREATE INDEX IF NOT EXISTS idx_edges_type ON graph_edges(edge_type);
+    CREATE TABLE IF NOT EXISTS conflicts (
+        conflict_id TEXT PRIMARY KEY,
+        claim_a_id TEXT NOT NULL,
+        claim_b_id TEXT NOT NULL,
+        conflict_type TEXT NOT NULL,
+        generated_hypothesis TEXT,
+        hypothesis_confidence TEXT DEFAULT 'low',
+        resolution_status TEXT DEFAULT 'Unresolved',
+        resolution_id TEXT,
+        comparability_confidence INTEGER,        -- Fixed-point ×1000
+        method_comparison TEXT,                  -- JSON
+        case_file_id TEXT,
+        key_differences TEXT,                    -- JSON
+        schema_version TEXT DEFAULT '2.0',
+        created_at TEXT NOT NULL,
+        resolved_at TEXT
+    );
+    CREATE TABLE IF NOT EXISTS case_files (
+        case_file_id TEXT PRIMARY KEY,
+        root_cause TEXT NOT NULL,
+        conflict_ids TEXT NOT NULL,              -- JSON array
+        status TEXT DEFAULT 'open',
+        resolution_summary TEXT,
+        created_at TEXT NOT NULL,
+        resolved_at TEXT
+    );
+    CREATE TABLE IF NOT EXISTS resolutions (
+        resolution_id TEXT PRIMARY KEY,
+        conflict_id TEXT,
+        case_file_id TEXT,
+        resolved_by TEXT NOT NULL,
+        resolution_type TEXT NOT NULL,
+        rationale TEXT NOT NULL,
+        evidence_cited TEXT,                     -- JSON array
+        status TEXT DEFAULT 'active',            -- active, under_review, superseded
+        superseded_by TEXT,
+        downstream_resolutions TEXT,             -- JSON array
+        created_at TEXT NOT NULL,
+        reopened_at TEXT,
+        reopen_reason TEXT
+    );
+    -- ═══════════════════════════════════════════════════════════
+    -- LAYER 5: Scoring (calibration tracking)
+    -- ═══════════════════════════════════════════════════════════
+    CREATE TABLE IF NOT EXISTS calibration_log (
+        id INTEGER PRIMARY KEY AUTOINCREMENT,
+        claim_id TEXT,
+        system_confidence INTEGER,               -- Fixed-point ×1000
+        human_judgment TEXT,                      -- correct, partially_correct, incorrect
+        brier_contribution REAL,
+        timestamp TEXT NOT NULL
+    );
+    -- ═══════════════════════════════════════════════════════════
+    -- LAYER 6: Evaluation
+    -- ═══════════════════════════════════════════════════════════
+    CREATE TABLE IF NOT EXISTS eval_runs (
+        run_id TEXT PRIMARY KEY,
+        run_type TEXT NOT NULL,                  -- regression, llm_judge, calibration, holdout
+        metrics TEXT NOT NULL,                   -- JSON
+        passed INTEGER,
+        pipeline_version TEXT,
+        model_checkpoint TEXT,
+        prompt_hash TEXT,
+        created_at TEXT NOT NULL
+    );
+    -- ═══════════════════════════════════════════════════════════
+    -- LAYER 7: Provenance
+    -- ═══════════════════════════════════════════════════════════
+    CREATE TABLE IF NOT EXISTS pipeline_lineage (
+        lineage_id TEXT PRIMARY KEY,
+        claim_id TEXT NOT NULL,
+        pipeline_version TEXT NOT NULL,
+        model_checkpoint TEXT,
+        parser_version TEXT,
+        taxonomy_version TEXT,
+        prompt_hash TEXT,
+        extraction_timestamp TEXT NOT NULL,
+        FOREIGN KEY(claim_id) REFERENCES claims(claim_id)
+    );
+    -- ═══════════════════════════════════════════════════════════
+    -- CROSS-CUTTING: Sources, Goals, Taxonomy, API Logging
+    -- ═══════════════════════════════════════════════════════════
+    CREATE TABLE IF NOT EXISTS sources (
+        doi TEXT PRIMARY KEY,
+        title TEXT,
+        authors TEXT,                            -- JSON array
+        year INTEGER,
+        journal TEXT,
+        journal_tier INTEGER,
+        study_type TEXT,
+        is_canonical INTEGER DEFAULT 0,
+        taxonomy_version TEXT,
+        schema_version TEXT DEFAULT '2.0',
+        created_at TEXT NOT NULL
+    );
+    CREATE TABLE IF NOT EXISTS goals (
+        goal_id TEXT PRIMARY KEY,
+        description TEXT NOT NULL,
+        priority TEXT NOT NULL CHECK(priority IN ('high','medium','low')),
+        status TEXT NOT NULL DEFAULT 'Active',
+        linked_claim_ids TEXT,
+        schema_version TEXT DEFAULT '2.0',
+        created_at TEXT NOT NULL,
+        updated_at TEXT NOT NULL
+    );
+    CREATE TABLE IF NOT EXISTS decisions (
+        decision_id TEXT PRIMARY KEY,
+        recommended_action TEXT NOT NULL,
+        action_description TEXT,
+        expected_information_gain INTEGER,
+        linked_goal_id TEXT,
+        linked_claim_ids TEXT,
+        status TEXT DEFAULT 'Proposed',
+        priority TEXT DEFAULT 'medium',
+        estimated_effort TEXT,
+        source TEXT DEFAULT 'gap_analysis',      -- gap_analysis, meta_improver, human
+        schema_version TEXT DEFAULT '2.0',
+        created_at TEXT NOT NULL
+    );
+    CREATE TABLE IF NOT EXISTS api_usage_log (
+        id INTEGER PRIMARY KEY AUTOINCREMENT,
+        timestamp TEXT NOT NULL,
+        model TEXT NOT NULL,
+        tokens_in INTEGER NOT NULL,
+        tokens_out INTEGER NOT NULL,
+        cost_usd REAL NOT NULL,
+        task_type TEXT,
+        layer TEXT
+    );
+    CREATE TABLE IF NOT EXISTS system_state (
+        key TEXT PRIMARY KEY,
+        value TEXT NOT NULL,
+        updated_at TEXT NOT NULL
+    );
+    """)
+    # Set initial system state
+    for key, value in [
+        ("schema_version", SCHEMA_VERSION),
+        ("pipeline_version", PIPELINE_VERSION),
+        ("setup_phase", "0"),
+        ("total_papers_ingested", "0"),
+        ("total_claims_extracted", "0"),
+    ]:
+        conn.execute(
+            "INSERT OR IGNORE INTO system_state (key, value, updated_at) VALUES (?, ?, ?)",
+            (key, value, now_iso())
+        )
+    conn.commit()
+    conn.close()
+def get_state(db_path: str = None, key: str = None) -> Optional[str]:
+    conn = get_db(db_path)
+    row = conn.execute("SELECT value FROM system_state WHERE key = ?", (key,)).fetchone()
+    conn.close()
+    return row[0] if row else None
+def set_state(db_path: str = None, key: str = None, value: str = None):
+    conn = get_db(db_path)
+    conn.execute(
+        "INSERT OR REPLACE INTO system_state (key, value, updated_at) VALUES (?, ?, ?)",
+        (key, value, now_iso())
+    )
+    conn.commit()
+    conn.close()
+def get_stats(db_path: str = None) -> dict:
+    conn = get_db(db_path)
+    stats = {}
+    for table in ["documents", "regions", "claims", "canonical_claims",
+                   "graph_nodes", "graph_edges", "conflicts", "sources", "goals"]:
+        try:
+            stats[table] = conn.execute(f"SELECT COUNT(*) FROM {table}").fetchone()[0]
+        except:
+            stats[table] = 0
+    conn.close()
+    return stats