""" PhD Research OS v2.0 — Core Database ====================================== SQLite database supporting all 7 layers. Fixed-point math throughout. Every table has schema_version. Every record has created_at. """ import sqlite3 import json import uuid import os import hashlib from datetime import datetime, timezone from typing import Optional SCHEMA_VERSION = "2.0" PIPELINE_VERSION = "2.1.0" DB_PATH = os.environ.get("RESEARCH_OS_DB", "data/research_os_v2.db") def to_fixed(value: float) -> int: """Float → fixed-point integer (×1000). Research OS Rule 5.""" return round(value * 1000) def from_fixed(value: int) -> float: """Fixed-point integer → float.""" return value / 1000.0 if value is not None else 0.0 def now_iso() -> str: return datetime.now(timezone.utc).isoformat() def gen_id(prefix: str) -> str: return f"{prefix}_{uuid.uuid4().hex[:8].upper()}" def hash_text(text: str) -> str: return hashlib.sha256(text.encode()).hexdigest()[:16] def get_db(db_path: str = None) -> sqlite3.Connection: path = db_path or DB_PATH os.makedirs(os.path.dirname(path) if os.path.dirname(path) else ".", exist_ok=True) conn = sqlite3.connect(path) conn.row_factory = sqlite3.Row conn.execute("PRAGMA journal_mode=WAL") conn.execute("PRAGMA foreign_keys=ON") return conn def init_db(db_path: str = None): """Initialize ALL tables for the complete 7-layer system.""" conn = get_db(db_path) conn.executescript(""" -- ═══════════════════════════════════════════════════════════ -- LAYER 0: Structural Ingestion -- ═══════════════════════════════════════════════════════════ CREATE TABLE IF NOT EXISTS documents ( doc_id TEXT PRIMARY KEY, file_path TEXT NOT NULL, doc_type TEXT NOT NULL DEFAULT 'main', -- main, supplement, dataset, code_repo title TEXT, doi TEXT, arxiv_id TEXT, vor_status TEXT DEFAULT 'unknown', -- preprint, vor, erratum, retracted lineage_parent TEXT, -- doc_id of parent version parse_method TEXT, parse_quality_avg INTEGER, -- Fixed-point ×1000 total_regions INTEGER DEFAULT 0, total_pages INTEGER DEFAULT 0, ingestion_status TEXT DEFAULT 'pending', -- pending, processing, complete, failed metadata TEXT, -- JSON schema_version TEXT DEFAULT '2.0', created_at TEXT NOT NULL ); CREATE TABLE IF NOT EXISTS regions ( region_id TEXT PRIMARY KEY, doc_id TEXT NOT NULL, page INTEGER NOT NULL, bbox TEXT, -- JSON: [x1, y1, x2, y2] region_type TEXT NOT NULL, -- body_text, table, figure, equation, caption, header, reference, footnote section TEXT, subsection TEXT, content_text TEXT, content_markdown TEXT, parse_method TEXT, parse_confidence INTEGER, -- Fixed-point ×1000 ocr_source INTEGER DEFAULT 0, extraction_status TEXT DEFAULT 'extractable', -- extractable, low_confidence, unextractable quality_flags TEXT, -- JSON array cross_refs TEXT, -- JSON array of {ref_text, ref_type, resolved_to, verified} figure_type TEXT, -- scatter_plot, bar_chart, diagram, micrograph, schematic, null digitized_data TEXT, -- JSON: recovered data points from plot schema_version TEXT DEFAULT '2.0', created_at TEXT NOT NULL, FOREIGN KEY(doc_id) REFERENCES documents(doc_id) ); -- ═══════════════════════════════════════════════════════════ -- LAYER 1: Entity Resolution -- ═══════════════════════════════════════════════════════════ CREATE TABLE IF NOT EXISTS entities ( entity_id TEXT PRIMARY KEY, canonical_name TEXT NOT NULL, entity_type TEXT NOT NULL, -- gene, protein, chemical, assay, disease, instrument, method aliases TEXT, -- JSON array of alternative names external_ids TEXT, -- JSON: {uniprot: ..., pubchem: ..., mesh: ...} schema_version TEXT DEFAULT '2.0', created_at TEXT NOT NULL ); CREATE TABLE IF NOT EXISTS citation_chains ( chain_id TEXT PRIMARY KEY, citing_doc TEXT NOT NULL, cited_doi TEXT, cited_title TEXT, in_text_ref TEXT, -- e.g., "[32]" is_in_knowledge_base INTEGER DEFAULT 0, resolved_doc_id TEXT, chain_type TEXT DEFAULT 'direct', -- direct, inherited, self_cite schema_version TEXT DEFAULT '2.0', created_at TEXT NOT NULL, FOREIGN KEY(citing_doc) REFERENCES documents(doc_id) ); -- ═══════════════════════════════════════════════════════════ -- LAYER 2: Qualified Extraction (Claims) -- ═══════════════════════════════════════════════════════════ CREATE TABLE IF NOT EXISTS claims ( claim_id TEXT PRIMARY KEY, canonical_id TEXT, -- Layer 3: canonical claim this maps to text TEXT NOT NULL, epistemic_tag TEXT NOT NULL CHECK(epistemic_tag IN ('Fact','Interpretation','Hypothesis','Conflict_Hypothesis')), -- Confidence components (all fixed-point ×1000) evidence_strength INTEGER, study_quality_weight INTEGER, journal_tier_weight INTEGER, completeness_penalty INTEGER, section_modifier INTEGER, qualifier_penalty INTEGER, -- Computed scores (code-computed, NOT LLM-stated) evidence_quality INTEGER, -- Fixed-point ×1000 truth_likelihood INTEGER, -- Fixed-point ×1000 qualifier_strength_score INTEGER, -- Fixed-point ×1000 composite_confidence INTEGER, -- Fixed-point ×1000 status TEXT NOT NULL CHECK(status IN ('Complete','Incomplete','Unextractable')), is_null_result INTEGER DEFAULT 0, is_inherited_citation INTEGER DEFAULT 0, causal_direction TEXT DEFAULT 'unspecified', -- observed_correlation, causal_claim, unspecified practical_significance INTEGER DEFAULT 1, -- Qualifiers qualifiers TEXT, -- JSON array missing_fields TEXT, -- JSON array -- Statistical evidence stat_p_value REAL, stat_effect_size REAL, stat_effect_type TEXT, stat_sample_size INTEGER, stat_ci_lower REAL, stat_ci_upper REAL, -- Source provenance source_quote TEXT, source_page INTEGER, source_bbox TEXT, -- JSON: [x1, y1, x2, y2] source_section TEXT, source_region_id TEXT, source_doc_id TEXT, source_doi TEXT, -- Council provenance council_votes TEXT, -- JSON: {member: {tag, reasoning}} -- Granularity granularity TEXT DEFAULT 'atomic', -- atomic, aggregate parent_claim_id TEXT, -- Version tracking ontology_version TEXT, pipeline_version TEXT DEFAULT '2.1.0', taxonomy_version TEXT, extraction_timestamp TEXT, schema_version TEXT DEFAULT '2.0', created_at TEXT NOT NULL, updated_at TEXT NOT NULL ); -- ═══════════════════════════════════════════════════════════ -- LAYER 3: Canonicalization -- ═══════════════════════════════════════════════════════════ CREATE TABLE IF NOT EXISTS canonical_claims ( canonical_id TEXT PRIMARY KEY, representative_text TEXT NOT NULL, epistemic_tag TEXT NOT NULL, composite_confidence INTEGER, -- Aggregated across sources evidence_count INTEGER DEFAULT 1, source_dois TEXT, -- JSON array aliases TEXT, -- JSON array of claim_ids version_history TEXT, -- JSON array of {version, source, confidence, date} current_version INTEGER DEFAULT 1, supersedes TEXT, superseded_by TEXT, schema_version TEXT DEFAULT '2.0', created_at TEXT NOT NULL, updated_at TEXT NOT NULL ); -- ═══════════════════════════════════════════════════════════ -- LAYER 4: Knowledge Graph -- ═══════════════════════════════════════════════════════════ CREATE TABLE IF NOT EXISTS graph_nodes ( node_id TEXT PRIMARY KEY, node_type TEXT NOT NULL, -- claim, entity, method, condition, lab label TEXT NOT NULL, properties TEXT, -- JSON created_at TEXT NOT NULL ); CREATE TABLE IF NOT EXISTS graph_edges ( edge_id TEXT PRIMARY KEY, source_node TEXT NOT NULL, target_node TEXT NOT NULL, edge_type TEXT NOT NULL, -- supports, refutes, extends, depends_on, supersedes, blocks, investigative_hypothesis confidence INTEGER NOT NULL, -- Fixed-point ×1000 evidence_sources TEXT, -- JSON array of DOIs is_inferred INTEGER DEFAULT 0, inference_chain TEXT, -- JSON method_compatible INTEGER, -- NULL, 0, 1 resolution_id TEXT, created_at TEXT NOT NULL, updated_at TEXT NOT NULL, FOREIGN KEY(source_node) REFERENCES graph_nodes(node_id), FOREIGN KEY(target_node) REFERENCES graph_nodes(node_id) ); CREATE INDEX IF NOT EXISTS idx_edges_source ON graph_edges(source_node); CREATE INDEX IF NOT EXISTS idx_edges_target ON graph_edges(target_node); CREATE INDEX IF NOT EXISTS idx_edges_type ON graph_edges(edge_type); CREATE TABLE IF NOT EXISTS conflicts ( conflict_id TEXT PRIMARY KEY, claim_a_id TEXT NOT NULL, claim_b_id TEXT NOT NULL, conflict_type TEXT NOT NULL, generated_hypothesis TEXT, hypothesis_confidence TEXT DEFAULT 'low', resolution_status TEXT DEFAULT 'Unresolved', resolution_id TEXT, comparability_confidence INTEGER, -- Fixed-point ×1000 method_comparison TEXT, -- JSON case_file_id TEXT, key_differences TEXT, -- JSON schema_version TEXT DEFAULT '2.0', created_at TEXT NOT NULL, resolved_at TEXT ); CREATE TABLE IF NOT EXISTS case_files ( case_file_id TEXT PRIMARY KEY, root_cause TEXT NOT NULL, conflict_ids TEXT NOT NULL, -- JSON array status TEXT DEFAULT 'open', resolution_summary TEXT, created_at TEXT NOT NULL, resolved_at TEXT ); CREATE TABLE IF NOT EXISTS resolutions ( resolution_id TEXT PRIMARY KEY, conflict_id TEXT, case_file_id TEXT, resolved_by TEXT NOT NULL, resolution_type TEXT NOT NULL, rationale TEXT NOT NULL, evidence_cited TEXT, -- JSON array status TEXT DEFAULT 'active', -- active, under_review, superseded superseded_by TEXT, downstream_resolutions TEXT, -- JSON array created_at TEXT NOT NULL, reopened_at TEXT, reopen_reason TEXT ); -- ═══════════════════════════════════════════════════════════ -- LAYER 5: Scoring (calibration tracking) -- ═══════════════════════════════════════════════════════════ CREATE TABLE IF NOT EXISTS calibration_log ( id INTEGER PRIMARY KEY AUTOINCREMENT, claim_id TEXT, system_confidence INTEGER, -- Fixed-point ×1000 human_judgment TEXT, -- correct, partially_correct, incorrect brier_contribution REAL, timestamp TEXT NOT NULL ); -- ═══════════════════════════════════════════════════════════ -- LAYER 6: Evaluation -- ═══════════════════════════════════════════════════════════ CREATE TABLE IF NOT EXISTS eval_runs ( run_id TEXT PRIMARY KEY, run_type TEXT NOT NULL, -- regression, llm_judge, calibration, holdout metrics TEXT NOT NULL, -- JSON passed INTEGER, pipeline_version TEXT, model_checkpoint TEXT, prompt_hash TEXT, created_at TEXT NOT NULL ); -- ═══════════════════════════════════════════════════════════ -- LAYER 7: Provenance -- ═══════════════════════════════════════════════════════════ CREATE TABLE IF NOT EXISTS pipeline_lineage ( lineage_id TEXT PRIMARY KEY, claim_id TEXT NOT NULL, pipeline_version TEXT NOT NULL, model_checkpoint TEXT, parser_version TEXT, taxonomy_version TEXT, prompt_hash TEXT, extraction_timestamp TEXT NOT NULL, FOREIGN KEY(claim_id) REFERENCES claims(claim_id) ); -- ═══════════════════════════════════════════════════════════ -- CROSS-CUTTING: Sources, Goals, Taxonomy, API Logging -- ═══════════════════════════════════════════════════════════ CREATE TABLE IF NOT EXISTS sources ( doi TEXT PRIMARY KEY, title TEXT, authors TEXT, -- JSON array year INTEGER, journal TEXT, journal_tier INTEGER, study_type TEXT, is_canonical INTEGER DEFAULT 0, taxonomy_version TEXT, schema_version TEXT DEFAULT '2.0', created_at TEXT NOT NULL ); CREATE TABLE IF NOT EXISTS goals ( goal_id TEXT PRIMARY KEY, description TEXT NOT NULL, priority TEXT NOT NULL CHECK(priority IN ('high','medium','low')), status TEXT NOT NULL DEFAULT 'Active', linked_claim_ids TEXT, schema_version TEXT DEFAULT '2.0', created_at TEXT NOT NULL, updated_at TEXT NOT NULL ); CREATE TABLE IF NOT EXISTS decisions ( decision_id TEXT PRIMARY KEY, recommended_action TEXT NOT NULL, action_description TEXT, expected_information_gain INTEGER, linked_goal_id TEXT, linked_claim_ids TEXT, status TEXT DEFAULT 'Proposed', priority TEXT DEFAULT 'medium', estimated_effort TEXT, source TEXT DEFAULT 'gap_analysis', -- gap_analysis, meta_improver, human schema_version TEXT DEFAULT '2.0', created_at TEXT NOT NULL ); CREATE TABLE IF NOT EXISTS api_usage_log ( id INTEGER PRIMARY KEY AUTOINCREMENT, timestamp TEXT NOT NULL, model TEXT NOT NULL, tokens_in INTEGER NOT NULL, tokens_out INTEGER NOT NULL, cost_usd REAL NOT NULL, task_type TEXT, layer TEXT ); CREATE TABLE IF NOT EXISTS system_state ( key TEXT PRIMARY KEY, value TEXT NOT NULL, updated_at TEXT NOT NULL ); """) # Set initial system state for key, value in [ ("schema_version", SCHEMA_VERSION), ("pipeline_version", PIPELINE_VERSION), ("setup_phase", "0"), ("total_papers_ingested", "0"), ("total_claims_extracted", "0"), ]: conn.execute( "INSERT OR IGNORE INTO system_state (key, value, updated_at) VALUES (?, ?, ?)", (key, value, now_iso()) ) conn.commit() conn.close() def get_state(db_path: str = None, key: str = None) -> Optional[str]: conn = get_db(db_path) row = conn.execute("SELECT value FROM system_state WHERE key = ?", (key,)).fetchone() conn.close() return row[0] if row else None def set_state(db_path: str = None, key: str = None, value: str = None): conn = get_db(db_path) conn.execute( "INSERT OR REPLACE INTO system_state (key, value, updated_at) VALUES (?, ?, ?)", (key, value, now_iso()) ) conn.commit() conn.close() def get_stats(db_path: str = None) -> dict: conn = get_db(db_path) stats = {} for table in ["documents", "regions", "claims", "canonical_claims", "graph_nodes", "graph_edges", "conflicts", "sources", "goals"]: try: stats[table] = conn.execute(f"SELECT COUNT(*) FROM {table}").fetchone()[0] except: stats[table] = 0 conn.close() return stats