| """ |
| PhD Research OS v2.0 β Core Database |
| ====================================== |
| SQLite database supporting all 7 layers. Fixed-point math throughout. |
| Every table has schema_version. Every record has created_at. |
| """ |
|
|
| import sqlite3 |
| import json |
| import uuid |
| import os |
| import hashlib |
| from datetime import datetime, timezone |
| from typing import Optional |
|
|
| SCHEMA_VERSION = "2.0" |
| PIPELINE_VERSION = "2.1.0" |
| DB_PATH = os.environ.get("RESEARCH_OS_DB", "data/research_os_v2.db") |
|
|
|
|
| def to_fixed(value: float) -> int: |
| """Float β fixed-point integer (Γ1000). Research OS Rule 5.""" |
| return round(value * 1000) |
|
|
| def from_fixed(value: int) -> float: |
| """Fixed-point integer β float.""" |
| return value / 1000.0 if value is not None else 0.0 |
|
|
| def now_iso() -> str: |
| return datetime.now(timezone.utc).isoformat() |
|
|
| def gen_id(prefix: str) -> str: |
| return f"{prefix}_{uuid.uuid4().hex[:8].upper()}" |
|
|
| def hash_text(text: str) -> str: |
| return hashlib.sha256(text.encode()).hexdigest()[:16] |
|
|
|
|
| def get_db(db_path: str = None) -> sqlite3.Connection: |
| path = db_path or DB_PATH |
| os.makedirs(os.path.dirname(path) if os.path.dirname(path) else ".", exist_ok=True) |
| conn = sqlite3.connect(path) |
| conn.row_factory = sqlite3.Row |
| conn.execute("PRAGMA journal_mode=WAL") |
| conn.execute("PRAGMA foreign_keys=ON") |
| return conn |
|
|
|
|
| def init_db(db_path: str = None): |
| """Initialize ALL tables for the complete 7-layer system.""" |
| conn = get_db(db_path) |
| conn.executescript(""" |
| -- βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| -- LAYER 0: Structural Ingestion |
| -- βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| |
| CREATE TABLE IF NOT EXISTS documents ( |
| doc_id TEXT PRIMARY KEY, |
| file_path TEXT NOT NULL, |
| doc_type TEXT NOT NULL DEFAULT 'main', -- main, supplement, dataset, code_repo |
| title TEXT, |
| doi TEXT, |
| arxiv_id TEXT, |
| vor_status TEXT DEFAULT 'unknown', -- preprint, vor, erratum, retracted |
| lineage_parent TEXT, -- doc_id of parent version |
| parse_method TEXT, |
| parse_quality_avg INTEGER, -- Fixed-point Γ1000 |
| total_regions INTEGER DEFAULT 0, |
| total_pages INTEGER DEFAULT 0, |
| ingestion_status TEXT DEFAULT 'pending', -- pending, processing, complete, failed |
| metadata TEXT, -- JSON |
| schema_version TEXT DEFAULT '2.0', |
| created_at TEXT NOT NULL |
| ); |
| |
| CREATE TABLE IF NOT EXISTS regions ( |
| region_id TEXT PRIMARY KEY, |
| doc_id TEXT NOT NULL, |
| page INTEGER NOT NULL, |
| bbox TEXT, -- JSON: [x1, y1, x2, y2] |
| region_type TEXT NOT NULL, -- body_text, table, figure, equation, caption, header, reference, footnote |
| section TEXT, |
| subsection TEXT, |
| content_text TEXT, |
| content_markdown TEXT, |
| parse_method TEXT, |
| parse_confidence INTEGER, -- Fixed-point Γ1000 |
| ocr_source INTEGER DEFAULT 0, |
| extraction_status TEXT DEFAULT 'extractable', -- extractable, low_confidence, unextractable |
| quality_flags TEXT, -- JSON array |
| cross_refs TEXT, -- JSON array of {ref_text, ref_type, resolved_to, verified} |
| figure_type TEXT, -- scatter_plot, bar_chart, diagram, micrograph, schematic, null |
| digitized_data TEXT, -- JSON: recovered data points from plot |
| schema_version TEXT DEFAULT '2.0', |
| created_at TEXT NOT NULL, |
| FOREIGN KEY(doc_id) REFERENCES documents(doc_id) |
| ); |
| |
| -- βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| -- LAYER 1: Entity Resolution |
| -- βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| |
| CREATE TABLE IF NOT EXISTS entities ( |
| entity_id TEXT PRIMARY KEY, |
| canonical_name TEXT NOT NULL, |
| entity_type TEXT NOT NULL, -- gene, protein, chemical, assay, disease, instrument, method |
| aliases TEXT, -- JSON array of alternative names |
| external_ids TEXT, -- JSON: {uniprot: ..., pubchem: ..., mesh: ...} |
| schema_version TEXT DEFAULT '2.0', |
| created_at TEXT NOT NULL |
| ); |
| |
| CREATE TABLE IF NOT EXISTS citation_chains ( |
| chain_id TEXT PRIMARY KEY, |
| citing_doc TEXT NOT NULL, |
| cited_doi TEXT, |
| cited_title TEXT, |
| in_text_ref TEXT, -- e.g., "[32]" |
| is_in_knowledge_base INTEGER DEFAULT 0, |
| resolved_doc_id TEXT, |
| chain_type TEXT DEFAULT 'direct', -- direct, inherited, self_cite |
| schema_version TEXT DEFAULT '2.0', |
| created_at TEXT NOT NULL, |
| FOREIGN KEY(citing_doc) REFERENCES documents(doc_id) |
| ); |
| |
| -- βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| -- LAYER 2: Qualified Extraction (Claims) |
| -- βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| |
| CREATE TABLE IF NOT EXISTS claims ( |
| claim_id TEXT PRIMARY KEY, |
| canonical_id TEXT, -- Layer 3: canonical claim this maps to |
| text TEXT NOT NULL, |
| epistemic_tag TEXT NOT NULL CHECK(epistemic_tag IN |
| ('Fact','Interpretation','Hypothesis','Conflict_Hypothesis')), |
| |
| -- Confidence components (all fixed-point Γ1000) |
| evidence_strength INTEGER, |
| study_quality_weight INTEGER, |
| journal_tier_weight INTEGER, |
| completeness_penalty INTEGER, |
| section_modifier INTEGER, |
| qualifier_penalty INTEGER, |
| |
| -- Computed scores (code-computed, NOT LLM-stated) |
| evidence_quality INTEGER, -- Fixed-point Γ1000 |
| truth_likelihood INTEGER, -- Fixed-point Γ1000 |
| qualifier_strength_score INTEGER, -- Fixed-point Γ1000 |
| composite_confidence INTEGER, -- Fixed-point Γ1000 |
| |
| status TEXT NOT NULL CHECK(status IN ('Complete','Incomplete','Unextractable')), |
| is_null_result INTEGER DEFAULT 0, |
| is_inherited_citation INTEGER DEFAULT 0, |
| causal_direction TEXT DEFAULT 'unspecified', -- observed_correlation, causal_claim, unspecified |
| practical_significance INTEGER DEFAULT 1, |
| |
| -- Qualifiers |
| qualifiers TEXT, -- JSON array |
| missing_fields TEXT, -- JSON array |
| |
| -- Statistical evidence |
| stat_p_value REAL, |
| stat_effect_size REAL, |
| stat_effect_type TEXT, |
| stat_sample_size INTEGER, |
| stat_ci_lower REAL, |
| stat_ci_upper REAL, |
| |
| -- Source provenance |
| source_quote TEXT, |
| source_page INTEGER, |
| source_bbox TEXT, -- JSON: [x1, y1, x2, y2] |
| source_section TEXT, |
| source_region_id TEXT, |
| source_doc_id TEXT, |
| source_doi TEXT, |
| |
| -- Council provenance |
| council_votes TEXT, -- JSON: {member: {tag, reasoning}} |
| |
| -- Granularity |
| granularity TEXT DEFAULT 'atomic', -- atomic, aggregate |
| parent_claim_id TEXT, |
| |
| -- Version tracking |
| ontology_version TEXT, |
| pipeline_version TEXT DEFAULT '2.1.0', |
| taxonomy_version TEXT, |
| extraction_timestamp TEXT, |
| schema_version TEXT DEFAULT '2.0', |
| created_at TEXT NOT NULL, |
| updated_at TEXT NOT NULL |
| ); |
| |
| -- βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| -- LAYER 3: Canonicalization |
| -- βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| |
| CREATE TABLE IF NOT EXISTS canonical_claims ( |
| canonical_id TEXT PRIMARY KEY, |
| representative_text TEXT NOT NULL, |
| epistemic_tag TEXT NOT NULL, |
| composite_confidence INTEGER, -- Aggregated across sources |
| evidence_count INTEGER DEFAULT 1, |
| source_dois TEXT, -- JSON array |
| aliases TEXT, -- JSON array of claim_ids |
| version_history TEXT, -- JSON array of {version, source, confidence, date} |
| current_version INTEGER DEFAULT 1, |
| supersedes TEXT, |
| superseded_by TEXT, |
| schema_version TEXT DEFAULT '2.0', |
| created_at TEXT NOT NULL, |
| updated_at TEXT NOT NULL |
| ); |
| |
| -- βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| -- LAYER 4: Knowledge Graph |
| -- βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| |
| CREATE TABLE IF NOT EXISTS graph_nodes ( |
| node_id TEXT PRIMARY KEY, |
| node_type TEXT NOT NULL, -- claim, entity, method, condition, lab |
| label TEXT NOT NULL, |
| properties TEXT, -- JSON |
| created_at TEXT NOT NULL |
| ); |
| |
| CREATE TABLE IF NOT EXISTS graph_edges ( |
| edge_id TEXT PRIMARY KEY, |
| source_node TEXT NOT NULL, |
| target_node TEXT NOT NULL, |
| edge_type TEXT NOT NULL, -- supports, refutes, extends, depends_on, supersedes, blocks, investigative_hypothesis |
| confidence INTEGER NOT NULL, -- Fixed-point Γ1000 |
| evidence_sources TEXT, -- JSON array of DOIs |
| is_inferred INTEGER DEFAULT 0, |
| inference_chain TEXT, -- JSON |
| method_compatible INTEGER, -- NULL, 0, 1 |
| resolution_id TEXT, |
| created_at TEXT NOT NULL, |
| updated_at TEXT NOT NULL, |
| FOREIGN KEY(source_node) REFERENCES graph_nodes(node_id), |
| FOREIGN KEY(target_node) REFERENCES graph_nodes(node_id) |
| ); |
| |
| CREATE INDEX IF NOT EXISTS idx_edges_source ON graph_edges(source_node); |
| CREATE INDEX IF NOT EXISTS idx_edges_target ON graph_edges(target_node); |
| CREATE INDEX IF NOT EXISTS idx_edges_type ON graph_edges(edge_type); |
| |
| CREATE TABLE IF NOT EXISTS conflicts ( |
| conflict_id TEXT PRIMARY KEY, |
| claim_a_id TEXT NOT NULL, |
| claim_b_id TEXT NOT NULL, |
| conflict_type TEXT NOT NULL, |
| generated_hypothesis TEXT, |
| hypothesis_confidence TEXT DEFAULT 'low', |
| resolution_status TEXT DEFAULT 'Unresolved', |
| resolution_id TEXT, |
| comparability_confidence INTEGER, -- Fixed-point Γ1000 |
| method_comparison TEXT, -- JSON |
| case_file_id TEXT, |
| key_differences TEXT, -- JSON |
| schema_version TEXT DEFAULT '2.0', |
| created_at TEXT NOT NULL, |
| resolved_at TEXT |
| ); |
| |
| CREATE TABLE IF NOT EXISTS case_files ( |
| case_file_id TEXT PRIMARY KEY, |
| root_cause TEXT NOT NULL, |
| conflict_ids TEXT NOT NULL, -- JSON array |
| status TEXT DEFAULT 'open', |
| resolution_summary TEXT, |
| created_at TEXT NOT NULL, |
| resolved_at TEXT |
| ); |
| |
| CREATE TABLE IF NOT EXISTS resolutions ( |
| resolution_id TEXT PRIMARY KEY, |
| conflict_id TEXT, |
| case_file_id TEXT, |
| resolved_by TEXT NOT NULL, |
| resolution_type TEXT NOT NULL, |
| rationale TEXT NOT NULL, |
| evidence_cited TEXT, -- JSON array |
| status TEXT DEFAULT 'active', -- active, under_review, superseded |
| superseded_by TEXT, |
| downstream_resolutions TEXT, -- JSON array |
| created_at TEXT NOT NULL, |
| reopened_at TEXT, |
| reopen_reason TEXT |
| ); |
| |
| -- βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| -- LAYER 5: Scoring (calibration tracking) |
| -- βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| |
| CREATE TABLE IF NOT EXISTS calibration_log ( |
| id INTEGER PRIMARY KEY AUTOINCREMENT, |
| claim_id TEXT, |
| system_confidence INTEGER, -- Fixed-point Γ1000 |
| human_judgment TEXT, -- correct, partially_correct, incorrect |
| brier_contribution REAL, |
| timestamp TEXT NOT NULL |
| ); |
| |
| -- βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| -- LAYER 6: Evaluation |
| -- βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| |
| CREATE TABLE IF NOT EXISTS eval_runs ( |
| run_id TEXT PRIMARY KEY, |
| run_type TEXT NOT NULL, -- regression, llm_judge, calibration, holdout |
| metrics TEXT NOT NULL, -- JSON |
| passed INTEGER, |
| pipeline_version TEXT, |
| model_checkpoint TEXT, |
| prompt_hash TEXT, |
| created_at TEXT NOT NULL |
| ); |
| |
| -- βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| -- LAYER 7: Provenance |
| -- βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| |
| CREATE TABLE IF NOT EXISTS pipeline_lineage ( |
| lineage_id TEXT PRIMARY KEY, |
| claim_id TEXT NOT NULL, |
| pipeline_version TEXT NOT NULL, |
| model_checkpoint TEXT, |
| parser_version TEXT, |
| taxonomy_version TEXT, |
| prompt_hash TEXT, |
| extraction_timestamp TEXT NOT NULL, |
| FOREIGN KEY(claim_id) REFERENCES claims(claim_id) |
| ); |
| |
| -- βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| -- CROSS-CUTTING: Sources, Goals, Taxonomy, API Logging |
| -- βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| |
| CREATE TABLE IF NOT EXISTS sources ( |
| doi TEXT PRIMARY KEY, |
| title TEXT, |
| authors TEXT, -- JSON array |
| year INTEGER, |
| journal TEXT, |
| journal_tier INTEGER, |
| study_type TEXT, |
| is_canonical INTEGER DEFAULT 0, |
| taxonomy_version TEXT, |
| schema_version TEXT DEFAULT '2.0', |
| created_at TEXT NOT NULL |
| ); |
| |
| CREATE TABLE IF NOT EXISTS goals ( |
| goal_id TEXT PRIMARY KEY, |
| description TEXT NOT NULL, |
| priority TEXT NOT NULL CHECK(priority IN ('high','medium','low')), |
| status TEXT NOT NULL DEFAULT 'Active', |
| linked_claim_ids TEXT, |
| schema_version TEXT DEFAULT '2.0', |
| created_at TEXT NOT NULL, |
| updated_at TEXT NOT NULL |
| ); |
| |
| CREATE TABLE IF NOT EXISTS decisions ( |
| decision_id TEXT PRIMARY KEY, |
| recommended_action TEXT NOT NULL, |
| action_description TEXT, |
| expected_information_gain INTEGER, |
| linked_goal_id TEXT, |
| linked_claim_ids TEXT, |
| status TEXT DEFAULT 'Proposed', |
| priority TEXT DEFAULT 'medium', |
| estimated_effort TEXT, |
| source TEXT DEFAULT 'gap_analysis', -- gap_analysis, meta_improver, human |
| schema_version TEXT DEFAULT '2.0', |
| created_at TEXT NOT NULL |
| ); |
| |
| CREATE TABLE IF NOT EXISTS api_usage_log ( |
| id INTEGER PRIMARY KEY AUTOINCREMENT, |
| timestamp TEXT NOT NULL, |
| model TEXT NOT NULL, |
| tokens_in INTEGER NOT NULL, |
| tokens_out INTEGER NOT NULL, |
| cost_usd REAL NOT NULL, |
| task_type TEXT, |
| layer TEXT |
| ); |
| |
| CREATE TABLE IF NOT EXISTS system_state ( |
| key TEXT PRIMARY KEY, |
| value TEXT NOT NULL, |
| updated_at TEXT NOT NULL |
| ); |
| """) |
| |
| |
| for key, value in [ |
| ("schema_version", SCHEMA_VERSION), |
| ("pipeline_version", PIPELINE_VERSION), |
| ("setup_phase", "0"), |
| ("total_papers_ingested", "0"), |
| ("total_claims_extracted", "0"), |
| ]: |
| conn.execute( |
| "INSERT OR IGNORE INTO system_state (key, value, updated_at) VALUES (?, ?, ?)", |
| (key, value, now_iso()) |
| ) |
| |
| conn.commit() |
| conn.close() |
|
|
|
|
| def get_state(db_path: str = None, key: str = None) -> Optional[str]: |
| conn = get_db(db_path) |
| row = conn.execute("SELECT value FROM system_state WHERE key = ?", (key,)).fetchone() |
| conn.close() |
| return row[0] if row else None |
|
|
|
|
| def set_state(db_path: str = None, key: str = None, value: str = None): |
| conn = get_db(db_path) |
| conn.execute( |
| "INSERT OR REPLACE INTO system_state (key, value, updated_at) VALUES (?, ?, ?)", |
| (key, value, now_iso()) |
| ) |
| conn.commit() |
| conn.close() |
|
|
|
|
| def get_stats(db_path: str = None) -> dict: |
| conn = get_db(db_path) |
| stats = {} |
| for table in ["documents", "regions", "claims", "canonical_claims", |
| "graph_nodes", "graph_edges", "conflicts", "sources", "goals"]: |
| try: |
| stats[table] = conn.execute(f"SELECT COUNT(*) FROM {table}").fetchone()[0] |
| except: |
| stats[table] = 0 |
| conn.close() |
| return stats |
|
|