nkshirsa's picture
v2.0: phd_research_os_v2/core/database.py
d555e10 verified
"""
PhD Research OS v2.0 β€” Core Database
======================================
SQLite database supporting all 7 layers. Fixed-point math throughout.
Every table has schema_version. Every record has created_at.
"""
import sqlite3
import json
import uuid
import os
import hashlib
from datetime import datetime, timezone
from typing import Optional
SCHEMA_VERSION = "2.0"
PIPELINE_VERSION = "2.1.0"
DB_PATH = os.environ.get("RESEARCH_OS_DB", "data/research_os_v2.db")
def to_fixed(value: float) -> int:
"""Float β†’ fixed-point integer (Γ—1000). Research OS Rule 5."""
return round(value * 1000)
def from_fixed(value: int) -> float:
"""Fixed-point integer β†’ float."""
return value / 1000.0 if value is not None else 0.0
def now_iso() -> str:
return datetime.now(timezone.utc).isoformat()
def gen_id(prefix: str) -> str:
return f"{prefix}_{uuid.uuid4().hex[:8].upper()}"
def hash_text(text: str) -> str:
return hashlib.sha256(text.encode()).hexdigest()[:16]
def get_db(db_path: str = None) -> sqlite3.Connection:
path = db_path or DB_PATH
os.makedirs(os.path.dirname(path) if os.path.dirname(path) else ".", exist_ok=True)
conn = sqlite3.connect(path)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA foreign_keys=ON")
return conn
def init_db(db_path: str = None):
"""Initialize ALL tables for the complete 7-layer system."""
conn = get_db(db_path)
conn.executescript("""
-- ═══════════════════════════════════════════════════════════
-- LAYER 0: Structural Ingestion
-- ═══════════════════════════════════════════════════════════
CREATE TABLE IF NOT EXISTS documents (
doc_id TEXT PRIMARY KEY,
file_path TEXT NOT NULL,
doc_type TEXT NOT NULL DEFAULT 'main', -- main, supplement, dataset, code_repo
title TEXT,
doi TEXT,
arxiv_id TEXT,
vor_status TEXT DEFAULT 'unknown', -- preprint, vor, erratum, retracted
lineage_parent TEXT, -- doc_id of parent version
parse_method TEXT,
parse_quality_avg INTEGER, -- Fixed-point Γ—1000
total_regions INTEGER DEFAULT 0,
total_pages INTEGER DEFAULT 0,
ingestion_status TEXT DEFAULT 'pending', -- pending, processing, complete, failed
metadata TEXT, -- JSON
schema_version TEXT DEFAULT '2.0',
created_at TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS regions (
region_id TEXT PRIMARY KEY,
doc_id TEXT NOT NULL,
page INTEGER NOT NULL,
bbox TEXT, -- JSON: [x1, y1, x2, y2]
region_type TEXT NOT NULL, -- body_text, table, figure, equation, caption, header, reference, footnote
section TEXT,
subsection TEXT,
content_text TEXT,
content_markdown TEXT,
parse_method TEXT,
parse_confidence INTEGER, -- Fixed-point Γ—1000
ocr_source INTEGER DEFAULT 0,
extraction_status TEXT DEFAULT 'extractable', -- extractable, low_confidence, unextractable
quality_flags TEXT, -- JSON array
cross_refs TEXT, -- JSON array of {ref_text, ref_type, resolved_to, verified}
figure_type TEXT, -- scatter_plot, bar_chart, diagram, micrograph, schematic, null
digitized_data TEXT, -- JSON: recovered data points from plot
schema_version TEXT DEFAULT '2.0',
created_at TEXT NOT NULL,
FOREIGN KEY(doc_id) REFERENCES documents(doc_id)
);
-- ═══════════════════════════════════════════════════════════
-- LAYER 1: Entity Resolution
-- ═══════════════════════════════════════════════════════════
CREATE TABLE IF NOT EXISTS entities (
entity_id TEXT PRIMARY KEY,
canonical_name TEXT NOT NULL,
entity_type TEXT NOT NULL, -- gene, protein, chemical, assay, disease, instrument, method
aliases TEXT, -- JSON array of alternative names
external_ids TEXT, -- JSON: {uniprot: ..., pubchem: ..., mesh: ...}
schema_version TEXT DEFAULT '2.0',
created_at TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS citation_chains (
chain_id TEXT PRIMARY KEY,
citing_doc TEXT NOT NULL,
cited_doi TEXT,
cited_title TEXT,
in_text_ref TEXT, -- e.g., "[32]"
is_in_knowledge_base INTEGER DEFAULT 0,
resolved_doc_id TEXT,
chain_type TEXT DEFAULT 'direct', -- direct, inherited, self_cite
schema_version TEXT DEFAULT '2.0',
created_at TEXT NOT NULL,
FOREIGN KEY(citing_doc) REFERENCES documents(doc_id)
);
-- ═══════════════════════════════════════════════════════════
-- LAYER 2: Qualified Extraction (Claims)
-- ═══════════════════════════════════════════════════════════
CREATE TABLE IF NOT EXISTS claims (
claim_id TEXT PRIMARY KEY,
canonical_id TEXT, -- Layer 3: canonical claim this maps to
text TEXT NOT NULL,
epistemic_tag TEXT NOT NULL CHECK(epistemic_tag IN
('Fact','Interpretation','Hypothesis','Conflict_Hypothesis')),
-- Confidence components (all fixed-point Γ—1000)
evidence_strength INTEGER,
study_quality_weight INTEGER,
journal_tier_weight INTEGER,
completeness_penalty INTEGER,
section_modifier INTEGER,
qualifier_penalty INTEGER,
-- Computed scores (code-computed, NOT LLM-stated)
evidence_quality INTEGER, -- Fixed-point Γ—1000
truth_likelihood INTEGER, -- Fixed-point Γ—1000
qualifier_strength_score INTEGER, -- Fixed-point Γ—1000
composite_confidence INTEGER, -- Fixed-point Γ—1000
status TEXT NOT NULL CHECK(status IN ('Complete','Incomplete','Unextractable')),
is_null_result INTEGER DEFAULT 0,
is_inherited_citation INTEGER DEFAULT 0,
causal_direction TEXT DEFAULT 'unspecified', -- observed_correlation, causal_claim, unspecified
practical_significance INTEGER DEFAULT 1,
-- Qualifiers
qualifiers TEXT, -- JSON array
missing_fields TEXT, -- JSON array
-- Statistical evidence
stat_p_value REAL,
stat_effect_size REAL,
stat_effect_type TEXT,
stat_sample_size INTEGER,
stat_ci_lower REAL,
stat_ci_upper REAL,
-- Source provenance
source_quote TEXT,
source_page INTEGER,
source_bbox TEXT, -- JSON: [x1, y1, x2, y2]
source_section TEXT,
source_region_id TEXT,
source_doc_id TEXT,
source_doi TEXT,
-- Council provenance
council_votes TEXT, -- JSON: {member: {tag, reasoning}}
-- Granularity
granularity TEXT DEFAULT 'atomic', -- atomic, aggregate
parent_claim_id TEXT,
-- Version tracking
ontology_version TEXT,
pipeline_version TEXT DEFAULT '2.1.0',
taxonomy_version TEXT,
extraction_timestamp TEXT,
schema_version TEXT DEFAULT '2.0',
created_at TEXT NOT NULL,
updated_at TEXT NOT NULL
);
-- ═══════════════════════════════════════════════════════════
-- LAYER 3: Canonicalization
-- ═══════════════════════════════════════════════════════════
CREATE TABLE IF NOT EXISTS canonical_claims (
canonical_id TEXT PRIMARY KEY,
representative_text TEXT NOT NULL,
epistemic_tag TEXT NOT NULL,
composite_confidence INTEGER, -- Aggregated across sources
evidence_count INTEGER DEFAULT 1,
source_dois TEXT, -- JSON array
aliases TEXT, -- JSON array of claim_ids
version_history TEXT, -- JSON array of {version, source, confidence, date}
current_version INTEGER DEFAULT 1,
supersedes TEXT,
superseded_by TEXT,
schema_version TEXT DEFAULT '2.0',
created_at TEXT NOT NULL,
updated_at TEXT NOT NULL
);
-- ═══════════════════════════════════════════════════════════
-- LAYER 4: Knowledge Graph
-- ═══════════════════════════════════════════════════════════
CREATE TABLE IF NOT EXISTS graph_nodes (
node_id TEXT PRIMARY KEY,
node_type TEXT NOT NULL, -- claim, entity, method, condition, lab
label TEXT NOT NULL,
properties TEXT, -- JSON
created_at TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS graph_edges (
edge_id TEXT PRIMARY KEY,
source_node TEXT NOT NULL,
target_node TEXT NOT NULL,
edge_type TEXT NOT NULL, -- supports, refutes, extends, depends_on, supersedes, blocks, investigative_hypothesis
confidence INTEGER NOT NULL, -- Fixed-point Γ—1000
evidence_sources TEXT, -- JSON array of DOIs
is_inferred INTEGER DEFAULT 0,
inference_chain TEXT, -- JSON
method_compatible INTEGER, -- NULL, 0, 1
resolution_id TEXT,
created_at TEXT NOT NULL,
updated_at TEXT NOT NULL,
FOREIGN KEY(source_node) REFERENCES graph_nodes(node_id),
FOREIGN KEY(target_node) REFERENCES graph_nodes(node_id)
);
CREATE INDEX IF NOT EXISTS idx_edges_source ON graph_edges(source_node);
CREATE INDEX IF NOT EXISTS idx_edges_target ON graph_edges(target_node);
CREATE INDEX IF NOT EXISTS idx_edges_type ON graph_edges(edge_type);
CREATE TABLE IF NOT EXISTS conflicts (
conflict_id TEXT PRIMARY KEY,
claim_a_id TEXT NOT NULL,
claim_b_id TEXT NOT NULL,
conflict_type TEXT NOT NULL,
generated_hypothesis TEXT,
hypothesis_confidence TEXT DEFAULT 'low',
resolution_status TEXT DEFAULT 'Unresolved',
resolution_id TEXT,
comparability_confidence INTEGER, -- Fixed-point Γ—1000
method_comparison TEXT, -- JSON
case_file_id TEXT,
key_differences TEXT, -- JSON
schema_version TEXT DEFAULT '2.0',
created_at TEXT NOT NULL,
resolved_at TEXT
);
CREATE TABLE IF NOT EXISTS case_files (
case_file_id TEXT PRIMARY KEY,
root_cause TEXT NOT NULL,
conflict_ids TEXT NOT NULL, -- JSON array
status TEXT DEFAULT 'open',
resolution_summary TEXT,
created_at TEXT NOT NULL,
resolved_at TEXT
);
CREATE TABLE IF NOT EXISTS resolutions (
resolution_id TEXT PRIMARY KEY,
conflict_id TEXT,
case_file_id TEXT,
resolved_by TEXT NOT NULL,
resolution_type TEXT NOT NULL,
rationale TEXT NOT NULL,
evidence_cited TEXT, -- JSON array
status TEXT DEFAULT 'active', -- active, under_review, superseded
superseded_by TEXT,
downstream_resolutions TEXT, -- JSON array
created_at TEXT NOT NULL,
reopened_at TEXT,
reopen_reason TEXT
);
-- ═══════════════════════════════════════════════════════════
-- LAYER 5: Scoring (calibration tracking)
-- ═══════════════════════════════════════════════════════════
CREATE TABLE IF NOT EXISTS calibration_log (
id INTEGER PRIMARY KEY AUTOINCREMENT,
claim_id TEXT,
system_confidence INTEGER, -- Fixed-point Γ—1000
human_judgment TEXT, -- correct, partially_correct, incorrect
brier_contribution REAL,
timestamp TEXT NOT NULL
);
-- ═══════════════════════════════════════════════════════════
-- LAYER 6: Evaluation
-- ═══════════════════════════════════════════════════════════
CREATE TABLE IF NOT EXISTS eval_runs (
run_id TEXT PRIMARY KEY,
run_type TEXT NOT NULL, -- regression, llm_judge, calibration, holdout
metrics TEXT NOT NULL, -- JSON
passed INTEGER,
pipeline_version TEXT,
model_checkpoint TEXT,
prompt_hash TEXT,
created_at TEXT NOT NULL
);
-- ═══════════════════════════════════════════════════════════
-- LAYER 7: Provenance
-- ═══════════════════════════════════════════════════════════
CREATE TABLE IF NOT EXISTS pipeline_lineage (
lineage_id TEXT PRIMARY KEY,
claim_id TEXT NOT NULL,
pipeline_version TEXT NOT NULL,
model_checkpoint TEXT,
parser_version TEXT,
taxonomy_version TEXT,
prompt_hash TEXT,
extraction_timestamp TEXT NOT NULL,
FOREIGN KEY(claim_id) REFERENCES claims(claim_id)
);
-- ═══════════════════════════════════════════════════════════
-- CROSS-CUTTING: Sources, Goals, Taxonomy, API Logging
-- ═══════════════════════════════════════════════════════════
CREATE TABLE IF NOT EXISTS sources (
doi TEXT PRIMARY KEY,
title TEXT,
authors TEXT, -- JSON array
year INTEGER,
journal TEXT,
journal_tier INTEGER,
study_type TEXT,
is_canonical INTEGER DEFAULT 0,
taxonomy_version TEXT,
schema_version TEXT DEFAULT '2.0',
created_at TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS goals (
goal_id TEXT PRIMARY KEY,
description TEXT NOT NULL,
priority TEXT NOT NULL CHECK(priority IN ('high','medium','low')),
status TEXT NOT NULL DEFAULT 'Active',
linked_claim_ids TEXT,
schema_version TEXT DEFAULT '2.0',
created_at TEXT NOT NULL,
updated_at TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS decisions (
decision_id TEXT PRIMARY KEY,
recommended_action TEXT NOT NULL,
action_description TEXT,
expected_information_gain INTEGER,
linked_goal_id TEXT,
linked_claim_ids TEXT,
status TEXT DEFAULT 'Proposed',
priority TEXT DEFAULT 'medium',
estimated_effort TEXT,
source TEXT DEFAULT 'gap_analysis', -- gap_analysis, meta_improver, human
schema_version TEXT DEFAULT '2.0',
created_at TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS api_usage_log (
id INTEGER PRIMARY KEY AUTOINCREMENT,
timestamp TEXT NOT NULL,
model TEXT NOT NULL,
tokens_in INTEGER NOT NULL,
tokens_out INTEGER NOT NULL,
cost_usd REAL NOT NULL,
task_type TEXT,
layer TEXT
);
CREATE TABLE IF NOT EXISTS system_state (
key TEXT PRIMARY KEY,
value TEXT NOT NULL,
updated_at TEXT NOT NULL
);
""")
# Set initial system state
for key, value in [
("schema_version", SCHEMA_VERSION),
("pipeline_version", PIPELINE_VERSION),
("setup_phase", "0"),
("total_papers_ingested", "0"),
("total_claims_extracted", "0"),
]:
conn.execute(
"INSERT OR IGNORE INTO system_state (key, value, updated_at) VALUES (?, ?, ?)",
(key, value, now_iso())
)
conn.commit()
conn.close()
def get_state(db_path: str = None, key: str = None) -> Optional[str]:
conn = get_db(db_path)
row = conn.execute("SELECT value FROM system_state WHERE key = ?", (key,)).fetchone()
conn.close()
return row[0] if row else None
def set_state(db_path: str = None, key: str = None, value: str = None):
conn = get_db(db_path)
conn.execute(
"INSERT OR REPLACE INTO system_state (key, value, updated_at) VALUES (?, ?, ?)",
(key, value, now_iso())
)
conn.commit()
conn.close()
def get_stats(db_path: str = None) -> dict:
conn = get_db(db_path)
stats = {}
for table in ["documents", "regions", "claims", "canonical_claims",
"graph_nodes", "graph_edges", "conflicts", "sources", "goals"]:
try:
stats[table] = conn.execute(f"SELECT COUNT(*) FROM {table}").fetchone()[0]
except:
stats[table] = 0
conn.close()
return stats