| """ |
| PhD Research OS — Core Data Layer (Phase 0) |
| ============================================ |
| SQLite database with CRUD operations for Claim, Source, Goal, Conflict, |
| Decision, Override, and Experiment objects. |
| |
| All objects carry schema_version tags. Fixed-point confidence math. |
| """ |
|
|
| import sqlite3 |
| import json |
| import uuid |
| import os |
| from datetime import datetime, timezone |
| from typing import Optional |
| from dataclasses import dataclass, field, asdict |
|
|
| SCHEMA_VERSION = "1.0" |
| DB_PATH = os.environ.get("RESEARCH_OS_DB", "data/research_os.db") |
|
|
|
|
| def get_db(db_path: str = None) -> sqlite3.Connection: |
| """Get database connection with WAL mode and foreign keys.""" |
| path = db_path or DB_PATH |
| os.makedirs(os.path.dirname(path) if os.path.dirname(path) else ".", exist_ok=True) |
| conn = sqlite3.connect(path) |
| conn.row_factory = sqlite3.Row |
| conn.execute("PRAGMA journal_mode=WAL") |
| conn.execute("PRAGMA foreign_keys=ON") |
| return conn |
|
|
|
|
| def init_db(db_path: str = None): |
| """Initialize all tables.""" |
| conn = get_db(db_path) |
| conn.executescript(""" |
| CREATE TABLE IF NOT EXISTS claims ( |
| claim_id TEXT PRIMARY KEY, |
| text TEXT NOT NULL, |
| status TEXT NOT NULL CHECK(status IN ('Complete', 'Incomplete')), |
| epistemic_tag TEXT NOT NULL CHECK(epistemic_tag IN ('Fact', 'Interpretation', 'Hypothesis', 'Conflict_Hypothesis')), |
| confidence INTEGER NOT NULL, -- Fixed-point: actual * 1000 |
| evidence_strength INTEGER, -- Fixed-point: actual * 1000 |
| study_type TEXT, |
| study_quality_weight INTEGER, -- Fixed-point: actual * 1000 |
| journal_tier_weight INTEGER, -- Fixed-point: actual * 1000 |
| completeness_penalty INTEGER, -- Fixed-point: actual * 1000 |
| source_doi TEXT, |
| source_type TEXT, |
| missing_fields TEXT, -- JSON array |
| parameters TEXT, -- JSON dict |
| is_canonical INTEGER DEFAULT 0, |
| expert_override TEXT, -- JSON object if set |
| schema_version TEXT NOT NULL DEFAULT '1.0', |
| created_at TEXT NOT NULL, |
| updated_at TEXT NOT NULL |
| ); |
| |
| CREATE TABLE IF NOT EXISTS sources ( |
| doi TEXT PRIMARY KEY, |
| title TEXT NOT NULL, |
| authors TEXT, -- JSON array |
| year INTEGER, |
| journal TEXT, |
| journal_tier INTEGER CHECK(journal_tier IN (1, 2, 3)), |
| study_type TEXT, |
| is_canonical INTEGER DEFAULT 0, |
| source_type TEXT, |
| schema_version TEXT NOT NULL DEFAULT '1.0', |
| created_at TEXT NOT NULL |
| ); |
| |
| CREATE TABLE IF NOT EXISTS goals ( |
| goal_id TEXT PRIMARY KEY, |
| description TEXT NOT NULL, |
| priority TEXT NOT NULL CHECK(priority IN ('high', 'medium', 'low')), |
| status TEXT NOT NULL CHECK(status IN ('Active', 'Paused', 'Completed')), |
| linked_claim_ids TEXT, -- JSON array |
| schema_version TEXT NOT NULL DEFAULT '1.0', |
| created_at TEXT NOT NULL, |
| updated_at TEXT NOT NULL |
| ); |
| |
| CREATE TABLE IF NOT EXISTS conflicts ( |
| conflict_id TEXT PRIMARY KEY, |
| claim_a_id TEXT NOT NULL, |
| claim_b_id TEXT NOT NULL, |
| conflict_type TEXT NOT NULL CHECK(conflict_type IN ('value_mismatch', 'methodology_difference', 'scope_difference', 'no_conflict')), |
| generated_hypothesis TEXT, |
| hypothesis_confidence TEXT DEFAULT 'low', |
| resolution_status TEXT NOT NULL DEFAULT 'Unresolved' CHECK(resolution_status IN ('Unresolved', 'Human_Resolved')), |
| resolution_notes TEXT, |
| key_differences TEXT, -- JSON array |
| schema_version TEXT NOT NULL DEFAULT '1.0', |
| created_at TEXT NOT NULL, |
| resolved_at TEXT, |
| FOREIGN KEY(claim_a_id) REFERENCES claims(claim_id), |
| FOREIGN KEY(claim_b_id) REFERENCES claims(claim_id) |
| ); |
| |
| CREATE TABLE IF NOT EXISTS decisions ( |
| decision_id TEXT PRIMARY KEY, |
| linked_claim_ids TEXT, -- JSON array |
| recommended_action TEXT NOT NULL, |
| action_description TEXT, |
| expected_information_gain INTEGER, -- Fixed-point * 1000 |
| linked_goal_id TEXT, |
| status TEXT NOT NULL DEFAULT 'Proposed' CHECK(status IN ('Proposed', 'Accepted', 'Rejected', 'Completed')), |
| why_not_log TEXT, -- Reason if rejected |
| priority TEXT DEFAULT 'medium', |
| estimated_effort TEXT, |
| schema_version TEXT NOT NULL DEFAULT '1.0', |
| created_at TEXT NOT NULL, |
| FOREIGN KEY(linked_goal_id) REFERENCES goals(goal_id) |
| ); |
| |
| CREATE TABLE IF NOT EXISTS overrides ( |
| override_id TEXT PRIMARY KEY, |
| claim_id TEXT NOT NULL, |
| who TEXT NOT NULL, |
| rationale TEXT NOT NULL, |
| authoritative_source TEXT, |
| original_confidence INTEGER, -- Fixed-point * 1000 |
| new_confidence INTEGER, -- Fixed-point * 1000 |
| status TEXT DEFAULT 'Provisional', |
| schema_version TEXT NOT NULL DEFAULT '1.0', |
| created_at TEXT NOT NULL, |
| FOREIGN KEY(claim_id) REFERENCES claims(claim_id) |
| ); |
| |
| CREATE TABLE IF NOT EXISTS experiments ( |
| experiment_id TEXT PRIMARY KEY, |
| date TEXT NOT NULL, |
| instrument TEXT, |
| measurement_type TEXT, |
| raw_file_path TEXT, |
| summary_statistics TEXT, -- JSON dict |
| linked_claim_ids TEXT, -- JSON array |
| approved INTEGER DEFAULT 0, |
| schema_version TEXT NOT NULL DEFAULT '1.0', |
| created_at TEXT NOT NULL |
| ); |
| |
| CREATE TABLE IF NOT EXISTS api_usage_log ( |
| id INTEGER PRIMARY KEY AUTOINCREMENT, |
| timestamp TEXT NOT NULL, |
| model TEXT NOT NULL, |
| tokens_in INTEGER NOT NULL, |
| tokens_out INTEGER NOT NULL, |
| cost_usd REAL NOT NULL, |
| task_type TEXT |
| ); |
| |
| CREATE TABLE IF NOT EXISTS calibration_log ( |
| id INTEGER PRIMARY KEY AUTOINCREMENT, |
| conflict_id TEXT, |
| system_predicted_confidence INTEGER, -- Fixed-point * 1000 |
| human_actual_judgment TEXT, |
| timestamp TEXT NOT NULL |
| ); |
| |
| CREATE TABLE IF NOT EXISTS embedding_cache ( |
| text_hash TEXT PRIMARY KEY, |
| embedding BLOB NOT NULL, |
| model TEXT NOT NULL, |
| created_at TEXT NOT NULL |
| ); |
| """) |
| conn.commit() |
| conn.close() |
|
|
|
|
| |
| |
| |
|
|
| def to_fixed(value: float) -> int: |
| """Convert float to fixed-point integer (×1000). |
| Research OS Rule 5: All probability/confidence use scaled integers.""" |
| return round(value * 1000) |
|
|
| def from_fixed(value: int) -> float: |
| """Convert fixed-point integer back to float.""" |
| return value / 1000.0 |
|
|
|
|
| |
| |
| |
|
|
| def now_iso() -> str: |
| return datetime.now(timezone.utc).isoformat() |
|
|
| def gen_id(prefix: str) -> str: |
| return f"{prefix}_{uuid.uuid4().hex[:8].upper()}" |
|
|
|
|
| |
|
|
| def create_claim(conn: sqlite3.Connection, text: str, epistemic_tag: str, |
| confidence: float, source_doi: str = None, |
| evidence_strength: float = None, study_type: str = None, |
| study_quality_weight: float = None, journal_tier_weight: float = None, |
| completeness_penalty: float = None, missing_fields: list = None, |
| parameters: dict = None, is_canonical: bool = False) -> str: |
| """Create a new Claim Object. Returns claim_id.""" |
| claim_id = gen_id("CLM") |
| status = "Incomplete" if missing_fields else "Complete" |
| now = now_iso() |
| |
| conn.execute(""" |
| INSERT INTO claims (claim_id, text, status, epistemic_tag, confidence, |
| evidence_strength, study_type, study_quality_weight, journal_tier_weight, |
| completeness_penalty, source_doi, missing_fields, parameters, |
| is_canonical, schema_version, created_at, updated_at) |
| VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) |
| """, (claim_id, text, status, epistemic_tag, to_fixed(confidence), |
| to_fixed(evidence_strength) if evidence_strength else None, |
| study_type, |
| to_fixed(study_quality_weight) if study_quality_weight else None, |
| to_fixed(journal_tier_weight) if journal_tier_weight else None, |
| to_fixed(completeness_penalty) if completeness_penalty else None, |
| source_doi, |
| json.dumps(missing_fields or []), |
| json.dumps(parameters or {}), |
| int(is_canonical), |
| SCHEMA_VERSION, now, now)) |
| conn.commit() |
| return claim_id |
|
|
|
|
| def get_claim(conn: sqlite3.Connection, claim_id: str) -> Optional[dict]: |
| """Get a single claim by ID.""" |
| row = conn.execute("SELECT * FROM claims WHERE claim_id = ?", (claim_id,)).fetchone() |
| if row is None: |
| return None |
| d = dict(row) |
| |
| for key in ['confidence', 'evidence_strength', 'study_quality_weight', |
| 'journal_tier_weight', 'completeness_penalty']: |
| if d.get(key) is not None: |
| d[key] = from_fixed(d[key]) |
| d['missing_fields'] = json.loads(d.get('missing_fields', '[]')) |
| d['parameters'] = json.loads(d.get('parameters', '{}')) |
| if d.get('expert_override'): |
| d['expert_override'] = json.loads(d['expert_override']) |
| return d |
|
|
|
|
| def update_claim(conn: sqlite3.Connection, claim_id: str, **kwargs) -> bool: |
| """Update claim fields. Fixed-point conversion applied automatically.""" |
| fixed_fields = {'confidence', 'evidence_strength', 'study_quality_weight', |
| 'journal_tier_weight', 'completeness_penalty'} |
| json_fields = {'missing_fields', 'parameters', 'expert_override'} |
| |
| updates = [] |
| values = [] |
| for key, val in kwargs.items(): |
| if key in fixed_fields and val is not None: |
| updates.append(f"{key} = ?") |
| values.append(to_fixed(val)) |
| elif key in json_fields and val is not None: |
| updates.append(f"{key} = ?") |
| values.append(json.dumps(val)) |
| else: |
| updates.append(f"{key} = ?") |
| values.append(val) |
| |
| updates.append("updated_at = ?") |
| values.append(now_iso()) |
| values.append(claim_id) |
| |
| if updates: |
| conn.execute(f"UPDATE claims SET {', '.join(updates)} WHERE claim_id = ?", values) |
| conn.commit() |
| return True |
| return False |
|
|
|
|
| def search_claims(conn: sqlite3.Connection, query: str = None, |
| epistemic_tag: str = None, min_confidence: float = None, |
| source_doi: str = None, limit: int = 50) -> list: |
| """Search claims with optional filters.""" |
| conditions = [] |
| params = [] |
| |
| if query: |
| conditions.append("text LIKE ?") |
| params.append(f"%{query}%") |
| if epistemic_tag: |
| conditions.append("epistemic_tag = ?") |
| params.append(epistemic_tag) |
| if min_confidence is not None: |
| conditions.append("confidence >= ?") |
| params.append(to_fixed(min_confidence)) |
| if source_doi: |
| conditions.append("source_doi = ?") |
| params.append(source_doi) |
| |
| where = " AND ".join(conditions) if conditions else "1=1" |
| rows = conn.execute( |
| f"SELECT * FROM claims WHERE {where} ORDER BY confidence DESC LIMIT ?", |
| params + [limit] |
| ).fetchall() |
| |
| results = [] |
| for row in rows: |
| d = dict(row) |
| for key in ['confidence', 'evidence_strength', 'study_quality_weight', |
| 'journal_tier_weight', 'completeness_penalty']: |
| if d.get(key) is not None: |
| d[key] = from_fixed(d[key]) |
| d['missing_fields'] = json.loads(d.get('missing_fields', '[]')) |
| d['parameters'] = json.loads(d.get('parameters', '{}')) |
| results.append(d) |
| return results |
|
|
|
|
| |
|
|
| def create_source(conn: sqlite3.Connection, doi: str, title: str, |
| authors: list = None, year: int = None, journal: str = None, |
| journal_tier: int = None, study_type: str = None, |
| is_canonical: bool = False) -> str: |
| """Create a Source Object.""" |
| conn.execute(""" |
| INSERT OR REPLACE INTO sources (doi, title, authors, year, journal, journal_tier, |
| study_type, is_canonical, schema_version, created_at) |
| VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) |
| """, (doi, title, json.dumps(authors or []), year, journal, journal_tier, |
| study_type, int(is_canonical), SCHEMA_VERSION, now_iso())) |
| conn.commit() |
| return doi |
|
|
|
|
| def get_source(conn: sqlite3.Connection, doi: str) -> Optional[dict]: |
| """Get source by DOI.""" |
| row = conn.execute("SELECT * FROM sources WHERE doi = ?", (doi,)).fetchone() |
| if row is None: |
| return None |
| d = dict(row) |
| d['authors'] = json.loads(d.get('authors', '[]')) |
| return d |
|
|
|
|
| |
|
|
| def create_goal(conn: sqlite3.Connection, description: str, priority: str, |
| linked_claim_ids: list = None) -> str: |
| """Create a Goal Object.""" |
| goal_id = gen_id("GOAL") |
| now = now_iso() |
| conn.execute(""" |
| INSERT INTO goals (goal_id, description, priority, status, linked_claim_ids, |
| schema_version, created_at, updated_at) |
| VALUES (?, ?, ?, 'Active', ?, ?, ?, ?) |
| """, (goal_id, description, priority, json.dumps(linked_claim_ids or []), |
| SCHEMA_VERSION, now, now)) |
| conn.commit() |
| return goal_id |
|
|
|
|
| def get_goals_by_priority(conn: sqlite3.Connection, status: str = "Active") -> list: |
| """Get goals sorted by priority.""" |
| priority_order = {"high": 1, "medium": 2, "low": 3} |
| rows = conn.execute( |
| "SELECT * FROM goals WHERE status = ? ORDER BY priority", (status,) |
| ).fetchall() |
| results = [dict(r) for r in rows] |
| for r in results: |
| r['linked_claim_ids'] = json.loads(r.get('linked_claim_ids', '[]')) |
| results.sort(key=lambda x: priority_order.get(x['priority'], 99)) |
| return results |
|
|
|
|
| |
|
|
| def create_conflict(conn: sqlite3.Connection, claim_a_id: str, claim_b_id: str, |
| conflict_type: str, generated_hypothesis: str = None, |
| key_differences: list = None) -> str: |
| """Create a Conflict Resolution Object.""" |
| conflict_id = gen_id("CONF") |
| conn.execute(""" |
| INSERT INTO conflicts (conflict_id, claim_a_id, claim_b_id, conflict_type, |
| generated_hypothesis, hypothesis_confidence, key_differences, |
| schema_version, created_at) |
| VALUES (?, ?, ?, ?, ?, 'low', ?, ?, ?) |
| """, (conflict_id, claim_a_id, claim_b_id, conflict_type, |
| generated_hypothesis, json.dumps(key_differences or []), |
| SCHEMA_VERSION, now_iso())) |
| conn.commit() |
| return conflict_id |
|
|
|
|
| |
|
|
| def create_decision(conn: sqlite3.Connection, recommended_action: str, |
| action_description: str, expected_info_gain: float, |
| linked_goal_id: str = None, linked_claim_ids: list = None, |
| priority: str = "medium", estimated_effort: str = None) -> str: |
| """Create a Decision Object.""" |
| dec_id = gen_id("DEC") |
| conn.execute(""" |
| INSERT INTO decisions (decision_id, linked_claim_ids, recommended_action, |
| action_description, expected_information_gain, linked_goal_id, |
| priority, estimated_effort, schema_version, created_at) |
| VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) |
| """, (dec_id, json.dumps(linked_claim_ids or []), recommended_action, |
| action_description, to_fixed(expected_info_gain), linked_goal_id, |
| priority, estimated_effort, SCHEMA_VERSION, now_iso())) |
| conn.commit() |
| return dec_id |
|
|
|
|
| |
|
|
| def create_override(conn: sqlite3.Connection, claim_id: str, who: str, |
| rationale: str, new_confidence: float, |
| authoritative_source: str = None) -> str: |
| """Create an expert override. Locks claim confidence.""" |
| override_id = gen_id("OVR") |
| |
| |
| claim = get_claim(conn, claim_id) |
| if not claim: |
| raise ValueError(f"Claim {claim_id} not found") |
| |
| conn.execute(""" |
| INSERT INTO overrides (override_id, claim_id, who, rationale, |
| authoritative_source, original_confidence, new_confidence, |
| schema_version, created_at) |
| VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) |
| """, (override_id, claim_id, who, rationale, authoritative_source, |
| to_fixed(claim['confidence']), to_fixed(new_confidence), |
| SCHEMA_VERSION, now_iso())) |
| |
| |
| update_claim(conn, claim_id, confidence=new_confidence, |
| expert_override={"override_id": override_id, "who": who, |
| "rationale": rationale}) |
| conn.commit() |
| return override_id |
|
|
|
|
| |
|
|
| def log_api_usage(conn: sqlite3.Connection, model: str, tokens_in: int, |
| tokens_out: int, cost_usd: float, task_type: str = None): |
| """Log API usage for cost tracking.""" |
| conn.execute(""" |
| INSERT INTO api_usage_log (timestamp, model, tokens_in, tokens_out, cost_usd, task_type) |
| VALUES (?, ?, ?, ?, ?, ?) |
| """, (now_iso(), model, tokens_in, tokens_out, cost_usd, task_type)) |
| conn.commit() |
|
|
|
|
| def get_cost_summary(conn: sqlite3.Connection, days: int = 7) -> dict: |
| """Get API cost summary for the last N days.""" |
| row = conn.execute(""" |
| SELECT SUM(cost_usd) as total_cost, SUM(tokens_in) as total_in, |
| SUM(tokens_out) as total_out, COUNT(*) as num_calls |
| FROM api_usage_log |
| WHERE timestamp >= datetime('now', ?) |
| """, (f"-{days} days",)).fetchone() |
| return dict(row) if row else {"total_cost": 0, "total_in": 0, "total_out": 0, "num_calls": 0} |
|
|
|
|
| if __name__ == "__main__": |
| |
| init_db("test_research_os.db") |
| conn = get_db("test_research_os.db") |
| |
| |
| cid = create_claim(conn, "Test claim about graphene", "Fact", 0.85, |
| evidence_strength=0.9, study_quality_weight=1.0, |
| journal_tier_weight=1.0, completeness_penalty=1.0) |
| claim = get_claim(conn, cid) |
| assert claim is not None |
| assert claim['confidence'] == 0.85 |
| assert claim['status'] == 'Complete' |
| |
| |
| cid2 = create_claim(conn, "Incomplete claim", "Hypothesis", 0.35, |
| missing_fields=["temperature", "pH"]) |
| claim2 = get_claim(conn, cid2) |
| assert claim2['status'] == 'Incomplete' |
| assert len(claim2['missing_fields']) == 2 |
| |
| |
| create_source(conn, "10.1234/test", "Test Paper", ["Author A"], 2024, |
| "Nature", 1) |
| src = get_source(conn, "10.1234/test") |
| assert src is not None |
| |
| |
| gid = create_goal(conn, "Achieve LOD < 1 fM", "high", [cid]) |
| goals = get_goals_by_priority(conn) |
| assert len(goals) == 1 |
| |
| |
| conf_id = create_conflict(conn, cid, cid2, "value_mismatch", |
| "Different measurement conditions") |
| |
| |
| ovr_id = create_override(conn, cid, "Dr. Smith", "Expert knowledge", 0.95) |
| claim_updated = get_claim(conn, cid) |
| assert claim_updated['confidence'] == 0.95 |
| assert claim_updated['expert_override'] is not None |
| |
| |
| results = search_claims(conn, query="graphene") |
| assert len(results) >= 1 |
| |
| |
| assert to_fixed(0.85) == 850 |
| assert from_fixed(850) == 0.85 |
| assert to_fixed(0.123) == 123 |
| |
| conn.close() |
| os.remove("test_research_os.db") |
| print("All database tests passed! ✓") |
|
|