File size: 18,754 Bytes
d555e10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
"""
PhD Research OS v2.0 β€” Core Database
======================================
SQLite database supporting all 7 layers. Fixed-point math throughout.
Every table has schema_version. Every record has created_at.
"""

import sqlite3
import json
import uuid
import os
import hashlib
from datetime import datetime, timezone
from typing import Optional

SCHEMA_VERSION = "2.0"
PIPELINE_VERSION = "2.1.0"
DB_PATH = os.environ.get("RESEARCH_OS_DB", "data/research_os_v2.db")


def to_fixed(value: float) -> int:
    """Float β†’ fixed-point integer (Γ—1000). Research OS Rule 5."""
    return round(value * 1000)

def from_fixed(value: int) -> float:
    """Fixed-point integer β†’ float."""
    return value / 1000.0 if value is not None else 0.0

def now_iso() -> str:
    return datetime.now(timezone.utc).isoformat()

def gen_id(prefix: str) -> str:
    return f"{prefix}_{uuid.uuid4().hex[:8].upper()}"

def hash_text(text: str) -> str:
    return hashlib.sha256(text.encode()).hexdigest()[:16]


def get_db(db_path: str = None) -> sqlite3.Connection:
    path = db_path or DB_PATH
    os.makedirs(os.path.dirname(path) if os.path.dirname(path) else ".", exist_ok=True)
    conn = sqlite3.connect(path)
    conn.row_factory = sqlite3.Row
    conn.execute("PRAGMA journal_mode=WAL")
    conn.execute("PRAGMA foreign_keys=ON")
    return conn


def init_db(db_path: str = None):
    """Initialize ALL tables for the complete 7-layer system."""
    conn = get_db(db_path)
    conn.executescript("""
    -- ═══════════════════════════════════════════════════════════
    -- LAYER 0: Structural Ingestion
    -- ═══════════════════════════════════════════════════════════
    
    CREATE TABLE IF NOT EXISTS documents (
        doc_id TEXT PRIMARY KEY,
        file_path TEXT NOT NULL,
        doc_type TEXT NOT NULL DEFAULT 'main',  -- main, supplement, dataset, code_repo
        title TEXT,
        doi TEXT,
        arxiv_id TEXT,
        vor_status TEXT DEFAULT 'unknown',      -- preprint, vor, erratum, retracted
        lineage_parent TEXT,                     -- doc_id of parent version
        parse_method TEXT,
        parse_quality_avg INTEGER,               -- Fixed-point Γ—1000
        total_regions INTEGER DEFAULT 0,
        total_pages INTEGER DEFAULT 0,
        ingestion_status TEXT DEFAULT 'pending', -- pending, processing, complete, failed
        metadata TEXT,                           -- JSON
        schema_version TEXT DEFAULT '2.0',
        created_at TEXT NOT NULL
    );

    CREATE TABLE IF NOT EXISTS regions (
        region_id TEXT PRIMARY KEY,
        doc_id TEXT NOT NULL,
        page INTEGER NOT NULL,
        bbox TEXT,                               -- JSON: [x1, y1, x2, y2]
        region_type TEXT NOT NULL,               -- body_text, table, figure, equation, caption, header, reference, footnote
        section TEXT,
        subsection TEXT,
        content_text TEXT,
        content_markdown TEXT,
        parse_method TEXT,
        parse_confidence INTEGER,                -- Fixed-point Γ—1000
        ocr_source INTEGER DEFAULT 0,
        extraction_status TEXT DEFAULT 'extractable',  -- extractable, low_confidence, unextractable
        quality_flags TEXT,                      -- JSON array
        cross_refs TEXT,                         -- JSON array of {ref_text, ref_type, resolved_to, verified}
        figure_type TEXT,                        -- scatter_plot, bar_chart, diagram, micrograph, schematic, null
        digitized_data TEXT,                     -- JSON: recovered data points from plot
        schema_version TEXT DEFAULT '2.0',
        created_at TEXT NOT NULL,
        FOREIGN KEY(doc_id) REFERENCES documents(doc_id)
    );

    -- ═══════════════════════════════════════════════════════════
    -- LAYER 1: Entity Resolution
    -- ═══════════════════════════════════════════════════════════

    CREATE TABLE IF NOT EXISTS entities (
        entity_id TEXT PRIMARY KEY,
        canonical_name TEXT NOT NULL,
        entity_type TEXT NOT NULL,               -- gene, protein, chemical, assay, disease, instrument, method
        aliases TEXT,                             -- JSON array of alternative names
        external_ids TEXT,                        -- JSON: {uniprot: ..., pubchem: ..., mesh: ...}
        schema_version TEXT DEFAULT '2.0',
        created_at TEXT NOT NULL
    );

    CREATE TABLE IF NOT EXISTS citation_chains (
        chain_id TEXT PRIMARY KEY,
        citing_doc TEXT NOT NULL,
        cited_doi TEXT,
        cited_title TEXT,
        in_text_ref TEXT,                        -- e.g., "[32]"
        is_in_knowledge_base INTEGER DEFAULT 0,
        resolved_doc_id TEXT,
        chain_type TEXT DEFAULT 'direct',        -- direct, inherited, self_cite
        schema_version TEXT DEFAULT '2.0',
        created_at TEXT NOT NULL,
        FOREIGN KEY(citing_doc) REFERENCES documents(doc_id)
    );

    -- ═══════════════════════════════════════════════════════════
    -- LAYER 2: Qualified Extraction (Claims)
    -- ═══════════════════════════════════════════════════════════

    CREATE TABLE IF NOT EXISTS claims (
        claim_id TEXT PRIMARY KEY,
        canonical_id TEXT,                       -- Layer 3: canonical claim this maps to
        text TEXT NOT NULL,
        epistemic_tag TEXT NOT NULL CHECK(epistemic_tag IN 
            ('Fact','Interpretation','Hypothesis','Conflict_Hypothesis')),
        
        -- Confidence components (all fixed-point Γ—1000)
        evidence_strength INTEGER,
        study_quality_weight INTEGER,
        journal_tier_weight INTEGER,
        completeness_penalty INTEGER,
        section_modifier INTEGER,
        qualifier_penalty INTEGER,
        
        -- Computed scores (code-computed, NOT LLM-stated)
        evidence_quality INTEGER,                -- Fixed-point Γ—1000
        truth_likelihood INTEGER,                -- Fixed-point Γ—1000
        qualifier_strength_score INTEGER,        -- Fixed-point Γ—1000
        composite_confidence INTEGER,            -- Fixed-point Γ—1000
        
        status TEXT NOT NULL CHECK(status IN ('Complete','Incomplete','Unextractable')),
        is_null_result INTEGER DEFAULT 0,
        is_inherited_citation INTEGER DEFAULT 0,
        causal_direction TEXT DEFAULT 'unspecified',  -- observed_correlation, causal_claim, unspecified
        practical_significance INTEGER DEFAULT 1,
        
        -- Qualifiers
        qualifiers TEXT,                         -- JSON array
        missing_fields TEXT,                     -- JSON array
        
        -- Statistical evidence
        stat_p_value REAL,
        stat_effect_size REAL,
        stat_effect_type TEXT,
        stat_sample_size INTEGER,
        stat_ci_lower REAL,
        stat_ci_upper REAL,
        
        -- Source provenance
        source_quote TEXT,
        source_page INTEGER,
        source_bbox TEXT,                        -- JSON: [x1, y1, x2, y2]
        source_section TEXT,
        source_region_id TEXT,
        source_doc_id TEXT,
        source_doi TEXT,
        
        -- Council provenance
        council_votes TEXT,                      -- JSON: {member: {tag, reasoning}}
        
        -- Granularity
        granularity TEXT DEFAULT 'atomic',       -- atomic, aggregate
        parent_claim_id TEXT,
        
        -- Version tracking
        ontology_version TEXT,
        pipeline_version TEXT DEFAULT '2.1.0',
        taxonomy_version TEXT,
        extraction_timestamp TEXT,
        schema_version TEXT DEFAULT '2.0',
        created_at TEXT NOT NULL,
        updated_at TEXT NOT NULL
    );

    -- ═══════════════════════════════════════════════════════════
    -- LAYER 3: Canonicalization
    -- ═══════════════════════════════════════════════════════════

    CREATE TABLE IF NOT EXISTS canonical_claims (
        canonical_id TEXT PRIMARY KEY,
        representative_text TEXT NOT NULL,
        epistemic_tag TEXT NOT NULL,
        composite_confidence INTEGER,            -- Aggregated across sources
        evidence_count INTEGER DEFAULT 1,
        source_dois TEXT,                        -- JSON array
        aliases TEXT,                            -- JSON array of claim_ids
        version_history TEXT,                    -- JSON array of {version, source, confidence, date}
        current_version INTEGER DEFAULT 1,
        supersedes TEXT,
        superseded_by TEXT,
        schema_version TEXT DEFAULT '2.0',
        created_at TEXT NOT NULL,
        updated_at TEXT NOT NULL
    );

    -- ═══════════════════════════════════════════════════════════
    -- LAYER 4: Knowledge Graph
    -- ═══════════════════════════════════════════════════════════

    CREATE TABLE IF NOT EXISTS graph_nodes (
        node_id TEXT PRIMARY KEY,
        node_type TEXT NOT NULL,                 -- claim, entity, method, condition, lab
        label TEXT NOT NULL,
        properties TEXT,                         -- JSON
        created_at TEXT NOT NULL
    );

    CREATE TABLE IF NOT EXISTS graph_edges (
        edge_id TEXT PRIMARY KEY,
        source_node TEXT NOT NULL,
        target_node TEXT NOT NULL,
        edge_type TEXT NOT NULL,                 -- supports, refutes, extends, depends_on, supersedes, blocks, investigative_hypothesis
        confidence INTEGER NOT NULL,             -- Fixed-point Γ—1000
        evidence_sources TEXT,                   -- JSON array of DOIs
        is_inferred INTEGER DEFAULT 0,
        inference_chain TEXT,                    -- JSON
        method_compatible INTEGER,               -- NULL, 0, 1
        resolution_id TEXT,
        created_at TEXT NOT NULL,
        updated_at TEXT NOT NULL,
        FOREIGN KEY(source_node) REFERENCES graph_nodes(node_id),
        FOREIGN KEY(target_node) REFERENCES graph_nodes(node_id)
    );

    CREATE INDEX IF NOT EXISTS idx_edges_source ON graph_edges(source_node);
    CREATE INDEX IF NOT EXISTS idx_edges_target ON graph_edges(target_node);
    CREATE INDEX IF NOT EXISTS idx_edges_type ON graph_edges(edge_type);

    CREATE TABLE IF NOT EXISTS conflicts (
        conflict_id TEXT PRIMARY KEY,
        claim_a_id TEXT NOT NULL,
        claim_b_id TEXT NOT NULL,
        conflict_type TEXT NOT NULL,
        generated_hypothesis TEXT,
        hypothesis_confidence TEXT DEFAULT 'low',
        resolution_status TEXT DEFAULT 'Unresolved',
        resolution_id TEXT,
        comparability_confidence INTEGER,        -- Fixed-point Γ—1000
        method_comparison TEXT,                  -- JSON
        case_file_id TEXT,
        key_differences TEXT,                    -- JSON
        schema_version TEXT DEFAULT '2.0',
        created_at TEXT NOT NULL,
        resolved_at TEXT
    );

    CREATE TABLE IF NOT EXISTS case_files (
        case_file_id TEXT PRIMARY KEY,
        root_cause TEXT NOT NULL,
        conflict_ids TEXT NOT NULL,              -- JSON array
        status TEXT DEFAULT 'open',
        resolution_summary TEXT,
        created_at TEXT NOT NULL,
        resolved_at TEXT
    );

    CREATE TABLE IF NOT EXISTS resolutions (
        resolution_id TEXT PRIMARY KEY,
        conflict_id TEXT,
        case_file_id TEXT,
        resolved_by TEXT NOT NULL,
        resolution_type TEXT NOT NULL,
        rationale TEXT NOT NULL,
        evidence_cited TEXT,                     -- JSON array
        status TEXT DEFAULT 'active',            -- active, under_review, superseded
        superseded_by TEXT,
        downstream_resolutions TEXT,             -- JSON array
        created_at TEXT NOT NULL,
        reopened_at TEXT,
        reopen_reason TEXT
    );

    -- ═══════════════════════════════════════════════════════════
    -- LAYER 5: Scoring (calibration tracking)
    -- ═══════════════════════════════════════════════════════════

    CREATE TABLE IF NOT EXISTS calibration_log (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        claim_id TEXT,
        system_confidence INTEGER,               -- Fixed-point Γ—1000
        human_judgment TEXT,                      -- correct, partially_correct, incorrect
        brier_contribution REAL,
        timestamp TEXT NOT NULL
    );

    -- ═══════════════════════════════════════════════════════════
    -- LAYER 6: Evaluation
    -- ═══════════════════════════════════════════════════════════

    CREATE TABLE IF NOT EXISTS eval_runs (
        run_id TEXT PRIMARY KEY,
        run_type TEXT NOT NULL,                  -- regression, llm_judge, calibration, holdout
        metrics TEXT NOT NULL,                   -- JSON
        passed INTEGER,
        pipeline_version TEXT,
        model_checkpoint TEXT,
        prompt_hash TEXT,
        created_at TEXT NOT NULL
    );

    -- ═══════════════════════════════════════════════════════════
    -- LAYER 7: Provenance
    -- ═══════════════════════════════════════════════════════════

    CREATE TABLE IF NOT EXISTS pipeline_lineage (
        lineage_id TEXT PRIMARY KEY,
        claim_id TEXT NOT NULL,
        pipeline_version TEXT NOT NULL,
        model_checkpoint TEXT,
        parser_version TEXT,
        taxonomy_version TEXT,
        prompt_hash TEXT,
        extraction_timestamp TEXT NOT NULL,
        FOREIGN KEY(claim_id) REFERENCES claims(claim_id)
    );

    -- ═══════════════════════════════════════════════════════════
    -- CROSS-CUTTING: Sources, Goals, Taxonomy, API Logging
    -- ═══════════════════════════════════════════════════════════

    CREATE TABLE IF NOT EXISTS sources (
        doi TEXT PRIMARY KEY,
        title TEXT,
        authors TEXT,                            -- JSON array
        year INTEGER,
        journal TEXT,
        journal_tier INTEGER,
        study_type TEXT,
        is_canonical INTEGER DEFAULT 0,
        taxonomy_version TEXT,
        schema_version TEXT DEFAULT '2.0',
        created_at TEXT NOT NULL
    );

    CREATE TABLE IF NOT EXISTS goals (
        goal_id TEXT PRIMARY KEY,
        description TEXT NOT NULL,
        priority TEXT NOT NULL CHECK(priority IN ('high','medium','low')),
        status TEXT NOT NULL DEFAULT 'Active',
        linked_claim_ids TEXT,
        schema_version TEXT DEFAULT '2.0',
        created_at TEXT NOT NULL,
        updated_at TEXT NOT NULL
    );

    CREATE TABLE IF NOT EXISTS decisions (
        decision_id TEXT PRIMARY KEY,
        recommended_action TEXT NOT NULL,
        action_description TEXT,
        expected_information_gain INTEGER,
        linked_goal_id TEXT,
        linked_claim_ids TEXT,
        status TEXT DEFAULT 'Proposed',
        priority TEXT DEFAULT 'medium',
        estimated_effort TEXT,
        source TEXT DEFAULT 'gap_analysis',      -- gap_analysis, meta_improver, human
        schema_version TEXT DEFAULT '2.0',
        created_at TEXT NOT NULL
    );

    CREATE TABLE IF NOT EXISTS api_usage_log (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        timestamp TEXT NOT NULL,
        model TEXT NOT NULL,
        tokens_in INTEGER NOT NULL,
        tokens_out INTEGER NOT NULL,
        cost_usd REAL NOT NULL,
        task_type TEXT,
        layer TEXT
    );

    CREATE TABLE IF NOT EXISTS system_state (
        key TEXT PRIMARY KEY,
        value TEXT NOT NULL,
        updated_at TEXT NOT NULL
    );
    """)
    
    # Set initial system state
    for key, value in [
        ("schema_version", SCHEMA_VERSION),
        ("pipeline_version", PIPELINE_VERSION),
        ("setup_phase", "0"),
        ("total_papers_ingested", "0"),
        ("total_claims_extracted", "0"),
    ]:
        conn.execute(
            "INSERT OR IGNORE INTO system_state (key, value, updated_at) VALUES (?, ?, ?)",
            (key, value, now_iso())
        )
    
    conn.commit()
    conn.close()


def get_state(db_path: str = None, key: str = None) -> Optional[str]:
    conn = get_db(db_path)
    row = conn.execute("SELECT value FROM system_state WHERE key = ?", (key,)).fetchone()
    conn.close()
    return row[0] if row else None


def set_state(db_path: str = None, key: str = None, value: str = None):
    conn = get_db(db_path)
    conn.execute(
        "INSERT OR REPLACE INTO system_state (key, value, updated_at) VALUES (?, ?, ?)",
        (key, value, now_iso())
    )
    conn.commit()
    conn.close()


def get_stats(db_path: str = None) -> dict:
    conn = get_db(db_path)
    stats = {}
    for table in ["documents", "regions", "claims", "canonical_claims", 
                   "graph_nodes", "graph_edges", "conflicts", "sources", "goals"]:
        try:
            stats[table] = conn.execute(f"SELECT COUNT(*) FROM {table}").fetchone()[0]
        except:
            stats[table] = 0
    conn.close()
    return stats