nkshirsa commited on
Commit
d555e10
Β·
verified Β·
1 Parent(s): d45d708

v2.0: phd_research_os_v2/core/database.py

Browse files
Files changed (1) hide show
  1. phd_research_os_v2/core/database.py +444 -0
phd_research_os_v2/core/database.py ADDED
@@ -0,0 +1,444 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PhD Research OS v2.0 β€” Core Database
3
+ ======================================
4
+ SQLite database supporting all 7 layers. Fixed-point math throughout.
5
+ Every table has schema_version. Every record has created_at.
6
+ """
7
+
8
+ import sqlite3
9
+ import json
10
+ import uuid
11
+ import os
12
+ import hashlib
13
+ from datetime import datetime, timezone
14
+ from typing import Optional
15
+
16
+ SCHEMA_VERSION = "2.0"
17
+ PIPELINE_VERSION = "2.1.0"
18
+ DB_PATH = os.environ.get("RESEARCH_OS_DB", "data/research_os_v2.db")
19
+
20
+
21
+ def to_fixed(value: float) -> int:
22
+ """Float β†’ fixed-point integer (Γ—1000). Research OS Rule 5."""
23
+ return round(value * 1000)
24
+
25
+ def from_fixed(value: int) -> float:
26
+ """Fixed-point integer β†’ float."""
27
+ return value / 1000.0 if value is not None else 0.0
28
+
29
+ def now_iso() -> str:
30
+ return datetime.now(timezone.utc).isoformat()
31
+
32
+ def gen_id(prefix: str) -> str:
33
+ return f"{prefix}_{uuid.uuid4().hex[:8].upper()}"
34
+
35
+ def hash_text(text: str) -> str:
36
+ return hashlib.sha256(text.encode()).hexdigest()[:16]
37
+
38
+
39
+ def get_db(db_path: str = None) -> sqlite3.Connection:
40
+ path = db_path or DB_PATH
41
+ os.makedirs(os.path.dirname(path) if os.path.dirname(path) else ".", exist_ok=True)
42
+ conn = sqlite3.connect(path)
43
+ conn.row_factory = sqlite3.Row
44
+ conn.execute("PRAGMA journal_mode=WAL")
45
+ conn.execute("PRAGMA foreign_keys=ON")
46
+ return conn
47
+
48
+
49
+ def init_db(db_path: str = None):
50
+ """Initialize ALL tables for the complete 7-layer system."""
51
+ conn = get_db(db_path)
52
+ conn.executescript("""
53
+ -- ═══════════════════════════════════════════════════════════
54
+ -- LAYER 0: Structural Ingestion
55
+ -- ═══════════════════════════════════════════════════════════
56
+
57
+ CREATE TABLE IF NOT EXISTS documents (
58
+ doc_id TEXT PRIMARY KEY,
59
+ file_path TEXT NOT NULL,
60
+ doc_type TEXT NOT NULL DEFAULT 'main', -- main, supplement, dataset, code_repo
61
+ title TEXT,
62
+ doi TEXT,
63
+ arxiv_id TEXT,
64
+ vor_status TEXT DEFAULT 'unknown', -- preprint, vor, erratum, retracted
65
+ lineage_parent TEXT, -- doc_id of parent version
66
+ parse_method TEXT,
67
+ parse_quality_avg INTEGER, -- Fixed-point Γ—1000
68
+ total_regions INTEGER DEFAULT 0,
69
+ total_pages INTEGER DEFAULT 0,
70
+ ingestion_status TEXT DEFAULT 'pending', -- pending, processing, complete, failed
71
+ metadata TEXT, -- JSON
72
+ schema_version TEXT DEFAULT '2.0',
73
+ created_at TEXT NOT NULL
74
+ );
75
+
76
+ CREATE TABLE IF NOT EXISTS regions (
77
+ region_id TEXT PRIMARY KEY,
78
+ doc_id TEXT NOT NULL,
79
+ page INTEGER NOT NULL,
80
+ bbox TEXT, -- JSON: [x1, y1, x2, y2]
81
+ region_type TEXT NOT NULL, -- body_text, table, figure, equation, caption, header, reference, footnote
82
+ section TEXT,
83
+ subsection TEXT,
84
+ content_text TEXT,
85
+ content_markdown TEXT,
86
+ parse_method TEXT,
87
+ parse_confidence INTEGER, -- Fixed-point Γ—1000
88
+ ocr_source INTEGER DEFAULT 0,
89
+ extraction_status TEXT DEFAULT 'extractable', -- extractable, low_confidence, unextractable
90
+ quality_flags TEXT, -- JSON array
91
+ cross_refs TEXT, -- JSON array of {ref_text, ref_type, resolved_to, verified}
92
+ figure_type TEXT, -- scatter_plot, bar_chart, diagram, micrograph, schematic, null
93
+ digitized_data TEXT, -- JSON: recovered data points from plot
94
+ schema_version TEXT DEFAULT '2.0',
95
+ created_at TEXT NOT NULL,
96
+ FOREIGN KEY(doc_id) REFERENCES documents(doc_id)
97
+ );
98
+
99
+ -- ═══════════════════════════════════════════════════════════
100
+ -- LAYER 1: Entity Resolution
101
+ -- ═══════════════════════════════════════════════════════════
102
+
103
+ CREATE TABLE IF NOT EXISTS entities (
104
+ entity_id TEXT PRIMARY KEY,
105
+ canonical_name TEXT NOT NULL,
106
+ entity_type TEXT NOT NULL, -- gene, protein, chemical, assay, disease, instrument, method
107
+ aliases TEXT, -- JSON array of alternative names
108
+ external_ids TEXT, -- JSON: {uniprot: ..., pubchem: ..., mesh: ...}
109
+ schema_version TEXT DEFAULT '2.0',
110
+ created_at TEXT NOT NULL
111
+ );
112
+
113
+ CREATE TABLE IF NOT EXISTS citation_chains (
114
+ chain_id TEXT PRIMARY KEY,
115
+ citing_doc TEXT NOT NULL,
116
+ cited_doi TEXT,
117
+ cited_title TEXT,
118
+ in_text_ref TEXT, -- e.g., "[32]"
119
+ is_in_knowledge_base INTEGER DEFAULT 0,
120
+ resolved_doc_id TEXT,
121
+ chain_type TEXT DEFAULT 'direct', -- direct, inherited, self_cite
122
+ schema_version TEXT DEFAULT '2.0',
123
+ created_at TEXT NOT NULL,
124
+ FOREIGN KEY(citing_doc) REFERENCES documents(doc_id)
125
+ );
126
+
127
+ -- ═══════════════════════════════════════════════════════════
128
+ -- LAYER 2: Qualified Extraction (Claims)
129
+ -- ═══════════════════════════════════════════════════════════
130
+
131
+ CREATE TABLE IF NOT EXISTS claims (
132
+ claim_id TEXT PRIMARY KEY,
133
+ canonical_id TEXT, -- Layer 3: canonical claim this maps to
134
+ text TEXT NOT NULL,
135
+ epistemic_tag TEXT NOT NULL CHECK(epistemic_tag IN
136
+ ('Fact','Interpretation','Hypothesis','Conflict_Hypothesis')),
137
+
138
+ -- Confidence components (all fixed-point Γ—1000)
139
+ evidence_strength INTEGER,
140
+ study_quality_weight INTEGER,
141
+ journal_tier_weight INTEGER,
142
+ completeness_penalty INTEGER,
143
+ section_modifier INTEGER,
144
+ qualifier_penalty INTEGER,
145
+
146
+ -- Computed scores (code-computed, NOT LLM-stated)
147
+ evidence_quality INTEGER, -- Fixed-point Γ—1000
148
+ truth_likelihood INTEGER, -- Fixed-point Γ—1000
149
+ qualifier_strength_score INTEGER, -- Fixed-point Γ—1000
150
+ composite_confidence INTEGER, -- Fixed-point Γ—1000
151
+
152
+ status TEXT NOT NULL CHECK(status IN ('Complete','Incomplete','Unextractable')),
153
+ is_null_result INTEGER DEFAULT 0,
154
+ is_inherited_citation INTEGER DEFAULT 0,
155
+ causal_direction TEXT DEFAULT 'unspecified', -- observed_correlation, causal_claim, unspecified
156
+ practical_significance INTEGER DEFAULT 1,
157
+
158
+ -- Qualifiers
159
+ qualifiers TEXT, -- JSON array
160
+ missing_fields TEXT, -- JSON array
161
+
162
+ -- Statistical evidence
163
+ stat_p_value REAL,
164
+ stat_effect_size REAL,
165
+ stat_effect_type TEXT,
166
+ stat_sample_size INTEGER,
167
+ stat_ci_lower REAL,
168
+ stat_ci_upper REAL,
169
+
170
+ -- Source provenance
171
+ source_quote TEXT,
172
+ source_page INTEGER,
173
+ source_bbox TEXT, -- JSON: [x1, y1, x2, y2]
174
+ source_section TEXT,
175
+ source_region_id TEXT,
176
+ source_doc_id TEXT,
177
+ source_doi TEXT,
178
+
179
+ -- Council provenance
180
+ council_votes TEXT, -- JSON: {member: {tag, reasoning}}
181
+
182
+ -- Granularity
183
+ granularity TEXT DEFAULT 'atomic', -- atomic, aggregate
184
+ parent_claim_id TEXT,
185
+
186
+ -- Version tracking
187
+ ontology_version TEXT,
188
+ pipeline_version TEXT DEFAULT '2.1.0',
189
+ taxonomy_version TEXT,
190
+ extraction_timestamp TEXT,
191
+ schema_version TEXT DEFAULT '2.0',
192
+ created_at TEXT NOT NULL,
193
+ updated_at TEXT NOT NULL
194
+ );
195
+
196
+ -- ═══════════════════════════════════════════════════════════
197
+ -- LAYER 3: Canonicalization
198
+ -- ═══════════════════════════════════════════════════════════
199
+
200
+ CREATE TABLE IF NOT EXISTS canonical_claims (
201
+ canonical_id TEXT PRIMARY KEY,
202
+ representative_text TEXT NOT NULL,
203
+ epistemic_tag TEXT NOT NULL,
204
+ composite_confidence INTEGER, -- Aggregated across sources
205
+ evidence_count INTEGER DEFAULT 1,
206
+ source_dois TEXT, -- JSON array
207
+ aliases TEXT, -- JSON array of claim_ids
208
+ version_history TEXT, -- JSON array of {version, source, confidence, date}
209
+ current_version INTEGER DEFAULT 1,
210
+ supersedes TEXT,
211
+ superseded_by TEXT,
212
+ schema_version TEXT DEFAULT '2.0',
213
+ created_at TEXT NOT NULL,
214
+ updated_at TEXT NOT NULL
215
+ );
216
+
217
+ -- ═══════════════════════════════════════════════════════════
218
+ -- LAYER 4: Knowledge Graph
219
+ -- ═══════════════════════════════════════════════════════════
220
+
221
+ CREATE TABLE IF NOT EXISTS graph_nodes (
222
+ node_id TEXT PRIMARY KEY,
223
+ node_type TEXT NOT NULL, -- claim, entity, method, condition, lab
224
+ label TEXT NOT NULL,
225
+ properties TEXT, -- JSON
226
+ created_at TEXT NOT NULL
227
+ );
228
+
229
+ CREATE TABLE IF NOT EXISTS graph_edges (
230
+ edge_id TEXT PRIMARY KEY,
231
+ source_node TEXT NOT NULL,
232
+ target_node TEXT NOT NULL,
233
+ edge_type TEXT NOT NULL, -- supports, refutes, extends, depends_on, supersedes, blocks, investigative_hypothesis
234
+ confidence INTEGER NOT NULL, -- Fixed-point Γ—1000
235
+ evidence_sources TEXT, -- JSON array of DOIs
236
+ is_inferred INTEGER DEFAULT 0,
237
+ inference_chain TEXT, -- JSON
238
+ method_compatible INTEGER, -- NULL, 0, 1
239
+ resolution_id TEXT,
240
+ created_at TEXT NOT NULL,
241
+ updated_at TEXT NOT NULL,
242
+ FOREIGN KEY(source_node) REFERENCES graph_nodes(node_id),
243
+ FOREIGN KEY(target_node) REFERENCES graph_nodes(node_id)
244
+ );
245
+
246
+ CREATE INDEX IF NOT EXISTS idx_edges_source ON graph_edges(source_node);
247
+ CREATE INDEX IF NOT EXISTS idx_edges_target ON graph_edges(target_node);
248
+ CREATE INDEX IF NOT EXISTS idx_edges_type ON graph_edges(edge_type);
249
+
250
+ CREATE TABLE IF NOT EXISTS conflicts (
251
+ conflict_id TEXT PRIMARY KEY,
252
+ claim_a_id TEXT NOT NULL,
253
+ claim_b_id TEXT NOT NULL,
254
+ conflict_type TEXT NOT NULL,
255
+ generated_hypothesis TEXT,
256
+ hypothesis_confidence TEXT DEFAULT 'low',
257
+ resolution_status TEXT DEFAULT 'Unresolved',
258
+ resolution_id TEXT,
259
+ comparability_confidence INTEGER, -- Fixed-point Γ—1000
260
+ method_comparison TEXT, -- JSON
261
+ case_file_id TEXT,
262
+ key_differences TEXT, -- JSON
263
+ schema_version TEXT DEFAULT '2.0',
264
+ created_at TEXT NOT NULL,
265
+ resolved_at TEXT
266
+ );
267
+
268
+ CREATE TABLE IF NOT EXISTS case_files (
269
+ case_file_id TEXT PRIMARY KEY,
270
+ root_cause TEXT NOT NULL,
271
+ conflict_ids TEXT NOT NULL, -- JSON array
272
+ status TEXT DEFAULT 'open',
273
+ resolution_summary TEXT,
274
+ created_at TEXT NOT NULL,
275
+ resolved_at TEXT
276
+ );
277
+
278
+ CREATE TABLE IF NOT EXISTS resolutions (
279
+ resolution_id TEXT PRIMARY KEY,
280
+ conflict_id TEXT,
281
+ case_file_id TEXT,
282
+ resolved_by TEXT NOT NULL,
283
+ resolution_type TEXT NOT NULL,
284
+ rationale TEXT NOT NULL,
285
+ evidence_cited TEXT, -- JSON array
286
+ status TEXT DEFAULT 'active', -- active, under_review, superseded
287
+ superseded_by TEXT,
288
+ downstream_resolutions TEXT, -- JSON array
289
+ created_at TEXT NOT NULL,
290
+ reopened_at TEXT,
291
+ reopen_reason TEXT
292
+ );
293
+
294
+ -- ═══════════════════════════════════════════════════════════
295
+ -- LAYER 5: Scoring (calibration tracking)
296
+ -- ═══════════════════════════════════════════════════════════
297
+
298
+ CREATE TABLE IF NOT EXISTS calibration_log (
299
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
300
+ claim_id TEXT,
301
+ system_confidence INTEGER, -- Fixed-point Γ—1000
302
+ human_judgment TEXT, -- correct, partially_correct, incorrect
303
+ brier_contribution REAL,
304
+ timestamp TEXT NOT NULL
305
+ );
306
+
307
+ -- ═══════════════════════════════════════════════════════════
308
+ -- LAYER 6: Evaluation
309
+ -- ═══════════════════════════════════════════════════════════
310
+
311
+ CREATE TABLE IF NOT EXISTS eval_runs (
312
+ run_id TEXT PRIMARY KEY,
313
+ run_type TEXT NOT NULL, -- regression, llm_judge, calibration, holdout
314
+ metrics TEXT NOT NULL, -- JSON
315
+ passed INTEGER,
316
+ pipeline_version TEXT,
317
+ model_checkpoint TEXT,
318
+ prompt_hash TEXT,
319
+ created_at TEXT NOT NULL
320
+ );
321
+
322
+ -- ═══════════════════════════════════════════════════════════
323
+ -- LAYER 7: Provenance
324
+ -- ═══════════════════════════════════════════════════════════
325
+
326
+ CREATE TABLE IF NOT EXISTS pipeline_lineage (
327
+ lineage_id TEXT PRIMARY KEY,
328
+ claim_id TEXT NOT NULL,
329
+ pipeline_version TEXT NOT NULL,
330
+ model_checkpoint TEXT,
331
+ parser_version TEXT,
332
+ taxonomy_version TEXT,
333
+ prompt_hash TEXT,
334
+ extraction_timestamp TEXT NOT NULL,
335
+ FOREIGN KEY(claim_id) REFERENCES claims(claim_id)
336
+ );
337
+
338
+ -- ═══════════════════════════════════════════════════════════
339
+ -- CROSS-CUTTING: Sources, Goals, Taxonomy, API Logging
340
+ -- ═══════════════════════════════════════════════════════════
341
+
342
+ CREATE TABLE IF NOT EXISTS sources (
343
+ doi TEXT PRIMARY KEY,
344
+ title TEXT,
345
+ authors TEXT, -- JSON array
346
+ year INTEGER,
347
+ journal TEXT,
348
+ journal_tier INTEGER,
349
+ study_type TEXT,
350
+ is_canonical INTEGER DEFAULT 0,
351
+ taxonomy_version TEXT,
352
+ schema_version TEXT DEFAULT '2.0',
353
+ created_at TEXT NOT NULL
354
+ );
355
+
356
+ CREATE TABLE IF NOT EXISTS goals (
357
+ goal_id TEXT PRIMARY KEY,
358
+ description TEXT NOT NULL,
359
+ priority TEXT NOT NULL CHECK(priority IN ('high','medium','low')),
360
+ status TEXT NOT NULL DEFAULT 'Active',
361
+ linked_claim_ids TEXT,
362
+ schema_version TEXT DEFAULT '2.0',
363
+ created_at TEXT NOT NULL,
364
+ updated_at TEXT NOT NULL
365
+ );
366
+
367
+ CREATE TABLE IF NOT EXISTS decisions (
368
+ decision_id TEXT PRIMARY KEY,
369
+ recommended_action TEXT NOT NULL,
370
+ action_description TEXT,
371
+ expected_information_gain INTEGER,
372
+ linked_goal_id TEXT,
373
+ linked_claim_ids TEXT,
374
+ status TEXT DEFAULT 'Proposed',
375
+ priority TEXT DEFAULT 'medium',
376
+ estimated_effort TEXT,
377
+ source TEXT DEFAULT 'gap_analysis', -- gap_analysis, meta_improver, human
378
+ schema_version TEXT DEFAULT '2.0',
379
+ created_at TEXT NOT NULL
380
+ );
381
+
382
+ CREATE TABLE IF NOT EXISTS api_usage_log (
383
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
384
+ timestamp TEXT NOT NULL,
385
+ model TEXT NOT NULL,
386
+ tokens_in INTEGER NOT NULL,
387
+ tokens_out INTEGER NOT NULL,
388
+ cost_usd REAL NOT NULL,
389
+ task_type TEXT,
390
+ layer TEXT
391
+ );
392
+
393
+ CREATE TABLE IF NOT EXISTS system_state (
394
+ key TEXT PRIMARY KEY,
395
+ value TEXT NOT NULL,
396
+ updated_at TEXT NOT NULL
397
+ );
398
+ """)
399
+
400
+ # Set initial system state
401
+ for key, value in [
402
+ ("schema_version", SCHEMA_VERSION),
403
+ ("pipeline_version", PIPELINE_VERSION),
404
+ ("setup_phase", "0"),
405
+ ("total_papers_ingested", "0"),
406
+ ("total_claims_extracted", "0"),
407
+ ]:
408
+ conn.execute(
409
+ "INSERT OR IGNORE INTO system_state (key, value, updated_at) VALUES (?, ?, ?)",
410
+ (key, value, now_iso())
411
+ )
412
+
413
+ conn.commit()
414
+ conn.close()
415
+
416
+
417
+ def get_state(db_path: str = None, key: str = None) -> Optional[str]:
418
+ conn = get_db(db_path)
419
+ row = conn.execute("SELECT value FROM system_state WHERE key = ?", (key,)).fetchone()
420
+ conn.close()
421
+ return row[0] if row else None
422
+
423
+
424
+ def set_state(db_path: str = None, key: str = None, value: str = None):
425
+ conn = get_db(db_path)
426
+ conn.execute(
427
+ "INSERT OR REPLACE INTO system_state (key, value, updated_at) VALUES (?, ?, ?)",
428
+ (key, value, now_iso())
429
+ )
430
+ conn.commit()
431
+ conn.close()
432
+
433
+
434
+ def get_stats(db_path: str = None) -> dict:
435
+ conn = get_db(db_path)
436
+ stats = {}
437
+ for table in ["documents", "regions", "claims", "canonical_claims",
438
+ "graph_nodes", "graph_edges", "conflicts", "sources", "goals"]:
439
+ try:
440
+ stats[table] = conn.execute(f"SELECT COUNT(*) FROM {table}").fetchone()[0]
441
+ except:
442
+ stats[table] = 0
443
+ conn.close()
444
+ return stats