nkshirsa commited on
Commit
eb25e86
·
verified ·
1 Parent(s): 88f66d8

Add Quantum-Bio taxonomy V2: phd_research_os/taxonomy.py

Browse files
Files changed (1) hide show
  1. phd_research_os/taxonomy.py +604 -0
phd_research_os/taxonomy.py ADDED
@@ -0,0 +1,604 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PhD Research OS — Quantum-Bio Taxonomy V2
3
+ ===========================================
4
+ 8-tier study type taxonomy with domain management, migration, and rollback.
5
+
6
+ Implements the Quantum-Bio Taxonomy V2 specification:
7
+ - 8 study types with calibrated weights
8
+ - Backward compatibility with legacy 4-tier system
9
+ - Idempotent SQLite migrations
10
+ - Cache invalidation matrix
11
+ - Multi-domain taxonomy support (add new STEM domains)
12
+
13
+ All weights use FIXED-POINT math (×1000) per Research OS Rule 5.
14
+ """
15
+
16
+ import json
17
+ import os
18
+ import sqlite3
19
+ import shutil
20
+ import hashlib
21
+ from datetime import datetime, timezone
22
+ from typing import Optional
23
+ from dataclasses import dataclass, field, asdict
24
+
25
+ from .db import get_db, init_db, now_iso, gen_id, to_fixed, from_fixed
26
+
27
+ # ============================================================
28
+ # Version Constants
29
+ # ============================================================
30
+
31
+ TAXONOMY_VERSION = "quantum_bio_v1"
32
+ PIPELINE_VERSION = "2.1.0"
33
+
34
+ # ============================================================
35
+ # 8-Tier Quantum-Bio Study Types
36
+ # ============================================================
37
+
38
+ STUDY_TYPE_WEIGHTS = {
39
+ "in_vivo": 1000, # 1.000 — Living organism experiments, clinical trials
40
+ "direct_physical_measurement": 1000, # 1.000 — Direct instrumental measurements
41
+ "mathematical_proof": 950, # 0.950 — Formal mathematical derivations
42
+ "in_vitro": 850, # 0.850 — Cell culture, tissue samples, ex vivo
43
+ "first_principles_simulation": 800, # 0.800 — Ab initio, DFT, quantum mechanical
44
+ "phenomenological_simulation": 600, # 0.600 — Empirical models, fitted parameters
45
+ "review": 400, # 0.400 — Meta-analyses, systematic reviews
46
+ "perspective": 200, # 0.200 — Opinion pieces, commentaries, editorials
47
+ }
48
+
49
+ ALLOWED_STUDY_TYPES = list(STUDY_TYPE_WEIGHTS.keys())
50
+
51
+ # Study type descriptions for UI
52
+ STUDY_TYPE_DESCRIPTIONS = {
53
+ "in_vivo": "Living organism experiments, clinical trials, animal studies",
54
+ "direct_physical_measurement": "Direct instrumental measurements without biological intermediaries",
55
+ "mathematical_proof": "Formal mathematical derivations and proofs",
56
+ "in_vitro": "Cell culture, tissue samples, ex vivo experiments",
57
+ "first_principles_simulation": "Ab initio calculations, DFT, quantum mechanical simulations",
58
+ "phenomenological_simulation": "Empirical models, fitted parameters, coarse-grained simulations",
59
+ "review": "Meta-analyses, systematic reviews, literature surveys",
60
+ "perspective": "Opinion pieces, commentaries, editorials, hypotheses",
61
+ }
62
+
63
+ # ============================================================
64
+ # Legacy V1 → V2 Mapping
65
+ # ============================================================
66
+
67
+ LEGACY_TO_V2_MAP = {
68
+ # Legacy 4-tier
69
+ "primaryexperimental": "direct_physical_measurement",
70
+ "primary_experimental": "direct_physical_measurement",
71
+ "invitro": "in_vitro",
72
+ "in_vitro": "in_vitro",
73
+ "simulation": "phenomenological_simulation",
74
+ "review": "review",
75
+ "review_non_systematic": "review",
76
+ # Additional aliases
77
+ "meta_analysis": "review",
78
+ "meta-analysis": "review",
79
+ "clinical": "in_vivo",
80
+ "clinical_trial": "in_vivo",
81
+ "case_study": "perspective",
82
+ "preprint": "perspective",
83
+ "rct": "in_vivo",
84
+ "cohort": "in_vivo",
85
+ "case_control": "in_vivo",
86
+ "cross_sectional": "in_vivo",
87
+ "case_report": "perspective",
88
+ "opinion": "perspective",
89
+ # V2 identity mappings
90
+ "in_vivo": "in_vivo",
91
+ "direct_physical_measurement": "direct_physical_measurement",
92
+ "mathematical_proof": "mathematical_proof",
93
+ "first_principles_simulation": "first_principles_simulation",
94
+ "phenomenological_simulation": "phenomenological_simulation",
95
+ "perspective": "perspective",
96
+ }
97
+
98
+ # V2 → V1 reverse mapping (for rollback)
99
+ V2_TO_LEGACY_MAP = {
100
+ "in_vivo": "primary_experimental",
101
+ "direct_physical_measurement": "primary_experimental",
102
+ "mathematical_proof": "primary_experimental",
103
+ "in_vitro": "in_vitro",
104
+ "first_principles_simulation": "simulation",
105
+ "phenomenological_simulation": "simulation",
106
+ "review": "review_non_systematic",
107
+ "perspective": "review_non_systematic",
108
+ }
109
+
110
+
111
+ # ============================================================
112
+ # Domain Taxonomy Management
113
+ # ============================================================
114
+
115
+ @dataclass
116
+ class DomainTaxonomy:
117
+ """A domain-specific taxonomy overlay that can add custom study types."""
118
+ domain_id: str
119
+ name: str
120
+ description: str
121
+ custom_study_types: dict # {type_name: {"weight": int, "description": str}}
122
+ parent_domain: Optional[str] = None
123
+ created_at: str = ""
124
+ is_active: bool = True
125
+
126
+ def get_all_weights(self) -> dict:
127
+ """Merge base Quantum-Bio weights with domain-specific overrides."""
128
+ weights = dict(STUDY_TYPE_WEIGHTS)
129
+ for type_name, info in self.custom_study_types.items():
130
+ weights[type_name] = info["weight"]
131
+ return weights
132
+
133
+
134
+ # Default domain taxonomies
135
+ DEFAULT_DOMAINS = {
136
+ "quantum_bio": {
137
+ "name": "Quantum-Bio (Default)",
138
+ "description": "Core 8-tier taxonomy for quantum mechanics and biological systems",
139
+ "custom_study_types": {},
140
+ },
141
+ "materials_science": {
142
+ "name": "Materials Science",
143
+ "description": "Extended taxonomy for materials characterization and synthesis",
144
+ "custom_study_types": {
145
+ "characterization": {"weight": 950, "description": "XRD, SEM, TEM, AFM, Raman — direct structural/chemical measurement"},
146
+ "synthesis_report": {"weight": 800, "description": "Novel material synthesis with reproducibility data"},
147
+ "device_fabrication": {"weight": 750, "description": "Fabricated device performance measurements"},
148
+ },
149
+ },
150
+ "biosensors": {
151
+ "name": "Biosensors & Diagnostics",
152
+ "description": "Taxonomy for biosensor development and clinical diagnostics",
153
+ "custom_study_types": {
154
+ "clinical_validation": {"weight": 1000, "description": "Tested with real clinical samples (blood, serum, saliva)"},
155
+ "spiked_sample": {"weight": 850, "description": "Known analyte spiked into buffer or simplified matrix"},
156
+ "buffer_only": {"weight": 700, "description": "Measurements in clean buffer solution only"},
157
+ "selectivity_panel": {"weight": 800, "description": "Cross-reactivity testing against panel of interferents"},
158
+ },
159
+ },
160
+ "computational_chemistry": {
161
+ "name": "Computational Chemistry",
162
+ "description": "Taxonomy for computational and theoretical chemistry methods",
163
+ "custom_study_types": {
164
+ "coupled_cluster": {"weight": 950, "description": "CCSD(T) or higher-level coupled cluster calculations"},
165
+ "dft_hybrid": {"weight": 850, "description": "Hybrid DFT (B3LYP, PBE0, etc.) with verified basis sets"},
166
+ "semi_empirical": {"weight": 650, "description": "AM1, PM3, DFTB, or other semi-empirical methods"},
167
+ "force_field_md": {"weight": 600, "description": "Classical MD with empirical force fields"},
168
+ "machine_learned_potential": {"weight": 750, "description": "Neural network potentials, GAP, ACE fitted to QM data"},
169
+ },
170
+ },
171
+ "neuroscience": {
172
+ "name": "Neuroscience",
173
+ "description": "Taxonomy for neuroscience and brain imaging studies",
174
+ "custom_study_types": {
175
+ "fmri_task": {"weight": 850, "description": "Task-based fMRI with proper controls and correction"},
176
+ "eeg_erp": {"weight": 800, "description": "EEG event-related potential studies"},
177
+ "lesion_study": {"weight": 900, "description": "Natural lesion or TMS studies establishing causal role"},
178
+ "behavioral_only": {"weight": 700, "description": "Behavioral measures without neural recording"},
179
+ },
180
+ },
181
+ }
182
+
183
+
184
+ # ============================================================
185
+ # Database Schema Extension
186
+ # ============================================================
187
+
188
+ def init_taxonomy_db(db_path: str = None):
189
+ """Add taxonomy tables to the Research OS database."""
190
+ init_db(db_path)
191
+ conn = get_db(db_path)
192
+
193
+ conn.executescript("""
194
+ CREATE TABLE IF NOT EXISTS domain_taxonomies (
195
+ domain_id TEXT PRIMARY KEY,
196
+ name TEXT NOT NULL,
197
+ description TEXT,
198
+ custom_study_types TEXT NOT NULL, -- JSON
199
+ parent_domain TEXT,
200
+ is_active INTEGER DEFAULT 1,
201
+ created_at TEXT NOT NULL,
202
+ updated_at TEXT NOT NULL,
203
+ schema_version TEXT NOT NULL DEFAULT '1.0'
204
+ );
205
+
206
+ CREATE TABLE IF NOT EXISTS taxonomy_audit_log (
207
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
208
+ action TEXT NOT NULL,
209
+ domain_id TEXT,
210
+ details TEXT,
211
+ timestamp TEXT NOT NULL
212
+ );
213
+
214
+ CREATE TABLE IF NOT EXISTS study_type_overrides (
215
+ override_id TEXT PRIMARY KEY,
216
+ domain_id TEXT NOT NULL,
217
+ study_type TEXT NOT NULL,
218
+ custom_weight INTEGER NOT NULL, -- Fixed-point ×1000
219
+ description TEXT,
220
+ rationale TEXT NOT NULL,
221
+ created_by TEXT NOT NULL,
222
+ created_at TEXT NOT NULL,
223
+ FOREIGN KEY(domain_id) REFERENCES domain_taxonomies(domain_id)
224
+ );
225
+ """)
226
+ conn.commit()
227
+ conn.close()
228
+
229
+
230
+ # ============================================================
231
+ # Taxonomy Manager
232
+ # ============================================================
233
+
234
+ class TaxonomyManager:
235
+ """
236
+ Manages domain taxonomies for the Research OS.
237
+
238
+ Provides:
239
+ - CRUD for domain taxonomies
240
+ - Study type normalization (legacy → V2)
241
+ - Confidence scoring with domain-aware weights
242
+ - Migration and rollback
243
+ - Cache invalidation
244
+ """
245
+
246
+ def __init__(self, db_path: str = None):
247
+ self.db_path = db_path or os.environ.get("RESEARCH_OS_DB", "data/research_os.db")
248
+ init_taxonomy_db(self.db_path)
249
+ self._ensure_default_domains()
250
+
251
+ def _ensure_default_domains(self):
252
+ """Seed default domain taxonomies if they don't exist."""
253
+ conn = get_db(self.db_path)
254
+ for domain_id, info in DEFAULT_DOMAINS.items():
255
+ existing = conn.execute(
256
+ "SELECT 1 FROM domain_taxonomies WHERE domain_id = ?", (domain_id,)
257
+ ).fetchone()
258
+ if not existing:
259
+ now = now_iso()
260
+ conn.execute("""
261
+ INSERT INTO domain_taxonomies (domain_id, name, description,
262
+ custom_study_types, is_active, created_at, updated_at, schema_version)
263
+ VALUES (?, ?, ?, ?, 1, ?, ?, '1.0')
264
+ """, (domain_id, info["name"], info["description"],
265
+ json.dumps(info["custom_study_types"]), now, now))
266
+ conn.commit()
267
+ conn.close()
268
+
269
+ # ============================================================
270
+ # Domain CRUD
271
+ # ============================================================
272
+
273
+ def create_domain(self, domain_id: str, name: str, description: str,
274
+ custom_study_types: dict = None,
275
+ parent_domain: str = None) -> str:
276
+ """Create a new domain taxonomy."""
277
+ conn = get_db(self.db_path)
278
+ now = now_iso()
279
+ conn.execute("""
280
+ INSERT INTO domain_taxonomies (domain_id, name, description,
281
+ custom_study_types, parent_domain, is_active, created_at, updated_at)
282
+ VALUES (?, ?, ?, ?, ?, 1, ?, ?)
283
+ """, (domain_id, name, description,
284
+ json.dumps(custom_study_types or {}), parent_domain, now, now))
285
+
286
+ self._log_audit(conn, "create_domain", domain_id, f"Created domain: {name}")
287
+ conn.commit()
288
+ conn.close()
289
+ return domain_id
290
+
291
+ def get_domain(self, domain_id: str) -> Optional[dict]:
292
+ """Get a domain taxonomy."""
293
+ conn = get_db(self.db_path)
294
+ row = conn.execute(
295
+ "SELECT * FROM domain_taxonomies WHERE domain_id = ?", (domain_id,)
296
+ ).fetchone()
297
+ conn.close()
298
+ if not row:
299
+ return None
300
+ d = dict(row)
301
+ d["custom_study_types"] = json.loads(d.get("custom_study_types", "{}"))
302
+ return d
303
+
304
+ def list_domains(self, active_only: bool = True) -> list:
305
+ """List all domain taxonomies."""
306
+ conn = get_db(self.db_path)
307
+ if active_only:
308
+ rows = conn.execute(
309
+ "SELECT * FROM domain_taxonomies WHERE is_active = 1 ORDER BY name"
310
+ ).fetchall()
311
+ else:
312
+ rows = conn.execute(
313
+ "SELECT * FROM domain_taxonomies ORDER BY name"
314
+ ).fetchall()
315
+ conn.close()
316
+ results = []
317
+ for row in rows:
318
+ d = dict(row)
319
+ d["custom_study_types"] = json.loads(d.get("custom_study_types", "{}"))
320
+ results.append(d)
321
+ return results
322
+
323
+ def update_domain(self, domain_id: str, name: str = None,
324
+ description: str = None,
325
+ custom_study_types: dict = None) -> bool:
326
+ """Update a domain taxonomy."""
327
+ conn = get_db(self.db_path)
328
+ updates, values = [], []
329
+ if name is not None:
330
+ updates.append("name = ?"); values.append(name)
331
+ if description is not None:
332
+ updates.append("description = ?"); values.append(description)
333
+ if custom_study_types is not None:
334
+ updates.append("custom_study_types = ?")
335
+ values.append(json.dumps(custom_study_types))
336
+ updates.append("updated_at = ?"); values.append(now_iso())
337
+ values.append(domain_id)
338
+
339
+ if updates:
340
+ conn.execute(
341
+ f"UPDATE domain_taxonomies SET {', '.join(updates)} WHERE domain_id = ?",
342
+ values
343
+ )
344
+ self._log_audit(conn, "update_domain", domain_id, f"Updated: {updates}")
345
+ conn.commit()
346
+ conn.close()
347
+ return True
348
+
349
+ def delete_domain(self, domain_id: str) -> bool:
350
+ """Soft-delete a domain (set inactive). Cannot delete quantum_bio base."""
351
+ if domain_id == "quantum_bio":
352
+ return False # Cannot delete base taxonomy
353
+ conn = get_db(self.db_path)
354
+ conn.execute(
355
+ "UPDATE domain_taxonomies SET is_active = 0, updated_at = ? WHERE domain_id = ?",
356
+ (now_iso(), domain_id)
357
+ )
358
+ self._log_audit(conn, "delete_domain", domain_id, "Soft-deleted")
359
+ conn.commit()
360
+ conn.close()
361
+ return True
362
+
363
+ def add_study_type(self, domain_id: str, type_name: str,
364
+ weight: float, description: str) -> bool:
365
+ """Add a custom study type to a domain."""
366
+ domain = self.get_domain(domain_id)
367
+ if not domain:
368
+ return False
369
+
370
+ types = domain["custom_study_types"]
371
+ types[type_name] = {
372
+ "weight": to_fixed(weight),
373
+ "description": description
374
+ }
375
+ return self.update_domain(domain_id, custom_study_types=types)
376
+
377
+ def remove_study_type(self, domain_id: str, type_name: str) -> bool:
378
+ """Remove a custom study type from a domain."""
379
+ domain = self.get_domain(domain_id)
380
+ if not domain:
381
+ return False
382
+ types = domain["custom_study_types"]
383
+ if type_name in types:
384
+ del types[type_name]
385
+ return self.update_domain(domain_id, custom_study_types=types)
386
+ return False
387
+
388
+ # ============================================================
389
+ # Study Type Normalization & Scoring
390
+ # ============================================================
391
+
392
+ def normalize_study_type(self, raw_type: str) -> str:
393
+ """Normalize a study type string to V2 canonical form."""
394
+ normalized = raw_type.strip().lower().replace("-", "_").replace(" ", "_")
395
+ return LEGACY_TO_V2_MAP.get(normalized, normalized)
396
+
397
+ def get_weight(self, study_type: str, domain_id: str = "quantum_bio") -> int:
398
+ """
399
+ Get the weight for a study type, considering domain overrides.
400
+ Returns fixed-point integer (×1000).
401
+ """
402
+ normalized = self.normalize_study_type(study_type)
403
+
404
+ # Check domain-specific override first
405
+ domain = self.get_domain(domain_id)
406
+ if domain and normalized in domain.get("custom_study_types", {}):
407
+ return domain["custom_study_types"][normalized]["weight"]
408
+
409
+ # Fall back to base taxonomy
410
+ return STUDY_TYPE_WEIGHTS.get(normalized, 600) # Default 0.6 for unknown
411
+
412
+ def get_weight_float(self, study_type: str, domain_id: str = "quantum_bio") -> float:
413
+ """Get weight as float."""
414
+ return from_fixed(self.get_weight(study_type, domain_id))
415
+
416
+ def get_all_study_types(self, domain_id: str = "quantum_bio") -> dict:
417
+ """Get all study types and weights for a domain (base + custom)."""
418
+ result = {}
419
+
420
+ # Base types
421
+ for st, weight in STUDY_TYPE_WEIGHTS.items():
422
+ result[st] = {
423
+ "weight": weight,
424
+ "weight_float": from_fixed(weight),
425
+ "description": STUDY_TYPE_DESCRIPTIONS.get(st, ""),
426
+ "source": "base",
427
+ }
428
+
429
+ # Domain custom types
430
+ domain = self.get_domain(domain_id)
431
+ if domain:
432
+ for st, info in domain.get("custom_study_types", {}).items():
433
+ result[st] = {
434
+ "weight": info["weight"],
435
+ "weight_float": from_fixed(info["weight"]),
436
+ "description": info.get("description", ""),
437
+ "source": f"domain:{domain_id}",
438
+ }
439
+
440
+ return result
441
+
442
+ def score_confidence(self, evidence_strength: float, study_type: str,
443
+ journal_tier: int, is_complete: bool,
444
+ domain_id: str = "quantum_bio") -> dict:
445
+ """
446
+ Calculate confidence using the V2 taxonomy.
447
+
448
+ confidence = evidence_strength × study_quality_weight × journal_tier_weight × completeness_penalty
449
+
450
+ Returns full breakdown with taxonomy version tag.
451
+ """
452
+ journal_tier_map = {1: 1000, 2: 850, 3: 700}
453
+ preprint_weight = 500
454
+
455
+ sq_weight = self.get_weight(study_type, domain_id)
456
+ jt_weight = journal_tier_map.get(journal_tier, preprint_weight)
457
+ completeness = 1000 if is_complete else 700
458
+
459
+ # Fixed-point multiplication: (a×b×c×d) / 1000^3
460
+ # To avoid overflow: chain multiply and divide
461
+ es_fp = to_fixed(evidence_strength)
462
+ raw = es_fp * sq_weight // 1000
463
+ raw = raw * jt_weight // 1000
464
+ raw = raw * completeness // 1000
465
+ confidence = max(0, min(1000, raw))
466
+
467
+ return {
468
+ "confidence": from_fixed(confidence),
469
+ "confidence_fixed": confidence,
470
+ "evidence_strength": evidence_strength,
471
+ "study_quality_weight": from_fixed(sq_weight),
472
+ "journal_tier_weight": from_fixed(jt_weight),
473
+ "completeness_penalty": from_fixed(completeness),
474
+ "study_type_normalized": self.normalize_study_type(study_type),
475
+ "taxonomy_version": TAXONOMY_VERSION,
476
+ "domain_id": domain_id,
477
+ }
478
+
479
+ # ============================================================
480
+ # Migration & Rollback
481
+ # ============================================================
482
+
483
+ def migrate_to_v2(self) -> dict:
484
+ """
485
+ Idempotent migration from legacy 4-tier to Quantum-Bio V2.
486
+ Returns migration summary.
487
+ """
488
+ conn = get_db(self.db_path)
489
+ summary = {"rows_backfilled": 0, "errors": [], "already_migrated": False}
490
+
491
+ try:
492
+ # Check if already migrated
493
+ cols = [row[1] for row in conn.execute("PRAGMA table_info(claims)").fetchall()]
494
+
495
+ if "taxonomy_version" in cols:
496
+ summary["already_migrated"] = True
497
+ conn.close()
498
+ return summary
499
+
500
+ # Add taxonomy_version column
501
+ conn.execute("ALTER TABLE claims ADD COLUMN taxonomy_version TEXT DEFAULT 'legacy_v1'")
502
+
503
+ # Backfill: normalize study types in claims
504
+ claims = conn.execute("SELECT claim_id, study_type FROM claims WHERE study_type IS NOT NULL").fetchall()
505
+ for claim in claims:
506
+ old_type = dict(claim).get("study_type", "")
507
+ if old_type:
508
+ new_type = self.normalize_study_type(old_type)
509
+ conn.execute(
510
+ "UPDATE claims SET study_type = ?, taxonomy_version = ? WHERE claim_id = ?",
511
+ (new_type, TAXONOMY_VERSION, dict(claim)["claim_id"])
512
+ )
513
+ summary["rows_backfilled"] += 1
514
+
515
+ # Backfill sources table too
516
+ src_cols = [row[1] for row in conn.execute("PRAGMA table_info(sources)").fetchall()]
517
+ if "taxonomy_version" not in src_cols:
518
+ conn.execute("ALTER TABLE sources ADD COLUMN taxonomy_version TEXT DEFAULT 'legacy_v1'")
519
+
520
+ sources = conn.execute("SELECT doi, study_type FROM sources WHERE study_type IS NOT NULL").fetchall()
521
+ for src in sources:
522
+ old_type = dict(src).get("study_type", "")
523
+ if old_type:
524
+ new_type = self.normalize_study_type(old_type)
525
+ conn.execute(
526
+ "UPDATE sources SET study_type = ?, taxonomy_version = ? WHERE doi = ?",
527
+ (new_type, TAXONOMY_VERSION, dict(src)["doi"])
528
+ )
529
+
530
+ self._log_audit(conn, "migrate_v2", None,
531
+ f"Migrated {summary['rows_backfilled']} claims to V2")
532
+ conn.commit()
533
+
534
+ except Exception as e:
535
+ conn.rollback()
536
+ summary["errors"].append(str(e))
537
+ finally:
538
+ conn.close()
539
+
540
+ return summary
541
+
542
+ def rollback_to_v1(self) -> dict:
543
+ """Rollback from V2 to legacy V1 study types."""
544
+ conn = get_db(self.db_path)
545
+ summary = {"rows_reverted": 0, "errors": []}
546
+
547
+ try:
548
+ for v2_type, v1_type in V2_TO_LEGACY_MAP.items():
549
+ cursor = conn.execute(
550
+ "UPDATE claims SET study_type = ? WHERE study_type = ?",
551
+ (v1_type, v2_type)
552
+ )
553
+ summary["rows_reverted"] += cursor.rowcount
554
+
555
+ conn.execute(
556
+ "UPDATE sources SET study_type = ? WHERE study_type = ?",
557
+ (v1_type, v2_type)
558
+ )
559
+
560
+ self._log_audit(conn, "rollback_v1", None,
561
+ f"Rolled back {summary['rows_reverted']} claims to V1")
562
+ conn.commit()
563
+
564
+ except Exception as e:
565
+ conn.rollback()
566
+ summary["errors"].append(str(e))
567
+ finally:
568
+ conn.close()
569
+
570
+ return summary
571
+
572
+ # ============================================================
573
+ # Cache Invalidation
574
+ # ============================================================
575
+
576
+ def generate_cache_key(self, pdf_hash: str, schema_version: str = "1.0") -> str:
577
+ """Generate a versioned cache key."""
578
+ raw = f"{pdf_hash}_{schema_version}_{PIPELINE_VERSION}_{TAXONOMY_VERSION}"
579
+ return hashlib.sha256(raw.encode()).hexdigest()
580
+
581
+ def validate_cache_entry(self, entry: dict) -> bool:
582
+ """Check if a cache entry is still valid against current versions."""
583
+ return (
584
+ entry.get("taxonomy_version") == TAXONOMY_VERSION and
585
+ entry.get("pipeline_version") == PIPELINE_VERSION
586
+ )
587
+
588
+ # ============================================================
589
+ # Audit
590
+ # ============================================================
591
+
592
+ def _log_audit(self, conn, action: str, domain_id: Optional[str], details: str):
593
+ conn.execute("""
594
+ INSERT INTO taxonomy_audit_log (action, domain_id, details, timestamp)
595
+ VALUES (?, ?, ?, ?)
596
+ """, (action, domain_id, details, now_iso()))
597
+
598
+ def get_audit_log(self, limit: int = 50) -> list:
599
+ conn = get_db(self.db_path)
600
+ rows = conn.execute(
601
+ "SELECT * FROM taxonomy_audit_log ORDER BY timestamp DESC LIMIT ?", (limit,)
602
+ ).fetchall()
603
+ conn.close()
604
+ return [dict(r) for r in rows]