Add Quantum-Bio taxonomy V2: phd_research_os/taxonomy.py

Browse files

Files changed (1) hide show

phd_research_os/taxonomy.py +604 -0

phd_research_os/taxonomy.py ADDED Viewed

	@@ -0,0 +1,604 @@

+"""
+PhD Research OS — Quantum-Bio Taxonomy V2
+===========================================
+8-tier study type taxonomy with domain management, migration, and rollback.
+Implements the Quantum-Bio Taxonomy V2 specification:
+  - 8 study types with calibrated weights
+  - Backward compatibility with legacy 4-tier system
+  - Idempotent SQLite migrations
+  - Cache invalidation matrix
+  - Multi-domain taxonomy support (add new STEM domains)
+All weights use FIXED-POINT math (×1000) per Research OS Rule 5.
+"""
+import json
+import os
+import sqlite3
+import shutil
+import hashlib
+from datetime import datetime, timezone
+from typing import Optional
+from dataclasses import dataclass, field, asdict
+from .db import get_db, init_db, now_iso, gen_id, to_fixed, from_fixed
+# ============================================================
+# Version Constants
+# ============================================================
+TAXONOMY_VERSION = "quantum_bio_v1"
+PIPELINE_VERSION = "2.1.0"
+# ============================================================
+# 8-Tier Quantum-Bio Study Types
+# ============================================================
+STUDY_TYPE_WEIGHTS = {
+    "in_vivo":                      1000,  # 1.000 — Living organism experiments, clinical trials
+    "direct_physical_measurement":  1000,  # 1.000 — Direct instrumental measurements
+    "mathematical_proof":            950,  # 0.950 — Formal mathematical derivations
+    "in_vitro":                      850,  # 0.850 — Cell culture, tissue samples, ex vivo
+    "first_principles_simulation":   800,  # 0.800 — Ab initio, DFT, quantum mechanical
+    "phenomenological_simulation":   600,  # 0.600 — Empirical models, fitted parameters
+    "review":                        400,  # 0.400 — Meta-analyses, systematic reviews
+    "perspective":                   200,  # 0.200 — Opinion pieces, commentaries, editorials
+}
+ALLOWED_STUDY_TYPES = list(STUDY_TYPE_WEIGHTS.keys())
+# Study type descriptions for UI
+STUDY_TYPE_DESCRIPTIONS = {
+    "in_vivo": "Living organism experiments, clinical trials, animal studies",
+    "direct_physical_measurement": "Direct instrumental measurements without biological intermediaries",
+    "mathematical_proof": "Formal mathematical derivations and proofs",
+    "in_vitro": "Cell culture, tissue samples, ex vivo experiments",
+    "first_principles_simulation": "Ab initio calculations, DFT, quantum mechanical simulations",
+    "phenomenological_simulation": "Empirical models, fitted parameters, coarse-grained simulations",
+    "review": "Meta-analyses, systematic reviews, literature surveys",
+    "perspective": "Opinion pieces, commentaries, editorials, hypotheses",
+}
+# ============================================================
+# Legacy V1 → V2 Mapping
+# ============================================================
+LEGACY_TO_V2_MAP = {
+    # Legacy 4-tier
+    "primaryexperimental":     "direct_physical_measurement",
+    "primary_experimental":    "direct_physical_measurement",
+    "invitro":                 "in_vitro",
+    "in_vitro":                "in_vitro",
+    "simulation":              "phenomenological_simulation",
+    "review":                  "review",
+    "review_non_systematic":   "review",
+    # Additional aliases
+    "meta_analysis":           "review",
+    "meta-analysis":           "review",
+    "clinical":                "in_vivo",
+    "clinical_trial":          "in_vivo",
+    "case_study":              "perspective",
+    "preprint":                "perspective",
+    "rct":                     "in_vivo",
+    "cohort":                  "in_vivo",
+    "case_control":            "in_vivo",
+    "cross_sectional":         "in_vivo",
+    "case_report":             "perspective",
+    "opinion":                 "perspective",
+    # V2 identity mappings
+    "in_vivo":                         "in_vivo",
+    "direct_physical_measurement":     "direct_physical_measurement",
+    "mathematical_proof":              "mathematical_proof",
+    "first_principles_simulation":     "first_principles_simulation",
+    "phenomenological_simulation":     "phenomenological_simulation",
+    "perspective":                     "perspective",
+}
+# V2 → V1 reverse mapping (for rollback)
+V2_TO_LEGACY_MAP = {
+    "in_vivo":                      "primary_experimental",
+    "direct_physical_measurement":  "primary_experimental",
+    "mathematical_proof":           "primary_experimental",
+    "in_vitro":                     "in_vitro",
+    "first_principles_simulation":  "simulation",
+    "phenomenological_simulation":  "simulation",
+    "review":                       "review_non_systematic",
+    "perspective":                  "review_non_systematic",
+}
+# ============================================================
+# Domain Taxonomy Management
+# ============================================================
+@dataclass
+class DomainTaxonomy:
+    """A domain-specific taxonomy overlay that can add custom study types."""
+    domain_id: str
+    name: str
+    description: str
+    custom_study_types: dict  # {type_name: {"weight": int, "description": str}}
+    parent_domain: Optional[str] = None
+    created_at: str = ""
+    is_active: bool = True
+    def get_all_weights(self) -> dict:
+        """Merge base Quantum-Bio weights with domain-specific overrides."""
+        weights = dict(STUDY_TYPE_WEIGHTS)
+        for type_name, info in self.custom_study_types.items():
+            weights[type_name] = info["weight"]
+        return weights
+# Default domain taxonomies
+DEFAULT_DOMAINS = {
+    "quantum_bio": {
+        "name": "Quantum-Bio (Default)",
+        "description": "Core 8-tier taxonomy for quantum mechanics and biological systems",
+        "custom_study_types": {},
+    },
+    "materials_science": {
+        "name": "Materials Science",
+        "description": "Extended taxonomy for materials characterization and synthesis",
+        "custom_study_types": {
+            "characterization": {"weight": 950, "description": "XRD, SEM, TEM, AFM, Raman — direct structural/chemical measurement"},
+            "synthesis_report": {"weight": 800, "description": "Novel material synthesis with reproducibility data"},
+            "device_fabrication": {"weight": 750, "description": "Fabricated device performance measurements"},
+        },
+    },
+    "biosensors": {
+        "name": "Biosensors & Diagnostics",
+        "description": "Taxonomy for biosensor development and clinical diagnostics",
+        "custom_study_types": {
+            "clinical_validation": {"weight": 1000, "description": "Tested with real clinical samples (blood, serum, saliva)"},
+            "spiked_sample": {"weight": 850, "description": "Known analyte spiked into buffer or simplified matrix"},
+            "buffer_only": {"weight": 700, "description": "Measurements in clean buffer solution only"},
+            "selectivity_panel": {"weight": 800, "description": "Cross-reactivity testing against panel of interferents"},
+        },
+    },
+    "computational_chemistry": {
+        "name": "Computational Chemistry",
+        "description": "Taxonomy for computational and theoretical chemistry methods",
+        "custom_study_types": {
+            "coupled_cluster": {"weight": 950, "description": "CCSD(T) or higher-level coupled cluster calculations"},
+            "dft_hybrid": {"weight": 850, "description": "Hybrid DFT (B3LYP, PBE0, etc.) with verified basis sets"},
+            "semi_empirical": {"weight": 650, "description": "AM1, PM3, DFTB, or other semi-empirical methods"},
+            "force_field_md": {"weight": 600, "description": "Classical MD with empirical force fields"},
+            "machine_learned_potential": {"weight": 750, "description": "Neural network potentials, GAP, ACE fitted to QM data"},
+        },
+    },
+    "neuroscience": {
+        "name": "Neuroscience",
+        "description": "Taxonomy for neuroscience and brain imaging studies",
+        "custom_study_types": {
+            "fmri_task": {"weight": 850, "description": "Task-based fMRI with proper controls and correction"},
+            "eeg_erp": {"weight": 800, "description": "EEG event-related potential studies"},
+            "lesion_study": {"weight": 900, "description": "Natural lesion or TMS studies establishing causal role"},
+            "behavioral_only": {"weight": 700, "description": "Behavioral measures without neural recording"},
+        },
+    },
+}
+# ============================================================
+# Database Schema Extension
+# ============================================================
+def init_taxonomy_db(db_path: str = None):
+    """Add taxonomy tables to the Research OS database."""
+    init_db(db_path)
+    conn = get_db(db_path)
+    conn.executescript("""
+    CREATE TABLE IF NOT EXISTS domain_taxonomies (
+        domain_id TEXT PRIMARY KEY,
+        name TEXT NOT NULL,
+        description TEXT,
+        custom_study_types TEXT NOT NULL,  -- JSON
+        parent_domain TEXT,
+        is_active INTEGER DEFAULT 1,
+        created_at TEXT NOT NULL,
+        updated_at TEXT NOT NULL,
+        schema_version TEXT NOT NULL DEFAULT '1.0'
+    );
+    CREATE TABLE IF NOT EXISTS taxonomy_audit_log (
+        id INTEGER PRIMARY KEY AUTOINCREMENT,
+        action TEXT NOT NULL,
+        domain_id TEXT,
+        details TEXT,
+        timestamp TEXT NOT NULL
+    );
+    CREATE TABLE IF NOT EXISTS study_type_overrides (
+        override_id TEXT PRIMARY KEY,
+        domain_id TEXT NOT NULL,
+        study_type TEXT NOT NULL,
+        custom_weight INTEGER NOT NULL,  -- Fixed-point ×1000
+        description TEXT,
+        rationale TEXT NOT NULL,
+        created_by TEXT NOT NULL,
+        created_at TEXT NOT NULL,
+        FOREIGN KEY(domain_id) REFERENCES domain_taxonomies(domain_id)
+    );
+    """)
+    conn.commit()
+    conn.close()
+# ============================================================
+# Taxonomy Manager
+# ============================================================
+class TaxonomyManager:
+    """
+    Manages domain taxonomies for the Research OS.
+    Provides:
+    - CRUD for domain taxonomies
+    - Study type normalization (legacy → V2)
+    - Confidence scoring with domain-aware weights
+    - Migration and rollback
+    - Cache invalidation
+    """
+    def __init__(self, db_path: str = None):
+        self.db_path = db_path or os.environ.get("RESEARCH_OS_DB", "data/research_os.db")
+        init_taxonomy_db(self.db_path)
+        self._ensure_default_domains()
+    def _ensure_default_domains(self):
+        """Seed default domain taxonomies if they don't exist."""
+        conn = get_db(self.db_path)
+        for domain_id, info in DEFAULT_DOMAINS.items():
+            existing = conn.execute(
+                "SELECT 1 FROM domain_taxonomies WHERE domain_id = ?", (domain_id,)
+            ).fetchone()
+            if not existing:
+                now = now_iso()
+                conn.execute("""
+                    INSERT INTO domain_taxonomies (domain_id, name, description,
+                        custom_study_types, is_active, created_at, updated_at, schema_version)
+                    VALUES (?, ?, ?, ?, 1, ?, ?, '1.0')
+                """, (domain_id, info["name"], info["description"],
+                      json.dumps(info["custom_study_types"]), now, now))
+        conn.commit()
+        conn.close()
+    # ============================================================
+    # Domain CRUD
+    # ============================================================
+    def create_domain(self, domain_id: str, name: str, description: str,
+                      custom_study_types: dict = None,
+                      parent_domain: str = None) -> str:
+        """Create a new domain taxonomy."""
+        conn = get_db(self.db_path)
+        now = now_iso()
+        conn.execute("""
+            INSERT INTO domain_taxonomies (domain_id, name, description,
+                custom_study_types, parent_domain, is_active, created_at, updated_at)
+            VALUES (?, ?, ?, ?, ?, 1, ?, ?)
+        """, (domain_id, name, description,
+              json.dumps(custom_study_types or {}), parent_domain, now, now))
+        self._log_audit(conn, "create_domain", domain_id, f"Created domain: {name}")
+        conn.commit()
+        conn.close()
+        return domain_id
+    def get_domain(self, domain_id: str) -> Optional[dict]:
+        """Get a domain taxonomy."""
+        conn = get_db(self.db_path)
+        row = conn.execute(
+            "SELECT * FROM domain_taxonomies WHERE domain_id = ?", (domain_id,)
+        ).fetchone()
+        conn.close()
+        if not row:
+            return None
+        d = dict(row)
+        d["custom_study_types"] = json.loads(d.get("custom_study_types", "{}"))
+        return d
+    def list_domains(self, active_only: bool = True) -> list:
+        """List all domain taxonomies."""
+        conn = get_db(self.db_path)
+        if active_only:
+            rows = conn.execute(
+                "SELECT * FROM domain_taxonomies WHERE is_active = 1 ORDER BY name"
+            ).fetchall()
+        else:
+            rows = conn.execute(
+                "SELECT * FROM domain_taxonomies ORDER BY name"
+            ).fetchall()
+        conn.close()
+        results = []
+        for row in rows:
+            d = dict(row)
+            d["custom_study_types"] = json.loads(d.get("custom_study_types", "{}"))
+            results.append(d)
+        return results
+    def update_domain(self, domain_id: str, name: str = None,
+                      description: str = None,
+                      custom_study_types: dict = None) -> bool:
+        """Update a domain taxonomy."""
+        conn = get_db(self.db_path)
+        updates, values = [], []
+        if name is not None:
+            updates.append("name = ?"); values.append(name)
+        if description is not None:
+            updates.append("description = ?"); values.append(description)
+        if custom_study_types is not None:
+            updates.append("custom_study_types = ?")
+            values.append(json.dumps(custom_study_types))
+        updates.append("updated_at = ?"); values.append(now_iso())
+        values.append(domain_id)
+        if updates:
+            conn.execute(
+                f"UPDATE domain_taxonomies SET {', '.join(updates)} WHERE domain_id = ?",
+                values
+            )
+            self._log_audit(conn, "update_domain", domain_id, f"Updated: {updates}")
+            conn.commit()
+        conn.close()
+        return True
+    def delete_domain(self, domain_id: str) -> bool:
+        """Soft-delete a domain (set inactive). Cannot delete quantum_bio base."""
+        if domain_id == "quantum_bio":
+            return False  # Cannot delete base taxonomy
+        conn = get_db(self.db_path)
+        conn.execute(
+            "UPDATE domain_taxonomies SET is_active = 0, updated_at = ? WHERE domain_id = ?",
+            (now_iso(), domain_id)
+        )
+        self._log_audit(conn, "delete_domain", domain_id, "Soft-deleted")
+        conn.commit()
+        conn.close()
+        return True
+    def add_study_type(self, domain_id: str, type_name: str,
+                       weight: float, description: str) -> bool:
+        """Add a custom study type to a domain."""
+        domain = self.get_domain(domain_id)
+        if not domain:
+            return False
+        types = domain["custom_study_types"]
+        types[type_name] = {
+            "weight": to_fixed(weight),
+            "description": description
+        }
+        return self.update_domain(domain_id, custom_study_types=types)
+    def remove_study_type(self, domain_id: str, type_name: str) -> bool:
+        """Remove a custom study type from a domain."""
+        domain = self.get_domain(domain_id)
+        if not domain:
+            return False
+        types = domain["custom_study_types"]
+        if type_name in types:
+            del types[type_name]
+            return self.update_domain(domain_id, custom_study_types=types)
+        return False
+    # ============================================================
+    # Study Type Normalization & Scoring
+    # ============================================================
+    def normalize_study_type(self, raw_type: str) -> str:
+        """Normalize a study type string to V2 canonical form."""
+        normalized = raw_type.strip().lower().replace("-", "_").replace(" ", "_")
+        return LEGACY_TO_V2_MAP.get(normalized, normalized)
+    def get_weight(self, study_type: str, domain_id: str = "quantum_bio") -> int:
+        """
+        Get the weight for a study type, considering domain overrides.
+        Returns fixed-point integer (×1000).
+        """
+        normalized = self.normalize_study_type(study_type)
+        # Check domain-specific override first
+        domain = self.get_domain(domain_id)
+        if domain and normalized in domain.get("custom_study_types", {}):
+            return domain["custom_study_types"][normalized]["weight"]
+        # Fall back to base taxonomy
+        return STUDY_TYPE_WEIGHTS.get(normalized, 600)  # Default 0.6 for unknown
+    def get_weight_float(self, study_type: str, domain_id: str = "quantum_bio") -> float:
+        """Get weight as float."""
+        return from_fixed(self.get_weight(study_type, domain_id))
+    def get_all_study_types(self, domain_id: str = "quantum_bio") -> dict:
+        """Get all study types and weights for a domain (base + custom)."""
+        result = {}
+        # Base types
+        for st, weight in STUDY_TYPE_WEIGHTS.items():
+            result[st] = {
+                "weight": weight,
+                "weight_float": from_fixed(weight),
+                "description": STUDY_TYPE_DESCRIPTIONS.get(st, ""),
+                "source": "base",
+            }
+        # Domain custom types
+        domain = self.get_domain(domain_id)
+        if domain:
+            for st, info in domain.get("custom_study_types", {}).items():
+                result[st] = {
+                    "weight": info["weight"],
+                    "weight_float": from_fixed(info["weight"]),
+                    "description": info.get("description", ""),
+                    "source": f"domain:{domain_id}",
+                }
+        return result
+    def score_confidence(self, evidence_strength: float, study_type: str,
+                         journal_tier: int, is_complete: bool,
+                         domain_id: str = "quantum_bio") -> dict:
+        """
+        Calculate confidence using the V2 taxonomy.
+        confidence = evidence_strength × study_quality_weight × journal_tier_weight × completeness_penalty
+        Returns full breakdown with taxonomy version tag.
+        """
+        journal_tier_map = {1: 1000, 2: 850, 3: 700}
+        preprint_weight = 500
+        sq_weight = self.get_weight(study_type, domain_id)
+        jt_weight = journal_tier_map.get(journal_tier, preprint_weight)
+        completeness = 1000 if is_complete else 700
+        # Fixed-point multiplication: (a×b×c×d) / 1000^3
+        # To avoid overflow: chain multiply and divide
+        es_fp = to_fixed(evidence_strength)
+        raw = es_fp * sq_weight // 1000
+        raw = raw * jt_weight // 1000
+        raw = raw * completeness // 1000
+        confidence = max(0, min(1000, raw))
+        return {
+            "confidence": from_fixed(confidence),
+            "confidence_fixed": confidence,
+            "evidence_strength": evidence_strength,
+            "study_quality_weight": from_fixed(sq_weight),
+            "journal_tier_weight": from_fixed(jt_weight),
+            "completeness_penalty": from_fixed(completeness),
+            "study_type_normalized": self.normalize_study_type(study_type),
+            "taxonomy_version": TAXONOMY_VERSION,
+            "domain_id": domain_id,
+        }
+    # ============================================================
+    # Migration & Rollback
+    # ============================================================
+    def migrate_to_v2(self) -> dict:
+        """
+        Idempotent migration from legacy 4-tier to Quantum-Bio V2.
+        Returns migration summary.
+        """
+        conn = get_db(self.db_path)
+        summary = {"rows_backfilled": 0, "errors": [], "already_migrated": False}
+        try:
+            # Check if already migrated
+            cols = [row[1] for row in conn.execute("PRAGMA table_info(claims)").fetchall()]
+            if "taxonomy_version" in cols:
+                summary["already_migrated"] = True
+                conn.close()
+                return summary
+            # Add taxonomy_version column
+            conn.execute("ALTER TABLE claims ADD COLUMN taxonomy_version TEXT DEFAULT 'legacy_v1'")
+            # Backfill: normalize study types in claims
+            claims = conn.execute("SELECT claim_id, study_type FROM claims WHERE study_type IS NOT NULL").fetchall()
+            for claim in claims:
+                old_type = dict(claim).get("study_type", "")
+                if old_type:
+                    new_type = self.normalize_study_type(old_type)
+                    conn.execute(
+                        "UPDATE claims SET study_type = ?, taxonomy_version = ? WHERE claim_id = ?",
+                        (new_type, TAXONOMY_VERSION, dict(claim)["claim_id"])
+                    )
+                    summary["rows_backfilled"] += 1
+            # Backfill sources table too
+            src_cols = [row[1] for row in conn.execute("PRAGMA table_info(sources)").fetchall()]
+            if "taxonomy_version" not in src_cols:
+                conn.execute("ALTER TABLE sources ADD COLUMN taxonomy_version TEXT DEFAULT 'legacy_v1'")
+            sources = conn.execute("SELECT doi, study_type FROM sources WHERE study_type IS NOT NULL").fetchall()
+            for src in sources:
+                old_type = dict(src).get("study_type", "")
+                if old_type:
+                    new_type = self.normalize_study_type(old_type)
+                    conn.execute(
+                        "UPDATE sources SET study_type = ?, taxonomy_version = ? WHERE doi = ?",
+                        (new_type, TAXONOMY_VERSION, dict(src)["doi"])
+                    )
+            self._log_audit(conn, "migrate_v2", None,
+                          f"Migrated {summary['rows_backfilled']} claims to V2")
+            conn.commit()
+        except Exception as e:
+            conn.rollback()
+            summary["errors"].append(str(e))
+        finally:
+            conn.close()
+        return summary
+    def rollback_to_v1(self) -> dict:
+        """Rollback from V2 to legacy V1 study types."""
+        conn = get_db(self.db_path)
+        summary = {"rows_reverted": 0, "errors": []}
+        try:
+            for v2_type, v1_type in V2_TO_LEGACY_MAP.items():
+                cursor = conn.execute(
+                    "UPDATE claims SET study_type = ? WHERE study_type = ?",
+                    (v1_type, v2_type)
+                )
+                summary["rows_reverted"] += cursor.rowcount
+                conn.execute(
+                    "UPDATE sources SET study_type = ? WHERE study_type = ?",
+                    (v1_type, v2_type)
+                )
+            self._log_audit(conn, "rollback_v1", None,
+                          f"Rolled back {summary['rows_reverted']} claims to V1")
+            conn.commit()
+        except Exception as e:
+            conn.rollback()
+            summary["errors"].append(str(e))
+        finally:
+            conn.close()
+        return summary
+    # ============================================================
+    # Cache Invalidation
+    # ============================================================
+    def generate_cache_key(self, pdf_hash: str, schema_version: str = "1.0") -> str:
+        """Generate a versioned cache key."""
+        raw = f"{pdf_hash}_{schema_version}_{PIPELINE_VERSION}_{TAXONOMY_VERSION}"
+        return hashlib.sha256(raw.encode()).hexdigest()
+    def validate_cache_entry(self, entry: dict) -> bool:
+        """Check if a cache entry is still valid against current versions."""
+        return (
+            entry.get("taxonomy_version") == TAXONOMY_VERSION and
+            entry.get("pipeline_version") == PIPELINE_VERSION
+        )
+    # ============================================================
+    # Audit
+    # ============================================================
+    def _log_audit(self, conn, action: str, domain_id: Optional[str], details: str):
+        conn.execute("""
+            INSERT INTO taxonomy_audit_log (action, domain_id, details, timestamp)
+            VALUES (?, ?, ?, ?)
+        """, (action, domain_id, details, now_iso()))
+    def get_audit_log(self, limit: int = 50) -> list:
+        conn = get_db(self.db_path)
+        rows = conn.execute(
+            "SELECT * FROM taxonomy_audit_log ORDER BY timestamp DESC LIMIT ?", (limit,)
+        ).fetchall()
+        conn.close()
+        return [dict(r) for r in rows]