Add SPECTER2 embedding-based deduplication (replaces Jaccard word overlap)

Browse files

Files changed (1) hide show

phd_research_os_v2/layer3/embedding_dedup.py +222 -0

phd_research_os_v2/layer3/embedding_dedup.py ADDED Viewed

	@@ -0,0 +1,222 @@

+"""
+Layer 3: Embedding-Based Claim Deduplication (SPECTER2)
+=========================================================
+Replaces Jaccard word-overlap deduplication with SPECTER2 scientific
+embeddings for semantic matching.
+Addresses blindspots: M-1, M-2, PA-1
+Source: SYSTEM_INSPIRATIONS.md DA-1
+Dependencies:
+    pip install adapters torch
+Falls back to Jaccard if adapters/torch not available.
+"""
+import json
+import re
+import logging
+from typing import Optional
+import numpy as np
+logger = logging.getLogger(__name__)
+# ── Try to load SPECTER2 ──────────────────────────────────────────────
+_SPECTER2_AVAILABLE = False
+_specter2_model = None
+_specter2_tokenizer = None
+def _load_specter2():
+    """Lazy-load SPECTER2 model and adapter. Called once on first use."""
+    global _SPECTER2_AVAILABLE, _specter2_model, _specter2_tokenizer
+    if _specter2_model is not None:
+        return True
+    try:
+        from adapters import AutoAdapterModel
+        from transformers import AutoTokenizer
+        logger.info("Loading SPECTER2 base model...")
+        _specter2_tokenizer = AutoTokenizer.from_pretrained("allenai/specter2_base")
+        _specter2_model = AutoAdapterModel.from_pretrained("allenai/specter2_base")
+        logger.info("Loading SPECTER2 proximity adapter...")
+        _specter2_model.load_adapter("allenai/specter2", source="hf", set_active=True)
+        _specter2_model.eval()
+        _SPECTER2_AVAILABLE = True
+        logger.info("SPECTER2 loaded successfully (768-dim embeddings)")
+        return True
+    except ImportError:
+        logger.warning(
+            "SPECTER2 not available (install: pip install adapters torch). "
+            "Falling back to Jaccard word overlap for deduplication."
+        )
+        _SPECTER2_AVAILABLE = False
+        return False
+    except Exception as e:
+        logger.warning(f"SPECTER2 failed to load: {e}. Using Jaccard fallback.")
+        _SPECTER2_AVAILABLE = False
+        return False
+def embed_claims(texts: list[str]) -> np.ndarray:
+    """
+    Embed a list of claim texts using SPECTER2.
+    Returns shape (N, 768) numpy array of L2-normalized embeddings.
+    For SPECTER2, the expected input format is:
+        title + [SEP] + abstract
+    For claims (no title), we just pass the claim text directly.
+    """
+    import torch
+    if not _load_specter2():
+        raise RuntimeError("SPECTER2 not available")
+    inputs = _specter2_tokenizer(
+        texts,
+        padding=True,
+        truncation=True,
+        max_length=512,
+        return_tensors="pt"
+    )
+    with torch.no_grad():
+        outputs = _specter2_model(**inputs)
+    # CLS token embedding
+    embeddings = outputs.last_hidden_state[:, 0, :].numpy()
+    # L2 normalize for cosine similarity via dot product
+    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
+    norms = np.where(norms == 0, 1, norms)
+    embeddings = embeddings / norms
+    return embeddings
+def cosine_similarity(emb_a: np.ndarray, emb_b: np.ndarray) -> float:
+    """Cosine similarity between two L2-normalized embeddings."""
+    return float(np.dot(emb_a, emb_b))
+def cosine_similarity_matrix(embeddings: np.ndarray) -> np.ndarray:
+    """Full pairwise cosine similarity matrix (for batch operations)."""
+    return embeddings @ embeddings.T
+# ── Jaccard fallback (identical to existing canonicalizer.py) ─────────
+_STOPWORDS = {
+    'the', 'a', 'an', 'is', 'was', 'were', 'are', 'been', 'be',
+    'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
+    'could', 'should', 'may', 'might', 'in', 'on', 'at', 'to',
+    'for', 'of', 'with', 'by', 'from', 'and', 'or', 'but', 'not',
+    'this', 'that', 'it', 'its', 'we', 'our', 'they'
+}
+def _normalize(text: str) -> str:
+    t = text.lower().strip()
+    t = re.sub(r'\s+', ' ', t)
+    t = re.sub(r'[^\w\s\.\,\-\+\=\<\>\(\)]', '', t)
+    return t
+def jaccard_similarity(text_a: str, text_b: str) -> float:
+    words_a = set(_normalize(text_a).split()) - _STOPWORDS
+    words_b = set(_normalize(text_b).split()) - _STOPWORDS
+    if not words_a or not words_b:
+        return 0.0
+    intersection = words_a & words_b
+    union = words_a | words_b
+    return len(intersection) / len(union) if union else 0.0
+# ── Unified similarity function ───────────────────────────────────────
+def claim_similarity(text_a: str, text_b: str, method: str = "auto") -> float:
+    """
+    Compute similarity between two claim texts.
+    method:
+        "auto"     - SPECTER2 if available, else Jaccard
+        "specter2" - Force SPECTER2 (raises if not available)
+        "jaccard"  - Force Jaccard word overlap
+    """
+    if method == "jaccard":
+        return jaccard_similarity(text_a, text_b)
+    if method == "auto":
+        if _load_specter2():
+            method = "specter2"
+        else:
+            return jaccard_similarity(text_a, text_b)
+    # SPECTER2
+    embeddings = embed_claims([text_a, text_b])
+    return cosine_similarity(embeddings[0], embeddings[1])
+def batch_deduplicate(texts: list[str], threshold: float = 0.85,
+                      method: str = "auto") -> dict:
+    """
+    Batch deduplication. Returns mapping of duplicate indices to their canonical index.
+    Returns:
+        {
+            "canonical_indices": [0, 2, 5, ...],  # indices of unique claims
+            "duplicates": {1: 0, 3: 0, 4: 2},    # duplicate_idx -> canonical_idx
+            "similarity_method": "specter2" | "jaccard"
+        }
+    """
+    n = len(texts)
+    if n == 0:
+        return {"canonical_indices": [], "duplicates": {}, "similarity_method": "none"}
+    if n == 1:
+        return {"canonical_indices": [0], "duplicates": {}, "similarity_method": "none"}
+    use_specter = (method == "specter2") or (method == "auto" and _load_specter2())
+    if use_specter:
+        embeddings = embed_claims(texts)
+        sim_matrix = cosine_similarity_matrix(embeddings)
+        actual_method = "specter2"
+    else:
+        # Build Jaccard matrix
+        sim_matrix = np.zeros((n, n))
+        for i in range(n):
+            for j in range(i, n):
+                sim = jaccard_similarity(texts[i], texts[j])
+                sim_matrix[i][j] = sim
+                sim_matrix[j][i] = sim
+        actual_method = "jaccard"
+    # Greedy deduplication
+    canonical_indices = []
+    duplicates = {}
+    removed = set()
+    for i in range(n):
+        if i in removed:
+            continue
+        canonical_indices.append(i)
+        for j in range(i + 1, n):
+            if j in removed:
+                continue
+            if sim_matrix[i][j] >= threshold:
+                duplicates[j] = i
+                removed.add(j)
+    return {
+        "canonical_indices": canonical_indices,
+        "duplicates": duplicates,
+        "similarity_method": actual_method,
+    }
+def is_available() -> bool:
+    """Check if SPECTER2 is available for embedding-based dedup."""
+    return _load_specter2()