Add phd_research_os_v2/layer3/canonicalizer.py

Browse files

Files changed (1) hide show

phd_research_os_v2/layer3/canonicalizer.py +212 -0

phd_research_os_v2/layer3/canonicalizer.py ADDED Viewed

	@@ -0,0 +1,212 @@

+"""
+Layer 3: Claim Canonicalization
+=================================
+Deduplicate claims using text similarity, maintain canonical registry,
+aggregate evidence across sources, track temporal versions.
+"""
+import json
+import re
+from typing import Optional
+from ..core.database import get_db, gen_id, now_iso, to_fixed, from_fixed
+def normalize_claim_text(text: str) -> str:
+    """Normalize claim text for comparison."""
+    t = text.lower().strip()
+    t = re.sub(r'\s+', ' ', t)
+    t = re.sub(r'[^\w\s\.\,\-\+\=\<\>\(\)]', '', t)
+    return t
+def jaccard_similarity(text_a: str, text_b: str) -> float:
+    """Compute Jaccard similarity between two texts (word-level)."""
+    stopwords = {'the', 'a', 'an', 'is', 'was', 'were', 'are', 'been', 'be',
+                 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
+                 'could', 'should', 'may', 'might', 'in', 'on', 'at', 'to',
+                 'for', 'of', 'with', 'by', 'from', 'and', 'or', 'but', 'not',
+                 'this', 'that', 'it', 'its', 'we', 'our', 'they'}
+    words_a = set(normalize_claim_text(text_a).split()) - stopwords
+    words_b = set(normalize_claim_text(text_b).split()) - stopwords
+    if not words_a or not words_b:
+        return 0.0
+    intersection = words_a & words_b
+    union = words_a | words_b
+    return len(intersection) / len(union) if union else 0.0
+class Canonicalizer:
+    """
+    Deduplicates claims into canonical entries.
+    When a new claim is extracted:
+    - If similarity > 0.85 to existing canonical: MERGE (add source as evidence)
+    - If 0.70-0.85: FLAG for human review
+    - If < 0.70: CREATE new canonical
+    """
+    MERGE_THRESHOLD = 0.85
+    REVIEW_THRESHOLD = 0.70
+    def __init__(self, db_path: str = None):
+        self.db_path = db_path
+    def canonicalize_claim(self, claim_id: str) -> dict:
+        """
+        Canonicalize a single claim. Returns action taken.
+        """
+        conn = get_db(self.db_path)
+        # Get the claim
+        claim_row = conn.execute("SELECT * FROM claims WHERE claim_id = ?", (claim_id,)).fetchone()
+        if not claim_row:
+            conn.close()
+            return {"action": "error", "reason": "Claim not found"}
+        claim = dict(claim_row)
+        claim_text = claim["text"]
+        # Get all existing canonical claims
+        canonicals = conn.execute("SELECT * FROM canonical_claims").fetchall()
+        best_match = None
+        best_similarity = 0.0
+        for canon_row in canonicals:
+            canon = dict(canon_row)
+            sim = jaccard_similarity(claim_text, canon["representative_text"])
+            if sim > best_similarity:
+                best_similarity = sim
+                best_match = canon
+        result = {"claim_id": claim_id, "similarity": round(best_similarity, 3)}
+        if best_match and best_similarity >= self.MERGE_THRESHOLD:
+            # MERGE into existing canonical
+            canonical_id = best_match["canonical_id"]
+            # Update evidence count and source list
+            source_dois = json.loads(best_match.get("source_dois", "[]"))
+            aliases = json.loads(best_match.get("aliases", "[]"))
+            if claim.get("source_doi") and claim["source_doi"] not in source_dois:
+                source_dois.append(claim["source_doi"])
+            if claim_id not in aliases:
+                aliases.append(claim_id)
+            # Recalculate aggregate confidence
+            new_count = best_match["evidence_count"] + 1
+            old_conf = best_match.get("composite_confidence", 500)
+            new_conf = claim.get("composite_confidence", 500)
+            avg_conf = (old_conf * best_match["evidence_count"] + new_conf) // new_count
+            conn.execute("""
+                UPDATE canonical_claims SET
+                    evidence_count = ?,
+                    source_dois = ?,
+                    aliases = ?,
+                    composite_confidence = ?,
+                    updated_at = ?
+                WHERE canonical_id = ?
+            """, (new_count, json.dumps(source_dois), json.dumps(aliases),
+                  avg_conf, now_iso(), canonical_id))
+            # Link claim to canonical
+            conn.execute("UPDATE claims SET canonical_id = ? WHERE claim_id = ?",
+                        (canonical_id, claim_id))
+            conn.commit()
+            conn.close()
+            result.update({
+                "action": "merged",
+                "canonical_id": canonical_id,
+                "evidence_count": new_count,
+            })
+        elif best_match and best_similarity >= self.REVIEW_THRESHOLD:
+            # FLAG for review
+            conn.close()
+            result.update({
+                "action": "review_needed",
+                "candidate_canonical_id": best_match["canonical_id"],
+                "candidate_text": best_match["representative_text"][:100],
+            })
+        else:
+            # CREATE new canonical
+            canonical_id = gen_id("CANON")
+            source_dois = [claim.get("source_doi")] if claim.get("source_doi") else []
+            conn.execute("""
+                INSERT INTO canonical_claims (canonical_id, representative_text, epistemic_tag,
+                    composite_confidence, evidence_count, source_dois, aliases,
+                    version_history, current_version,
+                    schema_version, created_at, updated_at)
+                VALUES (?, ?, ?, ?, 1, ?, ?, ?, 1, '2.0', ?, ?)
+            """, (canonical_id, claim_text, claim.get("epistemic_tag", "Interpretation"),
+                  claim.get("composite_confidence", 500),
+                  json.dumps(source_dois), json.dumps([claim_id]),
+                  json.dumps([{
+                      "version": 1,
+                      "source": claim.get("source_doi"),
+                      "confidence": claim.get("composite_confidence", 500),
+                      "date": now_iso()[:10],
+                  }]),
+                  now_iso(), now_iso()))
+            conn.execute("UPDATE claims SET canonical_id = ? WHERE claim_id = ?",
+                        (canonical_id, claim_id))
+            conn.commit()
+            conn.close()
+            result.update({
+                "action": "created",
+                "canonical_id": canonical_id,
+            })
+        return result
+    def canonicalize_all(self) -> dict:
+        """Canonicalize all uncanonicalized claims."""
+        conn = get_db(self.db_path)
+        uncanonicalized = conn.execute(
+            "SELECT claim_id FROM claims WHERE canonical_id IS NULL"
+        ).fetchall()
+        conn.close()
+        stats = {"merged": 0, "created": 0, "review_needed": 0, "errors": 0}
+        for row in uncanonicalized:
+            result = self.canonicalize_claim(dict(row)["claim_id"])
+            action = result.get("action", "error")
+            if action in stats:
+                stats[action] += 1
+            else:
+                stats["errors"] += 1
+        return stats
+    def get_canonical_claims(self, min_evidence: int = 1) -> list:
+        """Get canonical claims sorted by evidence count."""
+        conn = get_db(self.db_path)
+        rows = conn.execute("""
+            SELECT * FROM canonical_claims
+            WHERE evidence_count >= ?
+            ORDER BY evidence_count DESC, composite_confidence DESC
+        """, (min_evidence,)).fetchall()
+        conn.close()
+        results = []
+        for r in rows:
+            d = dict(r)
+            d["source_dois"] = json.loads(d.get("source_dois", "[]"))
+            d["aliases"] = json.loads(d.get("aliases", "[]"))
+            d["version_history"] = json.loads(d.get("version_history", "[]"))
+            d["composite_confidence"] = from_fixed(d.get("composite_confidence", 0))
+            results.append(d)
+        return results