nkshirsa commited on
Commit
07dc4a2
·
verified ·
1 Parent(s): 5d27056

Add phd_research_os_v2/layer3/canonicalizer.py

Browse files
phd_research_os_v2/layer3/canonicalizer.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Layer 3: Claim Canonicalization
3
+ =================================
4
+ Deduplicate claims using text similarity, maintain canonical registry,
5
+ aggregate evidence across sources, track temporal versions.
6
+ """
7
+
8
+ import json
9
+ import re
10
+ from typing import Optional
11
+ from ..core.database import get_db, gen_id, now_iso, to_fixed, from_fixed
12
+
13
+
14
+ def normalize_claim_text(text: str) -> str:
15
+ """Normalize claim text for comparison."""
16
+ t = text.lower().strip()
17
+ t = re.sub(r'\s+', ' ', t)
18
+ t = re.sub(r'[^\w\s\.\,\-\+\=\<\>\(\)]', '', t)
19
+ return t
20
+
21
+
22
+ def jaccard_similarity(text_a: str, text_b: str) -> float:
23
+ """Compute Jaccard similarity between two texts (word-level)."""
24
+ stopwords = {'the', 'a', 'an', 'is', 'was', 'were', 'are', 'been', 'be',
25
+ 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
26
+ 'could', 'should', 'may', 'might', 'in', 'on', 'at', 'to',
27
+ 'for', 'of', 'with', 'by', 'from', 'and', 'or', 'but', 'not',
28
+ 'this', 'that', 'it', 'its', 'we', 'our', 'they'}
29
+
30
+ words_a = set(normalize_claim_text(text_a).split()) - stopwords
31
+ words_b = set(normalize_claim_text(text_b).split()) - stopwords
32
+
33
+ if not words_a or not words_b:
34
+ return 0.0
35
+
36
+ intersection = words_a & words_b
37
+ union = words_a | words_b
38
+ return len(intersection) / len(union) if union else 0.0
39
+
40
+
41
+ class Canonicalizer:
42
+ """
43
+ Deduplicates claims into canonical entries.
44
+
45
+ When a new claim is extracted:
46
+ - If similarity > 0.85 to existing canonical: MERGE (add source as evidence)
47
+ - If 0.70-0.85: FLAG for human review
48
+ - If < 0.70: CREATE new canonical
49
+ """
50
+
51
+ MERGE_THRESHOLD = 0.85
52
+ REVIEW_THRESHOLD = 0.70
53
+
54
+ def __init__(self, db_path: str = None):
55
+ self.db_path = db_path
56
+
57
+ def canonicalize_claim(self, claim_id: str) -> dict:
58
+ """
59
+ Canonicalize a single claim. Returns action taken.
60
+ """
61
+ conn = get_db(self.db_path)
62
+
63
+ # Get the claim
64
+ claim_row = conn.execute("SELECT * FROM claims WHERE claim_id = ?", (claim_id,)).fetchone()
65
+ if not claim_row:
66
+ conn.close()
67
+ return {"action": "error", "reason": "Claim not found"}
68
+
69
+ claim = dict(claim_row)
70
+ claim_text = claim["text"]
71
+
72
+ # Get all existing canonical claims
73
+ canonicals = conn.execute("SELECT * FROM canonical_claims").fetchall()
74
+
75
+ best_match = None
76
+ best_similarity = 0.0
77
+
78
+ for canon_row in canonicals:
79
+ canon = dict(canon_row)
80
+ sim = jaccard_similarity(claim_text, canon["representative_text"])
81
+ if sim > best_similarity:
82
+ best_similarity = sim
83
+ best_match = canon
84
+
85
+ result = {"claim_id": claim_id, "similarity": round(best_similarity, 3)}
86
+
87
+ if best_match and best_similarity >= self.MERGE_THRESHOLD:
88
+ # MERGE into existing canonical
89
+ canonical_id = best_match["canonical_id"]
90
+
91
+ # Update evidence count and source list
92
+ source_dois = json.loads(best_match.get("source_dois", "[]"))
93
+ aliases = json.loads(best_match.get("aliases", "[]"))
94
+
95
+ if claim.get("source_doi") and claim["source_doi"] not in source_dois:
96
+ source_dois.append(claim["source_doi"])
97
+ if claim_id not in aliases:
98
+ aliases.append(claim_id)
99
+
100
+ # Recalculate aggregate confidence
101
+ new_count = best_match["evidence_count"] + 1
102
+ old_conf = best_match.get("composite_confidence", 500)
103
+ new_conf = claim.get("composite_confidence", 500)
104
+ avg_conf = (old_conf * best_match["evidence_count"] + new_conf) // new_count
105
+
106
+ conn.execute("""
107
+ UPDATE canonical_claims SET
108
+ evidence_count = ?,
109
+ source_dois = ?,
110
+ aliases = ?,
111
+ composite_confidence = ?,
112
+ updated_at = ?
113
+ WHERE canonical_id = ?
114
+ """, (new_count, json.dumps(source_dois), json.dumps(aliases),
115
+ avg_conf, now_iso(), canonical_id))
116
+
117
+ # Link claim to canonical
118
+ conn.execute("UPDATE claims SET canonical_id = ? WHERE claim_id = ?",
119
+ (canonical_id, claim_id))
120
+
121
+ conn.commit()
122
+ conn.close()
123
+
124
+ result.update({
125
+ "action": "merged",
126
+ "canonical_id": canonical_id,
127
+ "evidence_count": new_count,
128
+ })
129
+
130
+ elif best_match and best_similarity >= self.REVIEW_THRESHOLD:
131
+ # FLAG for review
132
+ conn.close()
133
+ result.update({
134
+ "action": "review_needed",
135
+ "candidate_canonical_id": best_match["canonical_id"],
136
+ "candidate_text": best_match["representative_text"][:100],
137
+ })
138
+
139
+ else:
140
+ # CREATE new canonical
141
+ canonical_id = gen_id("CANON")
142
+ source_dois = [claim.get("source_doi")] if claim.get("source_doi") else []
143
+
144
+ conn.execute("""
145
+ INSERT INTO canonical_claims (canonical_id, representative_text, epistemic_tag,
146
+ composite_confidence, evidence_count, source_dois, aliases,
147
+ version_history, current_version,
148
+ schema_version, created_at, updated_at)
149
+ VALUES (?, ?, ?, ?, 1, ?, ?, ?, 1, '2.0', ?, ?)
150
+ """, (canonical_id, claim_text, claim.get("epistemic_tag", "Interpretation"),
151
+ claim.get("composite_confidence", 500),
152
+ json.dumps(source_dois), json.dumps([claim_id]),
153
+ json.dumps([{
154
+ "version": 1,
155
+ "source": claim.get("source_doi"),
156
+ "confidence": claim.get("composite_confidence", 500),
157
+ "date": now_iso()[:10],
158
+ }]),
159
+ now_iso(), now_iso()))
160
+
161
+ conn.execute("UPDATE claims SET canonical_id = ? WHERE claim_id = ?",
162
+ (canonical_id, claim_id))
163
+
164
+ conn.commit()
165
+ conn.close()
166
+
167
+ result.update({
168
+ "action": "created",
169
+ "canonical_id": canonical_id,
170
+ })
171
+
172
+ return result
173
+
174
+ def canonicalize_all(self) -> dict:
175
+ """Canonicalize all uncanonicalized claims."""
176
+ conn = get_db(self.db_path)
177
+ uncanonicalized = conn.execute(
178
+ "SELECT claim_id FROM claims WHERE canonical_id IS NULL"
179
+ ).fetchall()
180
+ conn.close()
181
+
182
+ stats = {"merged": 0, "created": 0, "review_needed": 0, "errors": 0}
183
+
184
+ for row in uncanonicalized:
185
+ result = self.canonicalize_claim(dict(row)["claim_id"])
186
+ action = result.get("action", "error")
187
+ if action in stats:
188
+ stats[action] += 1
189
+ else:
190
+ stats["errors"] += 1
191
+
192
+ return stats
193
+
194
+ def get_canonical_claims(self, min_evidence: int = 1) -> list:
195
+ """Get canonical claims sorted by evidence count."""
196
+ conn = get_db(self.db_path)
197
+ rows = conn.execute("""
198
+ SELECT * FROM canonical_claims
199
+ WHERE evidence_count >= ?
200
+ ORDER BY evidence_count DESC, composite_confidence DESC
201
+ """, (min_evidence,)).fetchall()
202
+ conn.close()
203
+
204
+ results = []
205
+ for r in rows:
206
+ d = dict(r)
207
+ d["source_dois"] = json.loads(d.get("source_dois", "[]"))
208
+ d["aliases"] = json.loads(d.get("aliases", "[]"))
209
+ d["version_history"] = json.loads(d.get("version_history", "[]"))
210
+ d["composite_confidence"] = from_fixed(d.get("composite_confidence", 0))
211
+ results.append(d)
212
+ return results