nkshirsa commited on
Commit
6cd8c77
·
verified ·
1 Parent(s): 4b8a9f2

Add phd_research_os/conflict_detector.py

Browse files
Files changed (1) hide show
  1. phd_research_os/conflict_detector.py +184 -0
phd_research_os/conflict_detector.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PhD Research OS — Conflict Detection (Phase 5)
3
+ =================================================
4
+ Pairwise contradiction detection across claim database.
5
+ Uses embedding similarity + AI brain for conflict assessment.
6
+ """
7
+
8
+ import json
9
+ from typing import Optional
10
+
11
+ from .db import get_db, search_claims, create_conflict, from_fixed
12
+ from .agents import ResearchOSBrain
13
+
14
+
15
+ class ConflictDetector:
16
+ """
17
+ Detect contradictions between claims in the database.
18
+
19
+ Strategy:
20
+ 1. Find claim pairs with high semantic similarity but different conclusions
21
+ 2. Use AI brain to assess if they truly conflict
22
+ 3. Generate conflict resolution hypotheses (always LOW confidence)
23
+
24
+ Per Research OS spec:
25
+ - hypothesis_confidence is ALWAYS "low"
26
+ - Agent can NEVER auto-promote conflict resolution above Level 5
27
+ - Human review required for all resolutions
28
+ """
29
+
30
+ def __init__(self, db_path: str = None, brain: ResearchOSBrain = None):
31
+ self.db_path = db_path
32
+ self.brain = brain
33
+
34
+ def find_candidate_pairs(self, topic_filter: str = None,
35
+ max_pairs: int = 100) -> list:
36
+ """
37
+ Find claim pairs that might conflict.
38
+
39
+ Heuristic: Claims with overlapping keywords but from different sources.
40
+ (Upgradeable to embedding similarity > 0.85 when ChromaDB is set up)
41
+ """
42
+ conn = get_db(self.db_path)
43
+
44
+ # Get all claims, optionally filtered
45
+ claims = search_claims(conn, query=topic_filter, limit=500)
46
+ conn.close()
47
+
48
+ if len(claims) < 2:
49
+ return []
50
+
51
+ pairs = []
52
+ seen = set()
53
+
54
+ for i, claim_a in enumerate(claims):
55
+ for j, claim_b in enumerate(claims[i+1:], i+1):
56
+ # Skip same source
57
+ if claim_a.get('source_doi') == claim_b.get('source_doi'):
58
+ continue
59
+
60
+ # Simple keyword overlap heuristic
61
+ words_a = set(claim_a['text'].lower().split())
62
+ words_b = set(claim_b['text'].lower().split())
63
+
64
+ # Remove common stopwords
65
+ stopwords = {'the', 'a', 'an', 'is', 'was', 'were', 'are', 'been',
66
+ 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
67
+ 'would', 'could', 'should', 'may', 'might', 'shall',
68
+ 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
69
+ 'from', 'and', 'or', 'but', 'not', 'no', 'this', 'that',
70
+ 'these', 'those', 'it', 'its', 'we', 'our', 'they'}
71
+ words_a -= stopwords
72
+ words_b -= stopwords
73
+
74
+ if not words_a or not words_b:
75
+ continue
76
+
77
+ overlap = len(words_a & words_b) / min(len(words_a), len(words_b))
78
+
79
+ if overlap >= 0.3: # At least 30% keyword overlap
80
+ pair_key = tuple(sorted([claim_a['claim_id'], claim_b['claim_id']]))
81
+ if pair_key not in seen:
82
+ seen.add(pair_key)
83
+ pairs.append({
84
+ 'claim_a': claim_a,
85
+ 'claim_b': claim_b,
86
+ 'keyword_overlap': overlap
87
+ })
88
+
89
+ if len(pairs) >= max_pairs:
90
+ break
91
+ if len(pairs) >= max_pairs:
92
+ break
93
+
94
+ # Sort by overlap (highest first — most likely conflicts)
95
+ pairs.sort(key=lambda x: x['keyword_overlap'], reverse=True)
96
+ return pairs
97
+
98
+ def detect_conflicts(self, topic_filter: str = None,
99
+ max_pairs: int = 50) -> list:
100
+ """
101
+ Run full conflict detection pipeline.
102
+
103
+ Returns list of detected conflicts with hypotheses.
104
+ """
105
+ if self.brain is None:
106
+ print("Warning: No brain configured. Using keyword-only heuristic.")
107
+ return self._keyword_only_detection(topic_filter, max_pairs)
108
+
109
+ pairs = self.find_candidate_pairs(topic_filter, max_pairs)
110
+ print(f"Found {len(pairs)} candidate pairs for conflict analysis")
111
+
112
+ conflicts = []
113
+ conn = get_db(self.db_path)
114
+
115
+ for i, pair in enumerate(pairs):
116
+ print(f" Analyzing pair {i+1}/{len(pairs)}...")
117
+
118
+ response = self.brain.detect_conflicts(
119
+ pair['claim_a']['text'],
120
+ pair['claim_b']['text']
121
+ )
122
+
123
+ if response.success and response.data.get('conflict_detected', False):
124
+ conflict_id = create_conflict(
125
+ conn,
126
+ pair['claim_a']['claim_id'],
127
+ pair['claim_b']['claim_id'],
128
+ response.data.get('conflict_type', 'value_mismatch'),
129
+ response.data.get('generated_hypothesis', ''),
130
+ response.data.get('key_differences', [])
131
+ )
132
+
133
+ conflicts.append({
134
+ 'conflict_id': conflict_id,
135
+ 'claim_a': pair['claim_a']['claim_id'],
136
+ 'claim_b': pair['claim_b']['claim_id'],
137
+ 'type': response.data.get('conflict_type'),
138
+ 'hypothesis': response.data.get('generated_hypothesis'),
139
+ 'confidence': 'low', # ALWAYS low per spec
140
+ })
141
+
142
+ conn.close()
143
+
144
+ print(f"\nDetected {len(conflicts)} conflicts from {len(pairs)} candidates")
145
+ print(f"False positive rate estimate: check manually")
146
+
147
+ return conflicts
148
+
149
+ def _keyword_only_detection(self, topic_filter: str = None,
150
+ max_pairs: int = 50) -> list:
151
+ """Fallback: keyword-only conflict detection without AI brain."""
152
+ pairs = self.find_candidate_pairs(topic_filter, max_pairs)
153
+
154
+ # Without AI brain, we flag high-overlap pairs from different sources
155
+ # that have different epistemic tags or very different confidence
156
+ flagged = []
157
+ conn = get_db(self.db_path)
158
+
159
+ for pair in pairs:
160
+ a = pair['claim_a']
161
+ b = pair['claim_b']
162
+
163
+ # Flag if: high overlap + different conclusions indicators
164
+ confidence_diff = abs(a['confidence'] - b['confidence'])
165
+ different_tags = a['epistemic_tag'] != b['epistemic_tag']
166
+
167
+ if pair['keyword_overlap'] > 0.5 and (confidence_diff > 0.3 or different_tags):
168
+ conflict_id = create_conflict(
169
+ conn,
170
+ a['claim_id'],
171
+ b['claim_id'],
172
+ 'value_mismatch',
173
+ 'Flagged by keyword overlap heuristic — requires AI or human review',
174
+ ['keyword_overlap > 0.5', f'confidence_diff = {confidence_diff:.2f}']
175
+ )
176
+ flagged.append({
177
+ 'conflict_id': conflict_id,
178
+ 'claim_a': a['claim_id'],
179
+ 'claim_b': b['claim_id'],
180
+ 'overlap': pair['keyword_overlap'],
181
+ })
182
+
183
+ conn.close()
184
+ return flagged