nkshirsa commited on
Commit
adc45e6
Β·
verified Β·
1 Parent(s): cf74560

Add comprehensive test suite for all foundation components (98 test cases)

Browse files
Files changed (1) hide show
  1. tests/test_foundation_components.py +787 -0
tests/test_foundation_components.py ADDED
@@ -0,0 +1,787 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tests for Foundation Components
3
+ =================================
4
+ Tests for all "strongly implementable" features:
5
+ - SPECTER2 embedding dedup (with Jaccard fallback)
6
+ - SciFact benchmark evaluation
7
+ - Epistemic Trigger Words validator
8
+ - Low Confidence Quarantine
9
+ - SciBERT-NLI contradiction pre-filter (with fallback)
10
+ - Epistemic Velocity tracking
11
+ - Confidence Decomposition Display
12
+ """
13
+
14
+ import pytest
15
+ import json
16
+ import os
17
+ import sys
18
+ import tempfile
19
+
20
+ # Add project root to path
21
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
22
+
23
+
24
+ # ══════════════════════════════════════════════════════════════════════
25
+ # FIXTURES
26
+ # ══════════════════════════════════════════════════════════════════════
27
+
28
+ @pytest.fixture
29
+ def db_path():
30
+ """Create a temporary database for testing."""
31
+ from phd_research_os_v2.core.database import init_db
32
+ with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
33
+ path = f.name
34
+ init_db(path)
35
+ yield path
36
+ os.unlink(path)
37
+
38
+
39
+ @pytest.fixture
40
+ def sample_claims():
41
+ """Sample claims for testing."""
42
+ return [
43
+ {
44
+ "claim_id": "CLM_TEST001",
45
+ "text": "The limit of detection was 0.8 fM in 10 mM PBS buffer.",
46
+ "epistemic_tag": "Fact",
47
+ "source_section": "results",
48
+ "source_doi": "10.1234/paper1",
49
+ "evidence_strength": 800,
50
+ "composite_confidence": 750,
51
+ "qualifiers": json.dumps(["in 10 mM PBS"]),
52
+ "missing_fields": json.dumps([]),
53
+ "is_null_result": False,
54
+ "is_inherited_citation": False,
55
+ },
56
+ {
57
+ "claim_id": "CLM_TEST002",
58
+ "text": "A detection limit of 800 attomolar was achieved using the graphene sensor.",
59
+ "epistemic_tag": "Fact",
60
+ "source_section": "results",
61
+ "source_doi": "10.1234/paper2",
62
+ "evidence_strength": 750,
63
+ "composite_confidence": 700,
64
+ "qualifiers": json.dumps([]),
65
+ "missing_fields": json.dumps([]),
66
+ "is_null_result": False,
67
+ "is_inherited_citation": False,
68
+ },
69
+ {
70
+ "claim_id": "CLM_TEST003",
71
+ "text": "This approach may potentially reduce diagnostic costs in low-resource settings.",
72
+ "epistemic_tag": "Hypothesis",
73
+ "source_section": "discussion",
74
+ "source_doi": "10.1234/paper1",
75
+ "evidence_strength": 300,
76
+ "composite_confidence": 200,
77
+ "qualifiers": json.dumps(["may", "potentially"]),
78
+ "missing_fields": json.dumps(["cost_analysis", "field_testing"]),
79
+ "is_null_result": False,
80
+ "is_inherited_citation": False,
81
+ },
82
+ {
83
+ "claim_id": "CLM_TEST004",
84
+ "text": "The sensor did not show significant improvement over the control group.",
85
+ "epistemic_tag": "Fact",
86
+ "source_section": "results",
87
+ "source_doi": "10.1234/paper3",
88
+ "evidence_strength": 600,
89
+ "composite_confidence": 400,
90
+ "qualifiers": json.dumps(["not significant"]),
91
+ "missing_fields": json.dumps([]),
92
+ "is_null_result": True,
93
+ "is_inherited_citation": False,
94
+ },
95
+ ]
96
+
97
+
98
+ # ══════════════════════════════════════════════════════════════════════
99
+ # TEST: EMBEDDING DEDUP (Layer 3)
100
+ # ══════════════════════════════════════════════════════════════════════
101
+
102
+ class TestEmbeddingDedup:
103
+ """Tests for phd_research_os_v2.layer3.embedding_dedup"""
104
+
105
+ def test_jaccard_identical_texts(self):
106
+ from phd_research_os_v2.layer3.embedding_dedup import jaccard_similarity
107
+ sim = jaccard_similarity("The LOD was 0.8 fM", "The LOD was 0.8 fM")
108
+ assert sim == 1.0
109
+
110
+ def test_jaccard_different_texts(self):
111
+ from phd_research_os_v2.layer3.embedding_dedup import jaccard_similarity
112
+ sim = jaccard_similarity("The LOD was 0.8 fM", "Completely unrelated text about cooking")
113
+ assert sim < 0.2
114
+
115
+ def test_jaccard_similar_texts(self):
116
+ from phd_research_os_v2.layer3.embedding_dedup import jaccard_similarity
117
+ sim = jaccard_similarity(
118
+ "The detection limit was 0.8 femtomolar",
119
+ "The detection limit was measured at 0.8 fM"
120
+ )
121
+ assert sim > 0.3
122
+
123
+ def test_jaccard_empty_texts(self):
124
+ from phd_research_os_v2.layer3.embedding_dedup import jaccard_similarity
125
+ assert jaccard_similarity("", "") == 0.0
126
+ assert jaccard_similarity("hello", "") == 0.0
127
+
128
+ def test_claim_similarity_auto_mode(self):
129
+ from phd_research_os_v2.layer3.embedding_dedup import claim_similarity
130
+ sim = claim_similarity("LOD was 0.8 fM", "LOD was 0.8 fM", method="jaccard")
131
+ assert sim == 1.0
132
+
133
+ def test_batch_deduplicate_jaccard(self):
134
+ from phd_research_os_v2.layer3.embedding_dedup import batch_deduplicate
135
+ texts = [
136
+ "The LOD was 0.8 fM in PBS buffer",
137
+ "The LOD was 0.8 fM in PBS buffer", # exact duplicate
138
+ "Completely different topic about weather",
139
+ ]
140
+ result = batch_deduplicate(texts, threshold=0.85, method="jaccard")
141
+ assert len(result["canonical_indices"]) <= 2 # At most 2 unique
142
+ assert 1 in result["duplicates"] # Index 1 is a duplicate of 0
143
+
144
+ def test_batch_deduplicate_empty(self):
145
+ from phd_research_os_v2.layer3.embedding_dedup import batch_deduplicate
146
+ result = batch_deduplicate([], method="jaccard")
147
+ assert result["canonical_indices"] == []
148
+
149
+ def test_batch_deduplicate_single(self):
150
+ from phd_research_os_v2.layer3.embedding_dedup import batch_deduplicate
151
+ result = batch_deduplicate(["one claim"], method="jaccard")
152
+ assert result["canonical_indices"] == [0]
153
+
154
+ def test_normalize_claim_text(self):
155
+ from phd_research_os_v2.layer3.embedding_dedup import _normalize
156
+ assert _normalize(" The LOD was 0.8 fM ") == "the lod was 0.8 fm"
157
+
158
+
159
+ # ══════════════════════════════════════════════════════════════════════
160
+ # TEST: SCIFACT BENCHMARK (Layer 6)
161
+ # ══════════════════════════════════════════════════════════════════════
162
+
163
+ class TestSciFact:
164
+ """Tests for phd_research_os_v2.layer6.scifact_benchmark"""
165
+
166
+ def test_baseline_classifier_support(self):
167
+ from phd_research_os_v2.layer6.scifact_benchmark import quick_baseline_classifier
168
+ result = quick_baseline_classifier(
169
+ "Vitamin C helps prevent scurvy",
170
+ "Studies have shown vitamin C is essential for preventing scurvy in sailors"
171
+ )
172
+ assert result in ["SUPPORT", "CONTRADICT", "NOT_ENOUGH_INFO"]
173
+
174
+ def test_baseline_classifier_contradict(self):
175
+ from phd_research_os_v2.layer6.scifact_benchmark import quick_baseline_classifier
176
+ result = quick_baseline_classifier(
177
+ "The drug has no side effects",
178
+ "The drug was found to have significant adverse effects including nausea"
179
+ )
180
+ assert result in ["SUPPORT", "CONTRADICT", "NOT_ENOUGH_INFO"]
181
+
182
+ def test_evaluate_returns_correct_structure(self):
183
+ from phd_research_os_v2.layer6.scifact_benchmark import evaluate_against_scifact
184
+
185
+ def dummy_classifier(claim, evidence):
186
+ return "SUPPORT"
187
+
188
+ examples = [
189
+ {"claim": "test claim 1", "evidence": "test evidence 1", "label": "SUPPORT"},
190
+ {"claim": "test claim 2", "evidence": "test evidence 2", "label": "CONTRADICT"},
191
+ {"claim": "test claim 3", "evidence": "test evidence 3", "label": "NOT_ENOUGH_INFO"},
192
+ ]
193
+
194
+ result = evaluate_against_scifact(dummy_classifier, examples)
195
+
196
+ assert "accuracy" in result
197
+ assert "per_class" in result
198
+ assert "confusion_matrix" in result
199
+ assert "total_examples" in result
200
+ assert result["total_examples"] == 3
201
+ assert 0 <= result["accuracy"] <= 1
202
+
203
+ def test_evaluate_perfect_classifier(self):
204
+ from phd_research_os_v2.layer6.scifact_benchmark import evaluate_against_scifact
205
+
206
+ examples = [
207
+ {"claim": "c1", "evidence": "e1", "label": "SUPPORT"},
208
+ {"claim": "c2", "evidence": "e2", "label": "CONTRADICT"},
209
+ ]
210
+
211
+ def perfect(claim, evidence):
212
+ for ex in examples:
213
+ if ex["claim"] == claim:
214
+ return ex["label"]
215
+ return "NOT_ENOUGH_INFO"
216
+
217
+ result = evaluate_against_scifact(perfect, examples)
218
+ assert result["accuracy"] == 1.0
219
+
220
+ def test_evaluate_handles_errors(self):
221
+ from phd_research_os_v2.layer6.scifact_benchmark import evaluate_against_scifact
222
+
223
+ def broken(claim, evidence):
224
+ raise ValueError("broken")
225
+
226
+ examples = [{"claim": "c", "evidence": "e", "label": "SUPPORT"}]
227
+ result = evaluate_against_scifact(broken, examples)
228
+ assert result["total_examples"] == 1 # Should not crash
229
+
230
+
231
+ # ════════════════════════��═════════════════════════════════════════════
232
+ # TEST: EPISTEMIC TRIGGER WORDS (Layer 2)
233
+ # ══════════════════════════════════════════════════════════════════════
234
+
235
+ class TestTriggerValidator:
236
+ """Tests for phd_research_os_v2.layer2.trigger_validator"""
237
+
238
+ def test_fact_detection(self):
239
+ from phd_research_os_v2.layer2.trigger_validator import compute_trigger_scores
240
+ result = compute_trigger_scores(
241
+ "We measured a detection limit of 0.8 fM with p < 0.001",
242
+ source_section="results"
243
+ )
244
+ assert result["predicted_tag"] == "Fact"
245
+ assert result["scores"]["Fact"] > 0.3
246
+
247
+ def test_hypothesis_detection(self):
248
+ from phd_research_os_v2.layer2.trigger_validator import compute_trigger_scores
249
+ result = compute_trigger_scores(
250
+ "This may potentially reduce costs and further investigation is needed",
251
+ source_section="discussion"
252
+ )
253
+ assert result["predicted_tag"] == "Hypothesis"
254
+ assert result["scores"]["Hypothesis"] > 0.3
255
+
256
+ def test_interpretation_detection(self):
257
+ from phd_research_os_v2.layer2.trigger_validator import compute_trigger_scores
258
+ result = compute_trigger_scores(
259
+ "These findings suggest that the mechanism is likely due to charge transfer",
260
+ source_section="discussion"
261
+ )
262
+ assert result["predicted_tag"] == "Interpretation"
263
+
264
+ def test_conflict_detection(self):
265
+ from phd_research_os_v2.layer2.trigger_validator import compute_trigger_scores
266
+ result = compute_trigger_scores(
267
+ "Contrary to previous reports, our results show inconsistent findings that refutes the hypothesis"
268
+ )
269
+ assert result["scores"]["Conflict_Hypothesis"] > 0.2
270
+
271
+ def test_section_prior_results(self):
272
+ from phd_research_os_v2.layer2.trigger_validator import compute_trigger_scores
273
+ result = compute_trigger_scores(
274
+ "The value was obtained from the experiment",
275
+ source_section="results"
276
+ )
277
+ assert result["scores"]["Fact"] > 0 # Results prior boosts Fact
278
+
279
+ def test_section_prior_abstract(self):
280
+ from phd_research_os_v2.layer2.trigger_validator import compute_trigger_scores
281
+ result = compute_trigger_scores(
282
+ "A novel approach was developed",
283
+ source_section="abstract"
284
+ )
285
+ assert result["scores"]["Interpretation"] > 0 # Abstract prior boosts Interpretation
286
+
287
+ def test_validate_ai_tag_agreement(self):
288
+ from phd_research_os_v2.layer2.trigger_validator import validate_ai_tag
289
+ result = validate_ai_tag(
290
+ "We measured a detection limit of 0.8 fM with p < 0.001",
291
+ ai_tag="Fact",
292
+ source_section="results"
293
+ )
294
+ assert result["agreement"] == True
295
+ assert result["recommendation"] == "accept"
296
+
297
+ def test_validate_ai_tag_disagreement(self):
298
+ from phd_research_os_v2.layer2.trigger_validator import validate_ai_tag
299
+ result = validate_ai_tag(
300
+ "This may potentially reduce costs and further investigation is needed",
301
+ ai_tag="Fact",
302
+ source_section="discussion"
303
+ )
304
+ # Trigger words should detect hypothesis language
305
+ if not result["agreement"]:
306
+ assert result["disagreement_severity"] in ["mild", "strong"]
307
+
308
+ def test_batch_validate(self):
309
+ from phd_research_os_v2.layer2.trigger_validator import batch_validate
310
+ claims = [
311
+ {"text": "We measured 0.8 fM with p < 0.001", "epistemic_tag": "Fact", "source_section": "results"},
312
+ {"text": "May potentially reduce costs", "epistemic_tag": "Fact", "source_section": "discussion"},
313
+ {"text": "Suggests a novel mechanism", "epistemic_tag": "Interpretation", "source_section": "discussion"},
314
+ ]
315
+ result = batch_validate(claims)
316
+ assert result["total"] == 3
317
+ assert "agreement_rate" in result
318
+
319
+ def test_empty_text(self):
320
+ from phd_research_os_v2.layer2.trigger_validator import compute_trigger_scores
321
+ result = compute_trigger_scores("", source_section="results")
322
+ assert "predicted_tag" in result
323
+
324
+ def test_scores_bounded(self):
325
+ from phd_research_os_v2.layer2.trigger_validator import compute_trigger_scores
326
+ result = compute_trigger_scores(
327
+ "may possibly might could potentially suggests hypothesize propose speculate",
328
+ source_section="discussion"
329
+ )
330
+ for score in result["scores"].values():
331
+ assert 0 <= score <= 1.0
332
+
333
+
334
+ # ════════════════════════════════════════════════��═════════════════════
335
+ # TEST: LOW CONFIDENCE QUARANTINE (Layer 4)
336
+ # ══════════════════════════════════════════════════════════════════════
337
+
338
+ class TestQuarantine:
339
+ """Tests for phd_research_os_v2.layer4.quarantine_and_nli.ConfidenceQuarantine"""
340
+
341
+ def test_quarantine_check_low_confidence(self):
342
+ from phd_research_os_v2.layer4.quarantine_and_nli import ConfidenceQuarantine
343
+ q = ConfidenceQuarantine()
344
+ result = q.quarantine_check({"composite_confidence": 200})
345
+ assert result["quarantined"] == True
346
+ assert result["reason"] == "confidence_too_low"
347
+
348
+ def test_quarantine_check_high_confidence(self):
349
+ from phd_research_os_v2.layer4.quarantine_and_nli import ConfidenceQuarantine
350
+ q = ConfidenceQuarantine()
351
+ result = q.quarantine_check({"composite_confidence": 800})
352
+ assert result["quarantined"] == False
353
+ assert result["reason"] is None
354
+
355
+ def test_quarantine_check_threshold(self):
356
+ from phd_research_os_v2.layer4.quarantine_and_nli import ConfidenceQuarantine
357
+ q = ConfidenceQuarantine(threshold=500)
358
+
359
+ assert q.quarantine_check({"composite_confidence": 499})["quarantined"] == True
360
+ assert q.quarantine_check({"composite_confidence": 500})["quarantined"] == False
361
+
362
+ def test_quarantine_claim_in_db(self, db_path):
363
+ from phd_research_os_v2.layer4.quarantine_and_nli import ConfidenceQuarantine
364
+ from phd_research_os_v2.core.database import get_db, now_iso
365
+
366
+ # Insert a test claim
367
+ conn = get_db(db_path)
368
+ conn.execute("""
369
+ INSERT INTO claims (claim_id, text, epistemic_tag, composite_confidence,
370
+ status, created_at, updated_at)
371
+ VALUES ('CLM_Q1', 'test claim', 'Fact', 200, 'Complete', ?, ?)
372
+ """, (now_iso(), now_iso()))
373
+ conn.commit()
374
+ conn.close()
375
+
376
+ q = ConfidenceQuarantine(db_path=db_path)
377
+ q.quarantine_claim("CLM_Q1")
378
+
379
+ conn = get_db(db_path)
380
+ row = conn.execute("SELECT status FROM claims WHERE claim_id = 'CLM_Q1'").fetchone()
381
+ conn.close()
382
+ assert dict(row)["status"] == "Quarantined"
383
+
384
+ def test_promote_claim(self, db_path):
385
+ from phd_research_os_v2.layer4.quarantine_and_nli import ConfidenceQuarantine
386
+ from phd_research_os_v2.core.database import get_db, now_iso
387
+
388
+ conn = get_db(db_path)
389
+ conn.execute("""
390
+ INSERT INTO claims (claim_id, text, epistemic_tag, composite_confidence,
391
+ status, missing_fields, created_at, updated_at)
392
+ VALUES ('CLM_Q2', 'test', 'Fact', 200, 'Quarantined', '[]', ?, ?)
393
+ """, (now_iso(), now_iso()))
394
+ conn.commit()
395
+ conn.close()
396
+
397
+ q = ConfidenceQuarantine(db_path=db_path)
398
+ result = q.promote_claim("CLM_Q2")
399
+ assert result["new_status"] == "Complete"
400
+
401
+ def test_quarantine_sweep(self, db_path):
402
+ from phd_research_os_v2.layer4.quarantine_and_nli import ConfidenceQuarantine
403
+ from phd_research_os_v2.core.database import get_db, now_iso
404
+
405
+ conn = get_db(db_path)
406
+ # Insert claims with various confidence levels
407
+ for i, conf in enumerate([100, 200, 500, 800]):
408
+ conn.execute("""
409
+ INSERT INTO claims (claim_id, text, epistemic_tag, composite_confidence,
410
+ status, created_at, updated_at)
411
+ VALUES (?, 'test', 'Fact', ?, 'Complete', ?, ?)
412
+ """, (f"CLM_SW{i}", conf, now_iso(), now_iso()))
413
+ conn.commit()
414
+ conn.close()
415
+
416
+ q = ConfidenceQuarantine(db_path=db_path, threshold=300)
417
+ result = q.quarantine_sweep()
418
+ assert result["quarantined_count"] == 2 # 100 and 200 are below 300
419
+
420
+ def test_quarantine_stats(self, db_path):
421
+ from phd_research_os_v2.layer4.quarantine_and_nli import ConfidenceQuarantine
422
+ q = ConfidenceQuarantine(db_path=db_path)
423
+ stats = q.get_stats()
424
+ assert "total_claims" in stats
425
+ assert "quarantined" in stats
426
+ assert "quarantine_rate" in stats
427
+
428
+
429
+ # ══════════════════════════════════════════════════════════════════════
430
+ # TEST: NLI PRE-FILTER (Layer 4)
431
+ # ══════════════════════════════════════════════════════════════════════
432
+
433
+ class TestNLIPreFilter:
434
+ """Tests for contradiction pre-filter (keyword fallback only β€” SciBERT may not be installed)"""
435
+
436
+ def test_nli_classify_fallback(self):
437
+ from phd_research_os_v2.layer4.quarantine_and_nli import nli_classify
438
+ result = nli_classify(
439
+ "The drug reduces inflammation",
440
+ "The drug has no effect on inflammation contrary to expectations"
441
+ )
442
+ assert result["label"] in ["ENTAILMENT", "CONTRADICTION", "NEUTRAL"]
443
+ assert "method" in result
444
+
445
+ def test_prefilter_contradictions(self):
446
+ from phd_research_os_v2.layer4.quarantine_and_nli import prefilter_contradictions
447
+ claims = [
448
+ {"claim_id": "A", "text": "The sensor achieved 0.8 fM detection limit", "source_doi": "d1"},
449
+ {"claim_id": "B", "text": "The sensor failed to detect anything below 10 fM contrary to previous claims", "source_doi": "d2"},
450
+ {"claim_id": "C", "text": "Weather patterns affect global temperature", "source_doi": "d3"},
451
+ ]
452
+ results = prefilter_contradictions(claims, contradiction_threshold=0.0)
453
+ assert isinstance(results, list)
454
+ # Should find at least some pairs
455
+
456
+ def test_prefilter_skips_same_document(self):
457
+ from phd_research_os_v2.layer4.quarantine_and_nli import prefilter_contradictions
458
+ claims = [
459
+ {"claim_id": "A", "text": "X is true", "source_doi": "same_doi"},
460
+ {"claim_id": "B", "text": "X is false", "source_doi": "same_doi"},
461
+ ]
462
+ results = prefilter_contradictions(claims)
463
+ # Same-document pairs should be skipped
464
+ for r in results:
465
+ assert not (r["claim_a_id"] == "A" and r["claim_b_id"] == "B")
466
+
467
+ def test_prefilter_empty_claims(self):
468
+ from phd_research_os_v2.layer4.quarantine_and_nli import prefilter_contradictions
469
+ assert prefilter_contradictions([]) == []
470
+ assert prefilter_contradictions([{"claim_id": "A", "text": "only one"}]) == []
471
+
472
+
473
+ # ══════════════════════════════════════════════════════════════════════
474
+ # TEST: EPISTEMIC VELOCITY (Layer 5)
475
+ # ══════════════════════════════════════════════════════════════════════
476
+
477
+ class TestEpistemicVelocity:
478
+ """Tests for phd_research_os_v2.layer5.velocity_and_decomposition.EpistemicVelocity"""
479
+
480
+ def test_insufficient_data(self, db_path):
481
+ from phd_research_os_v2.layer5.velocity_and_decomposition import EpistemicVelocity
482
+ ev = EpistemicVelocity(db_path=db_path)
483
+ result = ev.compute_velocity("NONEXISTENT")
484
+ assert result["trend"] == "insufficient_data"
485
+
486
+ def test_rising_trend(self, db_path):
487
+ from phd_research_os_v2.layer5.velocity_and_decomposition import EpistemicVelocity
488
+ from phd_research_os_v2.core.database import get_db, now_iso, to_fixed
489
+
490
+ # Insert canonical claim with rising version history
491
+ conn = get_db(db_path)
492
+ history = [
493
+ {"version": 1, "confidence": to_fixed(0.5), "date": "2025-01-01", "source": "paper1"},
494
+ {"version": 2, "confidence": to_fixed(0.7), "date": "2025-06-01", "source": "paper2"},
495
+ {"version": 3, "confidence": to_fixed(0.9), "date": "2026-01-01", "source": "paper3"},
496
+ ]
497
+ conn.execute("""
498
+ INSERT INTO canonical_claims (canonical_id, representative_text, epistemic_tag,
499
+ composite_confidence, evidence_count, source_dois, aliases,
500
+ version_history, current_version, schema_version, created_at, updated_at)
501
+ VALUES ('CANON_RISE', 'test rising claim', 'Fact', ?, 3, '[]', '[]', ?, 3, '2.0', ?, ?)
502
+ """, (to_fixed(0.9), json.dumps(history), now_iso(), now_iso()))
503
+ conn.commit()
504
+ conn.close()
505
+
506
+ ev = EpistemicVelocity(db_path=db_path)
507
+ result = ev.compute_velocity("CANON_RISE")
508
+ assert result["trend"] == "rising"
509
+ assert result["velocity"] > 0
510
+
511
+ def test_falling_trend(self, db_path):
512
+ from phd_research_os_v2.layer5.velocity_and_decomposition import EpistemicVelocity
513
+ from phd_research_os_v2.core.database import get_db, now_iso, to_fixed
514
+
515
+ conn = get_db(db_path)
516
+ history = [
517
+ {"version": 1, "confidence": to_fixed(0.9), "date": "2025-01-01", "source": "p1"},
518
+ {"version": 2, "confidence": to_fixed(0.6), "date": "2025-06-01", "source": "p2"},
519
+ {"version": 3, "confidence": to_fixed(0.3), "date": "2026-01-01", "source": "p3"},
520
+ ]
521
+ conn.execute("""
522
+ INSERT INTO canonical_claims (canonical_id, representative_text, epistemic_tag,
523
+ composite_confidence, evidence_count, source_dois, aliases,
524
+ version_history, current_version, schema_version, created_at, updated_at)
525
+ VALUES ('CANON_FALL', 'test falling claim', 'Fact', ?, 3, '[]', '[]', ?, 3, '2.0', ?, ?)
526
+ """, (to_fixed(0.3), json.dumps(history), now_iso(), now_iso()))
527
+ conn.commit()
528
+ conn.close()
529
+
530
+ ev = EpistemicVelocity(db_path=db_path)
531
+ result = ev.compute_velocity("CANON_FALL")
532
+ assert result["trend"] == "falling"
533
+ assert result["velocity"] < 0
534
+
535
+ def test_single_version_insufficient(self, db_path):
536
+ from phd_research_os_v2.layer5.velocity_and_decomposition import EpistemicVelocity
537
+ from phd_research_os_v2.core.database import get_db, now_iso, to_fixed
538
+
539
+ conn = get_db(db_path)
540
+ history = [{"version": 1, "confidence": to_fixed(0.7), "date": "2025-01-01", "source": "p1"}]
541
+ conn.execute("""
542
+ INSERT INTO canonical_claims (canonical_id, representative_text, epistemic_tag,
543
+ composite_confidence, evidence_count, source_dois, aliases,
544
+ version_history, current_version, schema_version, created_at, updated_at)
545
+ VALUES ('CANON_SINGLE', 'test single', 'Fact', ?, 1, '[]', '[]', ?, 1, '2.0', ?, ?)
546
+ """, (to_fixed(0.7), json.dumps(history), now_iso(), now_iso()))
547
+ conn.commit()
548
+ conn.close()
549
+
550
+ ev = EpistemicVelocity(db_path=db_path)
551
+ result = ev.compute_velocity("CANON_SINGLE")
552
+ assert result["trend"] == "insufficient_data"
553
+
554
+
555
+ # ══════════════════════════════════════════════════════════════════════
556
+ # TEST: CONFIDENCE DECOMPOSITION (Layer 5)
557
+ # ══════════════════════════════════════════════════════════════════════
558
+
559
+ class TestConfidenceDecomposition:
560
+ """Tests for phd_research_os_v2.layer5.velocity_and_decomposition (decomposition)"""
561
+
562
+ def test_basic_decomposition(self):
563
+ from phd_research_os_v2.layer5.velocity_and_decomposition import decompose_confidence
564
+
565
+ claim = {
566
+ "evidence_quality": 800,
567
+ "truth_likelihood": 700,
568
+ "qualifier_strength_score": 600,
569
+ "composite_confidence": 700,
570
+ "evidence_strength": 850,
571
+ "source_section": "results",
572
+ "qualifiers": json.dumps(["in PBS"]),
573
+ "missing_fields": json.dumps([]),
574
+ "is_null_result": False,
575
+ "is_inherited_citation": False,
576
+ "practical_significance": True,
577
+ "parse_confidence": 950,
578
+ }
579
+
580
+ result = decompose_confidence(claim, source={"study_type": "in_vitro", "journal_tier": 1})
581
+
582
+ assert "composite_confidence" in result
583
+ assert "scores" in result
584
+ assert "headline" in result
585
+ assert "warnings" in result
586
+ assert "action_items" in result
587
+
588
+ assert "evidence_quality" in result["scores"]
589
+ assert "truth_likelihood" in result["scores"]
590
+ assert "qualifier_strength" in result["scores"]
591
+
592
+ # Each score should have value, bar, explanation
593
+ for score_data in result["scores"].values():
594
+ assert "value" in score_data
595
+ assert "bar" in score_data
596
+ assert "explanation" in score_data
597
+
598
+ def test_decomposition_null_result_warning(self):
599
+ from phd_research_os_v2.layer5.velocity_and_decomposition import decompose_confidence
600
+
601
+ claim = {
602
+ "evidence_quality": 400,
603
+ "truth_likelihood": 300,
604
+ "qualifier_strength_score": 300,
605
+ "composite_confidence": 333,
606
+ "evidence_strength": 500,
607
+ "source_section": "results",
608
+ "qualifiers": json.dumps(["not significant"]),
609
+ "missing_fields": json.dumps([]),
610
+ "is_null_result": True,
611
+ "is_inherited_citation": False,
612
+ "practical_significance": True,
613
+ }
614
+
615
+ result = decompose_confidence(claim)
616
+ assert any("null" in w.lower() for w in result["warnings"])
617
+
618
+ def test_decomposition_abstract_warning(self):
619
+ from phd_research_os_v2.layer5.velocity_and_decomposition import decompose_confidence
620
+
621
+ claim = {
622
+ "evidence_quality": 500,
623
+ "truth_likelihood": 500,
624
+ "qualifier_strength_score": 500,
625
+ "composite_confidence": 500,
626
+ "evidence_strength": 700,
627
+ "source_section": "abstract",
628
+ "qualifiers": json.dumps([]),
629
+ "missing_fields": json.dumps([]),
630
+ "is_null_result": False,
631
+ "is_inherited_citation": False,
632
+ "practical_significance": True,
633
+ }
634
+
635
+ result = decompose_confidence(claim)
636
+ assert any("abstract" in w.lower() for w in result["warnings"])
637
+
638
+ def test_format_text(self):
639
+ from phd_research_os_v2.layer5.velocity_and_decomposition import (
640
+ decompose_confidence, format_decomposition_text
641
+ )
642
+
643
+ claim = {
644
+ "evidence_quality": 800,
645
+ "truth_likelihood": 700,
646
+ "qualifier_strength_score": 900,
647
+ "composite_confidence": 800,
648
+ "evidence_strength": 850,
649
+ "source_section": "results",
650
+ "qualifiers": json.dumps([]),
651
+ "missing_fields": json.dumps([]),
652
+ "is_null_result": False,
653
+ "is_inherited_citation": False,
654
+ "practical_significance": True,
655
+ }
656
+
657
+ decomposition = decompose_confidence(claim)
658
+ text = format_decomposition_text(decomposition)
659
+
660
+ assert isinstance(text, str)
661
+ assert "Composite Confidence" in text
662
+ assert "Evidence Quality" in text
663
+
664
+ def test_format_markdown(self):
665
+ from phd_research_os_v2.layer5.velocity_and_decomposition import (
666
+ decompose_confidence, format_decomposition_markdown
667
+ )
668
+
669
+ claim = {
670
+ "evidence_quality": 800,
671
+ "truth_likelihood": 700,
672
+ "qualifier_strength_score": 900,
673
+ "composite_confidence": 800,
674
+ "evidence_strength": 850,
675
+ "source_section": "results",
676
+ "qualifiers": json.dumps([]),
677
+ "missing_fields": json.dumps([]),
678
+ "is_null_result": False,
679
+ "is_inherited_citation": False,
680
+ "practical_significance": True,
681
+ }
682
+
683
+ decomposition = decompose_confidence(claim)
684
+ md = format_decomposition_markdown(decomposition)
685
+
686
+ assert isinstance(md, str)
687
+ assert "**Confidence:" in md
688
+ assert "|" in md # Table format
689
+
690
+ def test_low_confidence_headline(self):
691
+ from phd_research_os_v2.layer5.velocity_and_decomposition import decompose_confidence
692
+
693
+ claim = {
694
+ "evidence_quality": 100,
695
+ "truth_likelihood": 100,
696
+ "qualifier_strength_score": 100,
697
+ "composite_confidence": 100,
698
+ "evidence_strength": 200,
699
+ "source_section": "discussion",
700
+ "qualifiers": json.dumps(["may", "possibly", "potentially"]),
701
+ "missing_fields": json.dumps(["data", "statistics"]),
702
+ "is_null_result": False,
703
+ "is_inherited_citation": True,
704
+ "practical_significance": True,
705
+ }
706
+
707
+ result = decompose_confidence(claim)
708
+ assert "quarantine" in result["headline"].lower() or "low" in result["headline"].lower()
709
+
710
+
711
+ # ══════════════════════════════════════════════════════════════════════
712
+ # TEST: SCIRIFF INTEGRATION (Training)
713
+ # ══════════════════════════════════════════════════════════════════════
714
+
715
+ class TestSciRIFFIntegration:
716
+ """Tests for the SciRIFF data integration logic (without actually downloading)."""
717
+
718
+ def test_relevant_task_families_defined(self):
719
+ from phd_research_os_v2.training.sciriff_integration import RELEVANT_TASK_FAMILIES
720
+ assert "ie" in RELEVANT_TASK_FAMILIES
721
+ assert "classification" in RELEVANT_TASK_FAMILIES
722
+ assert "entailment" in RELEVANT_TASK_FAMILIES
723
+
724
+ def test_system_prompts_exist(self):
725
+ from phd_research_os_v2.training.sciriff_integration import SYSTEM_PROMPTS
726
+ assert "ie" in SYSTEM_PROMPTS
727
+ assert "classification" in SYSTEM_PROMPTS
728
+ assert "qa" in SYSTEM_PROMPTS
729
+ for prompt in SYSTEM_PROMPTS.values():
730
+ assert "PhD Research OS" in prompt
731
+
732
+ def test_high_priority_tasks_defined(self):
733
+ from phd_research_os_v2.training.sciriff_integration import HIGH_PRIORITY_TASKS
734
+ assert "scifact" in HIGH_PRIORITY_TASKS
735
+ assert "scierc" in HIGH_PRIORITY_TASKS
736
+
737
+
738
+ # ══════════════════════════════════════════════════════════════════════
739
+ # TEST: DATABASE SCHEMA SUPPORTS NEW FEATURES
740
+ # ══════════════════════════════════════════════════════════════════════
741
+
742
+ class TestDatabaseSchema:
743
+ """Verify the database schema supports quarantine and new features."""
744
+
745
+ def test_claims_table_has_required_columns(self, db_path):
746
+ from phd_research_os_v2.core.database import get_db
747
+ conn = get_db(db_path)
748
+
749
+ # Get column info
750
+ cursor = conn.execute("PRAGMA table_info(claims)")
751
+ columns = {row[1] for row in cursor.fetchall()}
752
+ conn.close()
753
+
754
+ required = {
755
+ "claim_id", "text", "epistemic_tag", "composite_confidence",
756
+ "status", "is_null_result", "is_inherited_citation",
757
+ "qualifiers", "missing_fields", "source_section",
758
+ "evidence_quality", "truth_likelihood", "qualifier_strength_score",
759
+ }
760
+
761
+ for col in required:
762
+ assert col in columns, f"Missing column: {col}"
763
+
764
+ def test_canonical_claims_has_version_history(self, db_path):
765
+ from phd_research_os_v2.core.database import get_db
766
+ conn = get_db(db_path)
767
+ cursor = conn.execute("PRAGMA table_info(canonical_claims)")
768
+ columns = {row[1] for row in cursor.fetchall()}
769
+ conn.close()
770
+
771
+ assert "version_history" in columns
772
+ assert "evidence_count" in columns
773
+
774
+ def test_eval_runs_table_exists(self, db_path):
775
+ from phd_research_os_v2.core.database import get_db
776
+ conn = get_db(db_path)
777
+ cursor = conn.execute("PRAGMA table_info(eval_runs)")
778
+ columns = {row[1] for row in cursor.fetchall()}
779
+ conn.close()
780
+
781
+ assert "run_id" in columns
782
+ assert "metrics" in columns
783
+ assert "passed" in columns
784
+
785
+
786
+ if __name__ == "__main__":
787
+ pytest.main([__file__, "-v", "--tb=short"])