| """ |
| Tests for Foundation Components |
| ================================= |
| Tests for all "strongly implementable" features: |
| - SPECTER2 embedding dedup (with Jaccard fallback) |
| - SciFact benchmark evaluation |
| - Epistemic Trigger Words validator |
| - Low Confidence Quarantine |
| - SciBERT-NLI contradiction pre-filter (with fallback) |
| - Epistemic Velocity tracking |
| - Confidence Decomposition Display |
| """ |
|
|
| import pytest |
| import json |
| import os |
| import sys |
| import tempfile |
|
|
| |
| sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
|
|
|
| |
| |
| |
|
|
| @pytest.fixture |
| def db_path(): |
| """Create a temporary database for testing.""" |
| from phd_research_os_v2.core.database import init_db |
| with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: |
| path = f.name |
| init_db(path) |
| yield path |
| os.unlink(path) |
|
|
|
|
| @pytest.fixture |
| def sample_claims(): |
| """Sample claims for testing.""" |
| return [ |
| { |
| "claim_id": "CLM_TEST001", |
| "text": "The limit of detection was 0.8 fM in 10 mM PBS buffer.", |
| "epistemic_tag": "Fact", |
| "source_section": "results", |
| "source_doi": "10.1234/paper1", |
| "evidence_strength": 800, |
| "composite_confidence": 750, |
| "qualifiers": json.dumps(["in 10 mM PBS"]), |
| "missing_fields": json.dumps([]), |
| "is_null_result": False, |
| "is_inherited_citation": False, |
| }, |
| { |
| "claim_id": "CLM_TEST002", |
| "text": "A detection limit of 800 attomolar was achieved using the graphene sensor.", |
| "epistemic_tag": "Fact", |
| "source_section": "results", |
| "source_doi": "10.1234/paper2", |
| "evidence_strength": 750, |
| "composite_confidence": 700, |
| "qualifiers": json.dumps([]), |
| "missing_fields": json.dumps([]), |
| "is_null_result": False, |
| "is_inherited_citation": False, |
| }, |
| { |
| "claim_id": "CLM_TEST003", |
| "text": "This approach may potentially reduce diagnostic costs in low-resource settings.", |
| "epistemic_tag": "Hypothesis", |
| "source_section": "discussion", |
| "source_doi": "10.1234/paper1", |
| "evidence_strength": 300, |
| "composite_confidence": 200, |
| "qualifiers": json.dumps(["may", "potentially"]), |
| "missing_fields": json.dumps(["cost_analysis", "field_testing"]), |
| "is_null_result": False, |
| "is_inherited_citation": False, |
| }, |
| { |
| "claim_id": "CLM_TEST004", |
| "text": "The sensor did not show significant improvement over the control group.", |
| "epistemic_tag": "Fact", |
| "source_section": "results", |
| "source_doi": "10.1234/paper3", |
| "evidence_strength": 600, |
| "composite_confidence": 400, |
| "qualifiers": json.dumps(["not significant"]), |
| "missing_fields": json.dumps([]), |
| "is_null_result": True, |
| "is_inherited_citation": False, |
| }, |
| ] |
|
|
|
|
| |
| |
| |
|
|
| class TestEmbeddingDedup: |
| """Tests for phd_research_os_v2.layer3.embedding_dedup""" |
| |
| def test_jaccard_identical_texts(self): |
| from phd_research_os_v2.layer3.embedding_dedup import jaccard_similarity |
| sim = jaccard_similarity("The LOD was 0.8 fM", "The LOD was 0.8 fM") |
| assert sim == 1.0 |
| |
| def test_jaccard_different_texts(self): |
| from phd_research_os_v2.layer3.embedding_dedup import jaccard_similarity |
| sim = jaccard_similarity("The LOD was 0.8 fM", "Completely unrelated text about cooking") |
| assert sim < 0.2 |
| |
| def test_jaccard_similar_texts(self): |
| from phd_research_os_v2.layer3.embedding_dedup import jaccard_similarity |
| sim = jaccard_similarity( |
| "The detection limit was 0.8 femtomolar", |
| "The detection limit was measured at 0.8 fM" |
| ) |
| assert sim > 0.3 |
| |
| def test_jaccard_empty_texts(self): |
| from phd_research_os_v2.layer3.embedding_dedup import jaccard_similarity |
| assert jaccard_similarity("", "") == 0.0 |
| assert jaccard_similarity("hello", "") == 0.0 |
| |
| def test_claim_similarity_auto_mode(self): |
| from phd_research_os_v2.layer3.embedding_dedup import claim_similarity |
| sim = claim_similarity("LOD was 0.8 fM", "LOD was 0.8 fM", method="jaccard") |
| assert sim == 1.0 |
| |
| def test_batch_deduplicate_jaccard(self): |
| from phd_research_os_v2.layer3.embedding_dedup import batch_deduplicate |
| texts = [ |
| "The LOD was 0.8 fM in PBS buffer", |
| "The LOD was 0.8 fM in PBS buffer", |
| "Completely different topic about weather", |
| ] |
| result = batch_deduplicate(texts, threshold=0.85, method="jaccard") |
| assert len(result["canonical_indices"]) <= 2 |
| assert 1 in result["duplicates"] |
| |
| def test_batch_deduplicate_empty(self): |
| from phd_research_os_v2.layer3.embedding_dedup import batch_deduplicate |
| result = batch_deduplicate([], method="jaccard") |
| assert result["canonical_indices"] == [] |
| |
| def test_batch_deduplicate_single(self): |
| from phd_research_os_v2.layer3.embedding_dedup import batch_deduplicate |
| result = batch_deduplicate(["one claim"], method="jaccard") |
| assert result["canonical_indices"] == [0] |
| |
| def test_normalize_claim_text(self): |
| from phd_research_os_v2.layer3.embedding_dedup import _normalize |
| assert _normalize(" The LOD was 0.8 fM ") == "the lod was 0.8 fm" |
|
|
|
|
| |
| |
| |
|
|
| class TestSciFact: |
| """Tests for phd_research_os_v2.layer6.scifact_benchmark""" |
| |
| def test_baseline_classifier_support(self): |
| from phd_research_os_v2.layer6.scifact_benchmark import quick_baseline_classifier |
| result = quick_baseline_classifier( |
| "Vitamin C helps prevent scurvy", |
| "Studies have shown vitamin C is essential for preventing scurvy in sailors" |
| ) |
| assert result in ["SUPPORT", "CONTRADICT", "NOT_ENOUGH_INFO"] |
| |
| def test_baseline_classifier_contradict(self): |
| from phd_research_os_v2.layer6.scifact_benchmark import quick_baseline_classifier |
| result = quick_baseline_classifier( |
| "The drug has no side effects", |
| "The drug was found to have significant adverse effects including nausea" |
| ) |
| assert result in ["SUPPORT", "CONTRADICT", "NOT_ENOUGH_INFO"] |
| |
| def test_evaluate_returns_correct_structure(self): |
| from phd_research_os_v2.layer6.scifact_benchmark import evaluate_against_scifact |
| |
| def dummy_classifier(claim, evidence): |
| return "SUPPORT" |
| |
| examples = [ |
| {"claim": "test claim 1", "evidence": "test evidence 1", "label": "SUPPORT"}, |
| {"claim": "test claim 2", "evidence": "test evidence 2", "label": "CONTRADICT"}, |
| {"claim": "test claim 3", "evidence": "test evidence 3", "label": "NOT_ENOUGH_INFO"}, |
| ] |
| |
| result = evaluate_against_scifact(dummy_classifier, examples) |
| |
| assert "accuracy" in result |
| assert "per_class" in result |
| assert "confusion_matrix" in result |
| assert "total_examples" in result |
| assert result["total_examples"] == 3 |
| assert 0 <= result["accuracy"] <= 1 |
| |
| def test_evaluate_perfect_classifier(self): |
| from phd_research_os_v2.layer6.scifact_benchmark import evaluate_against_scifact |
| |
| examples = [ |
| {"claim": "c1", "evidence": "e1", "label": "SUPPORT"}, |
| {"claim": "c2", "evidence": "e2", "label": "CONTRADICT"}, |
| ] |
| |
| def perfect(claim, evidence): |
| for ex in examples: |
| if ex["claim"] == claim: |
| return ex["label"] |
| return "NOT_ENOUGH_INFO" |
| |
| result = evaluate_against_scifact(perfect, examples) |
| assert result["accuracy"] == 1.0 |
| |
| def test_evaluate_handles_errors(self): |
| from phd_research_os_v2.layer6.scifact_benchmark import evaluate_against_scifact |
| |
| def broken(claim, evidence): |
| raise ValueError("broken") |
| |
| examples = [{"claim": "c", "evidence": "e", "label": "SUPPORT"}] |
| result = evaluate_against_scifact(broken, examples) |
| assert result["total_examples"] == 1 |
|
|
|
|
| |
| |
| |
|
|
| class TestTriggerValidator: |
| """Tests for phd_research_os_v2.layer2.trigger_validator""" |
| |
| def test_fact_detection(self): |
| from phd_research_os_v2.layer2.trigger_validator import compute_trigger_scores |
| result = compute_trigger_scores( |
| "We measured a detection limit of 0.8 fM with p < 0.001", |
| source_section="results" |
| ) |
| assert result["predicted_tag"] == "Fact" |
| assert result["scores"]["Fact"] > 0.3 |
| |
| def test_hypothesis_detection(self): |
| from phd_research_os_v2.layer2.trigger_validator import compute_trigger_scores |
| result = compute_trigger_scores( |
| "This may potentially reduce costs and further investigation is needed", |
| source_section="discussion" |
| ) |
| assert result["predicted_tag"] == "Hypothesis" |
| assert result["scores"]["Hypothesis"] > 0.3 |
| |
| def test_interpretation_detection(self): |
| from phd_research_os_v2.layer2.trigger_validator import compute_trigger_scores |
| result = compute_trigger_scores( |
| "These findings suggest that the mechanism is likely due to charge transfer", |
| source_section="discussion" |
| ) |
| assert result["predicted_tag"] == "Interpretation" |
| |
| def test_conflict_detection(self): |
| from phd_research_os_v2.layer2.trigger_validator import compute_trigger_scores |
| result = compute_trigger_scores( |
| "Contrary to previous reports, our results show inconsistent findings that refutes the hypothesis" |
| ) |
| assert result["scores"]["Conflict_Hypothesis"] > 0.2 |
| |
| def test_section_prior_results(self): |
| from phd_research_os_v2.layer2.trigger_validator import compute_trigger_scores |
| result = compute_trigger_scores( |
| "The value was obtained from the experiment", |
| source_section="results" |
| ) |
| assert result["scores"]["Fact"] > 0 |
| |
| def test_section_prior_abstract(self): |
| from phd_research_os_v2.layer2.trigger_validator import compute_trigger_scores |
| result = compute_trigger_scores( |
| "A novel approach was developed", |
| source_section="abstract" |
| ) |
| assert result["scores"]["Interpretation"] > 0 |
| |
| def test_validate_ai_tag_agreement(self): |
| from phd_research_os_v2.layer2.trigger_validator import validate_ai_tag |
| result = validate_ai_tag( |
| "We measured a detection limit of 0.8 fM with p < 0.001", |
| ai_tag="Fact", |
| source_section="results" |
| ) |
| assert result["agreement"] == True |
| assert result["recommendation"] == "accept" |
| |
| def test_validate_ai_tag_disagreement(self): |
| from phd_research_os_v2.layer2.trigger_validator import validate_ai_tag |
| result = validate_ai_tag( |
| "This may potentially reduce costs and further investigation is needed", |
| ai_tag="Fact", |
| source_section="discussion" |
| ) |
| |
| if not result["agreement"]: |
| assert result["disagreement_severity"] in ["mild", "strong"] |
| |
| def test_batch_validate(self): |
| from phd_research_os_v2.layer2.trigger_validator import batch_validate |
| claims = [ |
| {"text": "We measured 0.8 fM with p < 0.001", "epistemic_tag": "Fact", "source_section": "results"}, |
| {"text": "May potentially reduce costs", "epistemic_tag": "Fact", "source_section": "discussion"}, |
| {"text": "Suggests a novel mechanism", "epistemic_tag": "Interpretation", "source_section": "discussion"}, |
| ] |
| result = batch_validate(claims) |
| assert result["total"] == 3 |
| assert "agreement_rate" in result |
| |
| def test_empty_text(self): |
| from phd_research_os_v2.layer2.trigger_validator import compute_trigger_scores |
| result = compute_trigger_scores("", source_section="results") |
| assert "predicted_tag" in result |
| |
| def test_scores_bounded(self): |
| from phd_research_os_v2.layer2.trigger_validator import compute_trigger_scores |
| result = compute_trigger_scores( |
| "may possibly might could potentially suggests hypothesize propose speculate", |
| source_section="discussion" |
| ) |
| for score in result["scores"].values(): |
| assert 0 <= score <= 1.0 |
|
|
|
|
| |
| |
| |
|
|
| class TestQuarantine: |
| """Tests for phd_research_os_v2.layer4.quarantine_and_nli.ConfidenceQuarantine""" |
| |
| def test_quarantine_check_low_confidence(self): |
| from phd_research_os_v2.layer4.quarantine_and_nli import ConfidenceQuarantine |
| q = ConfidenceQuarantine() |
| result = q.quarantine_check({"composite_confidence": 200}) |
| assert result["quarantined"] == True |
| assert result["reason"] == "confidence_too_low" |
| |
| def test_quarantine_check_high_confidence(self): |
| from phd_research_os_v2.layer4.quarantine_and_nli import ConfidenceQuarantine |
| q = ConfidenceQuarantine() |
| result = q.quarantine_check({"composite_confidence": 800}) |
| assert result["quarantined"] == False |
| assert result["reason"] is None |
| |
| def test_quarantine_check_threshold(self): |
| from phd_research_os_v2.layer4.quarantine_and_nli import ConfidenceQuarantine |
| q = ConfidenceQuarantine(threshold=500) |
| |
| assert q.quarantine_check({"composite_confidence": 499})["quarantined"] == True |
| assert q.quarantine_check({"composite_confidence": 500})["quarantined"] == False |
| |
| def test_quarantine_claim_in_db(self, db_path): |
| from phd_research_os_v2.layer4.quarantine_and_nli import ConfidenceQuarantine |
| from phd_research_os_v2.core.database import get_db, now_iso |
| |
| |
| conn = get_db(db_path) |
| conn.execute(""" |
| INSERT INTO claims (claim_id, text, epistemic_tag, composite_confidence, |
| status, created_at, updated_at) |
| VALUES ('CLM_Q1', 'test claim', 'Fact', 200, 'Complete', ?, ?) |
| """, (now_iso(), now_iso())) |
| conn.commit() |
| conn.close() |
| |
| q = ConfidenceQuarantine(db_path=db_path) |
| q.quarantine_claim("CLM_Q1") |
| |
| conn = get_db(db_path) |
| row = conn.execute("SELECT status FROM claims WHERE claim_id = 'CLM_Q1'").fetchone() |
| conn.close() |
| assert dict(row)["status"] == "Quarantined" |
| |
| def test_promote_claim(self, db_path): |
| from phd_research_os_v2.layer4.quarantine_and_nli import ConfidenceQuarantine |
| from phd_research_os_v2.core.database import get_db, now_iso |
| |
| conn = get_db(db_path) |
| conn.execute(""" |
| INSERT INTO claims (claim_id, text, epistemic_tag, composite_confidence, |
| status, missing_fields, created_at, updated_at) |
| VALUES ('CLM_Q2', 'test', 'Fact', 200, 'Quarantined', '[]', ?, ?) |
| """, (now_iso(), now_iso())) |
| conn.commit() |
| conn.close() |
| |
| q = ConfidenceQuarantine(db_path=db_path) |
| result = q.promote_claim("CLM_Q2") |
| assert result["new_status"] == "Complete" |
| |
| def test_quarantine_sweep(self, db_path): |
| from phd_research_os_v2.layer4.quarantine_and_nli import ConfidenceQuarantine |
| from phd_research_os_v2.core.database import get_db, now_iso |
| |
| conn = get_db(db_path) |
| |
| for i, conf in enumerate([100, 200, 500, 800]): |
| conn.execute(""" |
| INSERT INTO claims (claim_id, text, epistemic_tag, composite_confidence, |
| status, created_at, updated_at) |
| VALUES (?, 'test', 'Fact', ?, 'Complete', ?, ?) |
| """, (f"CLM_SW{i}", conf, now_iso(), now_iso())) |
| conn.commit() |
| conn.close() |
| |
| q = ConfidenceQuarantine(db_path=db_path, threshold=300) |
| result = q.quarantine_sweep() |
| assert result["quarantined_count"] == 2 |
| |
| def test_quarantine_stats(self, db_path): |
| from phd_research_os_v2.layer4.quarantine_and_nli import ConfidenceQuarantine |
| q = ConfidenceQuarantine(db_path=db_path) |
| stats = q.get_stats() |
| assert "total_claims" in stats |
| assert "quarantined" in stats |
| assert "quarantine_rate" in stats |
|
|
|
|
| |
| |
| |
|
|
| class TestNLIPreFilter: |
| """Tests for contradiction pre-filter (keyword fallback only β SciBERT may not be installed)""" |
| |
| def test_nli_classify_fallback(self): |
| from phd_research_os_v2.layer4.quarantine_and_nli import nli_classify |
| result = nli_classify( |
| "The drug reduces inflammation", |
| "The drug has no effect on inflammation contrary to expectations" |
| ) |
| assert result["label"] in ["ENTAILMENT", "CONTRADICTION", "NEUTRAL"] |
| assert "method" in result |
| |
| def test_prefilter_contradictions(self): |
| from phd_research_os_v2.layer4.quarantine_and_nli import prefilter_contradictions |
| claims = [ |
| {"claim_id": "A", "text": "The sensor achieved 0.8 fM detection limit", "source_doi": "d1"}, |
| {"claim_id": "B", "text": "The sensor failed to detect anything below 10 fM contrary to previous claims", "source_doi": "d2"}, |
| {"claim_id": "C", "text": "Weather patterns affect global temperature", "source_doi": "d3"}, |
| ] |
| results = prefilter_contradictions(claims, contradiction_threshold=0.0) |
| assert isinstance(results, list) |
| |
| |
| def test_prefilter_skips_same_document(self): |
| from phd_research_os_v2.layer4.quarantine_and_nli import prefilter_contradictions |
| claims = [ |
| {"claim_id": "A", "text": "X is true", "source_doi": "same_doi"}, |
| {"claim_id": "B", "text": "X is false", "source_doi": "same_doi"}, |
| ] |
| results = prefilter_contradictions(claims) |
| |
| for r in results: |
| assert not (r["claim_a_id"] == "A" and r["claim_b_id"] == "B") |
| |
| def test_prefilter_empty_claims(self): |
| from phd_research_os_v2.layer4.quarantine_and_nli import prefilter_contradictions |
| assert prefilter_contradictions([]) == [] |
| assert prefilter_contradictions([{"claim_id": "A", "text": "only one"}]) == [] |
|
|
|
|
| |
| |
| |
|
|
| class TestEpistemicVelocity: |
| """Tests for phd_research_os_v2.layer5.velocity_and_decomposition.EpistemicVelocity""" |
| |
| def test_insufficient_data(self, db_path): |
| from phd_research_os_v2.layer5.velocity_and_decomposition import EpistemicVelocity |
| ev = EpistemicVelocity(db_path=db_path) |
| result = ev.compute_velocity("NONEXISTENT") |
| assert result["trend"] == "insufficient_data" |
| |
| def test_rising_trend(self, db_path): |
| from phd_research_os_v2.layer5.velocity_and_decomposition import EpistemicVelocity |
| from phd_research_os_v2.core.database import get_db, now_iso, to_fixed |
| |
| |
| conn = get_db(db_path) |
| history = [ |
| {"version": 1, "confidence": to_fixed(0.5), "date": "2025-01-01", "source": "paper1"}, |
| {"version": 2, "confidence": to_fixed(0.7), "date": "2025-06-01", "source": "paper2"}, |
| {"version": 3, "confidence": to_fixed(0.9), "date": "2026-01-01", "source": "paper3"}, |
| ] |
| conn.execute(""" |
| INSERT INTO canonical_claims (canonical_id, representative_text, epistemic_tag, |
| composite_confidence, evidence_count, source_dois, aliases, |
| version_history, current_version, schema_version, created_at, updated_at) |
| VALUES ('CANON_RISE', 'test rising claim', 'Fact', ?, 3, '[]', '[]', ?, 3, '2.0', ?, ?) |
| """, (to_fixed(0.9), json.dumps(history), now_iso(), now_iso())) |
| conn.commit() |
| conn.close() |
| |
| ev = EpistemicVelocity(db_path=db_path) |
| result = ev.compute_velocity("CANON_RISE") |
| assert result["trend"] == "rising" |
| assert result["velocity"] > 0 |
| |
| def test_falling_trend(self, db_path): |
| from phd_research_os_v2.layer5.velocity_and_decomposition import EpistemicVelocity |
| from phd_research_os_v2.core.database import get_db, now_iso, to_fixed |
| |
| conn = get_db(db_path) |
| history = [ |
| {"version": 1, "confidence": to_fixed(0.9), "date": "2025-01-01", "source": "p1"}, |
| {"version": 2, "confidence": to_fixed(0.6), "date": "2025-06-01", "source": "p2"}, |
| {"version": 3, "confidence": to_fixed(0.3), "date": "2026-01-01", "source": "p3"}, |
| ] |
| conn.execute(""" |
| INSERT INTO canonical_claims (canonical_id, representative_text, epistemic_tag, |
| composite_confidence, evidence_count, source_dois, aliases, |
| version_history, current_version, schema_version, created_at, updated_at) |
| VALUES ('CANON_FALL', 'test falling claim', 'Fact', ?, 3, '[]', '[]', ?, 3, '2.0', ?, ?) |
| """, (to_fixed(0.3), json.dumps(history), now_iso(), now_iso())) |
| conn.commit() |
| conn.close() |
| |
| ev = EpistemicVelocity(db_path=db_path) |
| result = ev.compute_velocity("CANON_FALL") |
| assert result["trend"] == "falling" |
| assert result["velocity"] < 0 |
| |
| def test_single_version_insufficient(self, db_path): |
| from phd_research_os_v2.layer5.velocity_and_decomposition import EpistemicVelocity |
| from phd_research_os_v2.core.database import get_db, now_iso, to_fixed |
| |
| conn = get_db(db_path) |
| history = [{"version": 1, "confidence": to_fixed(0.7), "date": "2025-01-01", "source": "p1"}] |
| conn.execute(""" |
| INSERT INTO canonical_claims (canonical_id, representative_text, epistemic_tag, |
| composite_confidence, evidence_count, source_dois, aliases, |
| version_history, current_version, schema_version, created_at, updated_at) |
| VALUES ('CANON_SINGLE', 'test single', 'Fact', ?, 1, '[]', '[]', ?, 1, '2.0', ?, ?) |
| """, (to_fixed(0.7), json.dumps(history), now_iso(), now_iso())) |
| conn.commit() |
| conn.close() |
| |
| ev = EpistemicVelocity(db_path=db_path) |
| result = ev.compute_velocity("CANON_SINGLE") |
| assert result["trend"] == "insufficient_data" |
|
|
|
|
| |
| |
| |
|
|
| class TestConfidenceDecomposition: |
| """Tests for phd_research_os_v2.layer5.velocity_and_decomposition (decomposition)""" |
| |
| def test_basic_decomposition(self): |
| from phd_research_os_v2.layer5.velocity_and_decomposition import decompose_confidence |
| |
| claim = { |
| "evidence_quality": 800, |
| "truth_likelihood": 700, |
| "qualifier_strength_score": 600, |
| "composite_confidence": 700, |
| "evidence_strength": 850, |
| "source_section": "results", |
| "qualifiers": json.dumps(["in PBS"]), |
| "missing_fields": json.dumps([]), |
| "is_null_result": False, |
| "is_inherited_citation": False, |
| "practical_significance": True, |
| "parse_confidence": 950, |
| } |
| |
| result = decompose_confidence(claim, source={"study_type": "in_vitro", "journal_tier": 1}) |
| |
| assert "composite_confidence" in result |
| assert "scores" in result |
| assert "headline" in result |
| assert "warnings" in result |
| assert "action_items" in result |
| |
| assert "evidence_quality" in result["scores"] |
| assert "truth_likelihood" in result["scores"] |
| assert "qualifier_strength" in result["scores"] |
| |
| |
| for score_data in result["scores"].values(): |
| assert "value" in score_data |
| assert "bar" in score_data |
| assert "explanation" in score_data |
| |
| def test_decomposition_null_result_warning(self): |
| from phd_research_os_v2.layer5.velocity_and_decomposition import decompose_confidence |
| |
| claim = { |
| "evidence_quality": 400, |
| "truth_likelihood": 300, |
| "qualifier_strength_score": 300, |
| "composite_confidence": 333, |
| "evidence_strength": 500, |
| "source_section": "results", |
| "qualifiers": json.dumps(["not significant"]), |
| "missing_fields": json.dumps([]), |
| "is_null_result": True, |
| "is_inherited_citation": False, |
| "practical_significance": True, |
| } |
| |
| result = decompose_confidence(claim) |
| assert any("null" in w.lower() for w in result["warnings"]) |
| |
| def test_decomposition_abstract_warning(self): |
| from phd_research_os_v2.layer5.velocity_and_decomposition import decompose_confidence |
| |
| claim = { |
| "evidence_quality": 500, |
| "truth_likelihood": 500, |
| "qualifier_strength_score": 500, |
| "composite_confidence": 500, |
| "evidence_strength": 700, |
| "source_section": "abstract", |
| "qualifiers": json.dumps([]), |
| "missing_fields": json.dumps([]), |
| "is_null_result": False, |
| "is_inherited_citation": False, |
| "practical_significance": True, |
| } |
| |
| result = decompose_confidence(claim) |
| assert any("abstract" in w.lower() for w in result["warnings"]) |
| |
| def test_format_text(self): |
| from phd_research_os_v2.layer5.velocity_and_decomposition import ( |
| decompose_confidence, format_decomposition_text |
| ) |
| |
| claim = { |
| "evidence_quality": 800, |
| "truth_likelihood": 700, |
| "qualifier_strength_score": 900, |
| "composite_confidence": 800, |
| "evidence_strength": 850, |
| "source_section": "results", |
| "qualifiers": json.dumps([]), |
| "missing_fields": json.dumps([]), |
| "is_null_result": False, |
| "is_inherited_citation": False, |
| "practical_significance": True, |
| } |
| |
| decomposition = decompose_confidence(claim) |
| text = format_decomposition_text(decomposition) |
| |
| assert isinstance(text, str) |
| assert "Composite Confidence" in text |
| assert "Evidence Quality" in text |
| |
| def test_format_markdown(self): |
| from phd_research_os_v2.layer5.velocity_and_decomposition import ( |
| decompose_confidence, format_decomposition_markdown |
| ) |
| |
| claim = { |
| "evidence_quality": 800, |
| "truth_likelihood": 700, |
| "qualifier_strength_score": 900, |
| "composite_confidence": 800, |
| "evidence_strength": 850, |
| "source_section": "results", |
| "qualifiers": json.dumps([]), |
| "missing_fields": json.dumps([]), |
| "is_null_result": False, |
| "is_inherited_citation": False, |
| "practical_significance": True, |
| } |
| |
| decomposition = decompose_confidence(claim) |
| md = format_decomposition_markdown(decomposition) |
| |
| assert isinstance(md, str) |
| assert "**Confidence:" in md |
| assert "|" in md |
| |
| def test_low_confidence_headline(self): |
| from phd_research_os_v2.layer5.velocity_and_decomposition import decompose_confidence |
| |
| claim = { |
| "evidence_quality": 100, |
| "truth_likelihood": 100, |
| "qualifier_strength_score": 100, |
| "composite_confidence": 100, |
| "evidence_strength": 200, |
| "source_section": "discussion", |
| "qualifiers": json.dumps(["may", "possibly", "potentially"]), |
| "missing_fields": json.dumps(["data", "statistics"]), |
| "is_null_result": False, |
| "is_inherited_citation": True, |
| "practical_significance": True, |
| } |
| |
| result = decompose_confidence(claim) |
| assert "quarantine" in result["headline"].lower() or "low" in result["headline"].lower() |
|
|
|
|
| |
| |
| |
|
|
| class TestSciRIFFIntegration: |
| """Tests for the SciRIFF data integration logic (without actually downloading).""" |
| |
| def test_relevant_task_families_defined(self): |
| from phd_research_os_v2.training.sciriff_integration import RELEVANT_TASK_FAMILIES |
| assert "ie" in RELEVANT_TASK_FAMILIES |
| assert "classification" in RELEVANT_TASK_FAMILIES |
| assert "entailment" in RELEVANT_TASK_FAMILIES |
| |
| def test_system_prompts_exist(self): |
| from phd_research_os_v2.training.sciriff_integration import SYSTEM_PROMPTS |
| assert "ie" in SYSTEM_PROMPTS |
| assert "classification" in SYSTEM_PROMPTS |
| assert "qa" in SYSTEM_PROMPTS |
| for prompt in SYSTEM_PROMPTS.values(): |
| assert "PhD Research OS" in prompt |
| |
| def test_high_priority_tasks_defined(self): |
| from phd_research_os_v2.training.sciriff_integration import HIGH_PRIORITY_TASKS |
| assert "scifact" in HIGH_PRIORITY_TASKS |
| assert "scierc" in HIGH_PRIORITY_TASKS |
|
|
|
|
| |
| |
| |
|
|
| class TestDatabaseSchema: |
| """Verify the database schema supports quarantine and new features.""" |
| |
| def test_claims_table_has_required_columns(self, db_path): |
| from phd_research_os_v2.core.database import get_db |
| conn = get_db(db_path) |
| |
| |
| cursor = conn.execute("PRAGMA table_info(claims)") |
| columns = {row[1] for row in cursor.fetchall()} |
| conn.close() |
| |
| required = { |
| "claim_id", "text", "epistemic_tag", "composite_confidence", |
| "status", "is_null_result", "is_inherited_citation", |
| "qualifiers", "missing_fields", "source_section", |
| "evidence_quality", "truth_likelihood", "qualifier_strength_score", |
| } |
| |
| for col in required: |
| assert col in columns, f"Missing column: {col}" |
| |
| def test_canonical_claims_has_version_history(self, db_path): |
| from phd_research_os_v2.core.database import get_db |
| conn = get_db(db_path) |
| cursor = conn.execute("PRAGMA table_info(canonical_claims)") |
| columns = {row[1] for row in cursor.fetchall()} |
| conn.close() |
| |
| assert "version_history" in columns |
| assert "evidence_count" in columns |
| |
| def test_eval_runs_table_exists(self, db_path): |
| from phd_research_os_v2.core.database import get_db |
| conn = get_db(db_path) |
| cursor = conn.execute("PRAGMA table_info(eval_runs)") |
| columns = {row[1] for row in cursor.fetchall()} |
| conn.close() |
| |
| assert "run_id" in columns |
| assert "metrics" in columns |
| assert "passed" in columns |
|
|
|
|
| if __name__ == "__main__": |
| pytest.main([__file__, "-v", "--tb=short"]) |
|
|