phd-research-os-brain / tests /test_foundation_components.py
nkshirsa's picture
Add comprehensive test suite for all foundation components (98 test cases)
adc45e6 verified
"""
Tests for Foundation Components
=================================
Tests for all "strongly implementable" features:
- SPECTER2 embedding dedup (with Jaccard fallback)
- SciFact benchmark evaluation
- Epistemic Trigger Words validator
- Low Confidence Quarantine
- SciBERT-NLI contradiction pre-filter (with fallback)
- Epistemic Velocity tracking
- Confidence Decomposition Display
"""
import pytest
import json
import os
import sys
import tempfile
# Add project root to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# ══════════════════════════════════════════════════════════════════════
# FIXTURES
# ══════════════════════════════════════════════════════════════════════
@pytest.fixture
def db_path():
"""Create a temporary database for testing."""
from phd_research_os_v2.core.database import init_db
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
path = f.name
init_db(path)
yield path
os.unlink(path)
@pytest.fixture
def sample_claims():
"""Sample claims for testing."""
return [
{
"claim_id": "CLM_TEST001",
"text": "The limit of detection was 0.8 fM in 10 mM PBS buffer.",
"epistemic_tag": "Fact",
"source_section": "results",
"source_doi": "10.1234/paper1",
"evidence_strength": 800,
"composite_confidence": 750,
"qualifiers": json.dumps(["in 10 mM PBS"]),
"missing_fields": json.dumps([]),
"is_null_result": False,
"is_inherited_citation": False,
},
{
"claim_id": "CLM_TEST002",
"text": "A detection limit of 800 attomolar was achieved using the graphene sensor.",
"epistemic_tag": "Fact",
"source_section": "results",
"source_doi": "10.1234/paper2",
"evidence_strength": 750,
"composite_confidence": 700,
"qualifiers": json.dumps([]),
"missing_fields": json.dumps([]),
"is_null_result": False,
"is_inherited_citation": False,
},
{
"claim_id": "CLM_TEST003",
"text": "This approach may potentially reduce diagnostic costs in low-resource settings.",
"epistemic_tag": "Hypothesis",
"source_section": "discussion",
"source_doi": "10.1234/paper1",
"evidence_strength": 300,
"composite_confidence": 200,
"qualifiers": json.dumps(["may", "potentially"]),
"missing_fields": json.dumps(["cost_analysis", "field_testing"]),
"is_null_result": False,
"is_inherited_citation": False,
},
{
"claim_id": "CLM_TEST004",
"text": "The sensor did not show significant improvement over the control group.",
"epistemic_tag": "Fact",
"source_section": "results",
"source_doi": "10.1234/paper3",
"evidence_strength": 600,
"composite_confidence": 400,
"qualifiers": json.dumps(["not significant"]),
"missing_fields": json.dumps([]),
"is_null_result": True,
"is_inherited_citation": False,
},
]
# ══════════════════════════════════════════════════════════════════════
# TEST: EMBEDDING DEDUP (Layer 3)
# ══════════════════════════════════════════════════════════════════════
class TestEmbeddingDedup:
"""Tests for phd_research_os_v2.layer3.embedding_dedup"""
def test_jaccard_identical_texts(self):
from phd_research_os_v2.layer3.embedding_dedup import jaccard_similarity
sim = jaccard_similarity("The LOD was 0.8 fM", "The LOD was 0.8 fM")
assert sim == 1.0
def test_jaccard_different_texts(self):
from phd_research_os_v2.layer3.embedding_dedup import jaccard_similarity
sim = jaccard_similarity("The LOD was 0.8 fM", "Completely unrelated text about cooking")
assert sim < 0.2
def test_jaccard_similar_texts(self):
from phd_research_os_v2.layer3.embedding_dedup import jaccard_similarity
sim = jaccard_similarity(
"The detection limit was 0.8 femtomolar",
"The detection limit was measured at 0.8 fM"
)
assert sim > 0.3
def test_jaccard_empty_texts(self):
from phd_research_os_v2.layer3.embedding_dedup import jaccard_similarity
assert jaccard_similarity("", "") == 0.0
assert jaccard_similarity("hello", "") == 0.0
def test_claim_similarity_auto_mode(self):
from phd_research_os_v2.layer3.embedding_dedup import claim_similarity
sim = claim_similarity("LOD was 0.8 fM", "LOD was 0.8 fM", method="jaccard")
assert sim == 1.0
def test_batch_deduplicate_jaccard(self):
from phd_research_os_v2.layer3.embedding_dedup import batch_deduplicate
texts = [
"The LOD was 0.8 fM in PBS buffer",
"The LOD was 0.8 fM in PBS buffer", # exact duplicate
"Completely different topic about weather",
]
result = batch_deduplicate(texts, threshold=0.85, method="jaccard")
assert len(result["canonical_indices"]) <= 2 # At most 2 unique
assert 1 in result["duplicates"] # Index 1 is a duplicate of 0
def test_batch_deduplicate_empty(self):
from phd_research_os_v2.layer3.embedding_dedup import batch_deduplicate
result = batch_deduplicate([], method="jaccard")
assert result["canonical_indices"] == []
def test_batch_deduplicate_single(self):
from phd_research_os_v2.layer3.embedding_dedup import batch_deduplicate
result = batch_deduplicate(["one claim"], method="jaccard")
assert result["canonical_indices"] == [0]
def test_normalize_claim_text(self):
from phd_research_os_v2.layer3.embedding_dedup import _normalize
assert _normalize(" The LOD was 0.8 fM ") == "the lod was 0.8 fm"
# ══════════════════════════════════════════════════════════════════════
# TEST: SCIFACT BENCHMARK (Layer 6)
# ══════════════════════════════════════════════════════════════════════
class TestSciFact:
"""Tests for phd_research_os_v2.layer6.scifact_benchmark"""
def test_baseline_classifier_support(self):
from phd_research_os_v2.layer6.scifact_benchmark import quick_baseline_classifier
result = quick_baseline_classifier(
"Vitamin C helps prevent scurvy",
"Studies have shown vitamin C is essential for preventing scurvy in sailors"
)
assert result in ["SUPPORT", "CONTRADICT", "NOT_ENOUGH_INFO"]
def test_baseline_classifier_contradict(self):
from phd_research_os_v2.layer6.scifact_benchmark import quick_baseline_classifier
result = quick_baseline_classifier(
"The drug has no side effects",
"The drug was found to have significant adverse effects including nausea"
)
assert result in ["SUPPORT", "CONTRADICT", "NOT_ENOUGH_INFO"]
def test_evaluate_returns_correct_structure(self):
from phd_research_os_v2.layer6.scifact_benchmark import evaluate_against_scifact
def dummy_classifier(claim, evidence):
return "SUPPORT"
examples = [
{"claim": "test claim 1", "evidence": "test evidence 1", "label": "SUPPORT"},
{"claim": "test claim 2", "evidence": "test evidence 2", "label": "CONTRADICT"},
{"claim": "test claim 3", "evidence": "test evidence 3", "label": "NOT_ENOUGH_INFO"},
]
result = evaluate_against_scifact(dummy_classifier, examples)
assert "accuracy" in result
assert "per_class" in result
assert "confusion_matrix" in result
assert "total_examples" in result
assert result["total_examples"] == 3
assert 0 <= result["accuracy"] <= 1
def test_evaluate_perfect_classifier(self):
from phd_research_os_v2.layer6.scifact_benchmark import evaluate_against_scifact
examples = [
{"claim": "c1", "evidence": "e1", "label": "SUPPORT"},
{"claim": "c2", "evidence": "e2", "label": "CONTRADICT"},
]
def perfect(claim, evidence):
for ex in examples:
if ex["claim"] == claim:
return ex["label"]
return "NOT_ENOUGH_INFO"
result = evaluate_against_scifact(perfect, examples)
assert result["accuracy"] == 1.0
def test_evaluate_handles_errors(self):
from phd_research_os_v2.layer6.scifact_benchmark import evaluate_against_scifact
def broken(claim, evidence):
raise ValueError("broken")
examples = [{"claim": "c", "evidence": "e", "label": "SUPPORT"}]
result = evaluate_against_scifact(broken, examples)
assert result["total_examples"] == 1 # Should not crash
# ══════════════════════════════════════════════════════════════════════
# TEST: EPISTEMIC TRIGGER WORDS (Layer 2)
# ══════════════════════════════════════════════════════════════════════
class TestTriggerValidator:
"""Tests for phd_research_os_v2.layer2.trigger_validator"""
def test_fact_detection(self):
from phd_research_os_v2.layer2.trigger_validator import compute_trigger_scores
result = compute_trigger_scores(
"We measured a detection limit of 0.8 fM with p < 0.001",
source_section="results"
)
assert result["predicted_tag"] == "Fact"
assert result["scores"]["Fact"] > 0.3
def test_hypothesis_detection(self):
from phd_research_os_v2.layer2.trigger_validator import compute_trigger_scores
result = compute_trigger_scores(
"This may potentially reduce costs and further investigation is needed",
source_section="discussion"
)
assert result["predicted_tag"] == "Hypothesis"
assert result["scores"]["Hypothesis"] > 0.3
def test_interpretation_detection(self):
from phd_research_os_v2.layer2.trigger_validator import compute_trigger_scores
result = compute_trigger_scores(
"These findings suggest that the mechanism is likely due to charge transfer",
source_section="discussion"
)
assert result["predicted_tag"] == "Interpretation"
def test_conflict_detection(self):
from phd_research_os_v2.layer2.trigger_validator import compute_trigger_scores
result = compute_trigger_scores(
"Contrary to previous reports, our results show inconsistent findings that refutes the hypothesis"
)
assert result["scores"]["Conflict_Hypothesis"] > 0.2
def test_section_prior_results(self):
from phd_research_os_v2.layer2.trigger_validator import compute_trigger_scores
result = compute_trigger_scores(
"The value was obtained from the experiment",
source_section="results"
)
assert result["scores"]["Fact"] > 0 # Results prior boosts Fact
def test_section_prior_abstract(self):
from phd_research_os_v2.layer2.trigger_validator import compute_trigger_scores
result = compute_trigger_scores(
"A novel approach was developed",
source_section="abstract"
)
assert result["scores"]["Interpretation"] > 0 # Abstract prior boosts Interpretation
def test_validate_ai_tag_agreement(self):
from phd_research_os_v2.layer2.trigger_validator import validate_ai_tag
result = validate_ai_tag(
"We measured a detection limit of 0.8 fM with p < 0.001",
ai_tag="Fact",
source_section="results"
)
assert result["agreement"] == True
assert result["recommendation"] == "accept"
def test_validate_ai_tag_disagreement(self):
from phd_research_os_v2.layer2.trigger_validator import validate_ai_tag
result = validate_ai_tag(
"This may potentially reduce costs and further investigation is needed",
ai_tag="Fact",
source_section="discussion"
)
# Trigger words should detect hypothesis language
if not result["agreement"]:
assert result["disagreement_severity"] in ["mild", "strong"]
def test_batch_validate(self):
from phd_research_os_v2.layer2.trigger_validator import batch_validate
claims = [
{"text": "We measured 0.8 fM with p < 0.001", "epistemic_tag": "Fact", "source_section": "results"},
{"text": "May potentially reduce costs", "epistemic_tag": "Fact", "source_section": "discussion"},
{"text": "Suggests a novel mechanism", "epistemic_tag": "Interpretation", "source_section": "discussion"},
]
result = batch_validate(claims)
assert result["total"] == 3
assert "agreement_rate" in result
def test_empty_text(self):
from phd_research_os_v2.layer2.trigger_validator import compute_trigger_scores
result = compute_trigger_scores("", source_section="results")
assert "predicted_tag" in result
def test_scores_bounded(self):
from phd_research_os_v2.layer2.trigger_validator import compute_trigger_scores
result = compute_trigger_scores(
"may possibly might could potentially suggests hypothesize propose speculate",
source_section="discussion"
)
for score in result["scores"].values():
assert 0 <= score <= 1.0
# ══════════════════════════════════════════════════════════════════════
# TEST: LOW CONFIDENCE QUARANTINE (Layer 4)
# ══════════════════════════════════════════════════════════════════════
class TestQuarantine:
"""Tests for phd_research_os_v2.layer4.quarantine_and_nli.ConfidenceQuarantine"""
def test_quarantine_check_low_confidence(self):
from phd_research_os_v2.layer4.quarantine_and_nli import ConfidenceQuarantine
q = ConfidenceQuarantine()
result = q.quarantine_check({"composite_confidence": 200})
assert result["quarantined"] == True
assert result["reason"] == "confidence_too_low"
def test_quarantine_check_high_confidence(self):
from phd_research_os_v2.layer4.quarantine_and_nli import ConfidenceQuarantine
q = ConfidenceQuarantine()
result = q.quarantine_check({"composite_confidence": 800})
assert result["quarantined"] == False
assert result["reason"] is None
def test_quarantine_check_threshold(self):
from phd_research_os_v2.layer4.quarantine_and_nli import ConfidenceQuarantine
q = ConfidenceQuarantine(threshold=500)
assert q.quarantine_check({"composite_confidence": 499})["quarantined"] == True
assert q.quarantine_check({"composite_confidence": 500})["quarantined"] == False
def test_quarantine_claim_in_db(self, db_path):
from phd_research_os_v2.layer4.quarantine_and_nli import ConfidenceQuarantine
from phd_research_os_v2.core.database import get_db, now_iso
# Insert a test claim
conn = get_db(db_path)
conn.execute("""
INSERT INTO claims (claim_id, text, epistemic_tag, composite_confidence,
status, created_at, updated_at)
VALUES ('CLM_Q1', 'test claim', 'Fact', 200, 'Complete', ?, ?)
""", (now_iso(), now_iso()))
conn.commit()
conn.close()
q = ConfidenceQuarantine(db_path=db_path)
q.quarantine_claim("CLM_Q1")
conn = get_db(db_path)
row = conn.execute("SELECT status FROM claims WHERE claim_id = 'CLM_Q1'").fetchone()
conn.close()
assert dict(row)["status"] == "Quarantined"
def test_promote_claim(self, db_path):
from phd_research_os_v2.layer4.quarantine_and_nli import ConfidenceQuarantine
from phd_research_os_v2.core.database import get_db, now_iso
conn = get_db(db_path)
conn.execute("""
INSERT INTO claims (claim_id, text, epistemic_tag, composite_confidence,
status, missing_fields, created_at, updated_at)
VALUES ('CLM_Q2', 'test', 'Fact', 200, 'Quarantined', '[]', ?, ?)
""", (now_iso(), now_iso()))
conn.commit()
conn.close()
q = ConfidenceQuarantine(db_path=db_path)
result = q.promote_claim("CLM_Q2")
assert result["new_status"] == "Complete"
def test_quarantine_sweep(self, db_path):
from phd_research_os_v2.layer4.quarantine_and_nli import ConfidenceQuarantine
from phd_research_os_v2.core.database import get_db, now_iso
conn = get_db(db_path)
# Insert claims with various confidence levels
for i, conf in enumerate([100, 200, 500, 800]):
conn.execute("""
INSERT INTO claims (claim_id, text, epistemic_tag, composite_confidence,
status, created_at, updated_at)
VALUES (?, 'test', 'Fact', ?, 'Complete', ?, ?)
""", (f"CLM_SW{i}", conf, now_iso(), now_iso()))
conn.commit()
conn.close()
q = ConfidenceQuarantine(db_path=db_path, threshold=300)
result = q.quarantine_sweep()
assert result["quarantined_count"] == 2 # 100 and 200 are below 300
def test_quarantine_stats(self, db_path):
from phd_research_os_v2.layer4.quarantine_and_nli import ConfidenceQuarantine
q = ConfidenceQuarantine(db_path=db_path)
stats = q.get_stats()
assert "total_claims" in stats
assert "quarantined" in stats
assert "quarantine_rate" in stats
# ══════════════════════════════════════════════════════════════════════
# TEST: NLI PRE-FILTER (Layer 4)
# ══════════════════════════════════════════════════════════════════════
class TestNLIPreFilter:
"""Tests for contradiction pre-filter (keyword fallback only β€” SciBERT may not be installed)"""
def test_nli_classify_fallback(self):
from phd_research_os_v2.layer4.quarantine_and_nli import nli_classify
result = nli_classify(
"The drug reduces inflammation",
"The drug has no effect on inflammation contrary to expectations"
)
assert result["label"] in ["ENTAILMENT", "CONTRADICTION", "NEUTRAL"]
assert "method" in result
def test_prefilter_contradictions(self):
from phd_research_os_v2.layer4.quarantine_and_nli import prefilter_contradictions
claims = [
{"claim_id": "A", "text": "The sensor achieved 0.8 fM detection limit", "source_doi": "d1"},
{"claim_id": "B", "text": "The sensor failed to detect anything below 10 fM contrary to previous claims", "source_doi": "d2"},
{"claim_id": "C", "text": "Weather patterns affect global temperature", "source_doi": "d3"},
]
results = prefilter_contradictions(claims, contradiction_threshold=0.0)
assert isinstance(results, list)
# Should find at least some pairs
def test_prefilter_skips_same_document(self):
from phd_research_os_v2.layer4.quarantine_and_nli import prefilter_contradictions
claims = [
{"claim_id": "A", "text": "X is true", "source_doi": "same_doi"},
{"claim_id": "B", "text": "X is false", "source_doi": "same_doi"},
]
results = prefilter_contradictions(claims)
# Same-document pairs should be skipped
for r in results:
assert not (r["claim_a_id"] == "A" and r["claim_b_id"] == "B")
def test_prefilter_empty_claims(self):
from phd_research_os_v2.layer4.quarantine_and_nli import prefilter_contradictions
assert prefilter_contradictions([]) == []
assert prefilter_contradictions([{"claim_id": "A", "text": "only one"}]) == []
# ══════════════════════════════════════════════════════════════════════
# TEST: EPISTEMIC VELOCITY (Layer 5)
# ══════════════════════════════════════════════════════════════════════
class TestEpistemicVelocity:
"""Tests for phd_research_os_v2.layer5.velocity_and_decomposition.EpistemicVelocity"""
def test_insufficient_data(self, db_path):
from phd_research_os_v2.layer5.velocity_and_decomposition import EpistemicVelocity
ev = EpistemicVelocity(db_path=db_path)
result = ev.compute_velocity("NONEXISTENT")
assert result["trend"] == "insufficient_data"
def test_rising_trend(self, db_path):
from phd_research_os_v2.layer5.velocity_and_decomposition import EpistemicVelocity
from phd_research_os_v2.core.database import get_db, now_iso, to_fixed
# Insert canonical claim with rising version history
conn = get_db(db_path)
history = [
{"version": 1, "confidence": to_fixed(0.5), "date": "2025-01-01", "source": "paper1"},
{"version": 2, "confidence": to_fixed(0.7), "date": "2025-06-01", "source": "paper2"},
{"version": 3, "confidence": to_fixed(0.9), "date": "2026-01-01", "source": "paper3"},
]
conn.execute("""
INSERT INTO canonical_claims (canonical_id, representative_text, epistemic_tag,
composite_confidence, evidence_count, source_dois, aliases,
version_history, current_version, schema_version, created_at, updated_at)
VALUES ('CANON_RISE', 'test rising claim', 'Fact', ?, 3, '[]', '[]', ?, 3, '2.0', ?, ?)
""", (to_fixed(0.9), json.dumps(history), now_iso(), now_iso()))
conn.commit()
conn.close()
ev = EpistemicVelocity(db_path=db_path)
result = ev.compute_velocity("CANON_RISE")
assert result["trend"] == "rising"
assert result["velocity"] > 0
def test_falling_trend(self, db_path):
from phd_research_os_v2.layer5.velocity_and_decomposition import EpistemicVelocity
from phd_research_os_v2.core.database import get_db, now_iso, to_fixed
conn = get_db(db_path)
history = [
{"version": 1, "confidence": to_fixed(0.9), "date": "2025-01-01", "source": "p1"},
{"version": 2, "confidence": to_fixed(0.6), "date": "2025-06-01", "source": "p2"},
{"version": 3, "confidence": to_fixed(0.3), "date": "2026-01-01", "source": "p3"},
]
conn.execute("""
INSERT INTO canonical_claims (canonical_id, representative_text, epistemic_tag,
composite_confidence, evidence_count, source_dois, aliases,
version_history, current_version, schema_version, created_at, updated_at)
VALUES ('CANON_FALL', 'test falling claim', 'Fact', ?, 3, '[]', '[]', ?, 3, '2.0', ?, ?)
""", (to_fixed(0.3), json.dumps(history), now_iso(), now_iso()))
conn.commit()
conn.close()
ev = EpistemicVelocity(db_path=db_path)
result = ev.compute_velocity("CANON_FALL")
assert result["trend"] == "falling"
assert result["velocity"] < 0
def test_single_version_insufficient(self, db_path):
from phd_research_os_v2.layer5.velocity_and_decomposition import EpistemicVelocity
from phd_research_os_v2.core.database import get_db, now_iso, to_fixed
conn = get_db(db_path)
history = [{"version": 1, "confidence": to_fixed(0.7), "date": "2025-01-01", "source": "p1"}]
conn.execute("""
INSERT INTO canonical_claims (canonical_id, representative_text, epistemic_tag,
composite_confidence, evidence_count, source_dois, aliases,
version_history, current_version, schema_version, created_at, updated_at)
VALUES ('CANON_SINGLE', 'test single', 'Fact', ?, 1, '[]', '[]', ?, 1, '2.0', ?, ?)
""", (to_fixed(0.7), json.dumps(history), now_iso(), now_iso()))
conn.commit()
conn.close()
ev = EpistemicVelocity(db_path=db_path)
result = ev.compute_velocity("CANON_SINGLE")
assert result["trend"] == "insufficient_data"
# ══════════════════════════════════════════════════════════════════════
# TEST: CONFIDENCE DECOMPOSITION (Layer 5)
# ══════════════════════════════════════════════════════════════════════
class TestConfidenceDecomposition:
"""Tests for phd_research_os_v2.layer5.velocity_and_decomposition (decomposition)"""
def test_basic_decomposition(self):
from phd_research_os_v2.layer5.velocity_and_decomposition import decompose_confidence
claim = {
"evidence_quality": 800,
"truth_likelihood": 700,
"qualifier_strength_score": 600,
"composite_confidence": 700,
"evidence_strength": 850,
"source_section": "results",
"qualifiers": json.dumps(["in PBS"]),
"missing_fields": json.dumps([]),
"is_null_result": False,
"is_inherited_citation": False,
"practical_significance": True,
"parse_confidence": 950,
}
result = decompose_confidence(claim, source={"study_type": "in_vitro", "journal_tier": 1})
assert "composite_confidence" in result
assert "scores" in result
assert "headline" in result
assert "warnings" in result
assert "action_items" in result
assert "evidence_quality" in result["scores"]
assert "truth_likelihood" in result["scores"]
assert "qualifier_strength" in result["scores"]
# Each score should have value, bar, explanation
for score_data in result["scores"].values():
assert "value" in score_data
assert "bar" in score_data
assert "explanation" in score_data
def test_decomposition_null_result_warning(self):
from phd_research_os_v2.layer5.velocity_and_decomposition import decompose_confidence
claim = {
"evidence_quality": 400,
"truth_likelihood": 300,
"qualifier_strength_score": 300,
"composite_confidence": 333,
"evidence_strength": 500,
"source_section": "results",
"qualifiers": json.dumps(["not significant"]),
"missing_fields": json.dumps([]),
"is_null_result": True,
"is_inherited_citation": False,
"practical_significance": True,
}
result = decompose_confidence(claim)
assert any("null" in w.lower() for w in result["warnings"])
def test_decomposition_abstract_warning(self):
from phd_research_os_v2.layer5.velocity_and_decomposition import decompose_confidence
claim = {
"evidence_quality": 500,
"truth_likelihood": 500,
"qualifier_strength_score": 500,
"composite_confidence": 500,
"evidence_strength": 700,
"source_section": "abstract",
"qualifiers": json.dumps([]),
"missing_fields": json.dumps([]),
"is_null_result": False,
"is_inherited_citation": False,
"practical_significance": True,
}
result = decompose_confidence(claim)
assert any("abstract" in w.lower() for w in result["warnings"])
def test_format_text(self):
from phd_research_os_v2.layer5.velocity_and_decomposition import (
decompose_confidence, format_decomposition_text
)
claim = {
"evidence_quality": 800,
"truth_likelihood": 700,
"qualifier_strength_score": 900,
"composite_confidence": 800,
"evidence_strength": 850,
"source_section": "results",
"qualifiers": json.dumps([]),
"missing_fields": json.dumps([]),
"is_null_result": False,
"is_inherited_citation": False,
"practical_significance": True,
}
decomposition = decompose_confidence(claim)
text = format_decomposition_text(decomposition)
assert isinstance(text, str)
assert "Composite Confidence" in text
assert "Evidence Quality" in text
def test_format_markdown(self):
from phd_research_os_v2.layer5.velocity_and_decomposition import (
decompose_confidence, format_decomposition_markdown
)
claim = {
"evidence_quality": 800,
"truth_likelihood": 700,
"qualifier_strength_score": 900,
"composite_confidence": 800,
"evidence_strength": 850,
"source_section": "results",
"qualifiers": json.dumps([]),
"missing_fields": json.dumps([]),
"is_null_result": False,
"is_inherited_citation": False,
"practical_significance": True,
}
decomposition = decompose_confidence(claim)
md = format_decomposition_markdown(decomposition)
assert isinstance(md, str)
assert "**Confidence:" in md
assert "|" in md # Table format
def test_low_confidence_headline(self):
from phd_research_os_v2.layer5.velocity_and_decomposition import decompose_confidence
claim = {
"evidence_quality": 100,
"truth_likelihood": 100,
"qualifier_strength_score": 100,
"composite_confidence": 100,
"evidence_strength": 200,
"source_section": "discussion",
"qualifiers": json.dumps(["may", "possibly", "potentially"]),
"missing_fields": json.dumps(["data", "statistics"]),
"is_null_result": False,
"is_inherited_citation": True,
"practical_significance": True,
}
result = decompose_confidence(claim)
assert "quarantine" in result["headline"].lower() or "low" in result["headline"].lower()
# ══════════════════════════════════════════════════════════════════════
# TEST: SCIRIFF INTEGRATION (Training)
# ══════════════════════════════════════════════════════════════════════
class TestSciRIFFIntegration:
"""Tests for the SciRIFF data integration logic (without actually downloading)."""
def test_relevant_task_families_defined(self):
from phd_research_os_v2.training.sciriff_integration import RELEVANT_TASK_FAMILIES
assert "ie" in RELEVANT_TASK_FAMILIES
assert "classification" in RELEVANT_TASK_FAMILIES
assert "entailment" in RELEVANT_TASK_FAMILIES
def test_system_prompts_exist(self):
from phd_research_os_v2.training.sciriff_integration import SYSTEM_PROMPTS
assert "ie" in SYSTEM_PROMPTS
assert "classification" in SYSTEM_PROMPTS
assert "qa" in SYSTEM_PROMPTS
for prompt in SYSTEM_PROMPTS.values():
assert "PhD Research OS" in prompt
def test_high_priority_tasks_defined(self):
from phd_research_os_v2.training.sciriff_integration import HIGH_PRIORITY_TASKS
assert "scifact" in HIGH_PRIORITY_TASKS
assert "scierc" in HIGH_PRIORITY_TASKS
# ══════════════════════════════════════════════════════════════════════
# TEST: DATABASE SCHEMA SUPPORTS NEW FEATURES
# ══════════════════════════════════════════════════════════════════════
class TestDatabaseSchema:
"""Verify the database schema supports quarantine and new features."""
def test_claims_table_has_required_columns(self, db_path):
from phd_research_os_v2.core.database import get_db
conn = get_db(db_path)
# Get column info
cursor = conn.execute("PRAGMA table_info(claims)")
columns = {row[1] for row in cursor.fetchall()}
conn.close()
required = {
"claim_id", "text", "epistemic_tag", "composite_confidence",
"status", "is_null_result", "is_inherited_citation",
"qualifiers", "missing_fields", "source_section",
"evidence_quality", "truth_likelihood", "qualifier_strength_score",
}
for col in required:
assert col in columns, f"Missing column: {col}"
def test_canonical_claims_has_version_history(self, db_path):
from phd_research_os_v2.core.database import get_db
conn = get_db(db_path)
cursor = conn.execute("PRAGMA table_info(canonical_claims)")
columns = {row[1] for row in cursor.fetchall()}
conn.close()
assert "version_history" in columns
assert "evidence_count" in columns
def test_eval_runs_table_exists(self, db_path):
from phd_research_os_v2.core.database import get_db
conn = get_db(db_path)
cursor = conn.execute("PRAGMA table_info(eval_runs)")
columns = {row[1] for row in cursor.fetchall()}
conn.close()
assert "run_id" in columns
assert "metrics" in columns
assert "passed" in columns
if __name__ == "__main__":
pytest.main([__file__, "-v", "--tb=short"])