File size: 7,198 Bytes
e5fd9d4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 | """
PhD Research OS — AI Model Council Tests
==========================================
Tests for the 4-member council pipeline: Query Planner → Extractor → Critic → Chairman
"""
import os
import sys
import json
import pytest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from phd_research_os.council import (
ModelCouncil, CouncilRound, COUNCIL_PROMPTS
)
from phd_research_os.db import init_db, get_db, create_claim
from phd_research_os.taxonomy import TaxonomyManager
TEST_DB = "test_council.db"
SAMPLE_TEXT = """
We investigated the sensitivity of graphene field-effect transistor (GFET) biosensors
to cardiac troponin I (cTnI) in phosphate-buffered saline (PBS). Our measurements
show that the Dirac point shifts by 45 ± 3 mV upon binding of 1 pM cTnI in 10 mM PBS
(n=5, p<0.001). The limit of detection was determined to be 0.8 fM using the 3σ/slope
method. We interpret these results as evidence that aptamer-functionalized GFETs can
achieve clinically relevant sensitivity for cardiac biomarker detection. However, ionic
strength effects at physiological conditions (150 mM) significantly reduce sensitivity,
suggesting that a desalting step may be necessary for clinical translation. We hypothesize
that a PEG spacer of 5 kDa molecular weight could mitigate Debye screening while
maintaining binding affinity, though this remains to be experimentally validated.
"""
@pytest.fixture(autouse=True)
def setup_teardown():
init_db(TEST_DB)
TaxonomyManager(db_path=TEST_DB)
yield
for suffix in ["", "-wal", "-shm"]:
p = TEST_DB + suffix
if os.path.exists(p):
os.remove(p)
# ============================================================
# Council Prompt Tests
# ============================================================
def test_all_4_council_members_defined():
assert "query_planner" in COUNCIL_PROMPTS
assert "extractor" in COUNCIL_PROMPTS
assert "critic" in COUNCIL_PROMPTS
assert "chairman" in COUNCIL_PROMPTS
def test_query_planner_prompt_has_json_instruction():
assert "JSON array" in COUNCIL_PROMPTS["query_planner"]
def test_extractor_prompt_has_epistemic_tags():
prompt = COUNCIL_PROMPTS["extractor"]
assert "Fact" in prompt
assert "Interpretation" in prompt
assert "Hypothesis" in prompt
assert "Conflict_Hypothesis" in prompt
def test_critic_prompt_checks_5_things():
prompt = COUNCIL_PROMPTS["critic"]
assert "Missing important claims" in prompt
assert "Incorrect epistemic tags" in prompt
assert "Overly confident" in prompt
assert "Taxonomy correctness" in prompt
assert "Missing fields" in prompt
def test_chairman_prompt_has_completeness_penalty():
assert "0.7 completeness penalty" in COUNCIL_PROMPTS["chairman"]
def test_chairman_output_schema():
prompt = COUNCIL_PROMPTS["chairman"]
assert "epistemic_tag" in prompt
assert "confidence" in prompt
assert "missing_fields" in prompt
assert "status" in prompt
# ============================================================
# Mock Council Pipeline Tests (no brain)
# ============================================================
def test_council_deliberate_without_brain():
council = ModelCouncil(brain=None, db_path=TEST_DB)
result = council.deliberate(SAMPLE_TEXT)
assert isinstance(result, CouncilRound)
assert result.round_id.startswith("CNCL_")
assert len(result.final_claims) >= 1
assert result.started_at
assert result.completed_at
def test_council_produces_valid_claims():
council = ModelCouncil(brain=None, db_path=TEST_DB)
result = council.deliberate(SAMPLE_TEXT)
for claim in result.final_claims:
assert "text" in claim
assert "epistemic_tag" in claim
assert "confidence" in claim
assert "missing_fields" in claim
assert "status" in claim
assert claim["epistemic_tag"] in ["Fact", "Interpretation", "Hypothesis", "Conflict_Hypothesis"]
assert 0.0 <= claim["confidence"] <= 1.0
assert claim["status"] in ["Complete", "Incomplete"]
def test_council_claims_have_consistent_status():
council = ModelCouncil(brain=None, db_path=TEST_DB)
result = council.deliberate(SAMPLE_TEXT)
for claim in result.final_claims:
if claim["missing_fields"]:
assert claim["status"] == "Incomplete"
def test_council_with_query():
council = ModelCouncil(brain=None, db_path=TEST_DB)
result = council.deliberate(SAMPLE_TEXT, query="What is the LOD for GFET cTnI sensors?")
assert isinstance(result.query_plan, list)
assert len(result.final_claims) >= 1
def test_council_query_planner_standalone():
council = ModelCouncil(brain=None, db_path=TEST_DB)
queries = council.deliberate_query("What are the ionic strength effects on GFET sensitivity?")
assert isinstance(queries, list)
assert len(queries) >= 1
def test_council_metadata():
council = ModelCouncil(brain=None, db_path=TEST_DB)
result = council.deliberate(SAMPLE_TEXT)
assert "council_version" in result.metadata
assert "taxonomy_domain" in result.metadata
assert "extractor_claim_count" in result.metadata
assert "final_claim_count" in result.metadata
def test_council_critique_structure():
council = ModelCouncil(brain=None, db_path=TEST_DB)
result = council.deliberate(SAMPLE_TEXT)
if result.critique:
assert "feedback" in result.critique
def test_council_raw_extraction_preserved():
council = ModelCouncil(brain=None, db_path=TEST_DB)
result = council.deliberate(SAMPLE_TEXT)
assert isinstance(result.raw_extraction, list)
def test_council_round_logged_to_db():
council = ModelCouncil(brain=None, db_path=TEST_DB)
result = council.deliberate(SAMPLE_TEXT)
history = council.get_council_history()
assert len(history) >= 1
assert history[0]["round_id"] == result.round_id
def test_council_history_retrieval():
council = ModelCouncil(brain=None, db_path=TEST_DB)
council.deliberate(SAMPLE_TEXT)
council.deliberate(SAMPLE_TEXT)
history = council.get_council_history(limit=10)
assert len(history) >= 2
def test_council_confidence_clamped():
council = ModelCouncil(brain=None, db_path=TEST_DB)
result = council.deliberate(SAMPLE_TEXT)
for claim in result.final_claims:
assert 0.0 <= claim["confidence"] <= 1.0
def test_council_invalid_tag_defaulted():
"""Claims with invalid epistemic tags should be defaulted to Interpretation."""
council = ModelCouncil(brain=None, db_path=TEST_DB)
# Mock produces "Interpretation" by default, so this validates the path
result = council.deliberate(SAMPLE_TEXT)
for claim in result.final_claims:
assert claim["epistemic_tag"] in ["Fact", "Interpretation", "Hypothesis", "Conflict_Hypothesis"]
def test_multiple_deliberations_independent():
council = ModelCouncil(brain=None, db_path=TEST_DB)
r1 = council.deliberate("Text about graphene sensors.")
r2 = council.deliberate("Text about lithium batteries.")
assert r1.round_id != r2.round_id
if __name__ == "__main__":
pytest.main([__file__, "-v"])
|