phd-research-os-brain / tests /test_council.py
nkshirsa's picture
Add AI Model Council: tests/test_council.py
e5fd9d4 verified
"""
PhD Research OS — AI Model Council Tests
==========================================
Tests for the 4-member council pipeline: Query Planner → Extractor → Critic → Chairman
"""
import os
import sys
import json
import pytest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from phd_research_os.council import (
ModelCouncil, CouncilRound, COUNCIL_PROMPTS
)
from phd_research_os.db import init_db, get_db, create_claim
from phd_research_os.taxonomy import TaxonomyManager
TEST_DB = "test_council.db"
SAMPLE_TEXT = """
We investigated the sensitivity of graphene field-effect transistor (GFET) biosensors
to cardiac troponin I (cTnI) in phosphate-buffered saline (PBS). Our measurements
show that the Dirac point shifts by 45 ± 3 mV upon binding of 1 pM cTnI in 10 mM PBS
(n=5, p<0.001). The limit of detection was determined to be 0.8 fM using the 3σ/slope
method. We interpret these results as evidence that aptamer-functionalized GFETs can
achieve clinically relevant sensitivity for cardiac biomarker detection. However, ionic
strength effects at physiological conditions (150 mM) significantly reduce sensitivity,
suggesting that a desalting step may be necessary for clinical translation. We hypothesize
that a PEG spacer of 5 kDa molecular weight could mitigate Debye screening while
maintaining binding affinity, though this remains to be experimentally validated.
"""
@pytest.fixture(autouse=True)
def setup_teardown():
init_db(TEST_DB)
TaxonomyManager(db_path=TEST_DB)
yield
for suffix in ["", "-wal", "-shm"]:
p = TEST_DB + suffix
if os.path.exists(p):
os.remove(p)
# ============================================================
# Council Prompt Tests
# ============================================================
def test_all_4_council_members_defined():
assert "query_planner" in COUNCIL_PROMPTS
assert "extractor" in COUNCIL_PROMPTS
assert "critic" in COUNCIL_PROMPTS
assert "chairman" in COUNCIL_PROMPTS
def test_query_planner_prompt_has_json_instruction():
assert "JSON array" in COUNCIL_PROMPTS["query_planner"]
def test_extractor_prompt_has_epistemic_tags():
prompt = COUNCIL_PROMPTS["extractor"]
assert "Fact" in prompt
assert "Interpretation" in prompt
assert "Hypothesis" in prompt
assert "Conflict_Hypothesis" in prompt
def test_critic_prompt_checks_5_things():
prompt = COUNCIL_PROMPTS["critic"]
assert "Missing important claims" in prompt
assert "Incorrect epistemic tags" in prompt
assert "Overly confident" in prompt
assert "Taxonomy correctness" in prompt
assert "Missing fields" in prompt
def test_chairman_prompt_has_completeness_penalty():
assert "0.7 completeness penalty" in COUNCIL_PROMPTS["chairman"]
def test_chairman_output_schema():
prompt = COUNCIL_PROMPTS["chairman"]
assert "epistemic_tag" in prompt
assert "confidence" in prompt
assert "missing_fields" in prompt
assert "status" in prompt
# ============================================================
# Mock Council Pipeline Tests (no brain)
# ============================================================
def test_council_deliberate_without_brain():
council = ModelCouncil(brain=None, db_path=TEST_DB)
result = council.deliberate(SAMPLE_TEXT)
assert isinstance(result, CouncilRound)
assert result.round_id.startswith("CNCL_")
assert len(result.final_claims) >= 1
assert result.started_at
assert result.completed_at
def test_council_produces_valid_claims():
council = ModelCouncil(brain=None, db_path=TEST_DB)
result = council.deliberate(SAMPLE_TEXT)
for claim in result.final_claims:
assert "text" in claim
assert "epistemic_tag" in claim
assert "confidence" in claim
assert "missing_fields" in claim
assert "status" in claim
assert claim["epistemic_tag"] in ["Fact", "Interpretation", "Hypothesis", "Conflict_Hypothesis"]
assert 0.0 <= claim["confidence"] <= 1.0
assert claim["status"] in ["Complete", "Incomplete"]
def test_council_claims_have_consistent_status():
council = ModelCouncil(brain=None, db_path=TEST_DB)
result = council.deliberate(SAMPLE_TEXT)
for claim in result.final_claims:
if claim["missing_fields"]:
assert claim["status"] == "Incomplete"
def test_council_with_query():
council = ModelCouncil(brain=None, db_path=TEST_DB)
result = council.deliberate(SAMPLE_TEXT, query="What is the LOD for GFET cTnI sensors?")
assert isinstance(result.query_plan, list)
assert len(result.final_claims) >= 1
def test_council_query_planner_standalone():
council = ModelCouncil(brain=None, db_path=TEST_DB)
queries = council.deliberate_query("What are the ionic strength effects on GFET sensitivity?")
assert isinstance(queries, list)
assert len(queries) >= 1
def test_council_metadata():
council = ModelCouncil(brain=None, db_path=TEST_DB)
result = council.deliberate(SAMPLE_TEXT)
assert "council_version" in result.metadata
assert "taxonomy_domain" in result.metadata
assert "extractor_claim_count" in result.metadata
assert "final_claim_count" in result.metadata
def test_council_critique_structure():
council = ModelCouncil(brain=None, db_path=TEST_DB)
result = council.deliberate(SAMPLE_TEXT)
if result.critique:
assert "feedback" in result.critique
def test_council_raw_extraction_preserved():
council = ModelCouncil(brain=None, db_path=TEST_DB)
result = council.deliberate(SAMPLE_TEXT)
assert isinstance(result.raw_extraction, list)
def test_council_round_logged_to_db():
council = ModelCouncil(brain=None, db_path=TEST_DB)
result = council.deliberate(SAMPLE_TEXT)
history = council.get_council_history()
assert len(history) >= 1
assert history[0]["round_id"] == result.round_id
def test_council_history_retrieval():
council = ModelCouncil(brain=None, db_path=TEST_DB)
council.deliberate(SAMPLE_TEXT)
council.deliberate(SAMPLE_TEXT)
history = council.get_council_history(limit=10)
assert len(history) >= 2
def test_council_confidence_clamped():
council = ModelCouncil(brain=None, db_path=TEST_DB)
result = council.deliberate(SAMPLE_TEXT)
for claim in result.final_claims:
assert 0.0 <= claim["confidence"] <= 1.0
def test_council_invalid_tag_defaulted():
"""Claims with invalid epistemic tags should be defaulted to Interpretation."""
council = ModelCouncil(brain=None, db_path=TEST_DB)
# Mock produces "Interpretation" by default, so this validates the path
result = council.deliberate(SAMPLE_TEXT)
for claim in result.final_claims:
assert claim["epistemic_tag"] in ["Fact", "Interpretation", "Hypothesis", "Conflict_Hypothesis"]
def test_multiple_deliberations_independent():
council = ModelCouncil(brain=None, db_path=TEST_DB)
r1 = council.deliberate("Text about graphene sensors.")
r2 = council.deliberate("Text about lithium batteries.")
assert r1.round_id != r2.round_id
if __name__ == "__main__":
pytest.main([__file__, "-v"])