File size: 7,198 Bytes
e5fd9d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
"""
PhD Research OS — AI Model Council Tests
==========================================
Tests for the 4-member council pipeline: Query Planner → Extractor → Critic → Chairman
"""

import os
import sys
import json
import pytest

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from phd_research_os.council import (
    ModelCouncil, CouncilRound, COUNCIL_PROMPTS
)
from phd_research_os.db import init_db, get_db, create_claim
from phd_research_os.taxonomy import TaxonomyManager

TEST_DB = "test_council.db"

SAMPLE_TEXT = """
We investigated the sensitivity of graphene field-effect transistor (GFET) biosensors 
to cardiac troponin I (cTnI) in phosphate-buffered saline (PBS). Our measurements 
show that the Dirac point shifts by 45 ± 3 mV upon binding of 1 pM cTnI in 10 mM PBS 
(n=5, p<0.001). The limit of detection was determined to be 0.8 fM using the 3σ/slope 
method. We interpret these results as evidence that aptamer-functionalized GFETs can 
achieve clinically relevant sensitivity for cardiac biomarker detection. However, ionic 
strength effects at physiological conditions (150 mM) significantly reduce sensitivity, 
suggesting that a desalting step may be necessary for clinical translation. We hypothesize 
that a PEG spacer of 5 kDa molecular weight could mitigate Debye screening while 
maintaining binding affinity, though this remains to be experimentally validated.
"""


@pytest.fixture(autouse=True)
def setup_teardown():
    init_db(TEST_DB)
    TaxonomyManager(db_path=TEST_DB)
    yield
    for suffix in ["", "-wal", "-shm"]:
        p = TEST_DB + suffix
        if os.path.exists(p):
            os.remove(p)


# ============================================================
# Council Prompt Tests
# ============================================================

def test_all_4_council_members_defined():
    assert "query_planner" in COUNCIL_PROMPTS
    assert "extractor" in COUNCIL_PROMPTS
    assert "critic" in COUNCIL_PROMPTS
    assert "chairman" in COUNCIL_PROMPTS

def test_query_planner_prompt_has_json_instruction():
    assert "JSON array" in COUNCIL_PROMPTS["query_planner"]

def test_extractor_prompt_has_epistemic_tags():
    prompt = COUNCIL_PROMPTS["extractor"]
    assert "Fact" in prompt
    assert "Interpretation" in prompt
    assert "Hypothesis" in prompt
    assert "Conflict_Hypothesis" in prompt

def test_critic_prompt_checks_5_things():
    prompt = COUNCIL_PROMPTS["critic"]
    assert "Missing important claims" in prompt
    assert "Incorrect epistemic tags" in prompt
    assert "Overly confident" in prompt
    assert "Taxonomy correctness" in prompt
    assert "Missing fields" in prompt

def test_chairman_prompt_has_completeness_penalty():
    assert "0.7 completeness penalty" in COUNCIL_PROMPTS["chairman"]

def test_chairman_output_schema():
    prompt = COUNCIL_PROMPTS["chairman"]
    assert "epistemic_tag" in prompt
    assert "confidence" in prompt
    assert "missing_fields" in prompt
    assert "status" in prompt


# ============================================================
# Mock Council Pipeline Tests (no brain)
# ============================================================

def test_council_deliberate_without_brain():
    council = ModelCouncil(brain=None, db_path=TEST_DB)
    result = council.deliberate(SAMPLE_TEXT)

    assert isinstance(result, CouncilRound)
    assert result.round_id.startswith("CNCL_")
    assert len(result.final_claims) >= 1
    assert result.started_at
    assert result.completed_at

def test_council_produces_valid_claims():
    council = ModelCouncil(brain=None, db_path=TEST_DB)
    result = council.deliberate(SAMPLE_TEXT)

    for claim in result.final_claims:
        assert "text" in claim
        assert "epistemic_tag" in claim
        assert "confidence" in claim
        assert "missing_fields" in claim
        assert "status" in claim
        assert claim["epistemic_tag"] in ["Fact", "Interpretation", "Hypothesis", "Conflict_Hypothesis"]
        assert 0.0 <= claim["confidence"] <= 1.0
        assert claim["status"] in ["Complete", "Incomplete"]

def test_council_claims_have_consistent_status():
    council = ModelCouncil(brain=None, db_path=TEST_DB)
    result = council.deliberate(SAMPLE_TEXT)

    for claim in result.final_claims:
        if claim["missing_fields"]:
            assert claim["status"] == "Incomplete"

def test_council_with_query():
    council = ModelCouncil(brain=None, db_path=TEST_DB)
    result = council.deliberate(SAMPLE_TEXT, query="What is the LOD for GFET cTnI sensors?")

    assert isinstance(result.query_plan, list)
    assert len(result.final_claims) >= 1

def test_council_query_planner_standalone():
    council = ModelCouncil(brain=None, db_path=TEST_DB)
    queries = council.deliberate_query("What are the ionic strength effects on GFET sensitivity?")
    assert isinstance(queries, list)
    assert len(queries) >= 1

def test_council_metadata():
    council = ModelCouncil(brain=None, db_path=TEST_DB)
    result = council.deliberate(SAMPLE_TEXT)

    assert "council_version" in result.metadata
    assert "taxonomy_domain" in result.metadata
    assert "extractor_claim_count" in result.metadata
    assert "final_claim_count" in result.metadata

def test_council_critique_structure():
    council = ModelCouncil(brain=None, db_path=TEST_DB)
    result = council.deliberate(SAMPLE_TEXT)

    if result.critique:
        assert "feedback" in result.critique

def test_council_raw_extraction_preserved():
    council = ModelCouncil(brain=None, db_path=TEST_DB)
    result = council.deliberate(SAMPLE_TEXT)

    assert isinstance(result.raw_extraction, list)

def test_council_round_logged_to_db():
    council = ModelCouncil(brain=None, db_path=TEST_DB)
    result = council.deliberate(SAMPLE_TEXT)

    history = council.get_council_history()
    assert len(history) >= 1
    assert history[0]["round_id"] == result.round_id

def test_council_history_retrieval():
    council = ModelCouncil(brain=None, db_path=TEST_DB)
    council.deliberate(SAMPLE_TEXT)
    council.deliberate(SAMPLE_TEXT)

    history = council.get_council_history(limit=10)
    assert len(history) >= 2

def test_council_confidence_clamped():
    council = ModelCouncil(brain=None, db_path=TEST_DB)
    result = council.deliberate(SAMPLE_TEXT)

    for claim in result.final_claims:
        assert 0.0 <= claim["confidence"] <= 1.0

def test_council_invalid_tag_defaulted():
    """Claims with invalid epistemic tags should be defaulted to Interpretation."""
    council = ModelCouncil(brain=None, db_path=TEST_DB)
    # Mock produces "Interpretation" by default, so this validates the path
    result = council.deliberate(SAMPLE_TEXT)
    for claim in result.final_claims:
        assert claim["epistemic_tag"] in ["Fact", "Interpretation", "Hypothesis", "Conflict_Hypothesis"]

def test_multiple_deliberations_independent():
    council = ModelCouncil(brain=None, db_path=TEST_DB)
    r1 = council.deliberate("Text about graphene sensors.")
    r2 = council.deliberate("Text about lithium batteries.")

    assert r1.round_id != r2.round_id


if __name__ == "__main__":
    pytest.main([__file__, "-v"])