nkshirsa commited on
Commit
e5fd9d4
·
verified ·
1 Parent(s): 9ce4ec4

Add AI Model Council: tests/test_council.py

Browse files
Files changed (1) hide show
  1. tests/test_council.py +197 -0
tests/test_council.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PhD Research OS — AI Model Council Tests
3
+ ==========================================
4
+ Tests for the 4-member council pipeline: Query Planner → Extractor → Critic → Chairman
5
+ """
6
+
7
+ import os
8
+ import sys
9
+ import json
10
+ import pytest
11
+
12
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
13
+
14
+ from phd_research_os.council import (
15
+ ModelCouncil, CouncilRound, COUNCIL_PROMPTS
16
+ )
17
+ from phd_research_os.db import init_db, get_db, create_claim
18
+ from phd_research_os.taxonomy import TaxonomyManager
19
+
20
+ TEST_DB = "test_council.db"
21
+
22
+ SAMPLE_TEXT = """
23
+ We investigated the sensitivity of graphene field-effect transistor (GFET) biosensors
24
+ to cardiac troponin I (cTnI) in phosphate-buffered saline (PBS). Our measurements
25
+ show that the Dirac point shifts by 45 ± 3 mV upon binding of 1 pM cTnI in 10 mM PBS
26
+ (n=5, p<0.001). The limit of detection was determined to be 0.8 fM using the 3σ/slope
27
+ method. We interpret these results as evidence that aptamer-functionalized GFETs can
28
+ achieve clinically relevant sensitivity for cardiac biomarker detection. However, ionic
29
+ strength effects at physiological conditions (150 mM) significantly reduce sensitivity,
30
+ suggesting that a desalting step may be necessary for clinical translation. We hypothesize
31
+ that a PEG spacer of 5 kDa molecular weight could mitigate Debye screening while
32
+ maintaining binding affinity, though this remains to be experimentally validated.
33
+ """
34
+
35
+
36
+ @pytest.fixture(autouse=True)
37
+ def setup_teardown():
38
+ init_db(TEST_DB)
39
+ TaxonomyManager(db_path=TEST_DB)
40
+ yield
41
+ for suffix in ["", "-wal", "-shm"]:
42
+ p = TEST_DB + suffix
43
+ if os.path.exists(p):
44
+ os.remove(p)
45
+
46
+
47
+ # ============================================================
48
+ # Council Prompt Tests
49
+ # ============================================================
50
+
51
+ def test_all_4_council_members_defined():
52
+ assert "query_planner" in COUNCIL_PROMPTS
53
+ assert "extractor" in COUNCIL_PROMPTS
54
+ assert "critic" in COUNCIL_PROMPTS
55
+ assert "chairman" in COUNCIL_PROMPTS
56
+
57
+ def test_query_planner_prompt_has_json_instruction():
58
+ assert "JSON array" in COUNCIL_PROMPTS["query_planner"]
59
+
60
+ def test_extractor_prompt_has_epistemic_tags():
61
+ prompt = COUNCIL_PROMPTS["extractor"]
62
+ assert "Fact" in prompt
63
+ assert "Interpretation" in prompt
64
+ assert "Hypothesis" in prompt
65
+ assert "Conflict_Hypothesis" in prompt
66
+
67
+ def test_critic_prompt_checks_5_things():
68
+ prompt = COUNCIL_PROMPTS["critic"]
69
+ assert "Missing important claims" in prompt
70
+ assert "Incorrect epistemic tags" in prompt
71
+ assert "Overly confident" in prompt
72
+ assert "Taxonomy correctness" in prompt
73
+ assert "Missing fields" in prompt
74
+
75
+ def test_chairman_prompt_has_completeness_penalty():
76
+ assert "0.7 completeness penalty" in COUNCIL_PROMPTS["chairman"]
77
+
78
+ def test_chairman_output_schema():
79
+ prompt = COUNCIL_PROMPTS["chairman"]
80
+ assert "epistemic_tag" in prompt
81
+ assert "confidence" in prompt
82
+ assert "missing_fields" in prompt
83
+ assert "status" in prompt
84
+
85
+
86
+ # ============================================================
87
+ # Mock Council Pipeline Tests (no brain)
88
+ # ============================================================
89
+
90
+ def test_council_deliberate_without_brain():
91
+ council = ModelCouncil(brain=None, db_path=TEST_DB)
92
+ result = council.deliberate(SAMPLE_TEXT)
93
+
94
+ assert isinstance(result, CouncilRound)
95
+ assert result.round_id.startswith("CNCL_")
96
+ assert len(result.final_claims) >= 1
97
+ assert result.started_at
98
+ assert result.completed_at
99
+
100
+ def test_council_produces_valid_claims():
101
+ council = ModelCouncil(brain=None, db_path=TEST_DB)
102
+ result = council.deliberate(SAMPLE_TEXT)
103
+
104
+ for claim in result.final_claims:
105
+ assert "text" in claim
106
+ assert "epistemic_tag" in claim
107
+ assert "confidence" in claim
108
+ assert "missing_fields" in claim
109
+ assert "status" in claim
110
+ assert claim["epistemic_tag"] in ["Fact", "Interpretation", "Hypothesis", "Conflict_Hypothesis"]
111
+ assert 0.0 <= claim["confidence"] <= 1.0
112
+ assert claim["status"] in ["Complete", "Incomplete"]
113
+
114
+ def test_council_claims_have_consistent_status():
115
+ council = ModelCouncil(brain=None, db_path=TEST_DB)
116
+ result = council.deliberate(SAMPLE_TEXT)
117
+
118
+ for claim in result.final_claims:
119
+ if claim["missing_fields"]:
120
+ assert claim["status"] == "Incomplete"
121
+
122
+ def test_council_with_query():
123
+ council = ModelCouncil(brain=None, db_path=TEST_DB)
124
+ result = council.deliberate(SAMPLE_TEXT, query="What is the LOD for GFET cTnI sensors?")
125
+
126
+ assert isinstance(result.query_plan, list)
127
+ assert len(result.final_claims) >= 1
128
+
129
+ def test_council_query_planner_standalone():
130
+ council = ModelCouncil(brain=None, db_path=TEST_DB)
131
+ queries = council.deliberate_query("What are the ionic strength effects on GFET sensitivity?")
132
+ assert isinstance(queries, list)
133
+ assert len(queries) >= 1
134
+
135
+ def test_council_metadata():
136
+ council = ModelCouncil(brain=None, db_path=TEST_DB)
137
+ result = council.deliberate(SAMPLE_TEXT)
138
+
139
+ assert "council_version" in result.metadata
140
+ assert "taxonomy_domain" in result.metadata
141
+ assert "extractor_claim_count" in result.metadata
142
+ assert "final_claim_count" in result.metadata
143
+
144
+ def test_council_critique_structure():
145
+ council = ModelCouncil(brain=None, db_path=TEST_DB)
146
+ result = council.deliberate(SAMPLE_TEXT)
147
+
148
+ if result.critique:
149
+ assert "feedback" in result.critique
150
+
151
+ def test_council_raw_extraction_preserved():
152
+ council = ModelCouncil(brain=None, db_path=TEST_DB)
153
+ result = council.deliberate(SAMPLE_TEXT)
154
+
155
+ assert isinstance(result.raw_extraction, list)
156
+
157
+ def test_council_round_logged_to_db():
158
+ council = ModelCouncil(brain=None, db_path=TEST_DB)
159
+ result = council.deliberate(SAMPLE_TEXT)
160
+
161
+ history = council.get_council_history()
162
+ assert len(history) >= 1
163
+ assert history[0]["round_id"] == result.round_id
164
+
165
+ def test_council_history_retrieval():
166
+ council = ModelCouncil(brain=None, db_path=TEST_DB)
167
+ council.deliberate(SAMPLE_TEXT)
168
+ council.deliberate(SAMPLE_TEXT)
169
+
170
+ history = council.get_council_history(limit=10)
171
+ assert len(history) >= 2
172
+
173
+ def test_council_confidence_clamped():
174
+ council = ModelCouncil(brain=None, db_path=TEST_DB)
175
+ result = council.deliberate(SAMPLE_TEXT)
176
+
177
+ for claim in result.final_claims:
178
+ assert 0.0 <= claim["confidence"] <= 1.0
179
+
180
+ def test_council_invalid_tag_defaulted():
181
+ """Claims with invalid epistemic tags should be defaulted to Interpretation."""
182
+ council = ModelCouncil(brain=None, db_path=TEST_DB)
183
+ # Mock produces "Interpretation" by default, so this validates the path
184
+ result = council.deliberate(SAMPLE_TEXT)
185
+ for claim in result.final_claims:
186
+ assert claim["epistemic_tag"] in ["Fact", "Interpretation", "Hypothesis", "Conflict_Hypothesis"]
187
+
188
+ def test_multiple_deliberations_independent():
189
+ council = ModelCouncil(brain=None, db_path=TEST_DB)
190
+ r1 = council.deliberate("Text about graphene sensors.")
191
+ r2 = council.deliberate("Text about lithium batteries.")
192
+
193
+ assert r1.round_id != r2.round_id
194
+
195
+
196
+ if __name__ == "__main__":
197
+ pytest.main([__file__, "-v"])