Spaces:
Sleeping
Sleeping
Pablo
feat: V6.0 — TokenDance Master-Mirror storage, JCR Safety Gate (INV-15), AITER ROCm config. 15/15 PASS
d9c2197 | """Tests for JCRSafetyGate. | |
| Covers: | |
| - Risk score computation across the role / candidate / shuffle / reuse axes | |
| - INV-15: Critic with risk > threshold ALWAYS uses dense prefill | |
| - Non-judge roles never trigger dense fallback | |
| - gate_decision logging + summary stats | |
| - Edge case: invalid args | |
| """ | |
| from __future__ import annotations | |
| import pytest | |
| from apohara_context_forge.safety.jcr_gate import ( | |
| JCRDecision, | |
| JCRSafetyGate, | |
| ) | |
| class TestJCRSafetyGateDefaults: | |
| def test_default_threshold(self): | |
| gate = JCRSafetyGate() | |
| assert gate.jcr_threshold == 0.7 | |
| def test_invalid_threshold_rejected(self): | |
| with pytest.raises(ValueError, match="must be in"): | |
| JCRSafetyGate(jcr_threshold=1.5) | |
| with pytest.raises(ValueError, match="must be in"): | |
| JCRSafetyGate(jcr_threshold=-0.1) | |
| class TestJCRRiskComputation: | |
| def test_critic_base_risk(self): | |
| gate = JCRSafetyGate() | |
| risk = gate.compute_jcr_risk( | |
| agent_role="critic", | |
| candidate_count=2, | |
| reuse_rate=0.5, | |
| layout_shuffled=False, | |
| ) | |
| assert risk == pytest.approx(0.6) | |
| def test_non_critic_base_risk(self): | |
| gate = JCRSafetyGate() | |
| risk = gate.compute_jcr_risk( | |
| agent_role="retriever", | |
| candidate_count=2, | |
| reuse_rate=0.5, | |
| layout_shuffled=False, | |
| ) | |
| assert risk == pytest.approx(0.1) | |
| def test_extra_candidates_increase_risk(self): | |
| gate = JCRSafetyGate() | |
| baseline = gate.compute_jcr_risk("critic", 2, 0.0, False) | |
| five = gate.compute_jcr_risk("critic", 5, 0.0, False) | |
| assert five == pytest.approx(baseline + 0.3) | |
| def test_layout_shuffled_increases_risk(self): | |
| gate = JCRSafetyGate() | |
| plain = gate.compute_jcr_risk("critic", 2, 0.0, False) | |
| shuffled = gate.compute_jcr_risk("critic", 2, 0.0, True) | |
| assert shuffled == pytest.approx(plain + 0.2) | |
| def test_high_reuse_rate_increases_risk(self): | |
| gate = JCRSafetyGate() | |
| low = gate.compute_jcr_risk("critic", 2, 0.5, False) | |
| high = gate.compute_jcr_risk("critic", 2, 0.95, False) | |
| assert high == pytest.approx(low + 0.15) | |
| def test_risk_clamped_to_one(self): | |
| gate = JCRSafetyGate() | |
| risk = gate.compute_jcr_risk( | |
| agent_role="critic", | |
| candidate_count=20, | |
| reuse_rate=1.0, | |
| layout_shuffled=True, | |
| ) | |
| assert 0.0 <= risk <= 1.0 | |
| assert risk == pytest.approx(1.0) | |
| def test_invalid_candidate_count_rejected(self): | |
| gate = JCRSafetyGate() | |
| with pytest.raises(ValueError, match="non-negative"): | |
| gate.compute_jcr_risk("critic", -1, 0.5, False) | |
| def test_invalid_reuse_rate_rejected(self): | |
| gate = JCRSafetyGate() | |
| with pytest.raises(ValueError, match="reuse_rate must be"): | |
| gate.compute_jcr_risk("critic", 2, 1.5, False) | |
| class TestINV15CriticAlwaysDense: | |
| """INV-15: Critic with risk > threshold ALWAYS returns use_dense=True.""" | |
| def test_critic_5_candidates_shuffle_uses_dense(self): | |
| gate = JCRSafetyGate() | |
| # Risk = 0.6 + 0.3 + 0.2 = 1.1 → clamped to 1.0 → > 0.7 | |
| assert gate.should_use_dense_prefill( | |
| agent_role="critic", | |
| candidate_count=5, | |
| reuse_rate=0.5, | |
| layout_shuffled=True, | |
| ) is True | |
| def test_retriever_2_candidates_no_dense(self): | |
| gate = JCRSafetyGate() | |
| assert gate.should_use_dense_prefill( | |
| agent_role="retriever", | |
| candidate_count=2, | |
| reuse_rate=0.5, | |
| layout_shuffled=False, | |
| ) is False | |
| def test_non_critic_never_uses_dense_even_with_high_risk(self): | |
| """Non-judge roles aren't protected by INV-15.""" | |
| gate = JCRSafetyGate() | |
| # Even with all risk knobs cranked up, a retriever passes through. | |
| assert gate.should_use_dense_prefill( | |
| agent_role="retriever", | |
| candidate_count=10, | |
| reuse_rate=1.0, | |
| layout_shuffled=True, | |
| ) is False | |
| def test_critic_above_threshold_always_dense(self, candidates, shuffle, reuse): | |
| """Comprehensive sweep: Critic above threshold always dense (INV-15).""" | |
| gate = JCRSafetyGate() | |
| decision = gate.gate_decision( | |
| agent_role="critic", | |
| candidate_count=candidates, | |
| reuse_rate=reuse, | |
| layout_shuffled=shuffle, | |
| ) | |
| if decision.risk_score > gate.jcr_threshold: | |
| assert decision.use_dense is True, ( | |
| f"INV-15 violated: critic with risk {decision.risk_score} " | |
| f"> threshold {gate.jcr_threshold} did not get dense prefill" | |
| ) | |
| def test_critic_exactly_at_threshold_uses_reuse(self): | |
| """Threshold is strict: > threshold triggers dense, not >=.""" | |
| gate = JCRSafetyGate(jcr_threshold=0.6) | |
| # Critic, 2 candidates, no shuffle, low reuse → exactly 0.6 | |
| decision = gate.gate_decision( | |
| agent_role="critic", | |
| candidate_count=2, | |
| reuse_rate=0.5, | |
| layout_shuffled=False, | |
| ) | |
| assert decision.risk_score == pytest.approx(0.6) | |
| assert decision.use_dense is False | |
| class TestGateDecisionLogging: | |
| def test_gate_decision_returns_structured_record(self): | |
| gate = JCRSafetyGate() | |
| decision = gate.gate_decision("critic", 5, 0.9, True) | |
| assert isinstance(decision, JCRDecision) | |
| assert decision.agent_role == "critic" | |
| assert decision.use_dense is True | |
| assert "INV-15" in decision.reason | |
| assert decision.timestamp > 0 | |
| def test_log_accumulates(self): | |
| gate = JCRSafetyGate() | |
| for _ in range(3): | |
| gate.gate_decision("critic", 5, 0.9, True) | |
| gate.gate_decision("retriever", 2, 0.1, False) | |
| assert len(gate.gate_log) == 4 | |
| def test_summary_aggregates(self): | |
| gate = JCRSafetyGate() | |
| gate.gate_decision("critic", 5, 0.9, True) # dense | |
| gate.gate_decision("critic", 2, 0.1, False) # reuse | |
| gate.gate_decision("retriever", 2, 0.1, False) # reuse | |
| s = gate.summary() | |
| assert s["total_decisions"] == 3 | |
| assert s["dense_fallback_count"] == 1 | |
| # 2 critic decisions, 1 dense → 0.5 | |
| assert s["critic_dense_rate"] == pytest.approx(0.5) | |
| assert 0.0 <= s["avg_risk_score"] <= 1.0 | |
| def test_summary_empty_safe(self): | |
| gate = JCRSafetyGate() | |
| s = gate.summary() | |
| assert s["total_decisions"] == 0 | |
| assert s["dense_fallback_count"] == 0 | |
| assert s["avg_risk_score"] == 0.0 | |
| assert s["critic_dense_rate"] == 0.0 | |
| def test_role_case_insensitive(self): | |
| gate = JCRSafetyGate() | |
| # Upper-case role still resolves to "critic". | |
| decision = gate.gate_decision("CRITIC", 5, 0.9, True) | |
| assert decision.agent_role == "critic" | |
| assert decision.use_dense is True | |