Spaces:

TheLinconX
/

contextforge-demo

Sleeping

contextforge-demo / tests /test_jcr_gate.py

Pablo

feat: V6.0 — TokenDance Master-Mirror storage, JCR Safety Gate (INV-15), AITER ROCm config. 15/15 PASS

d9c2197 2 days ago

7.22 kB

	"""Tests for JCRSafetyGate.

	Covers:
	- Risk score computation across the role / candidate / shuffle / reuse axes
	- INV-15: Critic with risk > threshold ALWAYS uses dense prefill
	- Non-judge roles never trigger dense fallback
	- gate_decision logging + summary stats
	- Edge case: invalid args
	"""
	from __future__ import annotations

	import pytest

	from apohara_context_forge.safety.jcr_gate import (
	JCRDecision,
	JCRSafetyGate,
	)


	class TestJCRSafetyGateDefaults:
	def test_default_threshold(self):
	gate = JCRSafetyGate()
	assert gate.jcr_threshold == 0.7

	def test_invalid_threshold_rejected(self):
	with pytest.raises(ValueError, match="must be in"):
	JCRSafetyGate(jcr_threshold=1.5)
	with pytest.raises(ValueError, match="must be in"):
	JCRSafetyGate(jcr_threshold=-0.1)


	class TestJCRRiskComputation:
	def test_critic_base_risk(self):
	gate = JCRSafetyGate()
	risk = gate.compute_jcr_risk(
	agent_role="critic",
	candidate_count=2,
	reuse_rate=0.5,
	layout_shuffled=False,
	)
	assert risk == pytest.approx(0.6)

	def test_non_critic_base_risk(self):
	gate = JCRSafetyGate()
	risk = gate.compute_jcr_risk(
	agent_role="retriever",
	candidate_count=2,
	reuse_rate=0.5,
	layout_shuffled=False,
	)
	assert risk == pytest.approx(0.1)

	def test_extra_candidates_increase_risk(self):
	gate = JCRSafetyGate()
	baseline = gate.compute_jcr_risk("critic", 2, 0.0, False)
	five = gate.compute_jcr_risk("critic", 5, 0.0, False)
	assert five == pytest.approx(baseline + 0.3)

	def test_layout_shuffled_increases_risk(self):
	gate = JCRSafetyGate()
	plain = gate.compute_jcr_risk("critic", 2, 0.0, False)
	shuffled = gate.compute_jcr_risk("critic", 2, 0.0, True)
	assert shuffled == pytest.approx(plain + 0.2)

	def test_high_reuse_rate_increases_risk(self):
	gate = JCRSafetyGate()
	low = gate.compute_jcr_risk("critic", 2, 0.5, False)
	high = gate.compute_jcr_risk("critic", 2, 0.95, False)
	assert high == pytest.approx(low + 0.15)

	def test_risk_clamped_to_one(self):
	gate = JCRSafetyGate()
	risk = gate.compute_jcr_risk(
	agent_role="critic",
	candidate_count=20,
	reuse_rate=1.0,
	layout_shuffled=True,
	)
	assert 0.0 <= risk <= 1.0
	assert risk == pytest.approx(1.0)

	def test_invalid_candidate_count_rejected(self):
	gate = JCRSafetyGate()
	with pytest.raises(ValueError, match="non-negative"):
	gate.compute_jcr_risk("critic", -1, 0.5, False)

	def test_invalid_reuse_rate_rejected(self):
	gate = JCRSafetyGate()
	with pytest.raises(ValueError, match="reuse_rate must be"):
	gate.compute_jcr_risk("critic", 2, 1.5, False)


	class TestINV15CriticAlwaysDense:
	"""INV-15: Critic with risk > threshold ALWAYS returns use_dense=True."""

	def test_critic_5_candidates_shuffle_uses_dense(self):
	gate = JCRSafetyGate()
	# Risk = 0.6 + 0.3 + 0.2 = 1.1 → clamped to 1.0 → > 0.7
	assert gate.should_use_dense_prefill(
	agent_role="critic",
	candidate_count=5,
	reuse_rate=0.5,
	layout_shuffled=True,
	) is True

	def test_retriever_2_candidates_no_dense(self):
	gate = JCRSafetyGate()
	assert gate.should_use_dense_prefill(
	agent_role="retriever",
	candidate_count=2,
	reuse_rate=0.5,
	layout_shuffled=False,
	) is False

	def test_non_critic_never_uses_dense_even_with_high_risk(self):
	"""Non-judge roles aren't protected by INV-15."""
	gate = JCRSafetyGate()
	# Even with all risk knobs cranked up, a retriever passes through.
	assert gate.should_use_dense_prefill(
	agent_role="retriever",
	candidate_count=10,
	reuse_rate=1.0,
	layout_shuffled=True,
	) is False

	@pytest.mark.parametrize("candidates,shuffle,reuse", [
	(5, True, 0.9),
	(4, True, 0.85),
	(8, False, 0.85),
	(10, True, 0.5),
	])
	def test_critic_above_threshold_always_dense(self, candidates, shuffle, reuse):
	"""Comprehensive sweep: Critic above threshold always dense (INV-15)."""
	gate = JCRSafetyGate()
	decision = gate.gate_decision(
	agent_role="critic",
	candidate_count=candidates,
	reuse_rate=reuse,
	layout_shuffled=shuffle,
	)
	if decision.risk_score > gate.jcr_threshold:
	assert decision.use_dense is True, (
	f"INV-15 violated: critic with risk {decision.risk_score} "
	f"> threshold {gate.jcr_threshold} did not get dense prefill"
	)

	def test_critic_exactly_at_threshold_uses_reuse(self):
	"""Threshold is strict: > threshold triggers dense, not >=."""
	gate = JCRSafetyGate(jcr_threshold=0.6)
	# Critic, 2 candidates, no shuffle, low reuse → exactly 0.6
	decision = gate.gate_decision(
	agent_role="critic",
	candidate_count=2,
	reuse_rate=0.5,
	layout_shuffled=False,
	)
	assert decision.risk_score == pytest.approx(0.6)
	assert decision.use_dense is False


	class TestGateDecisionLogging:
	def test_gate_decision_returns_structured_record(self):
	gate = JCRSafetyGate()
	decision = gate.gate_decision("critic", 5, 0.9, True)
	assert isinstance(decision, JCRDecision)
	assert decision.agent_role == "critic"
	assert decision.use_dense is True
	assert "INV-15" in decision.reason
	assert decision.timestamp > 0

	def test_log_accumulates(self):
	gate = JCRSafetyGate()
	for _ in range(3):
	gate.gate_decision("critic", 5, 0.9, True)
	gate.gate_decision("retriever", 2, 0.1, False)
	assert len(gate.gate_log) == 4

	def test_summary_aggregates(self):
	gate = JCRSafetyGate()
	gate.gate_decision("critic", 5, 0.9, True) # dense
	gate.gate_decision("critic", 2, 0.1, False) # reuse
	gate.gate_decision("retriever", 2, 0.1, False) # reuse
	s = gate.summary()
	assert s["total_decisions"] == 3
	assert s["dense_fallback_count"] == 1
	# 2 critic decisions, 1 dense → 0.5
	assert s["critic_dense_rate"] == pytest.approx(0.5)
	assert 0.0 <= s["avg_risk_score"] <= 1.0

	def test_summary_empty_safe(self):
	gate = JCRSafetyGate()
	s = gate.summary()
	assert s["total_decisions"] == 0
	assert s["dense_fallback_count"] == 0
	assert s["avg_risk_score"] == 0.0
	assert s["critic_dense_rate"] == 0.0

	def test_role_case_insensitive(self):
	gate = JCRSafetyGate()
	# Upper-case role still resolves to "critic".
	decision = gate.gate_decision("CRITIC", 5, 0.9, True)
	assert decision.agent_role == "critic"
	assert decision.use_dense is True