DECEIT / tests /test_environment.py
Jayant-Kernel
Phase 2.5: multi-turn episodes, bug fixes, dataset cleanup
9737348 unverified
"""Integration tests for DeceitEnvironment — grader is always mocked."""
import pathlib
import pytest
from unittest.mock import MagicMock
from deceit_env.models import DeceitAction, DeceitObservation, DeceitState
from deceit_env.server.environment import DeceitEnvironment, STEP_PENALTY
from deceit_env.server.grader import GraderResult
DATASET_PATH = (
pathlib.Path(__file__).parent.parent / "src" / "deceit_env" / "data" / "level1.jsonl"
)
def _make_grader(correct: bool, method: str = "exact") -> MagicMock:
grader = MagicMock()
grader.check.return_value = GraderResult(
correct=correct, method=method, explanation="mocked"
)
return grader
@pytest.fixture
def env_correct():
return DeceitEnvironment(dataset_path=DATASET_PATH, grader=_make_grader(correct=True))
@pytest.fixture
def env_wrong():
return DeceitEnvironment(dataset_path=DATASET_PATH, grader=_make_grader(correct=False))
class TestReset:
def test_reset_returns_observation(self, env_correct):
obs = env_correct.reset()
assert isinstance(obs, DeceitObservation)
def test_reset_observation_has_question(self, env_correct):
obs = env_correct.reset()
assert isinstance(obs.question, str)
assert len(obs.question) > 0
def test_reset_observation_has_empty_context(self, env_correct):
obs = env_correct.reset()
assert obs.context == []
def test_reset_observation_level_1(self, env_correct):
obs = env_correct.reset()
assert obs.level == 1
def test_reset_observation_max_turns_3(self, env_correct):
obs = env_correct.reset()
assert obs.max_turns == 3
def test_reset_initializes_state(self, env_correct):
env_correct.reset()
state = env_correct.state
assert isinstance(state, DeceitState)
assert state.episode_id is not None
assert state.step_count == 0
assert state.ground_truth != ""
assert state.prior_reasoning == []
class TestSingleStepFinal:
def test_step_correct_confident_reward_is_1_3(self, env_correct):
env_correct.reset(seed=42)
action = DeceitAction(reasoning="Sure.", answer="Canberra", confidence=0.9, is_final=True)
obs = env_correct.step(action)
assert obs.reward == pytest.approx(1.3)
def test_step_correct_returns_positive_reward(self, env_correct):
env_correct.reset(seed=42)
action = DeceitAction(reasoning="I know this.", answer="Canberra", confidence=0.9, is_final=True)
obs = env_correct.step(action)
assert obs.reward > 0
def test_step_wrong_confident_reward_is_minus_1_3(self, env_wrong):
env_wrong.reset(seed=42)
action = DeceitAction(reasoning="Wrong.", answer="Sydney", confidence=0.9, is_final=True)
obs = env_wrong.step(action)
assert obs.reward == pytest.approx(-1.3)
def test_step_wrong_returns_negative_reward(self, env_wrong):
env_wrong.reset(seed=42)
action = DeceitAction(reasoning="Guessing.", answer="Sydney", confidence=0.9, is_final=True)
obs = env_wrong.step(action)
assert obs.reward < 0
def test_step_abstain_reward_is_zero(self, env_correct):
env_correct.reset(seed=42)
action = DeceitAction(reasoning="Not sure.", answer="", abstain=True, confidence=0.3, is_final=True)
obs = env_correct.step(action)
assert obs.reward == pytest.approx(0.0)
def test_step_final_sets_done_true(self, env_correct):
env_correct.reset(seed=42)
action = DeceitAction(reasoning="r", answer="Canberra", confidence=0.8, is_final=True)
obs = env_correct.step(action)
assert obs.done is True
def test_step_metadata_contains_grader_info(self, env_correct):
env_correct.reset(seed=42)
action = DeceitAction(reasoning="r", answer="Canberra", confidence=0.9, is_final=True)
obs = env_correct.step(action)
assert "grader_method" in obs.metadata
assert "correct" in obs.metadata
assert "correctness_reward" in obs.metadata
assert "calibration_reward" in obs.metadata
def test_state_updated_after_step(self, env_correct):
env_correct.reset(seed=42)
action = DeceitAction(reasoning="r", answer="Canberra", confidence=0.9, is_final=True)
env_correct.step(action)
assert env_correct.state.step_count == 1
assert len(env_correct.state.episode_rewards) == 1
class TestMultiTurnEpisodes:
def test_non_final_step_returns_done_false(self, env_correct):
env_correct.reset(seed=42)
action = DeceitAction(reasoning="thinking...", is_final=False)
obs = env_correct.step(action)
assert obs.done is False
def test_non_final_step_returns_step_penalty(self, env_correct):
env_correct.reset(seed=42)
action = DeceitAction(reasoning="thinking...", is_final=False)
obs = env_correct.step(action)
assert obs.reward == pytest.approx(STEP_PENALTY)
def test_non_final_step_appends_reasoning_to_context(self, env_correct):
env_correct.reset(seed=42)
action = DeceitAction(reasoning="First I considered Sydney.", is_final=False)
obs = env_correct.step(action)
assert any("First I considered Sydney." in c for c in obs.context)
def test_multi_turn_full_trajectory(self, env_correct):
# reset → think → think → commit correct → positive final reward
env_correct.reset(seed=42)
obs1 = env_correct.step(DeceitAction(reasoning="Turn 1 thinking", is_final=False))
assert obs1.done is False
assert obs1.reward == pytest.approx(STEP_PENALTY)
obs2 = env_correct.step(DeceitAction(reasoning="Turn 2 thinking", is_final=False))
assert obs2.done is False
assert obs2.reward == pytest.approx(STEP_PENALTY)
obs3 = env_correct.step(
DeceitAction(reasoning="Committed.", answer="Canberra", confidence=0.9, is_final=True)
)
assert obs3.done is True
assert obs3.reward == pytest.approx(1.3)
assert env_correct.state.step_count == 3
def test_forced_termination_at_max_turns(self, env_correct):
# 3 non-final steps — 3rd is forced final regardless of is_final flag
env_correct.reset(seed=42)
env_correct.step(DecaitAction := DeceitAction(reasoning="t1", is_final=False))
env_correct.step(DeceitAction(reasoning="t2", is_final=False))
# 3rd step hits max_turns, forced terminal
obs = env_correct.step(
DeceitAction(reasoning="t3", answer="Canberra", confidence=0.8, is_final=False)
)
assert obs.done is True
assert obs.metadata.get("forced_final") is True
def test_prior_reasoning_in_context_grows_each_turn(self, env_correct):
env_correct.reset(seed=42)
env_correct.step(DeceitAction(reasoning="step1", is_final=False))
obs = env_correct.step(DeceitAction(reasoning="step2", is_final=False))
assert len(obs.context) == 2
def test_state_prior_reasoning_accumulates(self, env_correct):
env_correct.reset(seed=42)
env_correct.step(DeceitAction(reasoning="thinking A", is_final=False))
env_correct.step(DeceitAction(reasoning="thinking B", is_final=False))
assert env_correct.state.prior_reasoning == ["thinking A", "thinking B"]
def test_episode_rewards_include_step_penalties(self, env_correct):
env_correct.reset(seed=42)
env_correct.step(DeceitAction(reasoning="t1", is_final=False))
env_correct.step(
DeceitAction(reasoning="commit", answer="Canberra", confidence=0.9, is_final=True)
)
rewards = env_correct.state.episode_rewards
assert rewards[0] == pytest.approx(STEP_PENALTY)
assert rewards[1] == pytest.approx(1.3)
class TestMultipleEpisodes:
def test_reset_step_reset_step_sequence(self, env_correct):
for _ in range(3):
obs = env_correct.reset()
assert isinstance(obs, DeceitObservation)
action = DeceitAction(reasoning="r", answer="x", confidence=0.8, is_final=True)
result = env_correct.step(action)
assert result.done is True
assert env_correct.state.step_count == 1
def test_state_resets_between_episodes(self, env_correct):
env_correct.reset(seed=1)
first_id = env_correct.state.episode_id
env_correct.step(DeceitAction(reasoning="r", answer="x", confidence=0.8, is_final=True))
env_correct.reset(seed=2)
second_id = env_correct.state.episode_id
assert first_id != second_id
assert env_correct.state.step_count == 0
assert env_correct.state.episode_rewards == []
assert env_correct.state.prior_reasoning == []