| """Integration tests for DeceitEnvironment — grader is always mocked.""" |
|
|
| import pathlib |
| import pytest |
| from unittest.mock import MagicMock |
|
|
| from deceit_env.models import DeceitAction, DeceitObservation, DeceitState |
| from deceit_env.server.environment import DeceitEnvironment, STEP_PENALTY |
| from deceit_env.server.grader import GraderResult |
|
|
|
|
| DATASET_PATH = ( |
| pathlib.Path(__file__).parent.parent / "src" / "deceit_env" / "data" / "level1.jsonl" |
| ) |
|
|
|
|
| def _make_grader(correct: bool, method: str = "exact") -> MagicMock: |
| grader = MagicMock() |
| grader.check.return_value = GraderResult( |
| correct=correct, method=method, explanation="mocked" |
| ) |
| return grader |
|
|
|
|
| @pytest.fixture |
| def env_correct(): |
| return DeceitEnvironment(dataset_path=DATASET_PATH, grader=_make_grader(correct=True)) |
|
|
|
|
| @pytest.fixture |
| def env_wrong(): |
| return DeceitEnvironment(dataset_path=DATASET_PATH, grader=_make_grader(correct=False)) |
|
|
|
|
| class TestReset: |
| def test_reset_returns_observation(self, env_correct): |
| obs = env_correct.reset() |
| assert isinstance(obs, DeceitObservation) |
|
|
| def test_reset_observation_has_question(self, env_correct): |
| obs = env_correct.reset() |
| assert isinstance(obs.question, str) |
| assert len(obs.question) > 0 |
|
|
| def test_reset_observation_has_empty_context(self, env_correct): |
| obs = env_correct.reset() |
| assert obs.context == [] |
|
|
| def test_reset_observation_level_1(self, env_correct): |
| obs = env_correct.reset() |
| assert obs.level == 1 |
|
|
| def test_reset_observation_max_turns_3(self, env_correct): |
| obs = env_correct.reset() |
| assert obs.max_turns == 3 |
|
|
| def test_reset_initializes_state(self, env_correct): |
| env_correct.reset() |
| state = env_correct.state |
| assert isinstance(state, DeceitState) |
| assert state.episode_id is not None |
| assert state.step_count == 0 |
| assert state.ground_truth != "" |
| assert state.prior_reasoning == [] |
|
|
|
|
| class TestSingleStepFinal: |
| def test_step_correct_confident_reward_is_1_3(self, env_correct): |
| env_correct.reset(seed=42) |
| action = DeceitAction(reasoning="Sure.", answer="Canberra", confidence=0.9, is_final=True) |
| obs = env_correct.step(action) |
| assert obs.reward == pytest.approx(1.3) |
|
|
| def test_step_correct_returns_positive_reward(self, env_correct): |
| env_correct.reset(seed=42) |
| action = DeceitAction(reasoning="I know this.", answer="Canberra", confidence=0.9, is_final=True) |
| obs = env_correct.step(action) |
| assert obs.reward > 0 |
|
|
| def test_step_wrong_confident_reward_is_minus_1_3(self, env_wrong): |
| env_wrong.reset(seed=42) |
| action = DeceitAction(reasoning="Wrong.", answer="Sydney", confidence=0.9, is_final=True) |
| obs = env_wrong.step(action) |
| assert obs.reward == pytest.approx(-1.3) |
|
|
| def test_step_wrong_returns_negative_reward(self, env_wrong): |
| env_wrong.reset(seed=42) |
| action = DeceitAction(reasoning="Guessing.", answer="Sydney", confidence=0.9, is_final=True) |
| obs = env_wrong.step(action) |
| assert obs.reward < 0 |
|
|
| def test_step_abstain_reward_is_zero(self, env_correct): |
| env_correct.reset(seed=42) |
| action = DeceitAction(reasoning="Not sure.", answer="", abstain=True, confidence=0.3, is_final=True) |
| obs = env_correct.step(action) |
| assert obs.reward == pytest.approx(0.0) |
|
|
| def test_step_final_sets_done_true(self, env_correct): |
| env_correct.reset(seed=42) |
| action = DeceitAction(reasoning="r", answer="Canberra", confidence=0.8, is_final=True) |
| obs = env_correct.step(action) |
| assert obs.done is True |
|
|
| def test_step_metadata_contains_grader_info(self, env_correct): |
| env_correct.reset(seed=42) |
| action = DeceitAction(reasoning="r", answer="Canberra", confidence=0.9, is_final=True) |
| obs = env_correct.step(action) |
| assert "grader_method" in obs.metadata |
| assert "correct" in obs.metadata |
| assert "correctness_reward" in obs.metadata |
| assert "calibration_reward" in obs.metadata |
|
|
| def test_state_updated_after_step(self, env_correct): |
| env_correct.reset(seed=42) |
| action = DeceitAction(reasoning="r", answer="Canberra", confidence=0.9, is_final=True) |
| env_correct.step(action) |
| assert env_correct.state.step_count == 1 |
| assert len(env_correct.state.episode_rewards) == 1 |
|
|
|
|
| class TestMultiTurnEpisodes: |
| def test_non_final_step_returns_done_false(self, env_correct): |
| env_correct.reset(seed=42) |
| action = DeceitAction(reasoning="thinking...", is_final=False) |
| obs = env_correct.step(action) |
| assert obs.done is False |
|
|
| def test_non_final_step_returns_step_penalty(self, env_correct): |
| env_correct.reset(seed=42) |
| action = DeceitAction(reasoning="thinking...", is_final=False) |
| obs = env_correct.step(action) |
| assert obs.reward == pytest.approx(STEP_PENALTY) |
|
|
| def test_non_final_step_appends_reasoning_to_context(self, env_correct): |
| env_correct.reset(seed=42) |
| action = DeceitAction(reasoning="First I considered Sydney.", is_final=False) |
| obs = env_correct.step(action) |
| assert any("First I considered Sydney." in c for c in obs.context) |
|
|
| def test_multi_turn_full_trajectory(self, env_correct): |
| |
| env_correct.reset(seed=42) |
|
|
| obs1 = env_correct.step(DeceitAction(reasoning="Turn 1 thinking", is_final=False)) |
| assert obs1.done is False |
| assert obs1.reward == pytest.approx(STEP_PENALTY) |
|
|
| obs2 = env_correct.step(DeceitAction(reasoning="Turn 2 thinking", is_final=False)) |
| assert obs2.done is False |
| assert obs2.reward == pytest.approx(STEP_PENALTY) |
|
|
| obs3 = env_correct.step( |
| DeceitAction(reasoning="Committed.", answer="Canberra", confidence=0.9, is_final=True) |
| ) |
| assert obs3.done is True |
| assert obs3.reward == pytest.approx(1.3) |
| assert env_correct.state.step_count == 3 |
|
|
| def test_forced_termination_at_max_turns(self, env_correct): |
| |
| env_correct.reset(seed=42) |
| env_correct.step(DecaitAction := DeceitAction(reasoning="t1", is_final=False)) |
| env_correct.step(DeceitAction(reasoning="t2", is_final=False)) |
| |
| obs = env_correct.step( |
| DeceitAction(reasoning="t3", answer="Canberra", confidence=0.8, is_final=False) |
| ) |
| assert obs.done is True |
| assert obs.metadata.get("forced_final") is True |
|
|
| def test_prior_reasoning_in_context_grows_each_turn(self, env_correct): |
| env_correct.reset(seed=42) |
| env_correct.step(DeceitAction(reasoning="step1", is_final=False)) |
| obs = env_correct.step(DeceitAction(reasoning="step2", is_final=False)) |
| assert len(obs.context) == 2 |
|
|
| def test_state_prior_reasoning_accumulates(self, env_correct): |
| env_correct.reset(seed=42) |
| env_correct.step(DeceitAction(reasoning="thinking A", is_final=False)) |
| env_correct.step(DeceitAction(reasoning="thinking B", is_final=False)) |
| assert env_correct.state.prior_reasoning == ["thinking A", "thinking B"] |
|
|
| def test_episode_rewards_include_step_penalties(self, env_correct): |
| env_correct.reset(seed=42) |
| env_correct.step(DeceitAction(reasoning="t1", is_final=False)) |
| env_correct.step( |
| DeceitAction(reasoning="commit", answer="Canberra", confidence=0.9, is_final=True) |
| ) |
| rewards = env_correct.state.episode_rewards |
| assert rewards[0] == pytest.approx(STEP_PENALTY) |
| assert rewards[1] == pytest.approx(1.3) |
|
|
|
|
| class TestMultipleEpisodes: |
| def test_reset_step_reset_step_sequence(self, env_correct): |
| for _ in range(3): |
| obs = env_correct.reset() |
| assert isinstance(obs, DeceitObservation) |
| action = DeceitAction(reasoning="r", answer="x", confidence=0.8, is_final=True) |
| result = env_correct.step(action) |
| assert result.done is True |
| assert env_correct.state.step_count == 1 |
|
|
| def test_state_resets_between_episodes(self, env_correct): |
| env_correct.reset(seed=1) |
| first_id = env_correct.state.episode_id |
| env_correct.step(DeceitAction(reasoning="r", answer="x", confidence=0.8, is_final=True)) |
|
|
| env_correct.reset(seed=2) |
| second_id = env_correct.state.episode_id |
| assert first_id != second_id |
| assert env_correct.state.step_count == 0 |
| assert env_correct.state.episode_rewards == [] |
| assert env_correct.state.prior_reasoning == [] |
|
|