Spaces:
Sleeping
Sleeping
| """Tests for the ClaimSense Pro Adjudication Gym. | |
| 34 tests ported verbatim (semantically) from | |
| ``insurance_agent_rl/tests/test_environment.py`` plus 5 new ones that | |
| exercise the VERIFY_PURCHASE verb, the Plaid fallback, the 6-component | |
| score bound, and a constant-policy collapse across the 5-task mix. | |
| """ | |
| import os | |
| import random | |
| import sys | |
| sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) | |
| from server.claims_environment import ( | |
| ClaimsEnvProEnvironment, | |
| Claim, | |
| DENSE_PLAID_DISCREPANCY_BONUS, | |
| TASKS, | |
| TASK_INDEX, | |
| ) | |
| from server.plaid_mock import BankProbeStub, get_bank_probe | |
| from models import ClaimsAction | |
| # ============================================================================= | |
| # 1. Reset (4) | |
| # ============================================================================= | |
| class TestEnvironmentReset: | |
| def test_reset_returns_observation(self): | |
| env = ClaimsEnvProEnvironment() | |
| obs = env.reset(seed=42) | |
| assert obs is not None | |
| assert obs.step_number == 0 | |
| assert obs.claims_processed == 0 | |
| assert obs.correct_decisions == 0 | |
| assert obs.wrong_decisions == 0 | |
| def test_reset_deterministic_with_same_seed(self): | |
| env1 = ClaimsEnvProEnvironment() | |
| obs1 = env1.reset(seed=42) | |
| env2 = ClaimsEnvProEnvironment() | |
| obs2 = env2.reset(seed=42) | |
| assert obs1.dashboard == obs2.dashboard | |
| assert obs1.claims_in_queue == obs2.claims_in_queue | |
| def test_reset_different_seeds_different_states(self): | |
| env = ClaimsEnvProEnvironment() | |
| obs1 = env.reset(seed=2) | |
| d1 = obs1.dashboard | |
| obs2 = env.reset(seed=9) | |
| d2 = obs2.dashboard | |
| assert d1 != d2 | |
| def test_reset_clears_previous_state(self): | |
| env = ClaimsEnvProEnvironment() | |
| env.reset(seed=2) | |
| env.step(ClaimsAction(message="VIEW_QUEUE")) | |
| env.step(ClaimsAction(message="OPEN_CLAIM 1")) | |
| obs = env.reset(seed=2) | |
| assert obs.step_number == 0 | |
| assert obs.claims_processed == 0 | |
| assert obs.active_claim_id == -1 | |
| # ============================================================================= | |
| # 2. Task config (5) | |
| # ============================================================================= | |
| class TestTasks: | |
| def test_five_tasks_exist(self): | |
| assert len(TASKS) == 5 | |
| def test_task_names(self): | |
| expected = { | |
| "routine_monday", "storm_surge", "multi_vehicle_pileup", | |
| "fraud_ring_day", "catastrophe_weekend", | |
| } | |
| assert set(TASKS.keys()) == expected | |
| def test_difficulty_progression(self): | |
| difficulties = [TASKS[t]["difficulty"] for t in TASKS] | |
| assert "easy" in difficulties | |
| assert "medium" in difficulties | |
| assert "hard" in difficulties | |
| assert "expert" in difficulties | |
| def test_claim_counts_increase(self): | |
| assert TASKS["catastrophe_weekend"]["num_claims"] > TASKS["routine_monday"]["num_claims"] | |
| def test_each_task_loads_via_seed(self): | |
| env = ClaimsEnvProEnvironment() | |
| seen = set() | |
| for seed in range(1, 200): | |
| obs = env.reset(seed=seed) | |
| seen.add(obs.task_name) | |
| if len(seen) == 5: | |
| break | |
| assert len(seen) == 5 | |
| # ============================================================================= | |
| # 3. Commands (13) | |
| # ============================================================================= | |
| class TestCommands: | |
| def _setup_env(self, seed=2): | |
| env = ClaimsEnvProEnvironment() | |
| env.reset(seed=seed) | |
| return env | |
| def test_view_queue(self): | |
| env = self._setup_env() | |
| obs = env.step(ClaimsAction(message="VIEW_QUEUE")) | |
| assert "CLAIMS QUEUE" in obs.dashboard | |
| assert obs.step_number == 1 | |
| def test_open_claim(self): | |
| env = self._setup_env() | |
| obs = env.step(ClaimsAction(message="OPEN_CLAIM 1")) | |
| assert "CLAIM #1" in obs.dashboard | |
| assert obs.active_claim_id == 1 | |
| def test_open_nonexistent_claim(self): | |
| env = self._setup_env() | |
| obs = env.step(ClaimsAction(message="OPEN_CLAIM 999")) | |
| assert "not found" in obs.dashboard | |
| def test_review_documents_without_open_claim(self): | |
| env = self._setup_env() | |
| obs = env.step(ClaimsAction(message="REVIEW_DOCUMENTS")) | |
| assert "No claim open" in obs.dashboard | |
| def test_review_documents(self): | |
| env = self._setup_env() | |
| env.step(ClaimsAction(message="OPEN_CLAIM 1")) | |
| obs = env.step(ClaimsAction(message="REVIEW_DOCUMENTS")) | |
| assert "DOCUMENT REVIEW" in obs.dashboard | |
| def test_check_policy(self): | |
| env = self._setup_env() | |
| env.step(ClaimsAction(message="OPEN_CLAIM 1")) | |
| obs = env.step(ClaimsAction(message="CHECK_POLICY")) | |
| assert "POLICY CHECK" in obs.dashboard | |
| assert "APPROVE" in obs.dashboard or "DENY" in obs.dashboard | |
| def test_investigate_fraud(self): | |
| env = self._setup_env() | |
| env.step(ClaimsAction(message="OPEN_CLAIM 1")) | |
| obs = env.step(ClaimsAction(message="INVESTIGATE_FRAUD")) | |
| assert "FRAUD INVESTIGATION" in obs.dashboard | |
| assert "FRAUD RISK SCORE" in obs.dashboard | |
| def test_approve_claim(self): | |
| env = self._setup_env() | |
| env.step(ClaimsAction(message="OPEN_CLAIM 1")) | |
| env.step(ClaimsAction(message="REVIEW_DOCUMENTS")) | |
| env.step(ClaimsAction(message="CHECK_POLICY")) | |
| env.step(ClaimsAction(message="INVESTIGATE_FRAUD")) | |
| obs = env.step(ClaimsAction(message="APPROVE 10000")) | |
| assert "DECISION" in obs.dashboard | |
| assert "APPROVED" in obs.dashboard | |
| assert obs.claims_processed == 1 | |
| def test_deny_claim(self): | |
| env = self._setup_env() | |
| env.step(ClaimsAction(message="OPEN_CLAIM 1")) | |
| obs = env.step(ClaimsAction(message="DENY fraud_detected")) | |
| assert "DENIED" in obs.dashboard | |
| assert obs.claims_processed == 1 | |
| def test_end_shift(self): | |
| env = self._setup_env() | |
| obs = env.step(ClaimsAction(message="END_SHIFT")) | |
| assert obs.done is True | |
| assert "SHIFT COMPLETE" in obs.dashboard | |
| def test_semicolon_multi_command(self): | |
| env = self._setup_env() | |
| obs = env.step(ClaimsAction( | |
| message="OPEN_CLAIM 1; REVIEW_DOCUMENTS; CHECK_POLICY" | |
| )) | |
| assert "CLAIM #1" in obs.dashboard | |
| assert "DOCUMENT REVIEW" in obs.dashboard | |
| assert "POLICY CHECK" in obs.dashboard | |
| def test_max_three_commands_per_step(self): | |
| env = self._setup_env() | |
| obs = env.step(ClaimsAction( | |
| message="VIEW_QUEUE; OPEN_CLAIM 1; REVIEW_DOCUMENTS; CHECK_POLICY" | |
| )) | |
| # 4th command (CHECK_POLICY) dropped. | |
| assert "CLAIMS QUEUE" in obs.dashboard | |
| assert "CLAIM #1" in obs.dashboard | |
| assert "DOCUMENT REVIEW" in obs.dashboard | |
| assert "POLICY CHECK" not in obs.dashboard | |
| def test_unknown_command(self): | |
| env = self._setup_env() | |
| obs = env.step(ClaimsAction(message="FOOBAR")) | |
| assert "Unknown command" in obs.dashboard | |
| # ============================================================================= | |
| # 4. Grading (4) | |
| # ============================================================================= | |
| class TestGrading: | |
| def test_reward_between_0_and_1(self): | |
| env = ClaimsEnvProEnvironment() | |
| env.reset(seed=2) | |
| for _ in range(5): | |
| obs = env.step(ClaimsAction(message="VIEW_QUEUE")) | |
| assert 0.0 <= obs.reward <= 1.0 | |
| def test_final_reward_on_end_shift(self): | |
| env = ClaimsEnvProEnvironment() | |
| env.reset(seed=2) | |
| obs = env.step(ClaimsAction(message="END_SHIFT")) | |
| assert obs.done is True | |
| assert 0.0 <= obs.reward <= 1.0 | |
| def test_perfect_run_scores_high(self): | |
| env = ClaimsEnvProEnvironment() | |
| env.reset(seed=2) | |
| for cid in list(env._claims.keys()): | |
| claim = env._claims[cid] | |
| if claim.appeal_pending: | |
| env.step(ClaimsAction(message=f"HANDLE_APPEAL {cid}")) | |
| else: | |
| env.step(ClaimsAction(message=f"OPEN_CLAIM {cid}")) | |
| env.step(ClaimsAction(message="REVIEW_DOCUMENTS; CHECK_POLICY")) | |
| env.step(ClaimsAction(message="INVESTIGATE_FRAUD")) | |
| if claim.correct_decision == "approve": | |
| env.step(ClaimsAction(message=f"APPROVE {claim.correct_payout:.2f}")) | |
| else: | |
| env.step(ClaimsAction(message=f"DENY {claim.deny_reason}")) | |
| obs = env.step(ClaimsAction(message="END_SHIFT")) | |
| assert obs.done is True | |
| # Perfect play across all 6 components should score very high. | |
| assert obs.reward >= 0.85, f"Perfect play scored {obs.reward}" | |
| def test_doing_nothing_scores_low(self): | |
| env = ClaimsEnvProEnvironment() | |
| env.reset(seed=2) | |
| obs = env.step(ClaimsAction(message="END_SHIFT")) | |
| assert obs.reward < 0.50 | |
| # ============================================================================= | |
| # 5. Episode boundaries (4) | |
| # ============================================================================= | |
| class TestEpisodeBoundaries: | |
| def test_done_false_initially(self): | |
| env = ClaimsEnvProEnvironment() | |
| obs = env.reset(seed=2) | |
| assert obs.done is False | |
| def test_done_on_end_shift(self): | |
| env = ClaimsEnvProEnvironment() | |
| env.reset(seed=2) | |
| obs = env.step(ClaimsAction(message="END_SHIFT")) | |
| assert obs.done is True | |
| def test_done_at_max_steps(self): | |
| env = ClaimsEnvProEnvironment() | |
| env.reset(seed=2) | |
| obs = None | |
| for _ in range(60): | |
| obs = env.step(ClaimsAction(message="VIEW_QUEUE")) | |
| if obs.done: | |
| break | |
| assert obs.done is True | |
| assert obs.step_number <= 50 | |
| def test_step_after_done_not_possible(self): | |
| env = ClaimsEnvProEnvironment() | |
| env.reset(seed=2) | |
| obs1 = env.step(ClaimsAction(message="END_SHIFT")) | |
| assert obs1.done is True | |
| # ============================================================================= | |
| # 6. Claim generation (4) | |
| # ============================================================================= | |
| class TestClaimGeneration: | |
| def test_claim_has_required_fields(self): | |
| rng = random.Random(42) | |
| c = Claim(1, rng, "easy") | |
| assert c.id == 1 | |
| assert c.claimant != "" | |
| assert c.claim_type in [ | |
| "auto_collision", "auto_theft", "health_emergency", | |
| "health_procedure", "property_fire", "property_water", | |
| "property_theft", "liability_slip_fall", "liability_product", | |
| "workers_comp", | |
| ] | |
| assert c.priority in ["low", "medium", "high", "urgent"] | |
| assert c.claimed_amount > 0 | |
| assert c.policy_limit > 0 | |
| assert c.correct_decision in ["approve", "deny"] | |
| def test_fraudulent_claim_should_be_denied(self): | |
| for seed in range(100): | |
| rng = random.Random(seed) | |
| c = Claim(1, rng, "hard") | |
| if c.is_fraudulent: | |
| assert c.correct_decision == "deny" | |
| assert c.correct_payout == 0.0 | |
| assert len(c.fraud_signals) > 0 | |
| return | |
| assert False, "No fraudulent claim found in 100 seeds" | |
| def test_lapsed_policy_should_be_denied(self): | |
| for seed in range(1000): | |
| rng = random.Random(seed) | |
| c = Claim(1, rng, "easy") | |
| if not c.policy_active: | |
| assert c.correct_decision == "deny" | |
| assert c.correct_payout == 0.0 | |
| return | |
| assert False, "No lapsed policy found in 1000 seeds" | |
| def test_valid_claim_payout_calculation(self): | |
| for seed in range(100): | |
| rng = random.Random(seed) | |
| c = Claim(1, rng, "easy") | |
| if c.correct_decision == "approve": | |
| expected = max(0, min(c.claimed_amount, c.policy_limit) - c.deductible) | |
| assert abs(c.correct_payout - expected) < 0.01 | |
| return | |
| assert False, "No valid claim found" | |
| # ============================================================================= | |
| # 7. NEW — VERIFY_PURCHASE + Plaid + 6-component bound + constant-policy (5) | |
| # ============================================================================= | |
| class TestPlaidVerifyPurchase: | |
| """5 new tests exclusive to claims-env-pro.""" | |
| def _find_seed_with_fraud(self) -> int: | |
| """Find a seed whose first claim is fraudulent so VERIFY surfaces it.""" | |
| for seed in range(1, 500): | |
| env = ClaimsEnvProEnvironment() | |
| env.reset(seed=seed) | |
| first = env._claims.get(1) | |
| if first and first.is_fraudulent: | |
| return seed | |
| return 1 | |
| def _find_seed_with_clean_claim(self) -> int: | |
| """Find a seed whose first claim is non-fraudulent and active.""" | |
| for seed in range(1, 500): | |
| env = ClaimsEnvProEnvironment() | |
| env.reset(seed=seed) | |
| first = env._claims.get(1) | |
| if first and not first.is_fraudulent and first.policy_active: | |
| return seed | |
| return 1 | |
| def test_verify_purchase_bonus_on_discrepancy(self): | |
| """+2 dense bonus when Plaid surfaces a discrepancy on a fraud claim.""" | |
| seed = self._find_seed_with_fraud() | |
| env = ClaimsEnvProEnvironment() | |
| env.reset(seed=seed) | |
| env.step(ClaimsAction(message="OPEN_CLAIM 1")) | |
| obs = env.step(ClaimsAction(message="VERIFY_PURCHASE")) | |
| # The mock biases discrepancy when fraud=True so we expect a hit. | |
| # Dense reward = -0.30 (cost) + 2.00 (bonus) = +1.70 on first surface. | |
| assert obs.dense_step_reward >= DENSE_PLAID_DISCREPANCY_BONUS - 0.50, ( | |
| f"expected dense bonus >= {DENSE_PLAID_DISCREPANCY_BONUS - 0.5}, " | |
| f"got {obs.dense_step_reward}" | |
| ) | |
| # And revealed_info should contain the plaid hit. | |
| assert "plaid" in obs.revealed_info | |
| assert obs.revealed_info["plaid"]["discrepancy"] is True | |
| def test_verify_purchase_no_bonus_when_no_discrepancy(self): | |
| """No +2 bonus when claim is clean (mock returns no discrepancy).""" | |
| seed = self._find_seed_with_clean_claim() | |
| env = ClaimsEnvProEnvironment() | |
| env.reset(seed=seed) | |
| env.step(ClaimsAction(message="OPEN_CLAIM 1")) | |
| obs = env.step(ClaimsAction(message="VERIFY_PURCHASE")) | |
| # Dense reward should be just the cost (-0.30) — no +2 bonus. | |
| assert obs.dense_step_reward < DENSE_PLAID_DISCREPANCY_BONUS / 2.0 | |
| assert obs.dense_step_reward <= 0.0 # no bonus on clean | |
| assert "plaid" in obs.revealed_info | |
| def test_plaid_falls_back_to_mock_when_unset(self): | |
| """get_bank_probe() returns BankProbeStub when PLAID_CLIENT_ID missing.""" | |
| original = os.environ.pop("PLAID_CLIENT_ID", None) | |
| try: | |
| client = get_bank_probe() | |
| assert isinstance(client, BankProbeStub) | |
| finally: | |
| if original is not None: | |
| os.environ["PLAID_CLIENT_ID"] = original | |
| def test_six_component_score_sums_le_1(self): | |
| """The 6-component graded final score must be in [0, 1].""" | |
| env = ClaimsEnvProEnvironment() | |
| env.reset(seed=2) | |
| # Drive a perfect episode (same as test_perfect_run_scores_high). | |
| for cid in list(env._claims.keys()): | |
| claim = env._claims[cid] | |
| if claim.appeal_pending: | |
| env.step(ClaimsAction(message=f"HANDLE_APPEAL {cid}")) | |
| else: | |
| env.step(ClaimsAction(message=f"OPEN_CLAIM {cid}")) | |
| env.step(ClaimsAction(message="REVIEW_DOCUMENTS; CHECK_POLICY")) | |
| env.step(ClaimsAction(message="INVESTIGATE_FRAUD")) | |
| if claim.correct_decision == "approve": | |
| env.step(ClaimsAction(message=f"APPROVE {claim.correct_payout:.2f}")) | |
| else: | |
| env.step(ClaimsAction(message=f"DENY {claim.deny_reason}")) | |
| obs = env.step(ClaimsAction(message="END_SHIFT")) | |
| # final_score field must exist and be bounded. | |
| assert obs.final_score >= 0.0 | |
| assert obs.final_score <= 1.0 + 1e-9, f"final_score {obs.final_score} > 1" | |
| def test_constant_approve_policy_collapses_across_tasks(self): | |
| """A 'always APPROVE 0' agent gets shredded across the 5-task mix. | |
| Compared against a thinking-policy upper bound of ~0.85 on perfect | |
| play (test_perfect_run_scores_high). The constant policy clears | |
| decision_accuracy on the easy task (because most claims are valid), | |
| which floors the mean at ~0.50 — well below the perfect-play | |
| ceiling and well above what the trained agent we ship will hit | |
| baseline-without-thinking. Asserted bound: <0.55, computed mean. | |
| """ | |
| scores = [] | |
| for sidx in range(5): | |
| for s in range(6): | |
| env = ClaimsEnvProEnvironment() | |
| env.reset(seed=sidx * 13 + s + 7, scenario_index=sidx) | |
| for cid in list(env._claims.keys()): | |
| claim = env._claims[cid] | |
| if claim.appeal_pending: | |
| env.step(ClaimsAction(message=f"HANDLE_APPEAL {cid}")) | |
| else: | |
| env.step(ClaimsAction(message=f"OPEN_CLAIM {cid}")) | |
| env.step(ClaimsAction(message="APPROVE 0")) | |
| obs = env.step(ClaimsAction(message="END_SHIFT")) | |
| scores.append(obs.final_score) | |
| mean_score = sum(scores) / len(scores) | |
| assert mean_score < 0.55, ( | |
| f"Constant-APPROVE-0 policy averaged {mean_score:.3f} across " | |
| f"30 episodes (expected <0.55, far from the 0.85+ perfect-play " | |
| f"ceiling)." | |
| ) | |