Spaces:
Sleeping
Sleeping
| """ | |
| Comprehensive tests for SmartPayEnv v2 graders, data generation, and environment. | |
| Run from the repo root: python test_graders.py | |
| """ | |
| import sys, math | |
| sys.path.insert(0, ".") | |
| sys.path.insert(0, "./server") | |
| import numpy as np | |
| from server.graders import ( | |
| RoutingEfficacyGrader, | |
| FraudDetectionGrader, | |
| UserRetentionGrader, | |
| process_combined_reward, | |
| ) | |
| from server.SmartPayEnv_environment import SmartpayenvEnvironment, DIFFICULTY_CONFIG | |
| from models import SmartpayenvAction | |
| SEP = "=" * 60 | |
| # ── 1. RoutingEfficacyGrader (deterministic expected_outcome) ──────── | |
| print(f"\n{SEP}\n[1] RoutingEfficacyGrader — deterministic expected_outcome\n{SEP}") | |
| rg = RoutingEfficacyGrader() | |
| gw_rates = [0.70, 0.85, 0.95] # GatewayC is best (index 2) | |
| # Optimal choice: choose best gateway, high expected outcome | |
| s_opt = rg.evaluate(expected_outcome=0.90, cost=0.5, retries=0, chosen_gateway=2, gateway_rates=gw_rates) | |
| # Suboptimal choice: choose worst gateway, same exp outcome for fairness (though in practice it would be lower) | |
| s_sub = rg.evaluate(expected_outcome=0.90, cost=0.5, retries=0, chosen_gateway=0, gateway_rates=gw_rates) | |
| # Optimal choice, low expected outcome | |
| s_low = rg.evaluate(expected_outcome=0.20, cost=0.5, retries=0, chosen_gateway=2, gateway_rates=gw_rates) | |
| # Worst: suboptimal + low outcome + retry + expensive | |
| s_bad = rg.evaluate(expected_outcome=0.10, cost=4.0, retries=2, chosen_gateway=0, gateway_rates=gw_rates) | |
| print(f" optimal gw + high outcome → {s_opt:.4f}") | |
| print(f" suboptimal gw + same cost → {s_sub:.4f} (lower: worse gateway choice)") | |
| print(f" optimal gw + low outcome → {s_low:.4f} (mid)") | |
| print(f" worst case → {s_bad:.4f} (expect lowest)") | |
| for s in [s_opt, s_sub, s_low, s_bad]: | |
| assert 0.0 <= s <= 1.0, f"Out of [0,1]: {s}" | |
| assert s_opt > s_sub, "Optimal gateway should outscore suboptimal" | |
| assert s_opt > s_low, "High expected outcome should outscore low" | |
| assert s_low > s_bad, "Any reasonable choice beats the worst case" | |
| # DETERMINISM check: same inputs must always give same score | |
| assert rg.evaluate(0.7, 1.5, 0, 1, gw_rates) == rg.evaluate(0.7, 1.5, 0, 1, gw_rates), "Not deterministic!" | |
| print(" ✅ RoutingEfficacyGrader deterministic OK") | |
| # ── 2. FraudDetectionGrader ────────────────────────────────── | |
| print(f"\n{SEP}\n[2] FraudDetectionGrader\n{SEP}") | |
| fg = FraudDetectionGrader() | |
| for _ in range(70): fg.add_step(False, False) | |
| for _ in range(30): fg.add_step(True, True) | |
| assert abs(fg.evaluate() - 1.0) < 1e-9, f"Perfect: {fg.evaluate()}" | |
| fg2 = FraudDetectionGrader() | |
| for _ in range(70): fg2.add_step(True, False) | |
| for _ in range(30): fg2.add_step(False, True) | |
| assert abs(fg2.evaluate() - 0.0) < 1e-9, f"Worst: {fg2.evaluate()}" | |
| fg3 = FraudDetectionGrader() | |
| for _ in range(100): fg3.add_step(True, True) | |
| assert abs(fg3.evaluate() - 0.5) < 1e-9, f"Neutral: {fg3.evaluate()}" | |
| print(f" perfect=1.0 worst=0.0 neutral=0.5 ✅") | |
| # ── 3. UserRetentionGrader ─────────────────────────────────── | |
| print(f"\n{SEP}\n[3] UserRetentionGrader\n{SEP}") | |
| urg = UserRetentionGrader(churn_rate=0.1, initial_users=100) | |
| assert abs(urg.evaluate() - 1.0) < 1e-9 | |
| urg.add_step(0); assert abs(urg.evaluate() - 1.0) < 1e-9 | |
| urg.add_step(3); assert urg.evaluate() < 1.0 | |
| print(f" initial=1.0, no-failure=1.0, 3-failures={urg.evaluate():.4f} ✅") | |
| # ── 4. process_combined_reward ──────────────────────────────── | |
| print(f"\n{SEP}\n[4] process_combined_reward\n{SEP}") | |
| r_best = process_combined_reward(1.0, True, False, 0) | |
| r_worst = process_combined_reward(0.0, False, True, 5) | |
| assert 0.0 <= r_best <= 1.0 | |
| assert 0.0 <= r_worst <= 1.0 | |
| assert r_best > r_worst | |
| print(f" best={r_best:.4f} worst={r_worst:.4f} ✅") | |
| # ── 5. Multi-factor fraud risk ──────────────────────────────── | |
| print(f"\n{SEP}\n[5] Multi-factor fraud risk via environment\n{SEP}") | |
| rng_seed = np.random.default_rng(42) | |
| env = SmartpayenvEnvironment() | |
| # Collect 200 transactions in easy mode and check fraud_risk ranges | |
| env.reset(difficulty=0) | |
| risks_easy = [] | |
| for _ in range(50): | |
| obs = env._generate_transaction() | |
| risks_easy.append(obs.fraud_risk_score) | |
| assert 0.0 <= obs.fraud_risk_score <= 1.0 | |
| assert obs.merchant_category in range(6) | |
| assert obs.device_type in (0, 1, 2) | |
| assert isinstance(obs.is_international, bool) | |
| assert isinstance(obs.card_present, bool) | |
| env.reset(difficulty=2) | |
| risks_hard = [] | |
| for _ in range(50): | |
| obs = env._generate_transaction() | |
| risks_hard.append(obs.fraud_risk_score) | |
| mean_easy = sum(risks_easy) / len(risks_easy) | |
| mean_hard = sum(risks_hard) / len(risks_hard) | |
| print(f" avg fraud_risk easy={mean_easy:.3f} hard={mean_hard:.3f}") | |
| assert mean_hard > mean_easy, "Hard mode should have higher avg fraud risk" | |
| print(" ✅ Multi-factor fraud + difficulty scaling OK") | |
| # ── 6. Gateway state machine ────────────────────────────────── | |
| print(f"\n{SEP}\n[6] Gateway state machine\n{SEP}") | |
| env.reset(difficulty=2) # high degrade_p for quick test | |
| states_seen = set() | |
| for _ in range(80): | |
| for gw in env._gateways: | |
| gw.step() | |
| states_seen.add(gw.state) | |
| assert 0.0 <= gw.current_rate <= 1.0 | |
| print(f" States observed: {states_seen}") | |
| assert "degraded" in states_seen or "recovering" in states_seen, \ | |
| "Hard mode should see degraded/recovering states" | |
| print(" ✅ Gateway state machine OK") | |
| # ── 7. Transaction velocity tracking ───────────────────────── | |
| print(f"\n{SEP}\n[7] Transaction velocity tracking\n{SEP}") | |
| env.reset(difficulty=0) | |
| velocities = [] | |
| for _ in range(20): | |
| obs = env._generate_transaction() | |
| velocities.append(obs.transaction_velocity) | |
| assert 0.0 <= obs.transaction_velocity <= 1.0 | |
| print(f" velocity range: [{min(velocities):.2f}, {max(velocities):.2f}] ✅") | |
| # ── 8. Episode smoke test — all 3 difficulty tiers ─────────── | |
| print(f"\n{SEP}\n[8] Full episode smoke test (15 steps × 3 difficulties)\n{SEP}") | |
| for diff in [0, 1, 2]: | |
| obs = env.reset(difficulty=diff) | |
| assert obs.difficulty == diff | |
| rewards = [] | |
| for step in range(15): | |
| action = SmartpayenvAction( | |
| gateway=int(np.argmax(obs.gateway_success_rates)), # always choose best gw | |
| retry_strategy=1, | |
| fraud_decision=1 if obs.fraud_risk_score > 0.65 else 0, | |
| ) | |
| obs = env.step(action) | |
| assert 0.0 <= obs.reward <= 1.0, f"reward out of [0,1]: {obs.reward}" | |
| assert 0.0 <= obs.task_routing_score <= 1.0 | |
| assert 0.0 <= obs.task_fraud_mcc_score <= 1.0 | |
| assert 0.0 <= obs.task_retention_score <= 1.0 | |
| rewards.append(obs.reward) | |
| if obs.done: | |
| break | |
| avg = sum(rewards) / len(rewards) | |
| print(f" difficulty={diff}: {len(rewards)} steps, avg_reward={avg:.4f}") | |
| assert any(r > 0 for r in rewards), "All rewards are still 0!" | |
| print(f"\n ✅ All difficulty tiers produce non-zero rewards") | |
| # ── 9. Block → done=True immediately ───────────────────────── | |
| print(f"\n{SEP}\n[9] fraud_decision=1 ends episode immediately\n{SEP}") | |
| env.reset(difficulty=0) | |
| obs = env.step(SmartpayenvAction(gateway=0, retry_strategy=0, fraud_decision=1)) | |
| assert obs.done is True, f"Expected done=True after block, got {obs.done}" | |
| print(f" Block step done={obs.done} ✅") | |
| print(f"\n{SEP}") | |
| print(" ALL TESTS PASSED ✅") | |
| print(f"{SEP}\n") | |