Spaces:

Pratap-K
/

SmartPayEnv

Sleeping

App Files Files Community

SmartPayEnv / tests /test_graders.py

Pratap-K

SmartPayEnv

39c0d5b 27 days ago

raw

history blame contribute delete

7.78 kB

	"""
	Comprehensive tests for SmartPayEnv v2 graders, data generation, and environment.
	Run from the repo root: python test_graders.py
	"""
	import sys, math
	sys.path.insert(0, ".")
	sys.path.insert(0, "./server")

	import numpy as np
	from server.graders import (
	RoutingEfficacyGrader,
	FraudDetectionGrader,
	UserRetentionGrader,
	process_combined_reward,
	)
	from server.SmartPayEnv_environment import SmartpayenvEnvironment, DIFFICULTY_CONFIG
	from models import SmartpayenvAction

	SEP = "=" * 60

	# ── 1. RoutingEfficacyGrader (deterministic expected_outcome) ────────
	print(f"\n{SEP}\n[1] RoutingEfficacyGrader — deterministic expected_outcome\n{SEP}")
	rg = RoutingEfficacyGrader()

	gw_rates = [0.70, 0.85, 0.95] # GatewayC is best (index 2)

	# Optimal choice: choose best gateway, high expected outcome
	s_opt = rg.evaluate(expected_outcome=0.90, cost=0.5, retries=0, chosen_gateway=2, gateway_rates=gw_rates)
	# Suboptimal choice: choose worst gateway, same exp outcome for fairness (though in practice it would be lower)
	s_sub = rg.evaluate(expected_outcome=0.90, cost=0.5, retries=0, chosen_gateway=0, gateway_rates=gw_rates)
	# Optimal choice, low expected outcome
	s_low = rg.evaluate(expected_outcome=0.20, cost=0.5, retries=0, chosen_gateway=2, gateway_rates=gw_rates)
	# Worst: suboptimal + low outcome + retry + expensive
	s_bad = rg.evaluate(expected_outcome=0.10, cost=4.0, retries=2, chosen_gateway=0, gateway_rates=gw_rates)

	print(f" optimal gw + high outcome → {s_opt:.4f}")
	print(f" suboptimal gw + same cost → {s_sub:.4f} (lower: worse gateway choice)")
	print(f" optimal gw + low outcome → {s_low:.4f} (mid)")
	print(f" worst case → {s_bad:.4f} (expect lowest)")

	for s in [s_opt, s_sub, s_low, s_bad]:
	assert 0.0 <= s <= 1.0, f"Out of [0,1]: {s}"
	assert s_opt > s_sub, "Optimal gateway should outscore suboptimal"
	assert s_opt > s_low, "High expected outcome should outscore low"
	assert s_low > s_bad, "Any reasonable choice beats the worst case"

	# DETERMINISM check: same inputs must always give same score
	assert rg.evaluate(0.7, 1.5, 0, 1, gw_rates) == rg.evaluate(0.7, 1.5, 0, 1, gw_rates), "Not deterministic!"
	print(" ✅ RoutingEfficacyGrader deterministic OK")

	# ── 2. FraudDetectionGrader ──────────────────────────────────
	print(f"\n{SEP}\n[2] FraudDetectionGrader\n{SEP}")
	fg = FraudDetectionGrader()
	for _ in range(70): fg.add_step(False, False)
	for _ in range(30): fg.add_step(True, True)
	assert abs(fg.evaluate() - 1.0) < 1e-9, f"Perfect: {fg.evaluate()}"

	fg2 = FraudDetectionGrader()
	for _ in range(70): fg2.add_step(True, False)
	for _ in range(30): fg2.add_step(False, True)
	assert abs(fg2.evaluate() - 0.0) < 1e-9, f"Worst: {fg2.evaluate()}"

	fg3 = FraudDetectionGrader()
	for _ in range(100): fg3.add_step(True, True)
	assert abs(fg3.evaluate() - 0.5) < 1e-9, f"Neutral: {fg3.evaluate()}"

	print(f" perfect=1.0 worst=0.0 neutral=0.5 ✅")

	# ── 3. UserRetentionGrader ───────────────────────────────────
	print(f"\n{SEP}\n[3] UserRetentionGrader\n{SEP}")
	urg = UserRetentionGrader(churn_rate=0.1, initial_users=100)
	assert abs(urg.evaluate() - 1.0) < 1e-9
	urg.add_step(0); assert abs(urg.evaluate() - 1.0) < 1e-9
	urg.add_step(3); assert urg.evaluate() < 1.0
	print(f" initial=1.0, no-failure=1.0, 3-failures={urg.evaluate():.4f} ✅")

	# ── 4. process_combined_reward ────────────────────────────────
	print(f"\n{SEP}\n[4] process_combined_reward\n{SEP}")
	r_best = process_combined_reward(1.0, True, False, 0)
	r_worst = process_combined_reward(0.0, False, True, 5)
	assert 0.0 <= r_best <= 1.0
	assert 0.0 <= r_worst <= 1.0
	assert r_best > r_worst
	print(f" best={r_best:.4f} worst={r_worst:.4f} ✅")

	# ── 5. Multi-factor fraud risk ────────────────────────────────
	print(f"\n{SEP}\n[5] Multi-factor fraud risk via environment\n{SEP}")
	rng_seed = np.random.default_rng(42)
	env = SmartpayenvEnvironment()

	# Collect 200 transactions in easy mode and check fraud_risk ranges
	env.reset(difficulty=0)
	risks_easy = []
	for _ in range(50):
	obs = env._generate_transaction()
	risks_easy.append(obs.fraud_risk_score)
	assert 0.0 <= obs.fraud_risk_score <= 1.0
	assert obs.merchant_category in range(6)
	assert obs.device_type in (0, 1, 2)
	assert isinstance(obs.is_international, bool)
	assert isinstance(obs.card_present, bool)

	env.reset(difficulty=2)
	risks_hard = []
	for _ in range(50):
	obs = env._generate_transaction()
	risks_hard.append(obs.fraud_risk_score)

	mean_easy = sum(risks_easy) / len(risks_easy)
	mean_hard = sum(risks_hard) / len(risks_hard)
	print(f" avg fraud_risk easy={mean_easy:.3f} hard={mean_hard:.3f}")
	assert mean_hard > mean_easy, "Hard mode should have higher avg fraud risk"
	print(" ✅ Multi-factor fraud + difficulty scaling OK")

	# ── 6. Gateway state machine ──────────────────────────────────
	print(f"\n{SEP}\n[6] Gateway state machine\n{SEP}")
	env.reset(difficulty=2) # high degrade_p for quick test
	states_seen = set()
	for _ in range(80):
	for gw in env._gateways:
	gw.step()
	states_seen.add(gw.state)
	assert 0.0 <= gw.current_rate <= 1.0

	print(f" States observed: {states_seen}")
	assert "degraded" in states_seen or "recovering" in states_seen, \
	"Hard mode should see degraded/recovering states"
	print(" ✅ Gateway state machine OK")

	# ── 7. Transaction velocity tracking ─────────────────────────
	print(f"\n{SEP}\n[7] Transaction velocity tracking\n{SEP}")
	env.reset(difficulty=0)
	velocities = []
	for _ in range(20):
	obs = env._generate_transaction()
	velocities.append(obs.transaction_velocity)
	assert 0.0 <= obs.transaction_velocity <= 1.0

	print(f" velocity range: [{min(velocities):.2f}, {max(velocities):.2f}] ✅")

	# ── 8. Episode smoke test — all 3 difficulty tiers ───────────
	print(f"\n{SEP}\n[8] Full episode smoke test (15 steps × 3 difficulties)\n{SEP}")
	for diff in [0, 1, 2]:
	obs = env.reset(difficulty=diff)
	assert obs.difficulty == diff
	rewards = []
	for step in range(15):
	action = SmartpayenvAction(
	gateway=int(np.argmax(obs.gateway_success_rates)), # always choose best gw
	retry_strategy=1,
	fraud_decision=1 if obs.fraud_risk_score > 0.65 else 0,
	)
	obs = env.step(action)
	assert 0.0 <= obs.reward <= 1.0, f"reward out of [0,1]: {obs.reward}"
	assert 0.0 <= obs.task_routing_score <= 1.0
	assert 0.0 <= obs.task_fraud_mcc_score <= 1.0
	assert 0.0 <= obs.task_retention_score <= 1.0
	rewards.append(obs.reward)
	if obs.done:
	break
	avg = sum(rewards) / len(rewards)
	print(f" difficulty={diff}: {len(rewards)} steps, avg_reward={avg:.4f}")
	assert any(r > 0 for r in rewards), "All rewards are still 0!"

	print(f"\n ✅ All difficulty tiers produce non-zero rewards")

	# ── 9. Block → done=True immediately ─────────────────────────
	print(f"\n{SEP}\n[9] fraud_decision=1 ends episode immediately\n{SEP}")
	env.reset(difficulty=0)
	obs = env.step(SmartpayenvAction(gateway=0, retry_strategy=0, fraud_decision=1))
	assert obs.done is True, f"Expected done=True after block, got {obs.done}"
	print(f" Block step done={obs.done} ✅")

	print(f"\n{SEP}")
	print(" ALL TESTS PASSED ✅")
	print(f"{SEP}\n")