SmartPayEnv / server /preference_utils.py
Pratap-K's picture
implement GRPO-style preference learning, simulation branching, and expanded documentation
27a0d2f
import numpy as np
from typing import List, Tuple, Any
def get_context_bucket(obs: Any) -> Tuple[int, int, int]:
"""
Discretizes the observation into a context bucket for preference learning.
Args:
obs: SmartpayenvObservation object or dict
Returns:
tuple: (bin_category, amount_bucket, risk_bucket)
"""
# Extract values whether obs is a class or dict
if hasattr(obs, 'bin_category'):
bin_cat = int(obs.bin_category)
amount = float(obs.amount)
risk = float(obs.observed_fraud_risk)
else:
bin_cat = int(obs.get('bin_category', 0))
amount = float(obs.get('amount', 0))
risk = float(obs.get('observed_fraud_risk', 0))
return (
bin_cat,
int(amount // 500), # Bucket amounts by $500
int(np.clip(risk * 5, 0, 4)) # Risk buckets 0–4
)
def calculate_advantages(results: List[Tuple[Any, float]], baseline: float = 0.5) -> List[Tuple[Any, float]]:
"""
Calculates standardized advantage scores from simulation results.
Args:
results: List of (action, reward) tuples
baseline: Neutral reward baseline
Returns:
List of (action, advantage) tuples
"""
if not results:
return []
scores = [r for _, r in results]
if len(scores) < 2:
# If only one action, advantage is relative to baseline
return [(results[0][0], results[0][1] - baseline)]
mean = np.mean(scores)
std = np.std(scores) + 1e-6 # Avoid div by zero
return [(a, (r - mean) / std) for (a, r) in results]
def rank_actions(results: List[Tuple[Any, float]]) -> List[Tuple[Any, int]]:
"""
Ranks actions by reward (higher index = better).
"""
sorted_results = sorted(results, key=lambda x: x[1])
return [(a, i) for i, (a, _) in enumerate(sorted_results)]