File size: 1,899 Bytes
27a0d2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import numpy as np
from typing import List, Tuple, Any

def get_context_bucket(obs: Any) -> Tuple[int, int, int]:
    """
    Discretizes the observation into a context bucket for preference learning.
    
    Args:
        obs: SmartpayenvObservation object or dict
    
    Returns:
        tuple: (bin_category, amount_bucket, risk_bucket)
    """
    # Extract values whether obs is a class or dict
    if hasattr(obs, 'bin_category'):
        bin_cat = int(obs.bin_category)
        amount = float(obs.amount)
        risk = float(obs.observed_fraud_risk)
    else:
        bin_cat = int(obs.get('bin_category', 0))
        amount = float(obs.get('amount', 0))
        risk = float(obs.get('observed_fraud_risk', 0))

    return (
        bin_cat,
        int(amount // 500),         # Bucket amounts by $500
        int(np.clip(risk * 5, 0, 4)) # Risk buckets 0–4
    )

def calculate_advantages(results: List[Tuple[Any, float]], baseline: float = 0.5) -> List[Tuple[Any, float]]:
    """
    Calculates standardized advantage scores from simulation results.
    
    Args:
        results: List of (action, reward) tuples
        baseline: Neutral reward baseline
        
    Returns:
        List of (action, advantage) tuples
    """
    if not results:
        return []
        
    scores = [r for _, r in results]
    
    if len(scores) < 2:
        # If only one action, advantage is relative to baseline
        return [(results[0][0], results[0][1] - baseline)]
        
    mean = np.mean(scores)
    std = np.std(scores) + 1e-6 # Avoid div by zero
    
    return [(a, (r - mean) / std) for (a, r) in results]

def rank_actions(results: List[Tuple[Any, float]]) -> List[Tuple[Any, int]]:
    """
    Ranks actions by reward (higher index = better).
    """
    sorted_results = sorted(results, key=lambda x: x[1])
    return [(a, i) for i, (a, _) in enumerate(sorted_results)]