File size: 5,069 Bytes
b2806bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
"""
Late Fusion: App序列向量 + 征信模型预测 → 最终风控决策
=========================================================
两个模型各自独立建模后,在决策层融合

方法: Late Fusion (拼接各自输出 → 简单分类器)
原因: App 序列和征信数据本质不同,early fusion 会相互干扰
"""

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy.stats import ks_2samp
import logging

logger = logging.getLogger(__name__)


def late_fusion(
    app_embeddings: np.ndarray,       # (n_users, 256) CoLES 用户向量
    app_risk_prob: np.ndarray,        # (n_users,) App 模型预测概率
    credit_risk_prob: np.ndarray,     # (n_users,) 征信模型预测概率
    credit_features: np.ndarray,      # (n_users, n_features) 征信原始特征 (可选)
    y_true: np.ndarray,              # (n_users,) 真实标签
    method: str = 'stacking'         # 'simple_avg', 'weighted_avg', 'stacking', 'lgbm'
):
    """
    融合策略:
    
    1. simple_avg: 简单平均两个模型的概率
    2. weighted_avg: 加权平均 (权重由验证集确定)
    3. stacking: 用两个模型的输出 + App embedding 作为特征,训练 LR
    4. lgbm: 用 LightGBM 做 stacking (最强)
    """
    
    if method == 'simple_avg':
        fusion_pred = 0.5 * app_risk_prob + 0.5 * credit_risk_prob
    
    elif method == 'weighted_avg':
        best_auc = 0
        best_w = 0.5
        for w in np.arange(0.1, 1.0, 0.05):
            pred = w * app_risk_prob + (1 - w) * credit_risk_prob
            auc = roc_auc_score(y_true, pred)
            if auc > best_auc:
                best_auc = auc
                best_w = w
        fusion_pred = best_w * app_risk_prob + (1 - best_w) * credit_risk_prob
        logger.info(f"Optimal weight: App={best_w:.2f}, Credit={1-best_w:.2f}")
    
    elif method == 'stacking':
        X_stack = np.column_stack([
            app_risk_prob.reshape(-1, 1),
            credit_risk_prob.reshape(-1, 1),
            app_embeddings,
        ])
        
        n = len(y_true)
        split = int(n * 0.8)
        
        lr = LogisticRegression(C=1.0, max_iter=1000, class_weight='balanced')
        lr.fit(X_stack[:split], y_true[:split])
        fusion_pred = lr.predict_proba(X_stack[split:])[:, 1]
        
        y_eval = y_true[split:]
        auc = roc_auc_score(y_eval, fusion_pred)
        ks = ks_2samp(fusion_pred[y_eval==1], fusion_pred[y_eval==0]).statistic
        logger.info(f"Stacking (LR): AUC={auc:.4f}, KS={ks:.4f}")
        return fusion_pred, auc, ks
    
    elif method == 'lgbm':
        import lightgbm as lgb
        
        X_stack = np.column_stack([
            app_risk_prob.reshape(-1, 1),
            credit_risk_prob.reshape(-1, 1),
            app_embeddings,
        ])
        
        n = len(y_true)
        split = int(n * 0.8)
        
        train_data = lgb.Dataset(X_stack[:split], label=y_true[:split])
        val_data = lgb.Dataset(X_stack[split:], label=y_true[split:])
        
        params = {
            'objective': 'binary', 'metric': 'auc',
            'learning_rate': 0.05, 'num_leaves': 31,
            'verbose': -1, 'n_jobs': -1,
        }
        
        model = lgb.train(params, train_data, num_boost_round=200,
                         valid_sets=[val_data],
                         callbacks=[lgb.early_stopping(30)])
        
        fusion_pred = model.predict(X_stack[split:])
        y_eval = y_true[split:]
        auc = roc_auc_score(y_eval, fusion_pred)
        ks = ks_2samp(fusion_pred[y_eval==1], fusion_pred[y_eval==0]).statistic
        logger.info(f"Stacking (LightGBM): AUC={auc:.4f}, KS={ks:.4f}")
        return fusion_pred, auc, ks
    
    # 评估
    auc = roc_auc_score(y_true, fusion_pred)
    ks = ks_2samp(fusion_pred[y_true==1], fusion_pred[y_true==0]).statistic
    logger.info(f"Fusion ({method}): AUC={auc:.4f}, KS={ks:.4f}")
    
    return fusion_pred, auc, ks


# ============================================================
# 使用示例
# ============================================================
"""
完整工作流:

# 1. App 序列模型
from app_sequence_model import pretrain_coles, preprocess_app_sequence
pretrained = pretrain_coles(user_sequences)
app_embeddings = extract_user_embeddings(pretrained, user_sequences)  # (N, 256)
app_risk_prob = app_classifier.predict_proba(app_embeddings)

# 2. 征信模型
from credit_bureau_model import train_tabm, train_lightgbm
tabm_pred = tabm_model.predict(credit_features)
lgb_pred = lgb_model.predict(credit_features)
credit_risk_prob = 0.5 * tabm_pred + 0.5 * lgb_pred

# 3. 融合
fusion_pred, auc, ks = late_fusion(
    app_embeddings=app_embeddings,
    app_risk_prob=app_risk_prob,
    credit_risk_prob=credit_risk_prob,
    credit_features=None,
    y_true=labels,
    method='lgbm'  # 推荐
)

# 4. 阈值决策
threshold = 0.15  # 由 KS 校准确定
decision = (fusion_pred >= threshold).astype(int)
# 0 = 通过, 1 = 拒绝
"""