File size: 5,069 Bytes
b2806bd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | """
Late Fusion: App序列向量 + 征信模型预测 → 最终风控决策
=========================================================
两个模型各自独立建模后,在决策层融合
方法: Late Fusion (拼接各自输出 → 简单分类器)
原因: App 序列和征信数据本质不同,early fusion 会相互干扰
"""
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy.stats import ks_2samp
import logging
logger = logging.getLogger(__name__)
def late_fusion(
app_embeddings: np.ndarray, # (n_users, 256) CoLES 用户向量
app_risk_prob: np.ndarray, # (n_users,) App 模型预测概率
credit_risk_prob: np.ndarray, # (n_users,) 征信模型预测概率
credit_features: np.ndarray, # (n_users, n_features) 征信原始特征 (可选)
y_true: np.ndarray, # (n_users,) 真实标签
method: str = 'stacking' # 'simple_avg', 'weighted_avg', 'stacking', 'lgbm'
):
"""
融合策略:
1. simple_avg: 简单平均两个模型的概率
2. weighted_avg: 加权平均 (权重由验证集确定)
3. stacking: 用两个模型的输出 + App embedding 作为特征,训练 LR
4. lgbm: 用 LightGBM 做 stacking (最强)
"""
if method == 'simple_avg':
fusion_pred = 0.5 * app_risk_prob + 0.5 * credit_risk_prob
elif method == 'weighted_avg':
best_auc = 0
best_w = 0.5
for w in np.arange(0.1, 1.0, 0.05):
pred = w * app_risk_prob + (1 - w) * credit_risk_prob
auc = roc_auc_score(y_true, pred)
if auc > best_auc:
best_auc = auc
best_w = w
fusion_pred = best_w * app_risk_prob + (1 - best_w) * credit_risk_prob
logger.info(f"Optimal weight: App={best_w:.2f}, Credit={1-best_w:.2f}")
elif method == 'stacking':
X_stack = np.column_stack([
app_risk_prob.reshape(-1, 1),
credit_risk_prob.reshape(-1, 1),
app_embeddings,
])
n = len(y_true)
split = int(n * 0.8)
lr = LogisticRegression(C=1.0, max_iter=1000, class_weight='balanced')
lr.fit(X_stack[:split], y_true[:split])
fusion_pred = lr.predict_proba(X_stack[split:])[:, 1]
y_eval = y_true[split:]
auc = roc_auc_score(y_eval, fusion_pred)
ks = ks_2samp(fusion_pred[y_eval==1], fusion_pred[y_eval==0]).statistic
logger.info(f"Stacking (LR): AUC={auc:.4f}, KS={ks:.4f}")
return fusion_pred, auc, ks
elif method == 'lgbm':
import lightgbm as lgb
X_stack = np.column_stack([
app_risk_prob.reshape(-1, 1),
credit_risk_prob.reshape(-1, 1),
app_embeddings,
])
n = len(y_true)
split = int(n * 0.8)
train_data = lgb.Dataset(X_stack[:split], label=y_true[:split])
val_data = lgb.Dataset(X_stack[split:], label=y_true[split:])
params = {
'objective': 'binary', 'metric': 'auc',
'learning_rate': 0.05, 'num_leaves': 31,
'verbose': -1, 'n_jobs': -1,
}
model = lgb.train(params, train_data, num_boost_round=200,
valid_sets=[val_data],
callbacks=[lgb.early_stopping(30)])
fusion_pred = model.predict(X_stack[split:])
y_eval = y_true[split:]
auc = roc_auc_score(y_eval, fusion_pred)
ks = ks_2samp(fusion_pred[y_eval==1], fusion_pred[y_eval==0]).statistic
logger.info(f"Stacking (LightGBM): AUC={auc:.4f}, KS={ks:.4f}")
return fusion_pred, auc, ks
# 评估
auc = roc_auc_score(y_true, fusion_pred)
ks = ks_2samp(fusion_pred[y_true==1], fusion_pred[y_true==0]).statistic
logger.info(f"Fusion ({method}): AUC={auc:.4f}, KS={ks:.4f}")
return fusion_pred, auc, ks
# ============================================================
# 使用示例
# ============================================================
"""
完整工作流:
# 1. App 序列模型
from app_sequence_model import pretrain_coles, preprocess_app_sequence
pretrained = pretrain_coles(user_sequences)
app_embeddings = extract_user_embeddings(pretrained, user_sequences) # (N, 256)
app_risk_prob = app_classifier.predict_proba(app_embeddings)
# 2. 征信模型
from credit_bureau_model import train_tabm, train_lightgbm
tabm_pred = tabm_model.predict(credit_features)
lgb_pred = lgb_model.predict(credit_features)
credit_risk_prob = 0.5 * tabm_pred + 0.5 * lgb_pred
# 3. 融合
fusion_pred, auc, ks = late_fusion(
app_embeddings=app_embeddings,
app_risk_prob=app_risk_prob,
credit_risk_prob=credit_risk_prob,
credit_features=None,
y_true=labels,
method='lgbm' # 推荐
)
# 4. 阈值决策
threshold = 0.15 # 由 KS 校准确定
decision = (fusion_pred >= threshold).astype(int)
# 0 = 通过, 1 = 拒绝
"""
|