Add fusion model and research report
Browse files- fusion_model.py +144 -0
fusion_model.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Late Fusion: App序列向量 + 征信模型预测 → 最终风控决策
|
| 3 |
+
=========================================================
|
| 4 |
+
两个模型各自独立建模后,在决策层融合
|
| 5 |
+
|
| 6 |
+
方法: Late Fusion (拼接各自输出 → 简单分类器)
|
| 7 |
+
原因: App 序列和征信数据本质不同,early fusion 会相互干扰
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import numpy as np
|
| 11 |
+
import pandas as pd
|
| 12 |
+
from sklearn.linear_model import LogisticRegression
|
| 13 |
+
from sklearn.metrics import roc_auc_score
|
| 14 |
+
from scipy.stats import ks_2samp
|
| 15 |
+
import logging
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def late_fusion(
|
| 21 |
+
app_embeddings: np.ndarray, # (n_users, 256) CoLES 用户向量
|
| 22 |
+
app_risk_prob: np.ndarray, # (n_users,) App 模型预测概率
|
| 23 |
+
credit_risk_prob: np.ndarray, # (n_users,) 征信模型预测概率
|
| 24 |
+
credit_features: np.ndarray, # (n_users, n_features) 征信原始特征 (可选)
|
| 25 |
+
y_true: np.ndarray, # (n_users,) 真实标签
|
| 26 |
+
method: str = 'stacking' # 'simple_avg', 'weighted_avg', 'stacking', 'lgbm'
|
| 27 |
+
):
|
| 28 |
+
"""
|
| 29 |
+
融合策略:
|
| 30 |
+
|
| 31 |
+
1. simple_avg: 简单平均两个模型的概率
|
| 32 |
+
2. weighted_avg: 加权平均 (权重由验证集确定)
|
| 33 |
+
3. stacking: 用两个模型的输出 + App embedding 作为特征,训练 LR
|
| 34 |
+
4. lgbm: 用 LightGBM 做 stacking (最强)
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
if method == 'simple_avg':
|
| 38 |
+
fusion_pred = 0.5 * app_risk_prob + 0.5 * credit_risk_prob
|
| 39 |
+
|
| 40 |
+
elif method == 'weighted_avg':
|
| 41 |
+
best_auc = 0
|
| 42 |
+
best_w = 0.5
|
| 43 |
+
for w in np.arange(0.1, 1.0, 0.05):
|
| 44 |
+
pred = w * app_risk_prob + (1 - w) * credit_risk_prob
|
| 45 |
+
auc = roc_auc_score(y_true, pred)
|
| 46 |
+
if auc > best_auc:
|
| 47 |
+
best_auc = auc
|
| 48 |
+
best_w = w
|
| 49 |
+
fusion_pred = best_w * app_risk_prob + (1 - best_w) * credit_risk_prob
|
| 50 |
+
logger.info(f"Optimal weight: App={best_w:.2f}, Credit={1-best_w:.2f}")
|
| 51 |
+
|
| 52 |
+
elif method == 'stacking':
|
| 53 |
+
X_stack = np.column_stack([
|
| 54 |
+
app_risk_prob.reshape(-1, 1),
|
| 55 |
+
credit_risk_prob.reshape(-1, 1),
|
| 56 |
+
app_embeddings,
|
| 57 |
+
])
|
| 58 |
+
|
| 59 |
+
n = len(y_true)
|
| 60 |
+
split = int(n * 0.8)
|
| 61 |
+
|
| 62 |
+
lr = LogisticRegression(C=1.0, max_iter=1000, class_weight='balanced')
|
| 63 |
+
lr.fit(X_stack[:split], y_true[:split])
|
| 64 |
+
fusion_pred = lr.predict_proba(X_stack[split:])[:, 1]
|
| 65 |
+
|
| 66 |
+
y_eval = y_true[split:]
|
| 67 |
+
auc = roc_auc_score(y_eval, fusion_pred)
|
| 68 |
+
ks = ks_2samp(fusion_pred[y_eval==1], fusion_pred[y_eval==0]).statistic
|
| 69 |
+
logger.info(f"Stacking (LR): AUC={auc:.4f}, KS={ks:.4f}")
|
| 70 |
+
return fusion_pred, auc, ks
|
| 71 |
+
|
| 72 |
+
elif method == 'lgbm':
|
| 73 |
+
import lightgbm as lgb
|
| 74 |
+
|
| 75 |
+
X_stack = np.column_stack([
|
| 76 |
+
app_risk_prob.reshape(-1, 1),
|
| 77 |
+
credit_risk_prob.reshape(-1, 1),
|
| 78 |
+
app_embeddings,
|
| 79 |
+
])
|
| 80 |
+
|
| 81 |
+
n = len(y_true)
|
| 82 |
+
split = int(n * 0.8)
|
| 83 |
+
|
| 84 |
+
train_data = lgb.Dataset(X_stack[:split], label=y_true[:split])
|
| 85 |
+
val_data = lgb.Dataset(X_stack[split:], label=y_true[split:])
|
| 86 |
+
|
| 87 |
+
params = {
|
| 88 |
+
'objective': 'binary', 'metric': 'auc',
|
| 89 |
+
'learning_rate': 0.05, 'num_leaves': 31,
|
| 90 |
+
'verbose': -1, 'n_jobs': -1,
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
model = lgb.train(params, train_data, num_boost_round=200,
|
| 94 |
+
valid_sets=[val_data],
|
| 95 |
+
callbacks=[lgb.early_stopping(30)])
|
| 96 |
+
|
| 97 |
+
fusion_pred = model.predict(X_stack[split:])
|
| 98 |
+
y_eval = y_true[split:]
|
| 99 |
+
auc = roc_auc_score(y_eval, fusion_pred)
|
| 100 |
+
ks = ks_2samp(fusion_pred[y_eval==1], fusion_pred[y_eval==0]).statistic
|
| 101 |
+
logger.info(f"Stacking (LightGBM): AUC={auc:.4f}, KS={ks:.4f}")
|
| 102 |
+
return fusion_pred, auc, ks
|
| 103 |
+
|
| 104 |
+
# 评估
|
| 105 |
+
auc = roc_auc_score(y_true, fusion_pred)
|
| 106 |
+
ks = ks_2samp(fusion_pred[y_true==1], fusion_pred[y_true==0]).statistic
|
| 107 |
+
logger.info(f"Fusion ({method}): AUC={auc:.4f}, KS={ks:.4f}")
|
| 108 |
+
|
| 109 |
+
return fusion_pred, auc, ks
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
# ============================================================
|
| 113 |
+
# 使用示例
|
| 114 |
+
# ============================================================
|
| 115 |
+
"""
|
| 116 |
+
完整工作流:
|
| 117 |
+
|
| 118 |
+
# 1. App 序列模型
|
| 119 |
+
from app_sequence_model import pretrain_coles, preprocess_app_sequence
|
| 120 |
+
pretrained = pretrain_coles(user_sequences)
|
| 121 |
+
app_embeddings = extract_user_embeddings(pretrained, user_sequences) # (N, 256)
|
| 122 |
+
app_risk_prob = app_classifier.predict_proba(app_embeddings)
|
| 123 |
+
|
| 124 |
+
# 2. 征信模型
|
| 125 |
+
from credit_bureau_model import train_tabm, train_lightgbm
|
| 126 |
+
tabm_pred = tabm_model.predict(credit_features)
|
| 127 |
+
lgb_pred = lgb_model.predict(credit_features)
|
| 128 |
+
credit_risk_prob = 0.5 * tabm_pred + 0.5 * lgb_pred
|
| 129 |
+
|
| 130 |
+
# 3. 融合
|
| 131 |
+
fusion_pred, auc, ks = late_fusion(
|
| 132 |
+
app_embeddings=app_embeddings,
|
| 133 |
+
app_risk_prob=app_risk_prob,
|
| 134 |
+
credit_risk_prob=credit_risk_prob,
|
| 135 |
+
credit_features=None,
|
| 136 |
+
y_true=labels,
|
| 137 |
+
method='lgbm' # 推荐
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
# 4. 阈值决策
|
| 141 |
+
threshold = 0.15 # 由 KS 校准确定
|
| 142 |
+
decision = (fusion_pred >= threshold).astype(int)
|
| 143 |
+
# 0 = 通过, 1 = 拒绝
|
| 144 |
+
"""
|