yonghao commited on
Commit
b2806bd
·
verified ·
1 Parent(s): a7e77d8

Add fusion model and research report

Browse files
Files changed (1) hide show
  1. fusion_model.py +144 -0
fusion_model.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Late Fusion: App序列向量 + 征信模型预测 → 最终风控决策
3
+ =========================================================
4
+ 两个模型各自独立建模后,在决策层融合
5
+
6
+ 方法: Late Fusion (拼接各自输出 → 简单分类器)
7
+ 原因: App 序列和征信数据本质不同,early fusion 会相互干扰
8
+ """
9
+
10
+ import numpy as np
11
+ import pandas as pd
12
+ from sklearn.linear_model import LogisticRegression
13
+ from sklearn.metrics import roc_auc_score
14
+ from scipy.stats import ks_2samp
15
+ import logging
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def late_fusion(
21
+ app_embeddings: np.ndarray, # (n_users, 256) CoLES 用户向量
22
+ app_risk_prob: np.ndarray, # (n_users,) App 模型预测概率
23
+ credit_risk_prob: np.ndarray, # (n_users,) 征信模型预测概率
24
+ credit_features: np.ndarray, # (n_users, n_features) 征信原始特征 (可选)
25
+ y_true: np.ndarray, # (n_users,) 真实标签
26
+ method: str = 'stacking' # 'simple_avg', 'weighted_avg', 'stacking', 'lgbm'
27
+ ):
28
+ """
29
+ 融合策略:
30
+
31
+ 1. simple_avg: 简单平均两个模型的概率
32
+ 2. weighted_avg: 加权平均 (权重由验证集确定)
33
+ 3. stacking: 用两个模型的输出 + App embedding 作为特征,训练 LR
34
+ 4. lgbm: 用 LightGBM 做 stacking (最强)
35
+ """
36
+
37
+ if method == 'simple_avg':
38
+ fusion_pred = 0.5 * app_risk_prob + 0.5 * credit_risk_prob
39
+
40
+ elif method == 'weighted_avg':
41
+ best_auc = 0
42
+ best_w = 0.5
43
+ for w in np.arange(0.1, 1.0, 0.05):
44
+ pred = w * app_risk_prob + (1 - w) * credit_risk_prob
45
+ auc = roc_auc_score(y_true, pred)
46
+ if auc > best_auc:
47
+ best_auc = auc
48
+ best_w = w
49
+ fusion_pred = best_w * app_risk_prob + (1 - best_w) * credit_risk_prob
50
+ logger.info(f"Optimal weight: App={best_w:.2f}, Credit={1-best_w:.2f}")
51
+
52
+ elif method == 'stacking':
53
+ X_stack = np.column_stack([
54
+ app_risk_prob.reshape(-1, 1),
55
+ credit_risk_prob.reshape(-1, 1),
56
+ app_embeddings,
57
+ ])
58
+
59
+ n = len(y_true)
60
+ split = int(n * 0.8)
61
+
62
+ lr = LogisticRegression(C=1.0, max_iter=1000, class_weight='balanced')
63
+ lr.fit(X_stack[:split], y_true[:split])
64
+ fusion_pred = lr.predict_proba(X_stack[split:])[:, 1]
65
+
66
+ y_eval = y_true[split:]
67
+ auc = roc_auc_score(y_eval, fusion_pred)
68
+ ks = ks_2samp(fusion_pred[y_eval==1], fusion_pred[y_eval==0]).statistic
69
+ logger.info(f"Stacking (LR): AUC={auc:.4f}, KS={ks:.4f}")
70
+ return fusion_pred, auc, ks
71
+
72
+ elif method == 'lgbm':
73
+ import lightgbm as lgb
74
+
75
+ X_stack = np.column_stack([
76
+ app_risk_prob.reshape(-1, 1),
77
+ credit_risk_prob.reshape(-1, 1),
78
+ app_embeddings,
79
+ ])
80
+
81
+ n = len(y_true)
82
+ split = int(n * 0.8)
83
+
84
+ train_data = lgb.Dataset(X_stack[:split], label=y_true[:split])
85
+ val_data = lgb.Dataset(X_stack[split:], label=y_true[split:])
86
+
87
+ params = {
88
+ 'objective': 'binary', 'metric': 'auc',
89
+ 'learning_rate': 0.05, 'num_leaves': 31,
90
+ 'verbose': -1, 'n_jobs': -1,
91
+ }
92
+
93
+ model = lgb.train(params, train_data, num_boost_round=200,
94
+ valid_sets=[val_data],
95
+ callbacks=[lgb.early_stopping(30)])
96
+
97
+ fusion_pred = model.predict(X_stack[split:])
98
+ y_eval = y_true[split:]
99
+ auc = roc_auc_score(y_eval, fusion_pred)
100
+ ks = ks_2samp(fusion_pred[y_eval==1], fusion_pred[y_eval==0]).statistic
101
+ logger.info(f"Stacking (LightGBM): AUC={auc:.4f}, KS={ks:.4f}")
102
+ return fusion_pred, auc, ks
103
+
104
+ # 评估
105
+ auc = roc_auc_score(y_true, fusion_pred)
106
+ ks = ks_2samp(fusion_pred[y_true==1], fusion_pred[y_true==0]).statistic
107
+ logger.info(f"Fusion ({method}): AUC={auc:.4f}, KS={ks:.4f}")
108
+
109
+ return fusion_pred, auc, ks
110
+
111
+
112
+ # ============================================================
113
+ # 使用示例
114
+ # ============================================================
115
+ """
116
+ 完整工作流:
117
+
118
+ # 1. App 序列模型
119
+ from app_sequence_model import pretrain_coles, preprocess_app_sequence
120
+ pretrained = pretrain_coles(user_sequences)
121
+ app_embeddings = extract_user_embeddings(pretrained, user_sequences) # (N, 256)
122
+ app_risk_prob = app_classifier.predict_proba(app_embeddings)
123
+
124
+ # 2. 征信模型
125
+ from credit_bureau_model import train_tabm, train_lightgbm
126
+ tabm_pred = tabm_model.predict(credit_features)
127
+ lgb_pred = lgb_model.predict(credit_features)
128
+ credit_risk_prob = 0.5 * tabm_pred + 0.5 * lgb_pred
129
+
130
+ # 3. 融合
131
+ fusion_pred, auc, ks = late_fusion(
132
+ app_embeddings=app_embeddings,
133
+ app_risk_prob=app_risk_prob,
134
+ credit_risk_prob=credit_risk_prob,
135
+ credit_features=None,
136
+ y_true=labels,
137
+ method='lgbm' # 推荐
138
+ )
139
+
140
+ # 4. 阈值决策
141
+ threshold = 0.15 # 由 KS 校准确定
142
+ decision = (fusion_pred >= threshold).astype(int)
143
+ # 0 = 通过, 1 = 拒绝
144
+ """