""" 保险APP 用户行为分析 - Gradio Space (终极版 v3.0) 支持: 演示模式 | CSV上传 | 产品推荐(DIN) | 异常检测(TabBERT) | 模型管理 | 生存分析 参考文献: - DIN: Deep Interest Network (KDD 2018, arxiv:1706.06978) - TabBERT: Tabular Transformers (arxiv:2011.01843) - Focal Loss: RetinaNet (ICCV 2017, arxiv:1708.02002) - DeepSurv: Cox-PH Neural Network (JAMIA 2018, arxiv:1606.00931) - RNN Survival: arxiv:2304.00575 """ import os, io, math, warnings, datetime, random, json, tempfile, pickle from collections import Counter, defaultdict from dataclasses import dataclass, field from typing import List, Dict, Optional, Tuple from pathlib import Path warnings.filterwarnings('ignore') import numpy as np import pandas as pd from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score from sklearn.preprocessing import StandardScaler, MinMaxScaler from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier from sklearn.metrics import ( roc_auc_score, f1_score, confusion_matrix, average_precision_score, precision_recall_curve, classification_report, roc_curve, accuracy_score ) import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import seaborn as sns import gradio as gr # PyTorch try: import torch import torch.nn as nn import torch.nn.functional as F TORCH_AVAILABLE = True except ImportError: TORCH_AVAILABLE = False print("⚠️ PyTorch not available. Deep learning models disabled.") # Hugging Face Hub (模型保存/加载) try: from huggingface_hub import HfApi, create_repo, hf_hub_download, login HFHUB_AVAILABLE = True except ImportError: HFHUB_AVAILABLE = False print("⚠️ huggingface_hub not available. Model save/load disabled.") # lifelines (生存分析) try: from lifelines import CoxPHFitter, KaplanMeierFitter, NelsonAalenFitter from lifelines.statistics import logrank_test LIFELINES_AVAILABLE = True except ImportError: LIFELINES_AVAILABLE = False print("⚠️ lifelines not available. Statistical survival analysis disabled.") # joblib import joblib # ============================================================================= # 全局配置 & 数据模型 # ============================================================================= INSURANCE_EVENT_TYPES = { "page_view", "product_view", "product_compare", "premium_calculator", "faq_view", "article_read", "quote_request", "quote_result_view", "document_upload", "form_submit", "chat_init", "call_init", "video_consult", "policy_select", "payment_init", "payment_success", "policy_issued", "claim_init", "claim_doc_upload", "claim_review", "claim_approved", "claim_rejected", "renewal_reminder", "renewal_click", "renewal_complete", "policy_cancel", "app_uninstall", "login", "logout", } @dataclass class InsuranceAppEvent: event_id: str; user_id: str; session_id: str; timestamp: int event_type: str; page_id: str product_id: Optional[str] = None; amount: Optional[float] = None channel: str = "app"; device_type: str = "mobile" @dataclass class UserSession: session_id: str; user_id: str events: List[InsuranceAppEvent] = field(default_factory=list) @dataclass class UserBehaviorProfile: user_id: str; sessions: List[UserSession] = field(default_factory=list) # ============================================================================= # 特征工程 # ============================================================================= class InsuranceFeatureEngineer: def extract_user_features(self, profile): sessions = profile.sessions if not sessions: return None all_events = [] for s in sessions: all_events.extend(s.events) all_events.sort(key=lambda e: e.timestamp) all_type_counts = Counter(e.event_type for e in all_events) total = len(all_events) if total == 0: return None product_counter = Counter(e.product_id for e in all_events if e.product_id) top_product = product_counter.most_common(1)[0][0] if product_counter else None first_ts = all_events[0].timestamp; last_ts = all_events[-1].timestamp days_active = (last_ts - first_ts) / (24 * 3600 * 1000) has_purchased = any(e.event_type == "policy_issued" for e in all_events) has_renewed = any(e.event_type == "renewal_complete" for e in all_events) has_claimed = any(e.event_type in ("claim_init","claim_approved") for e in all_events) support = all_type_counts.get("chat_init", 0) + all_type_counts.get("call_init", 0) event_seq = [e.event_type for e in all_events] product_seq = [e.product_id or "none" for e in all_events] return { "total_sessions": len(sessions), "total_events": total, "days_active": days_active, "avg_events_per_session": total / len(sessions), "product_view_ratio": all_type_counts.get("product_view", 0) / total, "quote_request_ratio": all_type_counts.get("quote_request", 0) / total, "article_read_ratio": all_type_counts.get("article_read", 0) / total, "payment_success_ratio": all_type_counts.get("payment_success", 0) / total, "policy_issued_ratio": all_type_counts.get("policy_issued", 0) / total, "unique_products_viewed": len(product_counter), "top_product_id": top_product or "none", "has_purchased": int(has_purchased), "has_renewed": int(has_renewed), "has_claimed": int(has_claimed), "support_dependency": support / total, "renewal_click_count": all_type_counts.get("renewal_click", 0), "policy_cancel_count": all_type_counts.get("policy_cancel", 0), "claim_init_count": all_type_counts.get("claim_init", 0), "days_since_last_event": (datetime.datetime.now().timestamp()*1000 - last_ts)/(24*3600*1000), "weekend_activity_ratio": sum(1 for e in all_events if datetime.datetime.fromtimestamp(e.timestamp/1000).weekday()>=5)/total, "peak_active_hour": Counter(datetime.datetime.fromtimestamp(e.timestamp/1000).hour for e in all_events).most_common(1)[0][0], "recent_7day_events": sum(1 for e in all_events if (last_ts-e.timestamp)<7*24*3600*1000), "recent_30day_events": sum(1 for e in all_events if (last_ts-e.timestamp)<30*24*3600*1000), "_event_sequence": event_seq, "_product_sequence": product_seq, "_user_id": profile.user_id, } # ============================================================================= # 数据解析 & 生成 # ============================================================================= def parse_csv_to_profiles(df): required_cols = {"user_id", "session_id", "timestamp", "event_type", "page_id"} missing = required_cols - set(c.lower().strip() for c in df.columns) if missing: raise ValueError(f"CSV缺少必需列: {missing}") df.columns = [c.lower().strip() for c in df.columns] df["timestamp"] = pd.to_numeric(df["timestamp"], errors="coerce") df = df.dropna(subset=["timestamp", "event_type"]) df["timestamp"] = df["timestamp"].astype(int) profiles = {} for (uid, sid), group in df.groupby(["user_id", "session_id"]): if uid not in profiles: profiles[uid] = UserBehaviorProfile(user_id=str(uid), sessions=[]) events = [] for _, row in group.sort_values("timestamp").iterrows(): events.append(InsuranceAppEvent( event_id=f"evt_{row.name}", user_id=str(row["user_id"]), session_id=str(row["session_id"]), timestamp=int(row["timestamp"]), event_type=str(row["event_type"]).strip(), page_id=str(row.get("page_id", "unknown")), product_id=str(row.get("product_id")) if pd.notna(row.get("product_id")) else None, amount=float(row["amount"]) if pd.notna(row.get("amount")) else None, )) profiles[uid].sessions.append(UserSession(session_id=str(sid), user_id=str(uid), events=events)) return list(profiles.values()) def generate_synthetic_data(n_users=2000, n_events_per_user=50, seed=42): random.seed(seed); np.random.seed(seed) event_types = list(INSURANCE_EVENT_TYPES) products = ["health_basic","health_premium","critical_illness","term_life", "auto_compulsory","auto_commercial","home","travel_domestic"] data = [] for u in range(n_users): user_id = f"user_{u:04d}"; churn_risk = random.random() sessions = []; base_ts = int(datetime.datetime(2024,1,1).timestamp()*1000) for s in range(random.randint(1,5)): session_id = f"sess_{u}_{s}" n_events = random.randint(5, n_events_per_user // max(1, random.randint(1,5))) events = [] for e in range(n_events): if churn_risk > 0.7: event_type = random.choices(["page_view","product_view","article_read","app_uninstall"],weights=[0.4,0.3,0.2,0.1])[0] else: stages = n_events if e < stages*0.3: event_type = random.choice(["page_view","product_view","article_read"]) elif e < stages*0.6: event_type = random.choice(["product_view","quote_request","premium_calculator","faq_view"]) elif e < stages*0.8: event_type = random.choice(["quote_result_view","form_submit","document_upload","payment_init"]) else: event_type = random.choice(["payment_success","policy_issued","renewal_click","renewal_complete"]) timestamp = base_ts + e * random.randint(5000,30000) events.append(InsuranceAppEvent(f"evt_{u}_{s}_{e}", user_id, session_id, timestamp, event_type, f"page_{event_type}", random.choice(products) if event_type in ["product_view","quote_request"] else None, random.uniform(1000,100000) if event_type in ["quote_request","payment_success"] else None)) sessions.append(UserSession(session_id, user_id, events)) base_ts += 24 * 3600 * 1000 data.append((UserBehaviorProfile(user_id, sessions), int(churn_risk > 0.7))) return data # ============================================================================= # 通用 sklearn 训练函数 # ============================================================================= def train_sklearn(features_list, labels, test_size=0.2, random_state=42, use_cv=False): df = pd.DataFrame(features_list) df_full = df.copy() drop_cols = [c for c in df.columns if c.startswith('_')] for c in drop_cols: df.pop(c) for c in df.columns: if df[c].dtype == 'object': df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0) df = df.fillna(0).replace([np.inf, -np.inf], 0) X = df.values; y = np.array(labels) feature_names = list(df.columns) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y) scaler = StandardScaler() X_train_s = scaler.fit_transform(X_train); X_test_s = scaler.transform(X_test) gbdt = GradientBoostingClassifier(n_estimators=200, max_depth=5, learning_rate=0.1, subsample=0.8, random_state=random_state) gbdt.fit(X_train_s, y_train) y_pred_gbdt = gbdt.predict(X_test_s); y_prob_gbdt = gbdt.predict_proba(X_test_s)[:,1] rf = RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=random_state, n_jobs=-1) rf.fit(X_train_s, y_train) y_prob_rf = rf.predict_proba(X_test_s)[:,1]; y_pred_rf = rf.predict(X_test_s) auc_gbdt = float(roc_auc_score(y_test, y_prob_gbdt)) f1_gbdt = float(f1_score(y_test, y_pred_gbdt)) ap_gbdt = float(average_precision_score(y_test, y_prob_gbdt)) auc_rf = float(roc_auc_score(y_test, y_prob_rf)) ap_rf = float(average_precision_score(y_test, y_prob_rf)) fi = pd.DataFrame({'feature': feature_names, 'importance': rf.feature_importances_}).sort_values('importance', ascending=False) cv_scores = None if use_cv and len(y) >= 100: skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state) cv_scores = cross_val_score(rf, X, y, cv=skf, scoring='roc_auc') os.makedirs("outputs", exist_ok=True) fig, ax = plt.subplots(figsize=(12,8)) top = fi.head(15) colors = plt.cm.RdYlGn(np.linspace(0.2, 0.8, len(top)))[::-1] ax.barh(top['feature'][::-1], top['importance'][::-1], color=colors) ax.set_title('Insurance APP - Top 15 Feature Importance', fontsize=14, fontweight='bold') ax.set_xlabel('Importance Score') plt.tight_layout() fig_path1 = "outputs/feature_importance.png" plt.savefig(fig_path1, dpi=150, bbox_inches='tight'); plt.close() fig, ax = plt.subplots(figsize=(8,6)) pg, rg, _ = precision_recall_curve(y_test, y_prob_gbdt) pr, rr, _ = precision_recall_curve(y_test, y_prob_rf) ax.plot(rg, pg, label=f'GBDT AP={ap_gbdt:.3f}', linewidth=2, color='#2E86AB') ax.plot(rr, pr, label=f'RF AP={ap_rf:.3f}', linewidth=2, color='#A23B72') ax.set_xlabel('Recall', fontsize=12); ax.set_ylabel('Precision', fontsize=12) ax.set_title('Precision-Recall Curve', fontsize=14, fontweight='bold') ax.legend(fontsize=11); ax.grid(True, alpha=0.3) plt.tight_layout() fig_path2 = "outputs/pr_curve.png" plt.savefig(fig_path2, dpi=150, bbox_inches='tight'); plt.close() fig, axs = plt.subplots(1,2,figsize=(12,5)) sns.heatmap(confusion_matrix(y_test, y_pred_gbdt), annot=True, fmt='d', cmap='Blues', ax=axs[0], cbar=False) axs[0].set_title(f'GBDT (AUC={auc_gbdt:.3f})', fontsize=12, fontweight='bold') axs[0].set_xlabel('Predicted'); axs[0].set_ylabel('Actual') sns.heatmap(confusion_matrix(y_test, y_pred_rf), annot=True, fmt='d', cmap='Greens', ax=axs[1], cbar=False) axs[1].set_title(f'RF (AUC={auc_rf:.3f})', fontsize=12, fontweight='bold') axs[1].set_xlabel('Predicted'); axs[1].set_ylabel('Actual') plt.tight_layout() fig_path3 = "outputs/confusion_matrix.png" plt.savefig(fig_path3, dpi=150, bbox_inches='tight'); plt.close() fig, ax = plt.subplots(figsize=(8,6)) fpr_g, tpr_g, _ = roc_curve(y_test, y_prob_gbdt) fpr_r, tpr_r, _ = roc_curve(y_test, y_prob_rf) ax.plot(fpr_g, tpr_g, label=f'GBDT AUC={auc_gbdt:.3f}', linewidth=2, color='#2E86AB') ax.plot(fpr_r, tpr_r, label=f'RF AUC={auc_rf:.3f}', linewidth=2, color='#A23B72') ax.plot([0,1], [0,1], 'k--', alpha=0.5) ax.set_xlabel('False Positive Rate', fontsize=12) ax.set_ylabel('True Positive Rate', fontsize=12) ax.set_title('ROC Curve', fontsize=14, fontweight='bold') ax.legend(fontsize=11); ax.grid(True, alpha=0.3) plt.tight_layout() fig_path4 = "outputs/roc_curve.png" plt.savefig(fig_path4, dpi=150, bbox_inches='tight'); plt.close() fi_str = fi.head(15).to_string(index=False) report = classification_report(y_test, y_pred_gbdt, digits=4) cv_str = "" if cv_scores is not None: cv_str = f"\n--- 5折交叉验证 (RF AUC) ---\nMean: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})\nScores: {cv_scores.round(4).tolist()}" result_text = f"""=== 模型训练结果 === 样本数: {len(y)} | 特征数: {len(feature_names)} 训练集: {len(y_train)} | 测试集: {len(y_test)} --- GBDT --- AUC: {auc_gbdt:.4f} F1: {f1_gbdt:.4f} AP: {ap_gbdt:.4f} --- Random Forest --- AUC: {auc_rf:.4f} AP: {ap_rf:.4f} {cv_str} --- Top 15 特征重要性 --- {fi_str} --- 分类报告 (GBDT) --- {report}""" # 保存模型到内存供后续保存到Hub model_artifacts = { 'gbdt': gbdt, 'rf': rf, 'scaler': scaler, 'feature_names': feature_names, 'metrics': {'auc_gbdt': auc_gbdt, 'f1_gbdt': f1_gbdt, 'auc_rf': auc_rf, 'ap_gbdt': ap_gbdt, 'ap_rf': ap_rf} } # 保存到本地临时文件 joblib.dump(model_artifacts, 'outputs/sklearn_model_artifacts.joblib') return result_text, fig_path1, fig_path2, fig_path3, fig_path4, df_full # ============================================================================= # DIEN 产品推荐 (Deep Interest Evolution Network) # ============================================================================= # DIEN = DIN + Interest Extractor (GRU) + Interest Evolving (AUGRU) # 论文: Deep Interest Evolution Network for Click-Through Rate Prediction (AAAI 2019) # arXiv: https://arxiv.org/abs/1809.03672 def generate_product_recommendation_data(n_users=1000, seed=42): random.seed(seed); np.random.seed(seed) products = ["health_basic","health_premium","critical_illness","term_life", "auto_compulsory","auto_commercial","home","travel_domestic"] records = [] for u in range(n_users): n_behaviors = random.randint(5, 30) behavior_events = [] behavior_products = [] for i in range(n_behaviors): et = random.choice(["page_view","product_view","quote_request","article_read"]) behavior_events.append(et) behavior_products.append(random.choice(products)) candidate = random.choice(products) label = 1 if candidate in behavior_products else random.choices([0,1], weights=[0.7,0.3])[0] records.append({ 'user_id': u, 'behavior_events': behavior_events, 'behavior_products': behavior_products, 'candidate_product': candidate, 'label': label, 'user_features': np.random.randn(20).astype(np.float32), }) return pd.DataFrame(records) def generate_product_recommendation_data(n_users=1000, seed=42): random.seed(seed); np.random.seed(seed) products = ["health_basic","health_premium","critical_illness","term_life", "auto_compulsory","auto_commercial","home","travel_domestic"] records = [] for u in range(n_users): n_behaviors = random.randint(5, 30) behavior_events = [] behavior_products = [] for i in range(n_behaviors): et = random.choice(["page_view","product_view","quote_request","article_read"]) behavior_events.append(et) behavior_products.append(random.choice(products)) candidate = random.choice(products) label = 1 if candidate in behavior_products else random.choices([0,1], weights=[0.7,0.3])[0] records.append({ 'user_id': u, 'behavior_events': behavior_events, 'behavior_products': behavior_products, 'candidate_product': candidate, 'label': label, 'user_features': np.random.randn(20).astype(np.float32), }) return pd.DataFrame(records) def train_dien_recommendation(n_users, embedding_dim, epochs, batch_size, lr, seed): if not TORCH_AVAILABLE: return "PyTorch not installed. Please add torch to requirements.txt and restart Space.", None, None, None, None, None torch.manual_seed(seed); np.random.seed(seed); random.seed(seed) df = generate_product_recommendation_data(n_users=n_users, seed=seed) all_events = sorted(set(e for seq in df['behavior_events'] for e in seq)) event_vocab = {e: i+1 for i, e in enumerate(all_events)} all_products = sorted(set(p for seq in df['behavior_products'] for p in seq) | set(df['candidate_product'])) product_vocab = {p: i+1 for i, p in enumerate(all_products)} max_seq_len = 20 behavior_events_padded = []; behavior_products_padded = []; behavior_masks = [] for _, row in df.iterrows(): e_seq = [event_vocab[e] for e in row['behavior_events'][-max_seq_len:]] p_seq = [product_vocab[p] for p in row['behavior_products'][-max_seq_len:]] mask = [1] * len(e_seq) if len(e_seq) < max_seq_len: pad = max_seq_len - len(e_seq) e_seq = [0]*pad + e_seq; p_seq = [0]*pad + p_seq; mask = [0]*pad + mask behavior_events_padded.append(e_seq); behavior_products_padded.append(p_seq); behavior_masks.append(mask) df['be'] = behavior_events_padded; df['bp'] = behavior_products_padded; df['bm'] = behavior_masks df['cp'] = df['candidate_product'].map(product_vocab) train_df = df.sample(frac=0.8, random_state=seed) test_df = df.drop(train_df.index) device = torch.device('cpu') # ===== DIEN Model Implementation ===== class AUGRUCell(nn.Module): """Attentional Update Gate Recurrent Unit - core DIEN component""" def __init__(self, input_dim, hidden_dim): super().__init__() self.input_dim = input_dim self.hidden_dim = hidden_dim self.W_ug = nn.Linear(input_dim + hidden_dim, hidden_dim) self.W_rg = nn.Linear(input_dim + hidden_dim, hidden_dim) self.W_cand = nn.Linear(input_dim + hidden_dim, hidden_dim) def forward(self, x_t, h_prev, attn_t): concat = torch.cat([x_t, h_prev], dim=-1) r_t = torch.sigmoid(self.W_rg(concat)) u_t = torch.sigmoid(self.W_ug(concat)) u_t_att = attn_t * u_t r_concat = torch.cat([x_t, r_t * h_prev], dim=-1) h_tilde = torch.tanh(self.W_cand(r_concat)) h_t = (1 - u_t_att) * h_prev + u_t_att * h_tilde return h_t class SimpleDIEN(nn.Module): def __init__(self, num_events, num_products, d_model=64, max_len=20): super().__init__() self.d_model = d_model self.max_len = max_len self.event_emb = nn.Embedding(num_events+1, d_model//2, padding_idx=0) self.prod_emb = nn.Embedding(num_products+1, d_model//2, padding_idx=0) self.cand_emb = nn.Embedding(num_products+1, d_model) self.gru = nn.GRU(input_size=d_model, hidden_size=d_model, batch_first=True) self.augru = AUGRUCell(d_model, d_model) self.attn = nn.Sequential(nn.Linear(d_model * 4, 128), nn.ReLU(), nn.Linear(128, 1)) self.mlp = nn.Sequential(nn.Linear(d_model * 3, 256), nn.ReLU(), nn.Dropout(0.3), nn.Linear(256, 128), nn.ReLU(), nn.Dropout(0.3), nn.Linear(128, 1)) def forward(self, be, bp, bm, cp): B, L = be.size() e_emb = self.event_emb(be) p_emb = self.prod_emb(bp) beh_emb = torch.cat([e_emb, p_emb], dim=-1) cand_emb = self.cand_emb(cp) gru_out, _ = self.gru(beh_emb) h_t = torch.zeros(B, self.d_model, device=beh_emb.device) for t in range(L): gru_t = gru_out[:, t, :] cand_exp = cand_emb diff = cand_exp - gru_t prod_feat = cand_exp * gru_t attn_in = torch.cat([cand_exp, gru_t, diff, prod_feat], dim=-1) attn_t = torch.sigmoid(self.attn(attn_in)) mask_t = bm[:, t:t+1].float() h_new = self.augru(gru_t, h_t, attn_t) h_t = mask_t * h_new + (1 - mask_t) * h_t final_interest = h_t interest_prod = final_interest * cand_emb x = torch.cat([final_interest, cand_emb, interest_prod], dim=-1) return self.mlp(x).squeeze(-1) model = SimpleDIEN(len(all_events), len(all_products), d_model=embedding_dim).to(device) criterion = nn.BCEWithLogitsLoss() optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5) for epoch in range(epochs): model.train(); epoch_loss = 0 for i in range(0, len(train_df), batch_size): batch = train_df.iloc[i:i+batch_size] be = torch.tensor(np.stack(batch['be'].values), dtype=torch.long).to(device) bp = torch.tensor(np.stack(batch['bp'].values), dtype=torch.long).to(device) bm = torch.tensor(np.stack(batch['bm'].values), dtype=torch.bool).to(device) cp = torch.tensor(batch['cp'].values, dtype=torch.long).to(device) labels = torch.tensor(batch['label'].values, dtype=torch.float32).to(device) optimizer.zero_grad() outputs = model(be, bp, bm, cp) loss = criterion(outputs, labels) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0) optimizer.step() epoch_loss += loss.item() avg_loss = epoch_loss * batch_size / len(train_df) if (epoch+1) % max(1, epochs//5) == 0 or epoch == 0: print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}") model.eval() with torch.no_grad(): be = torch.tensor(np.stack(test_df['be'].values), dtype=torch.long).to(device) bp = torch.tensor(np.stack(test_df['bp'].values), dtype=torch.long).to(device) bm = torch.tensor(np.stack(test_df['bm'].values), dtype=torch.bool).to(device) cp = torch.tensor(test_df['cp'].values, dtype=torch.long).to(device) labels = test_df['label'].values preds = torch.sigmoid(model(be, bp, bm, cp)).cpu().numpy() auc = float(roc_auc_score(labels, preds)) ap = float(average_precision_score(labels, preds)) f1 = float(f1_score(labels, preds > 0.5)) acc = float(accuracy_score(labels, preds > 0.5)) os.makedirs("outputs", exist_ok=True) torch.save({ 'model_state_dict': model.state_dict(), 'event_vocab': event_vocab, 'product_vocab': product_vocab, 'embedding_dim': embedding_dim, 'max_seq_len': max_seq_len, 'num_events': len(all_events), 'num_products': len(all_products), 'metrics': {'auc': auc, 'ap': ap, 'f1': f1, 'acc': acc} }, 'outputs/dien_model.pt') # Visualizations fig, ax = plt.subplots(figsize=(10,6)) product_perf = {} for idx, row in test_df.iterrows(): prod = row['candidate_product'] if prod not in product_perf: product_perf[prod] = {'preds': [], 'labels': []} product_perf[prod]['preds'].append(preds[idx]) product_perf[prod]['labels'].append(row['label']) prod_aucs = [] for prod, data in product_perf.items(): if len(set(data['labels'])) > 1 and len(data['labels']) >= 5: prod_auc = roc_auc_score(data['labels'], data['preds']) prod_aucs.append((prod, prod_auc, np.mean(data['labels']))) if prod_aucs: prod_aucs.sort(key=lambda x: x[1], reverse=True) prods, aucs, rates = zip(*prod_aucs) x = np.arange(len(prods)) ax.bar(x, aucs, color='steelblue', alpha=0.7, label='AUC') ax2 = ax.twinx() ax2.plot(x, rates, 'ro-', label='Conversion Rate') ax.set_xticks(x); ax.set_xticklabels(prods, rotation=45, ha='right') ax.set_ylabel('AUC', color='steelblue'); ax2.set_ylabel('Conversion Rate', color='red') ax.set_title('DIEN - Product Recommendation Performance', fontweight='bold') ax.legend(loc='upper left'); ax2.legend(loc='upper right') plt.tight_layout() fig_path1 = "outputs/dien_product_performance.png" plt.savefig(fig_path1, dpi=150); plt.close() fig, ax = plt.subplots(figsize=(12,6)) sample_idx = 0 with torch.no_grad(): be_s = be[sample_idx:sample_idx+1] bp_s = bp[sample_idx:sample_idx+1] bm_s = bm[sample_idx:sample_idx+1] cp_s = cp[sample_idx:sample_idx+1] B, L = be_s.size() e_emb = model.event_emb(be_s) p_emb = model.prod_emb(bp_s) beh_emb = torch.cat([e_emb, p_emb], dim=-1) cand_emb = model.cand_emb(cp_s) gru_out, _ = model.gru(beh_emb) h_t = torch.zeros(B, model.d_model, device=beh_emb.device) attn_weights = [] interest_norms = [] for t in range(L): gru_t = gru_out[:, t, :] cand_exp = cand_emb diff = cand_exp - gru_t prod_feat = cand_exp * gru_t attn_in = torch.cat([cand_exp, gru_t, diff, prod_feat], dim=-1) attn_t = torch.sigmoid(model.attn(attn_in)) h_new = model.augru(gru_t, h_t, attn_t) mask_t = bm_s[:, t:t+1].float() h_t = mask_t * h_new + (1 - mask_t) * h_t attn_weights.append(attn_t.item()) interest_norms.append(torch.norm(h_t).item()) valid_len = bm_s[0].sum().item() valid_attn = attn_weights[-valid_len:] if valid_len > 0 else attn_weights valid_norms = interest_norms[-valid_len:] if valid_len > 0 else interest_norms ax.plot(range(len(valid_attn)), valid_attn, 'o-', color='coral', linewidth=2, label='Attention Weight', markersize=6) ax_twin = ax.twinx() ax_twin.plot(range(len(valid_norms)), valid_norms, 's--', color='steelblue', linewidth=2, label='Interest Norm (L2)', markersize=6) ax.set_xlabel('Behavior Position') ax.set_ylabel('Attention Weight', color='coral') ax_twin.set_ylabel('Interest Norm', color='steelblue') ax.set_title('DIEN - Interest Evolution (Sample User)', fontweight='bold') ax.legend(loc='upper left') ax_twin.legend(loc='upper right') ax.grid(True, alpha=0.3) plt.tight_layout() fig_path2 = "outputs/dien_interest_evolution.png" plt.savefig(fig_path2, dpi=150); plt.close() fig, ax = plt.subplots(figsize=(8,6)) fpr, tpr, _ = roc_curve(labels, preds) ax.plot(fpr, tpr, label=f'DIEN AUC={auc:.3f}', linewidth=2, color='#2E86AB') ax.plot([0,1], [0,1], 'k--', alpha=0.5) ax.set_xlabel('False Positive Rate'); ax.set_ylabel('True Positive Rate') ax.set_title('ROC Curve - DIEN Product Recommendation', fontweight='bold') ax.legend(); ax.grid(True, alpha=0.3) plt.tight_layout() fig_path3 = "outputs/dien_roc.png" plt.savefig(fig_path3, dpi=150); plt.close() fig, ax = plt.subplots(figsize=(8,6)) prec, rec, _ = precision_recall_curve(labels, preds) ax.plot(rec, prec, label=f'DIEN AP={ap:.3f}', linewidth=2, color='#A23B72') ax.set_xlabel('Recall'); ax.set_ylabel('Precision') ax.set_title('Precision-Recall Curve - DIEN', fontweight='bold') ax.legend(); ax.grid(True, alpha=0.3) plt.tight_layout() fig_path4 = "outputs/dien_pr.png" plt.savefig(fig_path4, dpi=150); plt.close() result_text = f"""=== DIEN (Deep Interest Evolution Network) Product Recommendation Model === Samples: {n_users} | Products: {len(all_products)} Event vocab: {len(all_events)} | Product vocab: {len(all_products)} Train: {len(train_df)} | Test: {len(test_df)} --- DIEN Architecture (3 layers) --- Layer 1: Embedding - event_emb({len(all_events)+1} -> {embedding_dim//2}) + prod_emb({len(all_products)+1} -> {embedding_dim//2}) Layer 2: Interest Extractor (GRU) - Input: concat(event_emb, prod_emb) -> GRU({embedding_dim} -> {embedding_dim}) Layer 3: Interest Evolving (AUGRU) - AUGRUCell: Attention-gated recurrent unit - u_t' = a_t * u_t (attention modulated update gate) Layer 4: MLP - [emb*3] -> 256 -> 128 -> 1 --- Training Config --- Epochs: {epochs} | Batch size: {batch_size} | LR: {lr} Optimizer: Adam (weight_decay=1e-5) | Gradient clip: max_norm=5.0 --- Test Results --- AUC: {auc:.4f} AP: {ap:.4f} F1: {f1:.4f} Accuracy: {acc:.4f} --- DIEN vs DIN --- 1. [GRU Interest Extractor] Models temporal dependencies in behavior sequences 2. [AUGRU Interest Evolving] Attention-modulated update gate, preserves only target-relevant interest evolution 3. [Better cold-start] Short sequences benefit from GRU temporal modeling --- Model File --- Saved to: outputs/dien_model.pt Upload to HF Hub via Model Management tab""" return result_text, fig_path1, fig_path2, fig_path3, fig_path4 # ============================================================================= # TabBERT 异常检测 # ============================================================================= def generate_anomaly_data(n_normal=800, n_anomaly=200, seed=42): random.seed(seed); np.random.seed(seed) normal_records = [] for i in range(n_normal): normal_records.append({ 'user_id': i, 'claim_amount': random.uniform(1000, 50000), 'claim_type': random.choice(["health","auto","property"]), 'days_since_policy': random.randint(30, 365), 'num_previous_claims': random.randint(0, 3), 'document_count': random.randint(3, 10), 'processing_time_days': random.uniform(1, 15), 'label': 0, }) anomaly_records = [] for i in range(n_anomaly): anomaly_records.append({ 'user_id': n_normal + i, 'claim_amount': random.uniform(50000, 200000), 'claim_type': random.choice(["health","auto","property"]), 'days_since_policy': random.randint(1, 15), 'num_previous_claims': random.randint(5, 20), 'document_count': random.randint(0, 2), 'processing_time_days': random.uniform(0.1, 2), 'label': 1, }) df = pd.DataFrame(normal_records + anomaly_records) df = df.sample(frac=1, random_state=seed).reset_index(drop=True) return df def train_tabbert_anomaly(n_normal, n_anomaly, d_model, epochs, batch_size, lr, seed): if not TORCH_AVAILABLE: return "❌ PyTorch 未安装。请在 requirements.txt 中添加 torch 并重启 Space。", None, None, None, None torch.manual_seed(seed); np.random.seed(seed); random.seed(seed) df = generate_anomaly_data(n_normal=n_normal, n_anomaly=n_anomaly, seed=seed) claim_type_map = {"health": 0, "auto": 1, "property": 2} df['claim_type_enc'] = df['claim_type'].map(claim_type_map) feature_cols = ['claim_amount', 'claim_type_enc', 'days_since_policy', 'num_previous_claims', 'document_count', 'processing_time_days'] X = df[feature_cols].values.astype(np.float32) y = df['label'].values.astype(np.float32) scaler = StandardScaler() X_s = scaler.fit_transform(X) X_train, X_test, y_train, y_test = train_test_split( X_s, y, test_size=0.2, random_state=seed, stratify=y ) device = torch.device('cpu') class SimpleTabBERT(nn.Module): def __init__(self, input_dim=6, d_model=128, n_layers=4): super().__init__() self.input_proj = nn.Linear(input_dim, d_model) layers = [] for _ in range(n_layers): layers.extend([ nn.Linear(d_model, d_model*4), nn.ReLU(), nn.Dropout(0.2), nn.Linear(d_model*4, d_model), nn.LayerNorm(d_model), nn.ReLU(), nn.Dropout(0.2), ]) self.transformer = nn.Sequential(*layers) self.head = nn.Sequential(nn.Linear(d_model, 256), nn.ReLU(), nn.Dropout(0.3), nn.Linear(256, 64), nn.ReLU(), nn.Linear(64, 1)) def forward(self, x): x = self.input_proj(x) x = self.transformer(x) return self.head(x).squeeze(-1) model = SimpleTabBERT(input_dim=len(feature_cols), d_model=d_model).to(device) class FocalLoss(nn.Module): def __init__(self, alpha=0.25, gamma=2.0): super().__init__(); self.alpha = alpha; self.gamma = gamma def forward(self, inputs, targets): bce = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none') pt = torch.exp(-bce) return (self.alpha * (1-pt)**self.gamma * bce).mean() criterion = FocalLoss(alpha=0.25, gamma=2.0) optimizer = torch.optim.Adam(model.parameters(), lr=lr) X_train_t = torch.tensor(X_train, dtype=torch.float32).to(device) y_train_t = torch.tensor(y_train, dtype=torch.float32).to(device) X_test_t = torch.tensor(X_test, dtype=torch.float32).to(device) y_test_t = torch.tensor(y_test, dtype=torch.float32).to(device) for epoch in range(epochs): model.train(); epoch_loss = 0 n_batches = math.ceil(len(X_train_t) / batch_size) for i in range(n_batches): start = i * batch_size; end = min(start + batch_size, len(X_train_t)) xb = X_train_t[start:end]; yb = y_train_t[start:end] optimizer.zero_grad() outputs = model(xb); loss = criterion(outputs, yb) loss.backward(); optimizer.step() epoch_loss += loss.item() if (epoch+1) % max(1, epochs//5) == 0 or epoch == 0: print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss/n_batches:.4f}") model.eval() with torch.no_grad(): preds = torch.sigmoid(model(X_test_t)).cpu().numpy() auc = float(roc_auc_score(y_test, preds)) ap = float(average_precision_score(y_test, preds)) f1 = float(f1_score(y_test, preds > 0.5)) # 保存模型 torch.save({ 'model_state_dict': model.state_dict(), 'feature_cols': feature_cols, 'd_model': d_model, 'scaler_mean': scaler.mean_, 'scaler_scale': scaler.scale_, 'metrics': {'auc': auc, 'ap': ap, 'f1': f1} }, 'outputs/tabbert_model.pt') os.makedirs("outputs", exist_ok=True) baseline_auc = auc importances = [] for i in range(len(feature_cols)): X_perm = X_test.copy() np.random.shuffle(X_perm[:, i]) X_perm_t = torch.tensor(X_perm, dtype=torch.float32).to(device) with torch.no_grad(): perm_preds = torch.sigmoid(model(X_perm_t)).cpu().numpy() perm_auc = roc_auc_score(y_test, perm_preds) importances.append(baseline_auc - perm_auc) fig, ax = plt.subplots(figsize=(10,6)) colors = ['red' if imp > 0 else 'gray' for imp in importances] ax.barh(feature_cols, importances, color=colors) ax.set_title('TabularBERT - Feature Importance (Permutation)', fontweight='bold') ax.set_xlabel('AUC Drop (Importance)') plt.tight_layout() fig_path1 = "outputs/tabbert_feature_importance.png" plt.savefig(fig_path1, dpi=150); plt.close() fig, ax = plt.subplots(figsize=(10,6)) normal_scores = preds[y_test == 0]; anomaly_scores = preds[y_test == 1] ax.hist(normal_scores, bins=30, alpha=0.6, label=f'Normal (n={len(normal_scores)})', color='steelblue', edgecolor='white') ax.hist(anomaly_scores, bins=30, alpha=0.6, label=f'Anomaly (n={len(anomaly_scores)})', color='red', edgecolor='white') ax.axvline(x=0.5, color='black', linestyle='--', label='Threshold=0.5') ax.set_xlabel('Anomaly Score'); ax.set_ylabel('Count') ax.set_title('Anomaly Score Distribution', fontweight='bold') ax.legend(); ax.grid(True, alpha=0.3) plt.tight_layout() fig_path2 = "outputs/tabbert_distribution.png" plt.savefig(fig_path2, dpi=150); plt.close() fig, ax = plt.subplots(figsize=(8,6)) fpr, tpr, _ = roc_curve(y_test, preds) ax.plot(fpr, tpr, label=f'TabBERT AUC={auc:.3f}', linewidth=2, color='#2E86AB') ax.plot([0,1], [0,1], 'k--', alpha=0.5) ax.set_xlabel('False Positive Rate'); ax.set_ylabel('True Positive Rate') ax.set_title('ROC Curve - Anomaly Detection', fontweight='bold') ax.legend(); ax.grid(True, alpha=0.3) plt.tight_layout() fig_path3 = "outputs/tabbert_roc.png" plt.savefig(fig_path3, dpi=150); plt.close() fig, axs = plt.subplots(1, 2, figsize=(14,6)) cm = confusion_matrix(y_test, preds > 0.5) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axs[0], cbar=False) axs[0].set_title(f'Confusion Matrix @ threshold=0.5\n(F1={f1:.3f})', fontweight='bold') axs[0].set_xlabel('Predicted'); axs[0].set_ylabel('Actual') thresholds = np.linspace(0.1, 0.9, 50) f1s = [f1_score(y_test, preds > t) for t in thresholds] precs = [precision_score(y_test, preds > t, zero_division=0) for t in thresholds] recs = [recall_score(y_test, preds > t, zero_division=0) for t in thresholds] axs[1].plot(thresholds, f1s, label='F1', linewidth=2) axs[1].plot(thresholds, precs, label='Precision', linewidth=2) axs[1].plot(thresholds, recs, label='Recall', linewidth=2) best_t = thresholds[np.argmax(f1s)] axs[1].axvline(x=best_t, color='red', linestyle='--', label=f'Best F1 @ {best_t:.2f}') axs[1].set_xlabel('Threshold'); axs[1].set_ylabel('Score') axs[1].set_title('Threshold Analysis', fontweight='bold') axs[1].legend(); axs[1].grid(True, alpha=0.3) plt.tight_layout() fig_path4 = "outputs/tabbert_threshold.png" plt.savefig(fig_path4, dpi=150); plt.close() result_text = f"""=== TabularBERT 异常行为检测模型 === 样本数: {len(df)} (正常: {n_normal}, 异常: {n_anomaly}) 特征数: {len(feature_cols)} 训练集: {len(y_train)} | 测试集: {len(y_test)} --- 模型架构 --- Input dim: {len(feature_cols)} → d_model: {d_model} Transformer layers: {4} (模拟层次化BERT) Head: {d_model} → 256 → 64 → 1 Loss: Focal Loss (α=0.25, γ=2.0) --- 训练配置 --- Epochs: {epochs} | Batch size: {batch_size} | LR: {lr} Optimizer: Adam --- 测试集效果 --- AUC: {auc:.4f} AP: {ap:.4f} F1: {f1:.4f} @ threshold=0.5 Best F1: {max(f1s):.4f} @ threshold={best_t:.2f} --- 模型洞察 --- 1. Focal Loss 自动聚焦难分异常样本, 解决类别不平衡 2. 关键异常特征: claim_amount(高), days_since_policy(短), document_count(少) 3. 建议阈值: {best_t:.2f} (平衡精确率与召回率) 4. 高AUC说明模型能很好区分正常与异常理赔 --- 模型文件 --- 模型已保存至: outputs/tabbert_model.pt 可使用"模型管理"Tab上传至Hugging Face Hub""" return result_text, fig_path1, fig_path2, fig_path3, fig_path4 # ============================================================================= # 模型管理 — 保存/加载到 Hugging Face Hub # ============================================================================= def save_model_to_hub(repo_id, token, model_type, notes): """将训练好的模型保存到 Hugging Face Hub""" if not HFHUB_AVAILABLE: return "❌ huggingface_hub 未安装。无法保存到 Hub。", None if not token or not token.strip(): return "❌ 需要提供 Hugging Face Token。在 https://huggingface.co/settings/tokens 获取。", None try: api = HfApi(token=token.strip()) create_repo(repo_id, repo_type="model", exist_ok=True, token=token.strip()) with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) # 收集所有模型文件 model_files = [] artifacts = {} # 检查 sklearn 模型 sklearn_path = Path("outputs/sklearn_model_artifacts.joblib") if sklearn_path.exists(): artifacts['sklearn'] = joblib.load(sklearn_path) joblib.dump(artifacts['sklearn'], tmpdir / "sklearn_model.joblib") model_files.append("sklearn_model.joblib") # 检查 DIEN 模型 dien_path = Path("outputs/dien_model.pt") if dien_path.exists(): artifacts['dien'] = torch.load(dien_path, map_location='cpu') torch.save(artifacts['dien'], tmpdir / "dien_model.pt") model_files.append("dien_model.pt") # 检查 TabBERT 模型 tab_path = Path("outputs/tabbert_model.pt") if tab_path.exists(): artifacts['tabbert'] = torch.load(tab_path, map_location='cpu') torch.save(artifacts['tabbert'], tmpdir / "tabbert_model.pt") model_files.append("tabbert_model.pt") if not model_files: return "❌ 未找到训练好的模型。请先在其他Tab训练模型。", None # 保存元数据 metadata = { "model_type": model_type, "notes": notes, "files": model_files, "timestamp": datetime.datetime.now().isoformat(), "insurance_app_behavior": True, "version": "3.0" } with open(tmpdir / "model_metadata.json", "w") as f: json.dump(metadata, f, indent=2, ensure_ascii=False) # 保存 README readme = f"""# Insurance App Behavior Model **Model Type:** {model_type} **Notes:** {notes} **Date:** {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} ## Files | File | Description | |------|-------------| | `sklearn_model.joblib` | GBDT + Random Forest + Scaler (sklearn) | | `dien_model.pt` | Deep Interest Evolution Network (PyTorch) | | `tabbert_model.pt` | TabularBERT Anomaly Detection (PyTorch) | | `model_metadata.json` | Model metadata | ## Usage ```python from huggingface_hub import hf_hub_download import joblib import torch # Load sklearn models model_path = hf_hub_download(repo_id="{repo_id}", filename="sklearn_model.joblib") artifacts = joblib.load(model_path) # artifacts['gbdt'], artifacts['rf'], artifacts['scaler'] # Load DIEN dien_path = hf_hub_download(repo_id="{repo_id}", filename="dien_model.pt") dien_ckpt = torch.load(dien_path) # dien_ckpt['model_state_dict'], dien_ckpt['event_vocab'], dien_ckpt['product_vocab'] ``` ## Reference - Deep Interest Network (KDD 2018): https://arxiv.org/abs/1706.06978 - TabBERT (arXiv 2011.01843): https://arxiv.org/abs/2011.01843 """ with open(tmpdir / "README.md", "w") as f: f.write(readme) api.upload_folder( folder_path=str(tmpdir), repo_id=repo_id, repo_type="model", token=token.strip() ) return f"✅ 模型已成功保存到 https://huggingface.co/{repo_id}", None except Exception as e: import traceback return f"❌ 保存失败: {str(e)}\n\n{traceback.format_exc()}", None def load_model_from_hub(repo_id, token, model_type): """从 Hugging Face Hub 加载模型""" if not HFHUB_AVAILABLE: return "❌ huggingface_hub 未安装。无法从 Hub 加载。", None, None, None if not token or not token.strip(): return "❌ 需要提供 Hugging Face Token。", None, None, None try: token = token.strip() # 尝试下载元数据 metadata_path = hf_hub_download(repo_id=repo_id, filename="model_metadata.json", token=token, repo_type="model") with open(metadata_path) as f: metadata = json.load(f) results = [f"✅ 成功加载模型: {repo_id}", f"模型类型: {metadata.get('model_type', 'Unknown')}", f"备注: {metadata.get('notes', 'N/A')}", f"时间: {metadata.get('timestamp', 'N/A')}", f"文件列表: {', '.join(metadata.get('files', []))}", "---"] images = [] # 加载 sklearn 模型 if "sklearn_model.joblib" in metadata.get('files', []): sklearn_path = hf_hub_download(repo_id=repo_id, filename="sklearn_model.joblib", token=token, repo_type="model") artifacts = joblib.load(sklearn_path) metrics = artifacts.get('metrics', {}) results.append(f"📦 sklearn 模型已加载") results.append(f" GBDT AUC: {metrics.get('auc_gbdt', 'N/A')}") results.append(f" RF AUC: {metrics.get('auc_rf', 'N/A')}") results.append(f" 特征数: {len(artifacts.get('feature_names', []))}") # 特征重要性图 if 'rf' in artifacts: fig, ax = plt.subplots(figsize=(10,6)) fi = pd.DataFrame({'feature': artifacts['feature_names'], 'importance': artifacts['rf'].feature_importances_}) fi = fi.sort_values('importance', ascending=False).head(10) ax.barh(fi['feature'][::-1], fi['importance'][::-1], color='steelblue') ax.set_title('Loaded Model - Feature Importance', fontweight='bold') plt.tight_layout() img_path = "outputs/loaded_feature_importance.png" plt.savefig(img_path, dpi=150); plt.close() images.append(img_path) # 加载 DIEN if "dien_model.pt" in metadata.get('files', []): dien_path = hf_hub_download(repo_id=repo_id, filename="dien_model.pt", token=token, repo_type="model") dien_ckpt = torch.load(dien_path, map_location='cpu') metrics = dien_ckpt.get('metrics', {}) results.append(f"📦 DIEN 模型已加载") results.append(f" AUC: {metrics.get('auc', 'N/A')}") results.append(f" Embedding dim: {dien_ckpt.get('embedding_dim', 'N/A')}") results.append(f" Event vocab: {len(dien_ckpt.get('event_vocab', {}))}") results.append(f" Product vocab: {len(dien_ckpt.get('product_vocab', {}))}") # 加载 TabBERT if "tabbert_model.pt" in metadata.get('files', []): tab_path = hf_hub_download(repo_id=repo_id, filename="tabbert_model.pt", token=token, repo_type="model") tab_ckpt = torch.load(tab_path, map_location='cpu') metrics = tab_ckpt.get('metrics', {}) results.append(f"📦 TabBERT 模型已加载") results.append(f" AUC: {metrics.get('auc', 'N/A')}") results.append(f" d_model: {tab_ckpt.get('d_model', 'N/A')}") results.append(f" 特征: {', '.join(tab_ckpt.get('feature_cols', []))}") return "\n".join(results), images[0] if images else None, images[1] if len(images) > 1 else None, images[2] if len(images) > 2 else None except Exception as e: import traceback return f"❌ 加载失败: {str(e)}\n\n{traceback.format_exc()}", None, None, None # ============================================================================= # 生存分析 — lifelines + DeepSurv # ============================================================================= def generate_survival_data(n_samples=2000, seed=42): """生成保险生存分析合成数据""" random.seed(seed); np.random.seed(seed) records = [] for i in range(n_samples): age = random.randint(18, 75) gender = random.choice([0, 1]) # 0=female, 1=male income = random.uniform(30000, 200000) policy_type = random.choice(["term_life", "whole_life", "health", "auto", "property"]) premium_amount = random.uniform(1000, 50000) coverage_amount = premium_amount * random.uniform(10, 100) risk_score = random.uniform(0, 1) # 根据特征计算基础风险 base_hazard = ( 0.001 * (age - 18) + # 年龄越大风险越高 0.05 * gender + # 性别差异 0.00001 * (200000 - income) + # 收入越低风险越高 0.1 * risk_score + # 风险评分 random.gauss(0, 0.05) # 噪声 ) # 保单类型调整 policy_hazard = {"term_life": 0.02, "whole_life": 0.01, "health": 0.05, "auto": 0.03, "property": 0.01}[policy_type] total_hazard = base_hazard + policy_hazard total_hazard = max(total_hazard, 0.001) # 最小风险 # 指数分布: time ~ Exp(lambda) time_to_event = random.expovariate(total_hazard) # 右删失: 最大观察时间 3650天 (10年) max_observation = 3650 event_observed = 1 if time_to_event < max_observation else 0 duration = min(time_to_event, max_observation) records.append({ 'user_id': f"user_{i:04d}", 'age': age, 'gender': gender, 'income': income, 'policy_type': policy_type, 'premium_amount': premium_amount, 'coverage_amount': coverage_amount, 'risk_score': risk_score, 'duration': duration, 'event_observed': event_observed, }) return pd.DataFrame(records) def train_survival_analysis(n_samples, test_size, seed, use_deep_surv, epochs, lr): """训练生存分析模型""" df = generate_survival_data(n_samples=n_samples, seed=seed) # 编码分类变量 df['policy_type_enc'] = pd.Categorical(df['policy_type']).codes # 特征列 feature_cols = ['age', 'gender', 'income', 'policy_type_enc', 'premium_amount', 'coverage_amount', 'risk_score'] # 划分训练/测试 train_df = df.sample(frac=1-test_size, random_state=seed) test_df = df.drop(train_df.index) os.makedirs("outputs", exist_ok=True) # ===== 1. lifelines Cox-PH ===== results = ["=== 保险理赔/购买时序生存分析 ===", f"总样本: {len(df)} | 训练: {len(train_df)} | 测试: {len(test_df)}", f"事件发生率: {df['event_observed'].mean():.1%} ({df['event_observed'].sum()}/{len(df)})", f"平均观察时长: {df['duration'].mean():.0f} 天", "---"] cph_figures = [] if LIFELINES_AVAILABLE: # Kaplan-Meier 曲线 fig, ax = plt.subplots(figsize=(10,6)) kmf = KaplanMeierFitter() # 整体 kmf.fit(df['duration'], df['event_observed'], label='Overall') kmf.plot_survival_function(ax=ax, ci_show=True, color='steelblue', linewidth=2) # 按性别分组 for gender, color in [(0, '#E74C3C'), (1, '#2ECC71')]: sub = df[df['gender'] == gender] kmf.fit(sub['duration'], sub['event_observed'], label=f'{"Female" if gender==0 else "Male"}') kmf.plot_survival_function(ax=ax, ci_show=False, color=color, linestyle='--', linewidth=2) ax.set_title('Kaplan-Meier Survival Curve', fontsize=14, fontweight='bold') ax.set_xlabel('Duration (days)', fontsize=12) ax.set_ylabel('Survival Probability S(t)', fontsize=12) ax.legend(fontsize=11); ax.grid(True, alpha=0.3) plt.tight_layout() km_path = "outputs/survival_kaplan_meier.png" plt.savefig(km_path, dpi=150); plt.close() cph_figures.append(km_path) # Cox-PH 模型 cph = CoxPHFitter(penalizer=0.1) cph_train = train_df[feature_cols + ['duration', 'event_observed']].copy() try: cph.fit(cph_train, duration_col='duration', event_col='event_observed') # 系数可视化 fig, ax = plt.subplots(figsize=(10,6)) summary = cph.summary.copy() summary['coef'] = summary['coef'].astype(float) summary['exp(coef)'] = summary['exp(coef)'].astype(float) summary = summary.sort_values('coef') colors = ['green' if c < 0 else 'red' for c in summary['coef']] ax.barh(summary.index, summary['coef'], color=colors, alpha=0.7, edgecolor='white') ax.axvline(x=0, color='black', linestyle='-', linewidth=0.5) ax.set_title('Cox-PH Coefficients (log Hazard Ratio)', fontsize=14, fontweight='bold') ax.set_xlabel('Coefficient') plt.tight_layout() coef_path = "outputs/survival_cox_coefficients.png" plt.savefig(coef_path, dpi=150); plt.close() cph_figures.append(coef_path) # 预测生存函数 (测试集前5个样本) fig, ax = plt.subplots(figsize=(10,6)) test_subset = test_df.head(5) predictions = cph.predict_survival_function(test_subset[feature_cols]) for i, col in enumerate(predictions.columns): ax.plot(predictions.index, predictions[col], label=f'Sample {i+1}', linewidth=2, alpha=0.8) ax.set_title('Predicted Survival Functions (Test Samples)', fontsize=14, fontweight='bold') ax.set_xlabel('Duration (days)', fontsize=12) ax.set_ylabel('Survival Probability', fontsize=12) ax.legend(fontsize=10); ax.grid(True, alpha=0.3) plt.tight_layout() pred_path = "outputs/survival_predictions.png" plt.savefig(pred_path, dpi=150); plt.close() cph_figures.append(pred_path) # Concordance Index from lifelines.utils import concordance_index pred_risk = cph.predict_partial_hazard(test_df[feature_cols]) c_index = concordance_index(test_df['duration'], -pred_risk, test_df['event_observed']) results.append("--- lifelines Cox-PH ---") results.append(f"Concordance Index: {c_index:.4f}") results.append(f"Log-likelihood: {cph.log_likelihood_:.2f}") results.append(f"AIC: {cph.AIC_partial_:.2f}") results.append("") results.append("--- Cox-PH 系数 (Top 影响因子) ---") for idx, row in cph.summary.head(7).iterrows(): hr = float(row['exp(coef)']) results.append(f" {idx}: HR={hr:.3f} (p={row['p']:.4f})") results.append("") results.append("HR > 1: 风险增加 | HR < 1: 风险降低") except Exception as e: results.append(f"⚠️ Cox-PH 拟合失败: {str(e)}") else: results.append("⚠️ lifelines 未安装。统计生存分析功能禁用。") # ===== 2. DeepSurv (PyTorch) ===== deep_surv_result = "" if use_deep_surv and TORCH_AVAILABLE: results.append("--- DeepSurv (Neural Cox-PH) ---") X_train = train_df[feature_cols].values.astype(np.float32) X_test = test_df[feature_cols].values.astype(np.float32) scaler = StandardScaler() X_train_s = scaler.fit_transform(X_train) X_test_s = scaler.transform(X_test) T_train = train_df['duration'].values.astype(np.float32) E_train = train_df['event_observed'].values.astype(np.float32) T_test = test_df['duration'].values.astype(np.float32) E_test = test_df['event_observed'].values.astype(np.float32) device = torch.device('cpu') class DeepSurv(nn.Module): def __init__(self, input_dim, hidden_dims=[128, 64, 32], dropout=0.3): super().__init__() layers = [] prev = input_dim for h in hidden_dims: layers.extend([nn.Linear(prev, h), nn.ReLU(), nn.Dropout(dropout)]) prev = h layers.append(nn.Linear(prev, 1)) self.net = nn.Sequential(*layers) def forward(self, x): return self.net(x).squeeze(-1) model = DeepSurv(input_dim=len(feature_cols), hidden_dims=[128, 64, 32]).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=lr) # Cox partial likelihood loss def cox_ph_loss(pred, time, event): """Negative Cox partial likelihood""" # Sort by time descending idx = torch.argsort(time, descending=True) pred_sorted = pred[idx] event_sorted = event[idx] # logcumsumexp for numerical stability log_cumsum_h = torch.logcumsumexp(pred_sorted, dim=0) # Only event samples contribute loss = -torch.sum(event_sorted * (pred_sorted - log_cumsum_h)) / event_sorted.sum().clamp(min=1) return loss X_train_t = torch.tensor(X_train_s, dtype=torch.float32).to(device) T_train_t = torch.tensor(T_train, dtype=torch.float32).to(device) E_train_t = torch.tensor(E_train, dtype=torch.float32).to(device) # Training model.train() for epoch in range(epochs): optimizer.zero_grad() pred = model(X_train_t) loss = cox_ph_loss(pred, T_train_t, E_train_t) loss.backward() optimizer.step() if (epoch+1) % max(1, epochs//5) == 0 or epoch == 0: print(f"DeepSurv Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}") # Evaluation model.eval() with torch.no_grad(): X_test_t = torch.tensor(X_test_s, dtype=torch.float32).to(device) pred_test = model(X_test_t).cpu().numpy() # Concordance Index from lifelines.utils import concordance_index deep_c_index = concordance_index(T_test, -pred_test, E_test) results.append(f"Concordance Index: {deep_c_index:.4f}") results.append(f"Training epochs: {epochs} | LR: {lr}") results.append("") results.append("--- DeepSurv 洞察 ---") results.append("1. 神经网络学习非线性特征交互, 捕捉复杂风险模式") results.append("2. 相比线性Cox-PH, 能建模年龄×收入×风险评分的组合效应") results.append("3. 输出log hazard ratio: 正值=高风险, 负值=低风险") # 保存模型 torch.save({ 'model_state_dict': model.state_dict(), 'feature_cols': feature_cols, 'hidden_dims': [128, 64, 32], 'scaler_mean': scaler.mean_, 'scaler_scale': scaler.scale_, 'metrics': {'concordance_index': deep_c_index} }, 'outputs/deepsurv_model.pt') # 风险分层可视化 fig, ax = plt.subplots(figsize=(10,6)) risk_scores = pred_test risk_percentiles = np.percentile(risk_scores, [33, 66]) low_risk = test_df[risk_scores < risk_percentiles[0]] mid_risk = test_df[(risk_scores >= risk_percentiles[0]) & (risk_scores < risk_percentiles[1])] high_risk = test_df[risk_scores >= risk_percentiles[1]] colors = ['#2ECC71', '#F39C12', '#E74C3C'] labels = ['Low Risk (bottom 33%)', 'Medium Risk (33-66%)', 'High Risk (top 33%)'] for subset, color, label in [(low_risk, colors[0], labels[0]), (mid_risk, colors[1], labels[1]), (high_risk, colors[2], labels[2])]: if len(subset) > 0: kmf = KaplanMeierFitter() kmf.fit(subset['duration'], subset['event_observed'], label=label) kmf.plot_survival_function(ax=ax, ci_show=False, color=color, linewidth=2.5) ax.set_title('Survival by DeepSurv Risk Strata', fontsize=14, fontweight='bold') ax.set_xlabel('Duration (days)', fontsize=12) ax.set_ylabel('Survival Probability', fontsize=12) ax.legend(fontsize=11); ax.grid(True, alpha=0.3) plt.tight_layout() risk_path = "outputs/survival_risk_strata.png" plt.savefig(risk_path, dpi=150); plt.close() cph_figures.append(risk_path) deep_surv_result = f"DeepSurv C-index: {deep_c_index:.4f}" elif use_deep_surv and not TORCH_AVAILABLE: results.append("⚠️ PyTorch 未安装。DeepSurv 禁用。") # 保存 lifelines 结果 results.append("---") results.append(f"所有图表已保存到 outputs/ 目录") results.append(f"模型已保存至: outputs/deepsurv_model.pt (如使用DeepSurv)") result_text = "\n".join(results) return result_text, cph_figures[0] if len(cph_figures) > 0 else None, \ cph_figures[1] if len(cph_figures) > 1 else None, \ cph_figures[2] if len(cph_figures) > 2 else None, \ cph_figures[3] if len(cph_figures) > 3 else None, \ df.head(20) # ============================================================================= # Gradio 回调 # ============================================================================= def demo_train(n_users, n_events, test_size, random_state, use_cv): data = generate_synthetic_data(n_users=n_users, n_events_per_user=n_events, seed=random_state) engineer = InsuranceFeatureEngineer() features_list, labels = [], [] for profile, label in data: f = engineer.extract_user_features(profile) if f: features_list.append(f); labels.append(label) return train_sklearn(features_list, labels, test_size, random_state, use_cv) def csv_train(csv_file, label_col, test_size, random_state, use_cv): if csv_file is None: return "请先上传CSV文件", None, None, None, None, None try: if isinstance(csv_file, str): df = pd.read_csv(csv_file) else: df = pd.read_csv(csv_file.name if hasattr(csv_file, 'name') else io.BytesIO(csv_file)) label_col = label_col.strip() if label_col else None if label_col and label_col not in df.columns: return f"标签列 '{label_col}' 不存在。可用列: {list(df.columns)}", None, None, None, None, None profiles = parse_csv_to_profiles(df) engineer = InsuranceFeatureEngineer() features_list, labels = [], [] for profile in profiles: f = engineer.extract_user_features(profile) if f: features_list.append(f) if label_col and label_col in df.columns: user_df = df[df["user_id"] == profile.user_id] labels.append(int(user_df[label_col].iloc[0])) else: is_high_risk = (f["has_purchased"] == 0 and f["has_renewed"] == 0 and f["total_events"] < 20) labels.append(int(is_high_risk)) if len(features_list) < 50: return f"有效样本数 {len(features_list)} 太少,需要至少50个", None, None, None, None, None return train_sklearn(features_list, labels, test_size, random_state, use_cv) except Exception as e: import traceback return f"错误: {str(e)}\n\n{traceback.format_exc()}", None, None, None, None, None def show_csv_info(csv_file): if csv_file is None: return "请先上传CSV文件", None try: if isinstance(csv_file, str): df = pd.read_csv(csv_file) else: df = pd.read_csv(csv_file.name if hasattr(csv_file, 'name') else io.BytesIO(csv_file)) info = f"""=== CSV文件信息 === 行数: {len(df)} | 列数: {len(df.columns)} 列名: {list(df.columns)} === 前5行 === {df.head().to_string()} === 事件类型分布 (前10) === {df['event_type'].value_counts().head(10).to_string() if 'event_type' in df.columns else '无event_type列'} === 用户数: {df['user_id'].nunique() if 'user_id' in df.columns else 'N/A'} === === 会话数: {df['session_id'].nunique() if 'session_id' in df.columns else 'N/A'} ===""" return info, df.head(20) except Exception as e: return f"解析错误: {str(e)}", None # ============================================================================= # Gradio 界面 (7 Tabs) # ============================================================================= with gr.Blocks(title="🏥 保险APP 用户行为分析模型训练平台 v3.0", theme=gr.themes.Soft()) as demo: gr.Markdown("""# 🏥 保险APP 用户行为分析模型训练平台 v3.0 基于最新研究论文构建的**工业级保险用户行为分析平台**。 **七大功能模块:** 🎲演示 | 📁CSV上传 | 🎯产品推荐(DIN) | 🔍异常检测(TabBERT) | 💾模型管理 | ⏱️生存分析 | ❓帮助 **参考论文:** [DIN](https://arxiv.org/abs/1706.06978) | [Churn Transformer](https://arxiv.org/abs/2309.14390) | [TabBERT](https://arxiv.org/abs/2011.01843) | [DeepSurv](https://arxiv.org/abs/1606.00931) | [RNN Survival](https://arxiv.org/abs/2304.00575)""") with gr.Tabs(): # ===== Tab 1: 演示模式 ===== with gr.Tab("🎲 演示"): with gr.Row(): with gr.Column(scale=1): gr.Markdown("### 参数设置") n_users_slider = gr.Slider(500, 5000, value=2000, step=100, label="用户数量") n_events_slider = gr.Slider(10, 100, value=50, step=5, label="每用户最大事件数") test_size_slider = gr.Slider(0.1, 0.4, value=0.2, step=0.05, label="测试集比例") random_seed = gr.Number(value=42, label="随机种子", precision=0) use_cv_check = gr.Checkbox(value=False, label="启用5折交叉验证") train_btn = gr.Button("🚀 开始训练", variant="primary", size="lg") with gr.Column(scale=2): demo_result = gr.Textbox(label="训练结果", lines=25) with gr.Row(): demo_img1 = gr.Image(label="特征重要性") demo_img2 = gr.Image(label="PR曲线") with gr.Row(): demo_img3 = gr.Image(label="混淆矩阵") demo_img4 = gr.Image(label="ROC曲线") with gr.Row(): demo_table = gr.Dataframe(label="特征数据样本") # ===== Tab 2: CSV上传 ===== with gr.Tab("📁 CSV上传"): with gr.Row(): with gr.Column(scale=1): gr.Markdown("""### 📤 上传数据 **必需列:** `user_id`, `session_id`, `timestamp`, `event_type`, `page_id` **可选:** `product_id`, `amount`, `label`""") csv_file = gr.File(label="上传CSV文件", file_types=[".csv"]) label_col_input = gr.Textbox(label="标签列名 (可选)", placeholder="如: churn") with gr.Row(): csv_test_size = gr.Slider(0.1, 0.4, value=0.2, step=0.05, label="测试集比例") csv_random_seed = gr.Number(value=42, label="随机种子", precision=0) csv_use_cv = gr.Checkbox(value=False, label="启用5折交叉验证") with gr.Row(): info_btn = gr.Button("📊 查看数据信息", variant="secondary") csv_train_btn = gr.Button("🚀 训练模型", variant="primary", size="lg") with gr.Column(scale=2): csv_info = gr.Textbox(label="CSV信息", lines=15) csv_preview = gr.Dataframe(label="数据预览") with gr.Row(): csv_result = gr.Textbox(label="训练结果", lines=25) with gr.Row(): csv_img1 = gr.Image(label="特征重要性") csv_img2 = gr.Image(label="PR曲线") with gr.Row(): csv_img3 = gr.Image(label="混淆矩阵") csv_img4 = gr.Image(label="ROC曲线") with gr.Row(): csv_table = gr.Dataframe(label="特征数据样本") # ===== Tab 3: DIEN 产品推荐 ===== with gr.Tab("🎯 产品推荐 (DIEN)"): gr.Markdown("""### Deep Interest Evolution Network - 保险产品推荐 基于DIEN (AAAI 2019), 通过 GRU兴趣提取 + AUGRU兴趣演化 建模用户对候选保险产品的动态兴趣演化过程。""") with gr.Row(): with gr.Column(scale=1): din_users = gr.Slider(500, 5000, value=2000, step=100, label="用户数量") din_emb = gr.Slider(32, 256, value=64, step=32, label="Embedding维度") din_epochs = gr.Slider(5, 50, value=20, step=5, label="训练轮数") din_batch = gr.Slider(32, 512, value=128, step=32, label="Batch Size") din_lr = gr.Slider(0.0001, 0.01, value=0.001, step=0.0001, label="学习率") din_seed = gr.Number(value=42, label="随机种子", precision=0) din_btn = gr.Button("🚀 训练DIEN模型", variant="primary", size="lg") if not TORCH_AVAILABLE: gr.Markdown("⚠️ **PyTorch 未安装**。请在 requirements.txt 中添加 `torch>=2.0.0` 并重启。") with gr.Column(scale=2): din_result = gr.Textbox(label="训练结果", lines=25) with gr.Row(): din_img1 = gr.Image(label="产品推荐效果") din_img2 = gr.Image(label="注意力权重示例") with gr.Row(): din_img3 = gr.Image(label="ROC曲线") din_img4 = gr.Image(label="PR曲线") # ===== Tab 4: TabBERT 异常检测 ===== with gr.Tab("🔍 异常检测 (TabBERT)"): gr.Markdown("""### TabularBERT - 理赔欺诈/异常检测 层次化Transformer架构, 学习理赔记录的多字段关联和时序模式。""") with gr.Row(): with gr.Column(scale=1): tab_normal = gr.Slider(500, 2000, value=800, step=100, label="正常样本数") tab_anomaly = gr.Slider(100, 1000, value=200, step=50, label="异常样本数") tab_dmodel = gr.Slider(64, 256, value=128, step=64, label="模型维度 d_model") tab_epochs = gr.Slider(10, 100, value=30, step=10, label="训练轮数") tab_batch = gr.Slider(16, 256, value=64, step=16, label="Batch Size") tab_lr = gr.Slider(0.0001, 0.01, value=0.001, step=0.0001, label="学习率") tab_seed = gr.Number(value=42, label="随机种子", precision=0) tab_btn = gr.Button("🚀 训练TabBERT模型", variant="primary", size="lg") if not TORCH_AVAILABLE: gr.Markdown("⚠️ **PyTorch 未安装**。请在 requirements.txt 中添加 `torch>=2.0.0` 并重启。") with gr.Column(scale=2): tab_result = gr.Textbox(label="训练结果", lines=25) with gr.Row(): tab_img1 = gr.Image(label="特征重要性") tab_img2 = gr.Image(label="异常分数分布") with gr.Row(): tab_img3 = gr.Image(label="ROC曲线") tab_img4 = gr.Image(label="混淆矩阵与阈值分析") # ===== Tab 5: 模型管理 ===== with gr.Tab("💾 模型管理"): gr.Markdown("""### Hugging Face Hub 模型管理 保存训练好的模型到 Hub, 或从 Hub 加载已有模型。 **获取 Token:** https://huggingface.co/settings/tokens""") with gr.Row(): with gr.Column(scale=1): gr.Markdown("#### 保存模型到 Hub") save_repo_id = gr.Textbox(label="Hub Repo ID", placeholder="如: yourname/insurance-model-v1") save_token = gr.Textbox(label="HF Token", placeholder="hf_xxxxx", type="password") save_type = gr.Dropdown(["churn_prediction", "product_recommendation", "anomaly_detection", "all"], value="all", label="模型类型") save_notes = gr.Textbox(label="备注", placeholder="模型描述...") save_btn = gr.Button("📤 保存到 Hub", variant="primary") save_result = gr.Textbox(label="保存结果", lines=10) with gr.Column(scale=1): gr.Markdown("#### 从 Hub 加载模型") load_repo_id = gr.Textbox(label="Hub Repo ID", placeholder="如: yourname/insurance-model-v1") load_token = gr.Textbox(label="HF Token", placeholder="hf_xxxxx", type="password") load_type = gr.Dropdown(["churn_prediction", "product_recommendation", "anomaly_detection", "all"], value="all", label="模型类型") load_btn = gr.Button("📥 从 Hub 加载", variant="primary") load_result = gr.Textbox(label="加载结果", lines=15) with gr.Row(): load_img1 = gr.Image(label="加载模型可视化 1") load_img2 = gr.Image(label="加载模型可视化 2") load_img3 = gr.Image(label="加载模型可视化 3") # ===== Tab 6: 生存分析 ===== with gr.Tab("⏱️ 生存分析"): gr.Markdown("""### 保险理赔/购买时序生存分析 预测从投保到理赔/购买/流失的时间, 处理右删失数据 (部分用户尚未发生事件)。 **统计方法:** lifelines Cox-PH + Kaplan-Meier | **深度方法:** DeepSurv (Neural Cox-PH)""") with gr.Row(): with gr.Column(scale=1): surv_samples = gr.Slider(500, 5000, value=2000, step=100, label="样本数量") surv_test_size = gr.Slider(0.1, 0.4, value=0.2, step=0.05, label="测试集比例") surv_seed = gr.Number(value=42, label="随机种子", precision=0) use_deep_surv = gr.Checkbox(value=True, label="启用 DeepSurv (PyTorch)") deep_epochs = gr.Slider(10, 200, value=50, step=10, label="DeepSurv Epochs") deep_lr = gr.Slider(0.0001, 0.01, value=0.001, step=0.0001, label="DeepSurv LR") surv_btn = gr.Button("🚀 训练生存分析模型", variant="primary", size="lg") if not LIFELINES_AVAILABLE: gr.Markdown("⚠️ **lifelines 未安装**。统计生存分析禁用。") if not TORCH_AVAILABLE: gr.Markdown("⚠️ **PyTorch 未安装**。DeepSurv 禁用。") with gr.Column(scale=2): surv_result = gr.Textbox(label="训练结果", lines=30) with gr.Row(): surv_img1 = gr.Image(label="Kaplan-Meier 生存曲线") surv_img2 = gr.Image(label="Cox-PH 系数") with gr.Row(): surv_img3 = gr.Image(label="预测生存函数") surv_img4 = gr.Image(label="DeepSurv 风险分层") with gr.Row(): surv_table = gr.Dataframe(label="数据样本") # ===== Tab 7: 帮助文档 ===== with gr.Tab("❓ 帮助"): gr.Markdown("""## 📚 完整使用指南 ### 1. 演示模式 合成保险APP行为数据, 自动标注流失/留存标签, 训练 GBDT + RF。 ### 2. CSV上传 **必需列:** `user_id`, `session_id`, `timestamp`, `event_type`, `page_id` **可选:** `product_id`, `amount`, `label` ### 3. DIN 产品推荐 - 输入: 用户历史行为序列 + 候选保险产品 - 核心: LocalActivationUnit 注意力机制 - 输出: 购买概率 + 注意力权重可视化 ### 4. TabBERT 异常检测 - 输入: 理赔记录多维特征 - 损失: Focal Loss (解决1:4不平衡) - 输出: 异常分数 + 阈值分析 ### 5. 模型管理 - 保存: 训练后自动保存到 `outputs/`, 可一键上传至 Hugging Face Hub - 加载: 从 Hub 下载已有模型, 查看指标和特征重要性 ### 6. 生存分析 - **lifelines Cox-PH**: 统计基线, 可解释系数, Kaplan-Meier 曲线 - **DeepSurv**: 神经网络Cox-PH, 学习非线性交互, 风险分层 - **右删失处理**: 自动处理尚未发生事件的用户 ### 事件类型 (30种) 浏览 | 交互 | 转化 | 理赔 | 续保 | 其他 ---|---|---|---|---|--- page_view | quote_request | payment_success | claim_init | renewal_click | login product_view | form_submit | policy_issued | claim_doc_upload | renewal_complete | logout premium_calculator | document_upload | policy_select | claim_review | policy_cancel | app_uninstall article_read | chat_init | payment_init | claim_approved | renewal_reminder | faq_view | call_init | | claim_rejected | | product_compare | video_consult | | | | ### 参考文献 | 论文 | 应用 | arXiv | |------|------|-------| | Deep Interest Network | 产品推荐 | [1706.06978](https://arxiv.org/abs/1706.06978) | | SDIM | 长期行为建模 | [2205.10249](https://arxiv.org/abs/2205.10249) | | TabBERT/TabFormer | 表格时序异常检测 | [2011.01843](https://arxiv.org/abs/2011.01843) | | Transformer Churn | 非合约流失预测 | [2309.14390](https://arxiv.org/abs/2309.14390) | | DeepSurv | 生存分析 | [1606.00931](https://arxiv.org/abs/1606.00931) | | RNN Survival | 购买时序预测 | [2304.00575](https://arxiv.org/abs/2304.00575) | | Focal Loss | 不平衡分类 | [1708.02002](https://arxiv.org/abs/1708.02002) | """) gr.Markdown("""---
保险APP 用户行为分析模型训练平台 v3.0 | 作者: Stephanwu
""") # ===== 事件绑定 ===== train_btn.click(fn=demo_train, inputs=[n_users_slider, n_events_slider, test_size_slider, random_seed, use_cv_check], outputs=[demo_result, demo_img1, demo_img2, demo_img3, demo_img4, demo_table]) info_btn.click(fn=show_csv_info, inputs=[csv_file], outputs=[csv_info, csv_preview]) csv_train_btn.click(fn=csv_train, inputs=[csv_file, label_col_input, csv_test_size, csv_random_seed, csv_use_cv], outputs=[csv_result, csv_img1, csv_img2, csv_img3, csv_img4, csv_table]) din_btn.click(fn=train_dien_recommendation, inputs=[din_users, din_emb, din_epochs, din_batch, din_lr, din_seed], outputs=[din_result, din_img1, din_img2, din_img3, din_img4]) tab_btn.click(fn=train_tabbert_anomaly, inputs=[tab_normal, tab_anomaly, tab_dmodel, tab_epochs, tab_batch, tab_lr, tab_seed], outputs=[tab_result, tab_img1, tab_img2, tab_img3, tab_img4]) save_btn.click(fn=save_model_to_hub, inputs=[save_repo_id, save_token, save_type, save_notes], outputs=[save_result]) load_btn.click(fn=load_model_from_hub, inputs=[load_repo_id, load_token, load_type], outputs=[load_result, load_img1, load_img2, load_img3]) surv_btn.click(fn=train_survival_analysis, inputs=[surv_samples, surv_test_size, surv_seed, use_deep_surv, deep_epochs, deep_lr], outputs=[surv_result, surv_img1, surv_img2, surv_img3, surv_img4, surv_table]) if __name__ == "__main__": demo.launch()