| """ |
| =============================================================================== |
| MASTER SCRIPT — TEK KOMUTLA TÜM DENEYLERİ ÇALIŞTIR |
| =============================================================================== |
| |
| Bu script sırayla şunları yapar: |
| 1. Veri denetimi ve temizleme (data_audit) |
| 2. En iyi ön işleme pipeline'ını belirle |
| 3. Topolojik kırılma noktası tespiti |
| 4. 5 bölme stratejisi × 4 model (GraphSAGE dahil) = 20 deney |
| 5. Walk-forward validasyon + dürüstlük testi |
| 6. Tüm figürleri ve sonuçları kaydet |
| |
| KULLANIM: |
| pip install pandas numpy scikit-learn matplotlib seaborn lightgbm xgboost networkx scipy imbalanced-learn torch torch-geometric |
| python run_all.py --data_dir ./dataset |
| |
| SÜRE: ~15 dakika (CPU) |
| =============================================================================== |
| """ |
|
|
| import os, sys, json, warnings, time, argparse |
| import numpy as np |
| import pandas as pd |
| import networkx as nx |
| from scipy.ndimage import uniform_filter1d |
|
|
| import matplotlib |
| matplotlib.use('Agg') |
| import matplotlib.pyplot as plt |
| import seaborn as sns |
|
|
| from sklearn.preprocessing import StandardScaler, RobustScaler |
| from sklearn.ensemble import RandomForestClassifier |
| from sklearn.model_selection import train_test_split |
| from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score |
|
|
| import xgboost as xgb |
| import lightgbm as lgb |
|
|
| import torch |
| import torch.nn as nn |
| import torch.nn.functional as F |
| from torch_geometric.nn import SAGEConv |
| from torch_geometric.data import Data |
|
|
| warnings.filterwarnings('ignore') |
| np.random.seed(42) |
| torch.manual_seed(42) |
|
|
|
|
| |
| |
| |
| class GraphSAGENet(nn.Module): |
| def __init__(self, in_channels, hidden=128, out_channels=2, num_layers=3, dropout=0.3): |
| super().__init__() |
| self.convs = nn.ModuleList() |
| self.bns = nn.ModuleList() |
| self.convs.append(SAGEConv(in_channels, hidden)) |
| self.bns.append(nn.BatchNorm1d(hidden)) |
| for _ in range(num_layers - 2): |
| self.convs.append(SAGEConv(hidden, hidden)) |
| self.bns.append(nn.BatchNorm1d(hidden)) |
| self.convs.append(SAGEConv(hidden, out_channels)) |
| self.dropout = dropout |
|
|
| def forward(self, x, edge_index): |
| for i, (conv, bn) in enumerate(zip(self.convs[:-1], self.bns)): |
| x = conv(x, edge_index) |
| x = bn(x) |
| x = F.relu(x) |
| x = F.dropout(x, p=self.dropout, training=self.training) |
| x = self.convs[-1](x, edge_index) |
| return x |
|
|
|
|
| def train_graphsage(data, train_mask, test_mask, in_channels, epochs=200, lr=0.005, weight=None): |
| """GraphSAGE eğit ve değerlendir — inductive: test kenarları eğitimde kullanılmaz""" |
| device = torch.device('cpu') |
| model = GraphSAGENet(in_channels, hidden=128, out_channels=2, num_layers=3, dropout=0.3).to(device) |
| |
| |
| if weight is not None: |
| w = torch.tensor([1.0, weight], dtype=torch.float32).to(device) |
| else: |
| w = None |
| |
| optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4) |
| scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs) |
| |
| |
| train_nodes = set(torch.where(train_mask)[0].tolist()) |
| edge_index = data.edge_index |
| mask_e = torch.tensor([ |
| (edge_index[0, i].item() in train_nodes) and (edge_index[1, i].item() in train_nodes) |
| for i in range(edge_index.shape[1]) |
| ], dtype=torch.bool) |
| train_edge_index = edge_index[:, mask_e] |
| |
| x = data.x.to(device) |
| y = data.y.to(device) |
| train_mask_d = train_mask.to(device) |
| test_mask_d = test_mask.to(device) |
| train_edge_index = train_edge_index.to(device) |
| full_edge_index = edge_index.to(device) |
| |
| best_f1 = 0 |
| best_state = None |
| patience = 30 |
| no_improve = 0 |
| |
| model.train() |
| for epoch in range(epochs): |
| optimizer.zero_grad() |
| out = model(x, train_edge_index) |
| loss = F.cross_entropy(out[train_mask_d], y[train_mask_d], weight=w) |
| loss.backward() |
| optimizer.step() |
| scheduler.step() |
| |
| if (epoch + 1) % 10 == 0: |
| model.eval() |
| with torch.no_grad(): |
| out_eval = model(x, full_edge_index) |
| pred = out_eval[test_mask_d].argmax(dim=1) |
| f1 = f1_score(y[test_mask_d].cpu(), pred.cpu(), zero_division=0) |
| if f1 > best_f1: |
| best_f1 = f1 |
| best_state = {k: v.clone() for k, v in model.state_dict().items()} |
| no_improve = 0 |
| else: |
| no_improve += 1 |
| model.train() |
| if no_improve >= patience // 10: |
| break |
| |
| |
| if best_state: |
| model.load_state_dict(best_state) |
| model.eval() |
| with torch.no_grad(): |
| out = model(x, full_edge_index) |
| proba = F.softmax(out, dim=1)[:, 1] |
| |
| |
| best_th_f1 = 0 |
| best_th = 0.5 |
| for th in np.arange(0.1, 0.9, 0.05): |
| pred_th = (proba[test_mask_d] >= th).long() |
| f1_th = f1_score(y[test_mask_d].cpu(), pred_th.cpu(), zero_division=0) |
| if f1_th > best_th_f1: |
| best_th_f1 = f1_th |
| best_th = th |
| |
| pred = (proba[test_mask_d] >= best_th).long() |
| y_test = y[test_mask_d].cpu().numpy() |
| pred_np = pred.cpu().numpy() |
| proba_np = proba[test_mask_d].cpu().numpy() |
| |
| return { |
| 'f1': round(f1_score(y_test, pred_np, zero_division=0), 4), |
| 'precision': round(precision_score(y_test, pred_np, zero_division=0), 4), |
| 'recall': round(recall_score(y_test, pred_np, zero_division=0), 4), |
| 'auroc': round(roc_auc_score(y_test, proba_np) if len(np.unique(y_test)) > 1 else 0.5, 4), |
| } |
|
|
|
|
| def main(data_dir): |
| start_time = time.time() |
| |
| |
| for d in ['output/figures', 'output/results']: |
| os.makedirs(d, exist_ok=True) |
| |
| |
| |
| |
| print("=" * 70) |
| print("ADIM 1: VERİ YÜKLEME") |
| print("=" * 70) |
| |
| feat_df = pd.read_csv(os.path.join(data_dir, 'elliptic_txs_features.csv'), header=None) |
| class_df = pd.read_csv(os.path.join(data_dir, 'elliptic_txs_classes.csv')) |
| edge_df = pd.read_csv(os.path.join(data_dir, 'elliptic_txs_edgelist.csv')) |
| |
| txids = feat_df.iloc[:, 0].values |
| timesteps_raw = feat_df.iloc[:, 1].values.astype(int) |
| features_raw = feat_df.iloc[:, 2:].values.astype(np.float64) |
| N = len(txids) |
| |
| id_map = {tid: i for i, tid in enumerate(txids)} |
| label_map = {'1': 1, '2': 0, 'unknown': -1} |
| labels_np = np.array([label_map[str(c)] for c in class_df['class'].values]) |
| |
| |
| valid_edges = [(id_map[s], id_map[d]) for s, d in zip(edge_df['txId1'], edge_df['txId2']) |
| if s in id_map and d in id_map] |
| src = np.array([e[0] for e in valid_edges]) |
| dst = np.array([e[1] for e in valid_edges]) |
| |
| labeled_mask = labels_np >= 0 |
| labeled_indices = np.where(labeled_mask)[0] |
| |
| |
| full_to_labeled = {full_idx: lab_idx for lab_idx, full_idx in enumerate(labeled_indices)} |
| |
| X_raw = features_raw[labeled_mask] |
| y = labels_np[labeled_mask] |
| ts = timesteps_raw[labeled_mask] |
| |
| print(f" Toplam: {N}, Etiketli: {len(y)}") |
| print(f" İllicit: {y.sum()} ({y.mean()*100:.1f}%), Licit: {len(y)-y.sum()}") |
| print(f" Kenar sayısı: {len(src)}") |
| |
| |
| |
| |
| print("\n" + "=" * 70) |
| print("ADIM 2: VERİ TEMİZLEME") |
| print("=" * 70) |
| |
| nan_count = np.isnan(X_raw).sum() |
| inf_count = np.isinf(X_raw).sum() |
| print(f" NaN: {nan_count}, Inf: {inf_count}") |
| X = np.nan_to_num(X_raw, nan=0.0, posinf=0.0, neginf=0.0) |
| |
| |
| Q1 = np.percentile(X, 25, axis=0) |
| Q3 = np.percentile(X, 75, axis=0) |
| IQR = Q3 - Q1 |
| lower = Q1 - 1.5 * IQR |
| upper = Q3 + 1.5 * IQR |
| outlier_mask = (X < lower) | (X > upper) |
| print(f" Outlier hücre: {outlier_mask.sum()} ({outlier_mask.sum()/(X.shape[0]*X.shape[1])*100:.1f}%)") |
| |
| X_clipped = np.clip(X, lower, upper) |
| |
| |
| variances = np.var(X_clipped, axis=0) |
| var_mask = variances > 1e-6 |
| X_clean = X_clipped[:, var_mask] |
| print(f" Düşük varyanslı özellik çıkarıldı: {(~var_mask).sum()}, kalan: {var_mask.sum()}") |
| |
| |
| X_final = X_clipped |
| |
| |
| |
| |
| print("\n" + "=" * 70) |
| print("ADIM 3: PIPELINE KARŞILAŞTIRMASI") |
| print("=" * 70) |
| |
| tr_mask_pipe = ts <= 39 |
| te_mask_pipe = ts > 39 |
| |
| def quick_eval(X_tr, y_tr, X_te, y_te): |
| m = lgb.LGBMClassifier(n_estimators=500, max_depth=12, scale_pos_weight=10, |
| learning_rate=0.05, random_state=42, n_jobs=-1, verbose=-1) |
| m.fit(X_tr, y_tr) |
| proba = m.predict_proba(X_te)[:, 1] |
| |
| best_f1, best_th = 0, 0.5 |
| for th in np.arange(0.1, 0.9, 0.05): |
| p = (proba >= th).astype(int) |
| f = f1_score(y_te, p, zero_division=0) |
| if f > best_f1: best_f1, best_th = f, th |
| return best_f1 |
| |
| pipelines = {} |
| |
| f1_raw = quick_eval(X_raw[tr_mask_pipe], y[tr_mask_pipe], X_raw[te_mask_pipe], y[te_mask_pipe]) |
| pipelines['Ham Veri'] = f1_raw |
| print(f" Ham Veri: F1={f1_raw:.4f}") |
| |
| sc = StandardScaler() |
| f1_ss = quick_eval(sc.fit_transform(X[tr_mask_pipe]), y[tr_mask_pipe], sc.transform(X[te_mask_pipe]), y[te_mask_pipe]) |
| pipelines['StandardScaler'] = f1_ss |
| print(f" StandardScaler: F1={f1_ss:.4f}") |
| |
| rs = RobustScaler() |
| f1_rs = quick_eval(rs.fit_transform(X[tr_mask_pipe]), y[tr_mask_pipe], rs.transform(X[te_mask_pipe]), y[te_mask_pipe]) |
| pipelines['RobustScaler'] = f1_rs |
| print(f" RobustScaler: F1={f1_rs:.4f}") |
| |
| rs2 = RobustScaler() |
| f1_cr = quick_eval(rs2.fit_transform(X_clipped[tr_mask_pipe]), y[tr_mask_pipe], rs2.transform(X_clipped[te_mask_pipe]), y[te_mask_pipe]) |
| pipelines['Clip+Robust'] = f1_cr |
| print(f" Clip+Robust: F1={f1_cr:.4f}") |
| |
| rs3 = RobustScaler() |
| f1_cvr = quick_eval(rs3.fit_transform(X_clean[tr_mask_pipe]), y[tr_mask_pipe], rs3.transform(X_clean[te_mask_pipe]), y[te_mask_pipe]) |
| pipelines['Clip+VarFilter+Robust'] = f1_cvr |
| print(f" Clip+VarFilter+Rob: F1={f1_cvr:.4f}") |
| |
| try: |
| from imblearn.over_sampling import SMOTE |
| smote = SMOTE(random_state=42) |
| rs4 = RobustScaler() |
| X_tr_s = rs4.fit_transform(X_clipped[tr_mask_pipe]) |
| X_te_s = rs4.transform(X_clipped[te_mask_pipe]) |
| X_tr_sm, y_tr_sm = smote.fit_resample(X_tr_s, y[tr_mask_pipe]) |
| f1_smote = quick_eval(X_tr_sm, y_tr_sm, X_te_s, y[te_mask_pipe]) |
| pipelines['Clip+Robust+SMOTE'] = f1_smote |
| print(f" Clip+Robust+SMOTE: F1={f1_smote:.4f}") |
| except ImportError: |
| print(" SMOTE atlandı") |
| |
| best_pipe = max(pipelines, key=pipelines.get) |
| print(f"\n ★ En iyi pipeline: {best_pipe} (F1={pipelines[best_pipe]:.4f})") |
| |
| |
| |
| |
| print("\n" + "=" * 70) |
| print("ADIM 4: TOPOLOJİK METRİKLER") |
| print("=" * 70) |
| |
| all_ts = sorted(np.unique(timesteps_raw)) |
| topo = {} |
| for t in all_ts: |
| ts_nodes = set(np.where(timesteps_raw == t)[0]) |
| m = np.isin(src, list(ts_nodes)) & np.isin(dst, list(ts_nodes)) |
| G = nx.DiGraph() |
| G.add_nodes_from(ts_nodes) |
| G.add_edges_from(zip(src[m], dst[m])) |
| n = G.number_of_nodes(); e = G.number_of_edges() |
| density = nx.density(G) if n > 1 else 0 |
| G_u = G.to_undirected() |
| comps = nx.number_connected_components(G_u) |
| largest = max(nx.connected_components(G_u), key=len) |
| cc_ratio = len(largest) / max(n, 1) |
| degs = [d for _, d in G.degree()] |
| avg_deg = np.mean(degs) if degs else 0 |
| ts_lab = [nd for nd in ts_nodes if labels_np[nd] >= 0] |
| ts_ill = [nd for nd in ts_lab if labels_np[nd] == 1] |
| ill_rate = len(ts_ill) / max(len(ts_lab), 1) |
| topo[t] = {'n_nodes': n, 'n_edges': e, 'density': density, 'cc_ratio': cc_ratio, |
| 'n_components': comps, 'avg_degree': avg_deg, 'illicit_rate': ill_rate} |
| print(f" TS {t:2d}: nodes={n:5d} edges={e:5d} density={density:.5f} illicit={ill_rate:.3f}") |
| |
| topo_df = pd.DataFrame(topo).T |
| topo_df.to_csv('output/results/topological_metrics.csv') |
| |
| |
| |
| |
| print("\n" + "=" * 70) |
| print("ADIM 5: KIRILMA NOKTASI TESPİTİ") |
| print("=" * 70) |
| |
| |
| for col in ['density', 'cc_ratio', 'n_components']: |
| mi, ma = topo_df[col].min(), topo_df[col].max() |
| topo_df[f'{col}_n'] = (topo_df[col] - mi) / (ma - mi + 1e-8) |
| health = (topo_df['density_n'] + topo_df['cc_ratio_n'] + (1 - topo_df['n_components_n'])) / 3 |
| bp_final = health.diff().idxmin() |
| print(f" Sağlık skoru kırılma noktası: TS {bp_final}") |
| |
| |
| df_t = topo_df.copy() |
| for col in ['n_edges', 'density', 'avg_degree']: |
| mi, ma = df_t[col].min(), df_t[col].max() |
| df_t[f'{col}_norm'] = (df_t[col] - mi) / (ma - mi + 1e-8) |
| crisis = (df_t['n_edges_norm'] * 0.4 + df_t['density_norm'] * 0.3 + df_t['avg_degree_norm'] * 0.3).values |
| crisis_smooth = uniform_filter1d(crisis, size=5, mode='nearest') |
| velocity = np.gradient(crisis_smooth) |
| peaks = [] |
| for i in range(1, len(velocity) - 1): |
| if velocity[i-1] > 0 and velocity[i+1] < 0: |
| peaks.append({'timestep': all_ts[i], 'index': i, 'drop': abs(velocity[i+1])}) |
| peaks = sorted(peaks, key=lambda x: x['drop'], reverse=True) |
| |
| print(f" ★ Final kırılma noktası: TS {bp_final}") |
| |
| |
| |
| |
| print("\n" + "=" * 70) |
| print("ADIM 6: GRAPHSAGE VERİ HAZIRLAMA") |
| print("=" * 70) |
| |
| |
| labeled_set = set(labeled_indices.tolist()) |
| labeled_edges = [(full_to_labeled[s], full_to_labeled[d]) |
| for s, d in zip(src, dst) |
| if s in labeled_set and d in labeled_set |
| and s in full_to_labeled and d in full_to_labeled] |
| |
| if labeled_edges: |
| edge_src = [e[0] for e in labeled_edges] |
| edge_dst = [e[1] for e in labeled_edges] |
| edge_index = torch.tensor([edge_src + edge_dst, edge_dst + edge_src], dtype=torch.long) |
| else: |
| edge_index = torch.zeros((2, 0), dtype=torch.long) |
| |
| print(f" Etiketli düğümler arası kenar: {len(labeled_edges)} ({edge_index.shape[1]} undirected)") |
| |
| |
| scaler_gnn = RobustScaler() |
| X_gnn = scaler_gnn.fit_transform(X_final) |
| |
| x_tensor = torch.tensor(X_gnn, dtype=torch.float32) |
| y_tensor = torch.tensor(y, dtype=torch.long) |
| |
| graph_data = Data(x=x_tensor, edge_index=edge_index, y=y_tensor) |
| |
| |
| |
| |
| print("\n" + "=" * 70) |
| print("ADIM 7: 5 STRATEJİ × 4 MODEL = 20 DENEY") |
| print("=" * 70) |
| |
| def make_masks(train_ts_set, test_ts_set): |
| tr = np.array([ts[i] in train_ts_set for i in range(len(y))]) |
| te = np.array([ts[i] in test_ts_set for i in range(len(y))]) |
| return tr, te |
| |
| def split_random(): |
| idx = np.arange(len(y)) |
| tr, te = train_test_split(idx, test_size=0.2, random_state=42, stratify=y) |
| m_tr = np.zeros(len(y), dtype=bool); m_tr[tr] = True |
| m_te = np.zeros(len(y), dtype=bool); m_te[te] = True |
| return m_tr, m_te |
| |
| cutoff_chrono = all_ts[int(len(all_ts) * 0.8) - 1] |
| |
| strategies = { |
| 'Rastgele': split_random, |
| 'Kronolojik': lambda: make_masks({t for t in all_ts if t <= cutoff_chrono}, {t for t in all_ts if t > cutoff_chrono}), |
| 'Topolojik Kırılma': lambda: make_masks({t for t in all_ts if t < bp_final}, {t for t in all_ts if t >= bp_final}), |
| 'Kayan Pencere': lambda: make_masks(set(all_ts[:-10]), set(all_ts[-10:])), |
| 'Düşmanca-Kriz': lambda: make_masks( |
| {t for t in all_ts if topo_df.loc[t, 'illicit_rate'] <= 0.18 and t != bp_final}, |
| {t for t in all_ts if topo_df.loc[t, 'illicit_rate'] > 0.18 or t == bp_final} |
| ), |
| } |
| |
| def train_eval_tabular(X_tr, y_tr, X_te, y_te, model_type): |
| """Tabular model eğit — threshold optimizasyonu ile""" |
| sc = RobustScaler() |
| Xtr = sc.fit_transform(X_tr) |
| Xte = sc.transform(X_te) |
| |
| |
| try: |
| from imblearn.over_sampling import SMOTE |
| smote = SMOTE(random_state=42) |
| Xtr, y_tr = smote.fit_resample(Xtr, y_tr) |
| except: |
| pass |
| |
| if model_type == 'lgbm': |
| m = lgb.LGBMClassifier( |
| n_estimators=500, max_depth=12, learning_rate=0.05, |
| num_leaves=63, min_child_samples=20, subsample=0.8, |
| colsample_bytree=0.8, scale_pos_weight=10, |
| random_state=42, n_jobs=-1, verbose=-1 |
| ) |
| elif model_type == 'rf': |
| m = RandomForestClassifier( |
| n_estimators=500, max_depth=20, min_samples_leaf=5, |
| class_weight='balanced_subsample', max_features='sqrt', |
| random_state=42, n_jobs=-1 |
| ) |
| elif model_type == 'xgb': |
| m = xgb.XGBClassifier( |
| n_estimators=500, max_depth=10, learning_rate=0.05, |
| subsample=0.8, colsample_bytree=0.8, scale_pos_weight=10, |
| min_child_weight=5, gamma=0.1, |
| random_state=42, n_jobs=-1, verbosity=0 |
| ) |
| |
| m.fit(Xtr, y_tr) |
| proba = m.predict_proba(Xte)[:, 1] |
| |
| |
| best_f1, best_th = 0, 0.5 |
| for th in np.arange(0.1, 0.9, 0.05): |
| pred_th = (proba >= th).astype(int) |
| f1_th = f1_score(y_te, pred_th, zero_division=0) |
| if f1_th > best_f1: |
| best_f1, best_th = f1_th, th |
| |
| pred = (proba >= best_th).astype(int) |
| return { |
| 'f1': round(f1_score(y_te, pred, zero_division=0), 4), |
| 'precision': round(precision_score(y_te, pred, zero_division=0), 4), |
| 'recall': round(recall_score(y_te, pred, zero_division=0), 4), |
| 'auroc': round(roc_auc_score(y_te, proba) if len(np.unique(y_te)) > 1 else 0.5, 4), |
| 'threshold': round(best_th, 2), |
| } |
| |
| model_types = [('lgbm', 'LightGBM'), ('rf', 'Random Forest'), ('xgb', 'XGBoost')] |
| all_results = [] |
| |
| for strat_name, strat_fn in strategies.items(): |
| tr_m, te_m = strat_fn() |
| if tr_m.sum() < 50 or te_m.sum() < 10: |
| print(f" {strat_name}: yetersiz veri, atlanıyor") |
| continue |
| |
| print(f"\n {strat_name} (train={tr_m.sum()}, test={te_m.sum()}, test_ill={y[te_m].sum()}):") |
| |
| |
| for mt, mn in model_types: |
| res = train_eval_tabular(X_final[tr_m], y[tr_m], X_final[te_m], y[te_m], mt) |
| res['strateji'] = strat_name |
| res['model'] = mn |
| all_results.append(res) |
| print(f" {mn:15s}: F1={res['f1']:.4f} P={res['precision']:.4f} R={res['recall']:.4f} AUROC={res['auroc']:.4f} th={res['threshold']}") |
| |
| |
| print(f" {'GraphSAGE':15s}: eğitiliyor...", end='', flush=True) |
| train_mask_t = torch.tensor(tr_m, dtype=torch.bool) |
| test_mask_t = torch.tensor(te_m, dtype=torch.bool) |
| ill_weight = float((y[tr_m] == 0).sum()) / max(float((y[tr_m] == 1).sum()), 1) |
| ill_weight = min(ill_weight, 15.0) |
| |
| gs_res = train_graphsage(graph_data, train_mask_t, test_mask_t, |
| X_final.shape[1], epochs=200, lr=0.005, weight=ill_weight) |
| gs_res['strateji'] = strat_name |
| gs_res['model'] = 'GraphSAGE' |
| gs_res['threshold'] = 0.0 |
| all_results.append(gs_res) |
| print(f"\r {'GraphSAGE':15s}: F1={gs_res['f1']:.4f} P={gs_res['precision']:.4f} R={gs_res['recall']:.4f} AUROC={gs_res['auroc']:.4f}") |
| |
| res_df = pd.DataFrame(all_results) |
| res_df.to_csv('output/results/all_experiment_results.csv', index=False) |
| |
| |
| |
| |
| print("\n" + "=" * 70) |
| print("ADIM 8: WALK-FORWARD VALİDASYON") |
| print("=" * 70) |
| |
| wf_results = {} |
| for mt, mn in model_types: |
| wf_f1s = [] |
| for test_start in range(10, 49, 3): |
| tr_m = ts < test_start |
| te_m = (ts >= test_start) & (ts < test_start + 3) |
| if tr_m.sum() < 50 or te_m.sum() < 10 or len(np.unique(y[te_m])) < 2: |
| continue |
| res = train_eval_tabular(X_final[tr_m], y[tr_m], X_final[te_m], y[te_m], mt) |
| wf_f1s.append(res['f1']) |
| wf_results[mn] = round(np.mean(wf_f1s), 4) |
| print(f" {mn}: Walk-Forward F1 = {wf_results[mn]:.4f}") |
| |
| |
| wf_gs_f1s = [] |
| for test_start in range(10, 49, 3): |
| tr_m_wf = ts < test_start |
| te_m_wf = (ts >= test_start) & (ts < test_start + 3) |
| if tr_m_wf.sum() < 50 or te_m_wf.sum() < 10 or len(np.unique(y[te_m_wf])) < 2: |
| continue |
| train_mask_wf = torch.tensor(tr_m_wf, dtype=torch.bool) |
| test_mask_wf = torch.tensor(te_m_wf, dtype=torch.bool) |
| ill_w = float((y[tr_m_wf]==0).sum()) / max(float((y[tr_m_wf]==1).sum()), 1) |
| ill_w = min(ill_w, 15.0) |
| gs_wf = train_graphsage(graph_data, train_mask_wf, test_mask_wf, X_final.shape[1], epochs=100, weight=ill_w) |
| wf_gs_f1s.append(gs_wf['f1']) |
| wf_results['GraphSAGE'] = round(np.mean(wf_gs_f1s), 4) if wf_gs_f1s else 0 |
| print(f" GraphSAGE: Walk-Forward F1 = {wf_results['GraphSAGE']:.4f}") |
| |
| |
| print("\n Dürüstlük Karşılaştırması:") |
| honesty_data = [] |
| for strat_name in strategies: |
| sapma_list = [] |
| for mn in wf_results: |
| row = res_df[(res_df['strateji'] == strat_name) & (res_df['model'] == mn)] |
| if len(row) > 0 and mn in wf_results and wf_results[mn] > 0: |
| sapma = ((row['f1'].values[0] - wf_results[mn]) / wf_results[mn]) * 100 |
| sapma_list.append(sapma) |
| if sapma_list: |
| avg_sapma = np.mean(sapma_list) |
| durum = "✅ DÜRÜST" if abs(avg_sapma) < 10 else ("🔴 ŞİŞME" if avg_sapma > 10 else "⚠️ PESİMİST") |
| honesty_data.append({'strateji': strat_name, 'sapma': round(avg_sapma, 1), 'durum': durum}) |
| print(f" {strat_name:25s}: ort. sapma = {avg_sapma:+.1f}% {durum}") |
| |
| |
| |
| |
| print("\n" + "=" * 70) |
| print("ADIM 9: FİGÜRLER") |
| print("=" * 70) |
| |
| sns.set_theme(style='whitegrid', font_scale=1.1) |
| |
| |
| fig, axes = plt.subplots(3, 1, figsize=(18, 14), gridspec_kw={'height_ratios': [2, 1, 1]}) |
| axes[0].plot(all_ts, health.values, 'o-', color='steelblue', linewidth=2, markersize=5) |
| axes[0].axvline(x=bp_final, color='red', linewidth=3, linestyle='--') |
| axes[0].annotate(f'KIRILMA TS={bp_final}', xy=(bp_final, health.loc[bp_final]), |
| fontsize=12, fontweight='bold', color='red', ha='center', |
| xytext=(bp_final+3, health.max()-0.05), |
| arrowprops=dict(arrowstyle='->', color='red', lw=2)) |
| axes[0].set_ylabel('Ağ Sağlık Skoru', fontsize=12) |
| axes[0].set_title('Topolojik Kırılma Noktası Tespiti', fontsize=14, fontweight='bold') |
| |
| colors_bar = ['#FF4444' if topo_df.loc[t, 'illicit_rate'] > 0.18 else '#44BB44' for t in all_ts] |
| axes[1].bar(all_ts, topo_df['illicit_rate'].values * 100, color=colors_bar) |
| axes[1].axvline(x=bp_final, color='red', linewidth=3, linestyle='--') |
| axes[1].set_ylabel('İllicit %', fontsize=12) |
| |
| axes[2].plot(all_ts, crisis_smooth, '-', color='steelblue', linewidth=2) |
| if peaks: |
| axes[2].scatter(peaks[0]['timestep'], crisis_smooth[peaks[0]['index']], |
| color='red', s=200, zorder=5, edgecolors='black') |
| axes[2].set_ylabel('Kriz Sinyali', fontsize=12) |
| axes[2].set_xlabel('Timestep', fontsize=12) |
| plt.tight_layout() |
| plt.savefig('output/figures/fig1_breakpoint.png', dpi=150, bbox_inches='tight') |
| plt.close() |
| print(" ✓ fig1_breakpoint.png") |
| |
| |
| fig, ax = plt.subplots(figsize=(18, 8)) |
| strat_names = list(strategies.keys()) |
| model_names = [mn for _, mn in model_types] + ['GraphSAGE'] |
| colors5 = sns.color_palette('Set2', len(strat_names)) |
| x = np.arange(len(model_names)); width = 0.15 |
| |
| for i, strat in enumerate(strat_names): |
| vals = [] |
| for m in model_names: |
| row = res_df[(res_df['model'] == m) & (res_df['strateji'] == strat)] |
| vals.append(row['f1'].values[0] if len(row) > 0 else 0) |
| ec = 'black' if 'Topolojik' in strat else 'white' |
| lw = 2 if 'Topolojik' in strat else 0.5 |
| ax.bar(x + i*width, vals, width, label=strat, color=colors5[i], edgecolor=ec, linewidth=lw) |
| |
| wf_avg = np.mean(list(wf_results.values())) |
| ax.axhspan(wf_avg*0.9, wf_avg*1.1, alpha=0.12, color='green') |
| ax.axhline(y=wf_avg, color='green', linewidth=2, linestyle='--', label=f'Walk-Forward ({wf_avg:.3f})') |
| ax.set_xticks(x + width*2); ax.set_xticklabels(model_names, fontsize=12) |
| ax.set_ylabel('Illicit F1', fontsize=13) |
| ax.set_title('Bölme Stratejileri × Model Karşılaştırması', fontsize=14, fontweight='bold') |
| ax.legend(fontsize=9, loc='upper right'); ax.set_ylim(0, 1.1) |
| plt.tight_layout() |
| plt.savefig('output/figures/fig2_f1_comparison.png', dpi=150, bbox_inches='tight') |
| plt.close() |
| print(" ✓ fig2_f1_comparison.png") |
| |
| |
| fig, ax = plt.subplots(figsize=(10, 6)) |
| p_names = list(pipelines.keys()) |
| p_vals = list(pipelines.values()) |
| colors_p = ['#FF6B6B' if v == min(p_vals) else '#4ECDC4' if v == max(p_vals) else '#45B7D1' for v in p_vals] |
| ax.barh(p_names, p_vals, color=colors_p) |
| ax.set_xlabel('Illicit F1 Score') |
| ax.set_title('Ön İşleme Pipeline Karşılaştırması', fontsize=14, fontweight='bold') |
| for i, v in enumerate(p_vals): |
| ax.text(v + 0.002, i, f'{v:.4f}', va='center', fontsize=10) |
| plt.tight_layout() |
| plt.savefig('output/figures/fig3_pipeline_comparison.png', dpi=150, bbox_inches='tight') |
| plt.close() |
| print(" ✓ fig3_pipeline_comparison.png") |
| |
| |
| fig, ax = plt.subplots(figsize=(16, 7)) |
| sapma_data = [] |
| for strat_name in strat_names: |
| for mn in model_names: |
| row = res_df[(res_df['strateji'] == strat_name) & (res_df['model'] == mn)] |
| if len(row) > 0 and mn in wf_results and wf_results[mn] > 0: |
| sapma = ((row['f1'].values[0] - wf_results[mn]) / wf_results[mn]) * 100 |
| sapma_data.append({'strateji': strat_name, 'model': mn, 'sapma': round(sapma, 1)}) |
| if sapma_data: |
| sapma_df = pd.DataFrame(sapma_data) |
| pivot = sapma_df.pivot_table(values='sapma', index='model', columns='strateji') |
| sns.heatmap(pivot, annot=True, fmt='.1f', cmap='RdYlGn_r', center=0, ax=ax, |
| linewidths=0.5, cbar_kws={'label': 'Walk-Forward Sapma (%)'}) |
| ax.set_title('Walk-Forward Dürüstlük Sapması (%) — 4 Model × 5 Strateji', fontsize=14, fontweight='bold') |
| plt.tight_layout() |
| plt.savefig('output/figures/fig4_honesty.png', dpi=150, bbox_inches='tight') |
| plt.close() |
| print(" ✓ fig4_honesty.png") |
| |
| |
| fig, ax = plt.subplots(figsize=(14, 6)) |
| inf_data = [] |
| for mn in model_names: |
| row_rand = res_df[(res_df['model'] == mn) & (res_df['strateji'] == 'Rastgele')] |
| row_chr = res_df[(res_df['model'] == mn) & (res_df['strateji'] == 'Kronolojik')] |
| row_topo = res_df[(res_df['model'] == mn) & (res_df['strateji'] == 'Topolojik Kırılma')] |
| if len(row_rand) > 0 and len(row_chr) > 0: |
| rand_f1 = row_rand['f1'].values[0] |
| chr_f1 = row_chr['f1'].values[0] |
| topo_f1 = row_topo['f1'].values[0] if len(row_topo) > 0 else 0 |
| inf_data.append({ |
| 'model': mn, |
| 'Rastgele vs Kronolojik': round((rand_f1 - chr_f1) / chr_f1 * 100, 1), |
| 'Rastgele vs Topolojik': round((rand_f1 - topo_f1) / topo_f1 * 100, 1) if topo_f1 > 0 else 0, |
| }) |
| if inf_data: |
| inf_df = pd.DataFrame(inf_data).set_index('model') |
| sns.heatmap(inf_df, annot=True, fmt='.1f', cmap='Reds', ax=ax, linewidths=0.5, |
| cbar_kws={'label': 'Şişme Oranı (%)'}) |
| ax.set_title('Rastgele Bölme Performans Şişmesi (%)', fontsize=14, fontweight='bold') |
| plt.tight_layout() |
| plt.savefig('output/figures/fig5_inflation.png', dpi=150, bbox_inches='tight') |
| plt.close() |
| print(" ✓ fig5_inflation.png") |
| |
| |
| |
| |
| elapsed = time.time() - start_time |
| |
| summary = { |
| 'veri': {'toplam': N, 'etiketli': len(y), 'illicit': int(y.sum()), |
| 'ozellik': int(X_final.shape[1]), 'kenar': len(valid_edges)}, |
| 'temizleme': {'nan': int(nan_count), 'inf': int(inf_count), |
| 'outlier_pct': round(outlier_mask.sum()/(X.shape[0]*X.shape[1])*100, 2), |
| 'cikarilan_ozellik': int((~var_mask).sum()), 'en_iyi_pipeline': best_pipe}, |
| 'kirilma': {'saglik_yontemi': int(bp_final), 'final': int(bp_final)}, |
| 'walk_forward': wf_results, |
| 'sonuclar': res_df.to_dict(orient='records'), |
| 'pipeline_karsilastirma': {k: round(v, 4) for k, v in pipelines.items()}, |
| 'durustukluk': honesty_data, |
| 'sure_dakika': round(elapsed / 60, 1), |
| } |
| |
| with open('output/results/summary.json', 'w') as f: |
| json.dump(summary, f, indent=2, ensure_ascii=False) |
| |
| print("\n" + "=" * 70) |
| print(f"TAMAMLANDI! (Süre: {elapsed/60:.1f} dakika)") |
| print("=" * 70) |
| |
| |
| print(f"\n ═══ SONUÇ TABLOSU (Illicit F1) ═══") |
| pivot_f1 = res_df.pivot_table(values='f1', index='model', columns='strateji') |
| print(pivot_f1.to_string()) |
| |
| print(f"\n ═══ WALK-FORWARD REFERANS ═══") |
| for mn, f1 in wf_results.items(): |
| print(f" {mn}: {f1:.4f}") |
| |
| print(f"\n Çıktılar: output/results/ ve output/figures/") |
|
|
|
|
| if __name__ == '__main__': |
| parser = argparse.ArgumentParser() |
| parser.add_argument('--data_dir', type=str, default='./dataset') |
| args = parser.parse_args() |
| main(args.data_dir) |
|
|