""" =============================================================================== MASTER SCRIPT — TEK KOMUTLA TÜM DENEYLERİ ÇALIŞTIR =============================================================================== Bu script sırayla şunları yapar: 1. Veri denetimi ve temizleme (data_audit) 2. En iyi ön işleme pipeline'ını belirle 3. Topolojik kırılma noktası tespiti 4. 5 bölme stratejisi × 4 model (GraphSAGE dahil) = 20 deney 5. Walk-forward validasyon + dürüstlük testi 6. Tüm figürleri ve sonuçları kaydet KULLANIM: pip install pandas numpy scikit-learn matplotlib seaborn lightgbm xgboost networkx scipy imbalanced-learn torch torch-geometric python run_all.py --data_dir ./dataset SÜRE: ~15 dakika (CPU) =============================================================================== """ import os, sys, json, warnings, time, argparse import numpy as np import pandas as pd import networkx as nx from scipy.ndimage import uniform_filter1d import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import seaborn as sns from sklearn.preprocessing import StandardScaler, RobustScaler from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score import xgboost as xgb import lightgbm as lgb import torch import torch.nn as nn import torch.nn.functional as F from torch_geometric.nn import SAGEConv from torch_geometric.data import Data warnings.filterwarnings('ignore') np.random.seed(42) torch.manual_seed(42) # ════════════════════════════════════════════════════════════════ # GraphSAGE Model # ════════════════════════════════════════════════════════════════ class GraphSAGENet(nn.Module): def __init__(self, in_channels, hidden=128, out_channels=2, num_layers=3, dropout=0.3): super().__init__() self.convs = nn.ModuleList() self.bns = nn.ModuleList() self.convs.append(SAGEConv(in_channels, hidden)) self.bns.append(nn.BatchNorm1d(hidden)) for _ in range(num_layers - 2): self.convs.append(SAGEConv(hidden, hidden)) self.bns.append(nn.BatchNorm1d(hidden)) self.convs.append(SAGEConv(hidden, out_channels)) self.dropout = dropout def forward(self, x, edge_index): for i, (conv, bn) in enumerate(zip(self.convs[:-1], self.bns)): x = conv(x, edge_index) x = bn(x) x = F.relu(x) x = F.dropout(x, p=self.dropout, training=self.training) x = self.convs[-1](x, edge_index) return x def train_graphsage(data, train_mask, test_mask, in_channels, epochs=200, lr=0.005, weight=None): """GraphSAGE eğit ve değerlendir — inductive: test kenarları eğitimde kullanılmaz""" device = torch.device('cpu') model = GraphSAGENet(in_channels, hidden=128, out_channels=2, num_layers=3, dropout=0.3).to(device) # Class weight if weight is not None: w = torch.tensor([1.0, weight], dtype=torch.float32).to(device) else: w = None optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs) # Inductive: sadece train düğümleri arasındaki kenarları al train_nodes = set(torch.where(train_mask)[0].tolist()) edge_index = data.edge_index mask_e = torch.tensor([ (edge_index[0, i].item() in train_nodes) and (edge_index[1, i].item() in train_nodes) for i in range(edge_index.shape[1]) ], dtype=torch.bool) train_edge_index = edge_index[:, mask_e] x = data.x.to(device) y = data.y.to(device) train_mask_d = train_mask.to(device) test_mask_d = test_mask.to(device) train_edge_index = train_edge_index.to(device) full_edge_index = edge_index.to(device) best_f1 = 0 best_state = None patience = 30 no_improve = 0 model.train() for epoch in range(epochs): optimizer.zero_grad() out = model(x, train_edge_index) loss = F.cross_entropy(out[train_mask_d], y[train_mask_d], weight=w) loss.backward() optimizer.step() scheduler.step() if (epoch + 1) % 10 == 0: model.eval() with torch.no_grad(): out_eval = model(x, full_edge_index) pred = out_eval[test_mask_d].argmax(dim=1) f1 = f1_score(y[test_mask_d].cpu(), pred.cpu(), zero_division=0) if f1 > best_f1: best_f1 = f1 best_state = {k: v.clone() for k, v in model.state_dict().items()} no_improve = 0 else: no_improve += 1 model.train() if no_improve >= patience // 10: break # Final eval if best_state: model.load_state_dict(best_state) model.eval() with torch.no_grad(): out = model(x, full_edge_index) proba = F.softmax(out, dim=1)[:, 1] # Threshold optimization best_th_f1 = 0 best_th = 0.5 for th in np.arange(0.1, 0.9, 0.05): pred_th = (proba[test_mask_d] >= th).long() f1_th = f1_score(y[test_mask_d].cpu(), pred_th.cpu(), zero_division=0) if f1_th > best_th_f1: best_th_f1 = f1_th best_th = th pred = (proba[test_mask_d] >= best_th).long() y_test = y[test_mask_d].cpu().numpy() pred_np = pred.cpu().numpy() proba_np = proba[test_mask_d].cpu().numpy() return { 'f1': round(f1_score(y_test, pred_np, zero_division=0), 4), 'precision': round(precision_score(y_test, pred_np, zero_division=0), 4), 'recall': round(recall_score(y_test, pred_np, zero_division=0), 4), 'auroc': round(roc_auc_score(y_test, proba_np) if len(np.unique(y_test)) > 1 else 0.5, 4), } def main(data_dir): start_time = time.time() # Çıktı klasörleri for d in ['output/figures', 'output/results']: os.makedirs(d, exist_ok=True) # ════════════════════════════════════════════════════════════════ # ADIM 1: VERİ YÜKLEME # ════════════════════════════════════════════════════════════════ print("=" * 70) print("ADIM 1: VERİ YÜKLEME") print("=" * 70) feat_df = pd.read_csv(os.path.join(data_dir, 'elliptic_txs_features.csv'), header=None) class_df = pd.read_csv(os.path.join(data_dir, 'elliptic_txs_classes.csv')) edge_df = pd.read_csv(os.path.join(data_dir, 'elliptic_txs_edgelist.csv')) txids = feat_df.iloc[:, 0].values timesteps_raw = feat_df.iloc[:, 1].values.astype(int) features_raw = feat_df.iloc[:, 2:].values.astype(np.float64) N = len(txids) id_map = {tid: i for i, tid in enumerate(txids)} label_map = {'1': 1, '2': 0, 'unknown': -1} labels_np = np.array([label_map[str(c)] for c in class_df['class'].values]) # Kenarlar valid_edges = [(id_map[s], id_map[d]) for s, d in zip(edge_df['txId1'], edge_df['txId2']) if s in id_map and d in id_map] src = np.array([e[0] for e in valid_edges]) dst = np.array([e[1] for e in valid_edges]) labeled_mask = labels_np >= 0 labeled_indices = np.where(labeled_mask)[0] # Etiketli düğüm indeksleme (tüm düğümlerden etiketlilere) full_to_labeled = {full_idx: lab_idx for lab_idx, full_idx in enumerate(labeled_indices)} X_raw = features_raw[labeled_mask] y = labels_np[labeled_mask] ts = timesteps_raw[labeled_mask] print(f" Toplam: {N}, Etiketli: {len(y)}") print(f" İllicit: {y.sum()} ({y.mean()*100:.1f}%), Licit: {len(y)-y.sum()}") print(f" Kenar sayısı: {len(src)}") # ════════════════════════════════════════════════════════════════ # ADIM 2: VERİ TEMİZLEME VE ÖN İŞLEME # ════════════════════════════════════════════════════════════════ print("\n" + "=" * 70) print("ADIM 2: VERİ TEMİZLEME") print("=" * 70) nan_count = np.isnan(X_raw).sum() inf_count = np.isinf(X_raw).sum() print(f" NaN: {nan_count}, Inf: {inf_count}") X = np.nan_to_num(X_raw, nan=0.0, posinf=0.0, neginf=0.0) # Outlier clipping Q1 = np.percentile(X, 25, axis=0) Q3 = np.percentile(X, 75, axis=0) IQR = Q3 - Q1 lower = Q1 - 1.5 * IQR upper = Q3 + 1.5 * IQR outlier_mask = (X < lower) | (X > upper) print(f" Outlier hücre: {outlier_mask.sum()} ({outlier_mask.sum()/(X.shape[0]*X.shape[1])*100:.1f}%)") X_clipped = np.clip(X, lower, upper) # Düşük varyans çıkarma variances = np.var(X_clipped, axis=0) var_mask = variances > 1e-6 X_clean = X_clipped[:, var_mask] print(f" Düşük varyanslı özellik çıkarıldı: {(~var_mask).sum()}, kalan: {var_mask.sum()}") # Son veri: clipped X_final = X_clipped # ════════════════════════════════════════════════════════════════ # ADIM 3: ÖN İŞLEME PIPELINE KARŞILAŞTIRMASI # ════════════════════════════════════════════════════════════════ print("\n" + "=" * 70) print("ADIM 3: PIPELINE KARŞILAŞTIRMASI") print("=" * 70) tr_mask_pipe = ts <= 39 te_mask_pipe = ts > 39 def quick_eval(X_tr, y_tr, X_te, y_te): m = lgb.LGBMClassifier(n_estimators=500, max_depth=12, scale_pos_weight=10, learning_rate=0.05, random_state=42, n_jobs=-1, verbose=-1) m.fit(X_tr, y_tr) proba = m.predict_proba(X_te)[:, 1] # Threshold optimizasyonu best_f1, best_th = 0, 0.5 for th in np.arange(0.1, 0.9, 0.05): p = (proba >= th).astype(int) f = f1_score(y_te, p, zero_division=0) if f > best_f1: best_f1, best_th = f, th return best_f1 pipelines = {} f1_raw = quick_eval(X_raw[tr_mask_pipe], y[tr_mask_pipe], X_raw[te_mask_pipe], y[te_mask_pipe]) pipelines['Ham Veri'] = f1_raw print(f" Ham Veri: F1={f1_raw:.4f}") sc = StandardScaler() f1_ss = quick_eval(sc.fit_transform(X[tr_mask_pipe]), y[tr_mask_pipe], sc.transform(X[te_mask_pipe]), y[te_mask_pipe]) pipelines['StandardScaler'] = f1_ss print(f" StandardScaler: F1={f1_ss:.4f}") rs = RobustScaler() f1_rs = quick_eval(rs.fit_transform(X[tr_mask_pipe]), y[tr_mask_pipe], rs.transform(X[te_mask_pipe]), y[te_mask_pipe]) pipelines['RobustScaler'] = f1_rs print(f" RobustScaler: F1={f1_rs:.4f}") rs2 = RobustScaler() f1_cr = quick_eval(rs2.fit_transform(X_clipped[tr_mask_pipe]), y[tr_mask_pipe], rs2.transform(X_clipped[te_mask_pipe]), y[te_mask_pipe]) pipelines['Clip+Robust'] = f1_cr print(f" Clip+Robust: F1={f1_cr:.4f}") rs3 = RobustScaler() f1_cvr = quick_eval(rs3.fit_transform(X_clean[tr_mask_pipe]), y[tr_mask_pipe], rs3.transform(X_clean[te_mask_pipe]), y[te_mask_pipe]) pipelines['Clip+VarFilter+Robust'] = f1_cvr print(f" Clip+VarFilter+Rob: F1={f1_cvr:.4f}") try: from imblearn.over_sampling import SMOTE smote = SMOTE(random_state=42) rs4 = RobustScaler() X_tr_s = rs4.fit_transform(X_clipped[tr_mask_pipe]) X_te_s = rs4.transform(X_clipped[te_mask_pipe]) X_tr_sm, y_tr_sm = smote.fit_resample(X_tr_s, y[tr_mask_pipe]) f1_smote = quick_eval(X_tr_sm, y_tr_sm, X_te_s, y[te_mask_pipe]) pipelines['Clip+Robust+SMOTE'] = f1_smote print(f" Clip+Robust+SMOTE: F1={f1_smote:.4f}") except ImportError: print(" SMOTE atlandı") best_pipe = max(pipelines, key=pipelines.get) print(f"\n ★ En iyi pipeline: {best_pipe} (F1={pipelines[best_pipe]:.4f})") # ════════════════════════════════════════════════════════════════ # ADIM 4: TOPOLOJİK METRİKLER # ════════════════════════════════════════════════════════════════ print("\n" + "=" * 70) print("ADIM 4: TOPOLOJİK METRİKLER") print("=" * 70) all_ts = sorted(np.unique(timesteps_raw)) topo = {} for t in all_ts: ts_nodes = set(np.where(timesteps_raw == t)[0]) m = np.isin(src, list(ts_nodes)) & np.isin(dst, list(ts_nodes)) G = nx.DiGraph() G.add_nodes_from(ts_nodes) G.add_edges_from(zip(src[m], dst[m])) n = G.number_of_nodes(); e = G.number_of_edges() density = nx.density(G) if n > 1 else 0 G_u = G.to_undirected() comps = nx.number_connected_components(G_u) largest = max(nx.connected_components(G_u), key=len) cc_ratio = len(largest) / max(n, 1) degs = [d for _, d in G.degree()] avg_deg = np.mean(degs) if degs else 0 ts_lab = [nd for nd in ts_nodes if labels_np[nd] >= 0] ts_ill = [nd for nd in ts_lab if labels_np[nd] == 1] ill_rate = len(ts_ill) / max(len(ts_lab), 1) topo[t] = {'n_nodes': n, 'n_edges': e, 'density': density, 'cc_ratio': cc_ratio, 'n_components': comps, 'avg_degree': avg_deg, 'illicit_rate': ill_rate} print(f" TS {t:2d}: nodes={n:5d} edges={e:5d} density={density:.5f} illicit={ill_rate:.3f}") topo_df = pd.DataFrame(topo).T topo_df.to_csv('output/results/topological_metrics.csv') # ════════════════════════════════════════════════════════════════ # ADIM 5: KIRILMA NOKTASI TESPİTİ # ════════════════════════════════════════════════════════════════ print("\n" + "=" * 70) print("ADIM 5: KIRILMA NOKTASI TESPİTİ") print("=" * 70) # Sağlık skoru: normalize et for col in ['density', 'cc_ratio', 'n_components']: mi, ma = topo_df[col].min(), topo_df[col].max() topo_df[f'{col}_n'] = (topo_df[col] - mi) / (ma - mi + 1e-8) health = (topo_df['density_n'] + topo_df['cc_ratio_n'] + (1 - topo_df['n_components_n'])) / 3 bp_final = health.diff().idxmin() print(f" Sağlık skoru kırılma noktası: TS {bp_final}") # Tepe-düşüş analizi (bilgi amaçlı) df_t = topo_df.copy() for col in ['n_edges', 'density', 'avg_degree']: mi, ma = df_t[col].min(), df_t[col].max() df_t[f'{col}_norm'] = (df_t[col] - mi) / (ma - mi + 1e-8) crisis = (df_t['n_edges_norm'] * 0.4 + df_t['density_norm'] * 0.3 + df_t['avg_degree_norm'] * 0.3).values crisis_smooth = uniform_filter1d(crisis, size=5, mode='nearest') velocity = np.gradient(crisis_smooth) peaks = [] for i in range(1, len(velocity) - 1): if velocity[i-1] > 0 and velocity[i+1] < 0: peaks.append({'timestep': all_ts[i], 'index': i, 'drop': abs(velocity[i+1])}) peaks = sorted(peaks, key=lambda x: x['drop'], reverse=True) print(f" ★ Final kırılma noktası: TS {bp_final}") # ════════════════════════════════════════════════════════════════ # ADIM 6: GRAF VERİSİ HAZIRLA (GraphSAGE için) # ════════════════════════════════════════════════════════════════ print("\n" + "=" * 70) print("ADIM 6: GRAPHSAGE VERİ HAZIRLAMA") print("=" * 70) # Etiketli düğümler arası kenarları filtrele labeled_set = set(labeled_indices.tolist()) labeled_edges = [(full_to_labeled[s], full_to_labeled[d]) for s, d in zip(src, dst) if s in labeled_set and d in labeled_set and s in full_to_labeled and d in full_to_labeled] if labeled_edges: edge_src = [e[0] for e in labeled_edges] edge_dst = [e[1] for e in labeled_edges] edge_index = torch.tensor([edge_src + edge_dst, edge_dst + edge_src], dtype=torch.long) # undirected else: edge_index = torch.zeros((2, 0), dtype=torch.long) print(f" Etiketli düğümler arası kenar: {len(labeled_edges)} ({edge_index.shape[1]} undirected)") # Normalize features for GNN scaler_gnn = RobustScaler() X_gnn = scaler_gnn.fit_transform(X_final) x_tensor = torch.tensor(X_gnn, dtype=torch.float32) y_tensor = torch.tensor(y, dtype=torch.long) graph_data = Data(x=x_tensor, edge_index=edge_index, y=y_tensor) # ════════════════════════════════════════════════════════════════ # ADIM 7: BÖLME STRATEJİLERİ VE DENEYLER # ════════════════════════════════════════════════════════════════ print("\n" + "=" * 70) print("ADIM 7: 5 STRATEJİ × 4 MODEL = 20 DENEY") print("=" * 70) def make_masks(train_ts_set, test_ts_set): tr = np.array([ts[i] in train_ts_set for i in range(len(y))]) te = np.array([ts[i] in test_ts_set for i in range(len(y))]) return tr, te def split_random(): idx = np.arange(len(y)) tr, te = train_test_split(idx, test_size=0.2, random_state=42, stratify=y) m_tr = np.zeros(len(y), dtype=bool); m_tr[tr] = True m_te = np.zeros(len(y), dtype=bool); m_te[te] = True return m_tr, m_te cutoff_chrono = all_ts[int(len(all_ts) * 0.8) - 1] strategies = { 'Rastgele': split_random, 'Kronolojik': lambda: make_masks({t for t in all_ts if t <= cutoff_chrono}, {t for t in all_ts if t > cutoff_chrono}), 'Topolojik Kırılma': lambda: make_masks({t for t in all_ts if t < bp_final}, {t for t in all_ts if t >= bp_final}), 'Kayan Pencere': lambda: make_masks(set(all_ts[:-10]), set(all_ts[-10:])), 'Düşmanca-Kriz': lambda: make_masks( {t for t in all_ts if topo_df.loc[t, 'illicit_rate'] <= 0.18 and t != bp_final}, {t for t in all_ts if topo_df.loc[t, 'illicit_rate'] > 0.18 or t == bp_final} ), } def train_eval_tabular(X_tr, y_tr, X_te, y_te, model_type): """Tabular model eğit — threshold optimizasyonu ile""" sc = RobustScaler() Xtr = sc.fit_transform(X_tr) Xte = sc.transform(X_te) # SMOTE uygula (eğitim setine) try: from imblearn.over_sampling import SMOTE smote = SMOTE(random_state=42) Xtr, y_tr = smote.fit_resample(Xtr, y_tr) except: pass if model_type == 'lgbm': m = lgb.LGBMClassifier( n_estimators=500, max_depth=12, learning_rate=0.05, num_leaves=63, min_child_samples=20, subsample=0.8, colsample_bytree=0.8, scale_pos_weight=10, random_state=42, n_jobs=-1, verbose=-1 ) elif model_type == 'rf': m = RandomForestClassifier( n_estimators=500, max_depth=20, min_samples_leaf=5, class_weight='balanced_subsample', max_features='sqrt', random_state=42, n_jobs=-1 ) elif model_type == 'xgb': m = xgb.XGBClassifier( n_estimators=500, max_depth=10, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8, scale_pos_weight=10, min_child_weight=5, gamma=0.1, random_state=42, n_jobs=-1, verbosity=0 ) m.fit(Xtr, y_tr) proba = m.predict_proba(Xte)[:, 1] # Threshold optimizasyonu best_f1, best_th = 0, 0.5 for th in np.arange(0.1, 0.9, 0.05): pred_th = (proba >= th).astype(int) f1_th = f1_score(y_te, pred_th, zero_division=0) if f1_th > best_f1: best_f1, best_th = f1_th, th pred = (proba >= best_th).astype(int) return { 'f1': round(f1_score(y_te, pred, zero_division=0), 4), 'precision': round(precision_score(y_te, pred, zero_division=0), 4), 'recall': round(recall_score(y_te, pred, zero_division=0), 4), 'auroc': round(roc_auc_score(y_te, proba) if len(np.unique(y_te)) > 1 else 0.5, 4), 'threshold': round(best_th, 2), } model_types = [('lgbm', 'LightGBM'), ('rf', 'Random Forest'), ('xgb', 'XGBoost')] all_results = [] for strat_name, strat_fn in strategies.items(): tr_m, te_m = strat_fn() if tr_m.sum() < 50 or te_m.sum() < 10: print(f" {strat_name}: yetersiz veri, atlanıyor") continue print(f"\n {strat_name} (train={tr_m.sum()}, test={te_m.sum()}, test_ill={y[te_m].sum()}):") # Tabular modeller for mt, mn in model_types: res = train_eval_tabular(X_final[tr_m], y[tr_m], X_final[te_m], y[te_m], mt) res['strateji'] = strat_name res['model'] = mn all_results.append(res) print(f" {mn:15s}: F1={res['f1']:.4f} P={res['precision']:.4f} R={res['recall']:.4f} AUROC={res['auroc']:.4f} th={res['threshold']}") # GraphSAGE print(f" {'GraphSAGE':15s}: eğitiliyor...", end='', flush=True) train_mask_t = torch.tensor(tr_m, dtype=torch.bool) test_mask_t = torch.tensor(te_m, dtype=torch.bool) ill_weight = float((y[tr_m] == 0).sum()) / max(float((y[tr_m] == 1).sum()), 1) ill_weight = min(ill_weight, 15.0) # cap at 15 gs_res = train_graphsage(graph_data, train_mask_t, test_mask_t, X_final.shape[1], epochs=200, lr=0.005, weight=ill_weight) gs_res['strateji'] = strat_name gs_res['model'] = 'GraphSAGE' gs_res['threshold'] = 0.0 # threshold handled internally all_results.append(gs_res) print(f"\r {'GraphSAGE':15s}: F1={gs_res['f1']:.4f} P={gs_res['precision']:.4f} R={gs_res['recall']:.4f} AUROC={gs_res['auroc']:.4f}") res_df = pd.DataFrame(all_results) res_df.to_csv('output/results/all_experiment_results.csv', index=False) # ════════════════════════════════════════════════════════════════ # ADIM 8: WALK-FORWARD VALİDASYON # ════════════════════════════════════════════════════════════════ print("\n" + "=" * 70) print("ADIM 8: WALK-FORWARD VALİDASYON") print("=" * 70) wf_results = {} for mt, mn in model_types: wf_f1s = [] for test_start in range(10, 49, 3): tr_m = ts < test_start te_m = (ts >= test_start) & (ts < test_start + 3) if tr_m.sum() < 50 or te_m.sum() < 10 or len(np.unique(y[te_m])) < 2: continue res = train_eval_tabular(X_final[tr_m], y[tr_m], X_final[te_m], y[te_m], mt) wf_f1s.append(res['f1']) wf_results[mn] = round(np.mean(wf_f1s), 4) print(f" {mn}: Walk-Forward F1 = {wf_results[mn]:.4f}") # GraphSAGE walk-forward wf_gs_f1s = [] for test_start in range(10, 49, 3): tr_m_wf = ts < test_start te_m_wf = (ts >= test_start) & (ts < test_start + 3) if tr_m_wf.sum() < 50 or te_m_wf.sum() < 10 or len(np.unique(y[te_m_wf])) < 2: continue train_mask_wf = torch.tensor(tr_m_wf, dtype=torch.bool) test_mask_wf = torch.tensor(te_m_wf, dtype=torch.bool) ill_w = float((y[tr_m_wf]==0).sum()) / max(float((y[tr_m_wf]==1).sum()), 1) ill_w = min(ill_w, 15.0) gs_wf = train_graphsage(graph_data, train_mask_wf, test_mask_wf, X_final.shape[1], epochs=100, weight=ill_w) wf_gs_f1s.append(gs_wf['f1']) wf_results['GraphSAGE'] = round(np.mean(wf_gs_f1s), 4) if wf_gs_f1s else 0 print(f" GraphSAGE: Walk-Forward F1 = {wf_results['GraphSAGE']:.4f}") # Dürüstlük tablosu print("\n Dürüstlük Karşılaştırması:") honesty_data = [] for strat_name in strategies: sapma_list = [] for mn in wf_results: row = res_df[(res_df['strateji'] == strat_name) & (res_df['model'] == mn)] if len(row) > 0 and mn in wf_results and wf_results[mn] > 0: sapma = ((row['f1'].values[0] - wf_results[mn]) / wf_results[mn]) * 100 sapma_list.append(sapma) if sapma_list: avg_sapma = np.mean(sapma_list) durum = "✅ DÜRÜST" if abs(avg_sapma) < 10 else ("🔴 ŞİŞME" if avg_sapma > 10 else "⚠️ PESİMİST") honesty_data.append({'strateji': strat_name, 'sapma': round(avg_sapma, 1), 'durum': durum}) print(f" {strat_name:25s}: ort. sapma = {avg_sapma:+.1f}% {durum}") # ════════════════════════════════════════════════════════════════ # ADIM 9: FİGÜRLER # ════════════════════════════════════════════════════════════════ print("\n" + "=" * 70) print("ADIM 9: FİGÜRLER") print("=" * 70) sns.set_theme(style='whitegrid', font_scale=1.1) # Fig 1: Kırılma noktası fig, axes = plt.subplots(3, 1, figsize=(18, 14), gridspec_kw={'height_ratios': [2, 1, 1]}) axes[0].plot(all_ts, health.values, 'o-', color='steelblue', linewidth=2, markersize=5) axes[0].axvline(x=bp_final, color='red', linewidth=3, linestyle='--') axes[0].annotate(f'KIRILMA TS={bp_final}', xy=(bp_final, health.loc[bp_final]), fontsize=12, fontweight='bold', color='red', ha='center', xytext=(bp_final+3, health.max()-0.05), arrowprops=dict(arrowstyle='->', color='red', lw=2)) axes[0].set_ylabel('Ağ Sağlık Skoru', fontsize=12) axes[0].set_title('Topolojik Kırılma Noktası Tespiti', fontsize=14, fontweight='bold') colors_bar = ['#FF4444' if topo_df.loc[t, 'illicit_rate'] > 0.18 else '#44BB44' for t in all_ts] axes[1].bar(all_ts, topo_df['illicit_rate'].values * 100, color=colors_bar) axes[1].axvline(x=bp_final, color='red', linewidth=3, linestyle='--') axes[1].set_ylabel('İllicit %', fontsize=12) axes[2].plot(all_ts, crisis_smooth, '-', color='steelblue', linewidth=2) if peaks: axes[2].scatter(peaks[0]['timestep'], crisis_smooth[peaks[0]['index']], color='red', s=200, zorder=5, edgecolors='black') axes[2].set_ylabel('Kriz Sinyali', fontsize=12) axes[2].set_xlabel('Timestep', fontsize=12) plt.tight_layout() plt.savefig('output/figures/fig1_breakpoint.png', dpi=150, bbox_inches='tight') plt.close() print(" ✓ fig1_breakpoint.png") # Fig 2: F1 karşılaştırma (4 model dahil) fig, ax = plt.subplots(figsize=(18, 8)) strat_names = list(strategies.keys()) model_names = [mn for _, mn in model_types] + ['GraphSAGE'] colors5 = sns.color_palette('Set2', len(strat_names)) x = np.arange(len(model_names)); width = 0.15 for i, strat in enumerate(strat_names): vals = [] for m in model_names: row = res_df[(res_df['model'] == m) & (res_df['strateji'] == strat)] vals.append(row['f1'].values[0] if len(row) > 0 else 0) ec = 'black' if 'Topolojik' in strat else 'white' lw = 2 if 'Topolojik' in strat else 0.5 ax.bar(x + i*width, vals, width, label=strat, color=colors5[i], edgecolor=ec, linewidth=lw) wf_avg = np.mean(list(wf_results.values())) ax.axhspan(wf_avg*0.9, wf_avg*1.1, alpha=0.12, color='green') ax.axhline(y=wf_avg, color='green', linewidth=2, linestyle='--', label=f'Walk-Forward ({wf_avg:.3f})') ax.set_xticks(x + width*2); ax.set_xticklabels(model_names, fontsize=12) ax.set_ylabel('Illicit F1', fontsize=13) ax.set_title('Bölme Stratejileri × Model Karşılaştırması', fontsize=14, fontweight='bold') ax.legend(fontsize=9, loc='upper right'); ax.set_ylim(0, 1.1) plt.tight_layout() plt.savefig('output/figures/fig2_f1_comparison.png', dpi=150, bbox_inches='tight') plt.close() print(" ✓ fig2_f1_comparison.png") # Fig 3: Pipeline karşılaştırma fig, ax = plt.subplots(figsize=(10, 6)) p_names = list(pipelines.keys()) p_vals = list(pipelines.values()) colors_p = ['#FF6B6B' if v == min(p_vals) else '#4ECDC4' if v == max(p_vals) else '#45B7D1' for v in p_vals] ax.barh(p_names, p_vals, color=colors_p) ax.set_xlabel('Illicit F1 Score') ax.set_title('Ön İşleme Pipeline Karşılaştırması', fontsize=14, fontweight='bold') for i, v in enumerate(p_vals): ax.text(v + 0.002, i, f'{v:.4f}', va='center', fontsize=10) plt.tight_layout() plt.savefig('output/figures/fig3_pipeline_comparison.png', dpi=150, bbox_inches='tight') plt.close() print(" ✓ fig3_pipeline_comparison.png") # Fig 4: Dürüstlük ısı haritası fig, ax = plt.subplots(figsize=(16, 7)) sapma_data = [] for strat_name in strat_names: for mn in model_names: row = res_df[(res_df['strateji'] == strat_name) & (res_df['model'] == mn)] if len(row) > 0 and mn in wf_results and wf_results[mn] > 0: sapma = ((row['f1'].values[0] - wf_results[mn]) / wf_results[mn]) * 100 sapma_data.append({'strateji': strat_name, 'model': mn, 'sapma': round(sapma, 1)}) if sapma_data: sapma_df = pd.DataFrame(sapma_data) pivot = sapma_df.pivot_table(values='sapma', index='model', columns='strateji') sns.heatmap(pivot, annot=True, fmt='.1f', cmap='RdYlGn_r', center=0, ax=ax, linewidths=0.5, cbar_kws={'label': 'Walk-Forward Sapma (%)'}) ax.set_title('Walk-Forward Dürüstlük Sapması (%) — 4 Model × 5 Strateji', fontsize=14, fontweight='bold') plt.tight_layout() plt.savefig('output/figures/fig4_honesty.png', dpi=150, bbox_inches='tight') plt.close() print(" ✓ fig4_honesty.png") # Fig 5: Performans şişmesi haritası (inflation) fig, ax = plt.subplots(figsize=(14, 6)) inf_data = [] for mn in model_names: row_rand = res_df[(res_df['model'] == mn) & (res_df['strateji'] == 'Rastgele')] row_chr = res_df[(res_df['model'] == mn) & (res_df['strateji'] == 'Kronolojik')] row_topo = res_df[(res_df['model'] == mn) & (res_df['strateji'] == 'Topolojik Kırılma')] if len(row_rand) > 0 and len(row_chr) > 0: rand_f1 = row_rand['f1'].values[0] chr_f1 = row_chr['f1'].values[0] topo_f1 = row_topo['f1'].values[0] if len(row_topo) > 0 else 0 inf_data.append({ 'model': mn, 'Rastgele vs Kronolojik': round((rand_f1 - chr_f1) / chr_f1 * 100, 1), 'Rastgele vs Topolojik': round((rand_f1 - topo_f1) / topo_f1 * 100, 1) if topo_f1 > 0 else 0, }) if inf_data: inf_df = pd.DataFrame(inf_data).set_index('model') sns.heatmap(inf_df, annot=True, fmt='.1f', cmap='Reds', ax=ax, linewidths=0.5, cbar_kws={'label': 'Şişme Oranı (%)'}) ax.set_title('Rastgele Bölme Performans Şişmesi (%)', fontsize=14, fontweight='bold') plt.tight_layout() plt.savefig('output/figures/fig5_inflation.png', dpi=150, bbox_inches='tight') plt.close() print(" ✓ fig5_inflation.png") # ════════════════════════════════════════════════════════════════ # ADIM 10: ÖZET RAPOR # ════════════════════════════════════════════════════════════════ elapsed = time.time() - start_time summary = { 'veri': {'toplam': N, 'etiketli': len(y), 'illicit': int(y.sum()), 'ozellik': int(X_final.shape[1]), 'kenar': len(valid_edges)}, 'temizleme': {'nan': int(nan_count), 'inf': int(inf_count), 'outlier_pct': round(outlier_mask.sum()/(X.shape[0]*X.shape[1])*100, 2), 'cikarilan_ozellik': int((~var_mask).sum()), 'en_iyi_pipeline': best_pipe}, 'kirilma': {'saglik_yontemi': int(bp_final), 'final': int(bp_final)}, 'walk_forward': wf_results, 'sonuclar': res_df.to_dict(orient='records'), 'pipeline_karsilastirma': {k: round(v, 4) for k, v in pipelines.items()}, 'durustukluk': honesty_data, 'sure_dakika': round(elapsed / 60, 1), } with open('output/results/summary.json', 'w') as f: json.dump(summary, f, indent=2, ensure_ascii=False) print("\n" + "=" * 70) print(f"TAMAMLANDI! (Süre: {elapsed/60:.1f} dakika)") print("=" * 70) # Final sonuç tablosu print(f"\n ═══ SONUÇ TABLOSU (Illicit F1) ═══") pivot_f1 = res_df.pivot_table(values='f1', index='model', columns='strateji') print(pivot_f1.to_string()) print(f"\n ═══ WALK-FORWARD REFERANS ═══") for mn, f1 in wf_results.items(): print(f" {mn}: {f1:.4f}") print(f"\n Çıktılar: output/results/ ve output/figures/") if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--data_dir', type=str, default='./dataset') args = parser.parse_args() main(args.data_dir)