MucahitSylmz commited on
Commit
c2ce770
Β·
verified Β·
1 Parent(s): ee8165c

Update run_all.py: GraphSAGE + TS29 + SMOTE + threshold opt

Browse files
Files changed (1) hide show
  1. run_all.py +356 -112
run_all.py CHANGED
@@ -7,17 +7,15 @@ Bu script sırayla şunları yapar:
7
  1. Veri denetimi ve temizleme (data_audit)
8
  2. En iyi ΓΆn işleme pipeline'Δ±nΔ± belirle
9
  3. Topolojik kΔ±rΔ±lma noktasΔ± tespiti
10
- 4. 5 bΓΆlme stratejisi Γ— 4 model = 20 deney
11
- 5. 6 kanΔ±t testi (walk-forward, rastgele etiket, dΓΌrΓΌstlΓΌk, vb.)
12
- 6. SΔ±zΔ±ntΔ± haritasΔ±
13
- 7. Tepe-düşüş kriz analizi
14
- 8. TΓΌm figΓΌrleri ve sonuΓ§larΔ± kaydet
15
 
16
  KULLANIM:
17
  pip install pandas numpy scikit-learn matplotlib seaborn lightgbm xgboost networkx scipy imbalanced-learn torch torch-geometric
18
  python run_all.py --data_dir ./dataset
19
 
20
- SÜRE: ~30 dakika (CPU)
21
  ===============================================================================
22
  """
23
 
@@ -40,14 +38,139 @@ from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_sco
40
  import xgboost as xgb
41
  import lightgbm as lgb
42
 
 
 
 
 
 
 
43
  warnings.filterwarnings('ignore')
44
  np.random.seed(42)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  def main(data_dir):
47
  start_time = time.time()
48
 
49
  # Γ‡Δ±ktΔ± klasΓΆrleri
50
- for d in ['output/figures', 'output/results', 'output/gephi_data']:
51
  os.makedirs(d, exist_ok=True)
52
 
53
  # ════════════════════════════════════════════════════════════════
@@ -70,17 +193,25 @@ def main(data_dir):
70
  label_map = {'1': 1, '2': 0, 'unknown': -1}
71
  labels_np = np.array([label_map[str(c)] for c in class_df['class'].values])
72
 
73
- src = np.array([id_map[t] for t in edge_df['txId1'].values if t in id_map])
74
- dst = np.array([id_map[t] for t in edge_df['txId2'].values if t in id_map])
75
- ml = min(len(src), len(dst)); src, dst = src[:ml], dst[:ml]
 
 
76
 
77
  labeled_mask = labels_np >= 0
 
 
 
 
 
78
  X_raw = features_raw[labeled_mask]
79
  y = labels_np[labeled_mask]
80
  ts = timesteps_raw[labeled_mask]
81
 
82
  print(f" Toplam: {N}, Etiketli: {len(y)}")
83
  print(f" Δ°llicit: {y.sum()} ({y.mean()*100:.1f}%), Licit: {len(y)-y.sum()}")
 
84
 
85
  # ════════════════════════════════════════════════════════════════
86
  # ADIM 2: VERΔ° TEMΔ°ZLEME VE Γ–N İŞLEME
@@ -89,13 +220,12 @@ def main(data_dir):
89
  print("ADIM 2: VERΔ° TEMΔ°ZLEME")
90
  print("=" * 70)
91
 
92
- # NaN/Inf temizleme
93
  nan_count = np.isnan(X_raw).sum()
94
  inf_count = np.isinf(X_raw).sum()
95
  print(f" NaN: {nan_count}, Inf: {inf_count}")
96
  X = np.nan_to_num(X_raw, nan=0.0, posinf=0.0, neginf=0.0)
97
 
98
- # Outlier analizi
99
  Q1 = np.percentile(X, 25, axis=0)
100
  Q3 = np.percentile(X, 75, axis=0)
101
  IQR = Q3 - Q1
@@ -104,93 +234,82 @@ def main(data_dir):
104
  outlier_mask = (X < lower) | (X > upper)
105
  print(f" Outlier hΓΌcre: {outlier_mask.sum()} ({outlier_mask.sum()/(X.shape[0]*X.shape[1])*100:.1f}%)")
106
 
107
- # İllicit vs Licit outlier karşılaştırması
108
- ill_out = outlier_mask[y==1].sum(axis=1).mean()
109
- lic_out = outlier_mask[y==0].sum(axis=1).mean()
110
- print(f" Δ°llicit ort. outlier: {ill_out:.1f}, Licit ort. outlier: {lic_out:.1f}")
111
-
112
- # Outlier clipping (IQR yΓΆntemi)
113
  X_clipped = np.clip(X, lower, upper)
114
- print(f" βœ“ Outlier clipping uygulandΔ± (IQR yΓΆntemi)")
115
 
116
- # Düşük varyans âzellik çıkarma
117
  variances = np.var(X_clipped, axis=0)
118
  var_mask = variances > 1e-6
119
  X_clean = X_clipped[:, var_mask]
120
- print(f" βœ“ Düşük varyanslΔ± ΓΆzellikler Γ§Δ±karΔ±ldΔ±: {(~var_mask).sum()} Γ§Δ±karΔ±ldΔ±, {var_mask.sum()} kaldΔ±")
 
 
 
121
 
122
  # ════════════════════════════════════════════════════════════════
123
  # ADIM 3: Γ–N İŞLEME PIPELINE KARŞILAŞTIRMASI
124
  # ════════════════════════════════════════════════════════════════
125
  print("\n" + "=" * 70)
126
- print("ADIM 3: EN Δ°YΔ° Γ–N İŞLEME PIPELINE SEΓ‡Δ°MΔ°")
127
  print("=" * 70)
128
 
129
- tr_mask = ts <= 39
130
- te_mask = ts > 39
131
 
132
  def quick_eval(X_tr, y_tr, X_te, y_te):
133
- m = lgb.LGBMClassifier(n_estimators=300, max_depth=10, scale_pos_weight=10,
134
- random_state=42, n_jobs=-1, verbose=-1)
135
  m.fit(X_tr, y_tr)
136
- return f1_score(y_te, m.predict(X_te), zero_division=0)
 
 
 
 
 
 
 
137
 
138
  pipelines = {}
139
 
140
- # Ham
141
- f1_raw = quick_eval(X_raw[tr_mask], y[tr_mask], X_raw[te_mask], y[te_mask])
142
  pipelines['Ham Veri'] = f1_raw
143
  print(f" Ham Veri: F1={f1_raw:.4f}")
144
 
145
- # StandardScaler
146
  sc = StandardScaler()
147
- f1_ss = quick_eval(sc.fit_transform(X[tr_mask]), y[tr_mask], sc.transform(X[te_mask]), y[te_mask])
148
  pipelines['StandardScaler'] = f1_ss
149
- print(f" StandardScaler: F1={f1_ss:.4f} ({f1_ss-f1_raw:+.4f})")
150
 
151
- # RobustScaler
152
  rs = RobustScaler()
153
- f1_rs = quick_eval(rs.fit_transform(X[tr_mask]), y[tr_mask], rs.transform(X[te_mask]), y[te_mask])
154
  pipelines['RobustScaler'] = f1_rs
155
- print(f" RobustScaler: F1={f1_rs:.4f} ({f1_rs-f1_raw:+.4f})")
156
 
157
- # Clip + RobustScaler
158
  rs2 = RobustScaler()
159
- f1_cr = quick_eval(rs2.fit_transform(X_clipped[tr_mask]), y[tr_mask],
160
- rs2.transform(X_clipped[te_mask]), y[te_mask])
161
  pipelines['Clip+Robust'] = f1_cr
162
- print(f" Clip+Robust: F1={f1_cr:.4f} ({f1_cr-f1_raw:+.4f})")
163
 
164
- # Clip + VarFilter + RobustScaler
165
  rs3 = RobustScaler()
166
- f1_cvr = quick_eval(rs3.fit_transform(X_clean[tr_mask]), y[tr_mask],
167
- rs3.transform(X_clean[te_mask]), y[te_mask])
168
  pipelines['Clip+VarFilter+Robust'] = f1_cvr
169
- print(f" Clip+VarFilter+Rob: F1={f1_cvr:.4f} ({f1_cvr-f1_raw:+.4f})")
170
 
171
- # SMOTE dene
172
  try:
173
  from imblearn.over_sampling import SMOTE
174
  smote = SMOTE(random_state=42)
175
  rs4 = RobustScaler()
176
- X_tr_s = rs4.fit_transform(X_clipped[tr_mask])
177
- X_te_s = rs4.transform(X_clipped[te_mask])
178
- X_tr_sm, y_tr_sm = smote.fit_resample(X_tr_s, y[tr_mask])
179
- f1_smote = quick_eval(X_tr_sm, y_tr_sm, X_te_s, y[te_mask])
180
  pipelines['Clip+Robust+SMOTE'] = f1_smote
181
- print(f" Clip+Robust+SMOTE: F1={f1_smote:.4f} ({f1_smote-f1_raw:+.4f})")
182
  except ImportError:
183
- print(f" SMOTE atlandΔ± (pip install imbalanced-learn)")
184
 
185
- # En iyiyi seΓ§
186
  best_pipe = max(pipelines, key=pipelines.get)
187
  print(f"\n β˜… En iyi pipeline: {best_pipe} (F1={pipelines[best_pipe]:.4f})")
188
 
189
- # SeΓ§ilen pipeline'Δ± uygula
190
- # LightGBM tree-based olduğu için scaling zorunlu değil ama tutarlılık için yapalım
191
- final_scaler = RobustScaler()
192
- X_final = X_clipped # Clipped versiyonu kullan (en gΓΌvenli)
193
-
194
  # ════════════════════════════════════════════════════════════════
195
  # ADIM 4: TOPOLOJΔ°K METRΔ°KLER
196
  # ════════════════════════════════════════════════════════════════
@@ -219,27 +338,27 @@ def main(data_dir):
219
  ill_rate = len(ts_ill) / max(len(ts_lab), 1)
220
  topo[t] = {'n_nodes': n, 'n_edges': e, 'density': density, 'cc_ratio': cc_ratio,
221
  'n_components': comps, 'avg_degree': avg_deg, 'illicit_rate': ill_rate}
222
- print(f" TS {t:2d}: nodes={n:5d} edges={e:5d} illicit={ill_rate:.3f}")
223
 
224
  topo_df = pd.DataFrame(topo).T
225
  topo_df.to_csv('output/results/topological_metrics.csv')
226
 
227
  # ════════════════════════════════════════════════════════════════
228
- # ADIM 5: KIRILMA NOKTASI TESPİTİ (Sağlık + Tepe-Düşüş)
229
  # ════════════════════════════════════════════════════════════════
230
  print("\n" + "=" * 70)
231
  print("ADIM 5: KIRILMA NOKTASI TESPΔ°TΔ°")
232
  print("=" * 70)
233
 
234
- # Yântem A: Sağlık skoru
235
  for col in ['density', 'cc_ratio', 'n_components']:
236
  mi, ma = topo_df[col].min(), topo_df[col].max()
237
  topo_df[f'{col}_n'] = (topo_df[col] - mi) / (ma - mi + 1e-8)
238
  health = (topo_df['density_n'] + topo_df['cc_ratio_n'] + (1 - topo_df['n_components_n'])) / 3
239
- bp_health = health.diff().idxmin()
240
- print(f" Yântem A (Sağlık skoru): Kırılma = TS {bp_health}")
241
 
242
- # Yântem B: Tepe-düşüş (türev)
243
  df_t = topo_df.copy()
244
  for col in ['n_edges', 'density', 'avg_degree']:
245
  mi, ma = df_t[col].min(), df_t[col].max()
@@ -247,25 +366,51 @@ def main(data_dir):
247
  crisis = (df_t['n_edges_norm'] * 0.4 + df_t['density_norm'] * 0.3 + df_t['avg_degree_norm'] * 0.3).values
248
  crisis_smooth = uniform_filter1d(crisis, size=5, mode='nearest')
249
  velocity = np.gradient(crisis_smooth)
250
-
251
  peaks = []
252
  for i in range(1, len(velocity) - 1):
253
  if velocity[i-1] > 0 and velocity[i+1] < 0:
254
  peaks.append({'timestep': all_ts[i], 'index': i, 'drop': abs(velocity[i+1])})
255
  peaks = sorted(peaks, key=lambda x: x['drop'], reverse=True)
256
- bp_peak = peaks[0]['timestep'] if peaks else bp_health
257
- print(f" Yântem B (Tepe-düşüş): Kırılma = TS {bp_peak}")
258
 
259
- # Birleştir: iki yântemin ortalamasına en yakın timestep
260
- avg_bp = (bp_health + bp_peak) / 2
261
- bp_final = min(all_ts, key=lambda t: abs(t - avg_bp))
262
  print(f" β˜… Final kΔ±rΔ±lma noktasΔ±: TS {bp_final}")
263
 
264
  # ════════════════════════════════════════════════════════════════
265
- # ADIM 6: BΓ–LME STRATEJΔ°LERΔ° VE DENEYLER
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  # ════════════════════════════════════════════════════════════════
267
  print("\n" + "=" * 70)
268
- print("ADIM 6: 5 STRATEJΔ° Γ— 4 MODEL = 20 DENEY")
269
  print("=" * 70)
270
 
271
  def make_masks(train_ts_set, test_ts_set):
@@ -293,23 +438,59 @@ def main(data_dir):
293
  ),
294
  }
295
 
296
- def train_eval(X_tr, y_tr, X_te, y_te, model_type):
 
297
  sc = RobustScaler()
298
- Xtr = sc.fit_transform(X_tr); Xte = sc.transform(X_te)
 
 
 
 
 
 
 
 
 
 
299
  if model_type == 'lgbm':
300
- m = lgb.LGBMClassifier(n_estimators=300, max_depth=10, scale_pos_weight=10, random_state=42, n_jobs=-1, verbose=-1)
 
 
 
 
 
301
  elif model_type == 'rf':
302
- m = RandomForestClassifier(n_estimators=300, max_depth=15, class_weight='balanced', random_state=42, n_jobs=-1)
 
 
 
 
303
  elif model_type == 'xgb':
304
- m = xgb.XGBClassifier(n_estimators=300, max_depth=8, scale_pos_weight=10, random_state=42, n_jobs=-1, verbosity=0)
 
 
 
 
 
 
305
  m.fit(Xtr, y_tr)
306
- pred = m.predict(Xte)
307
  proba = m.predict_proba(Xte)[:, 1]
 
 
 
 
 
 
 
 
 
 
308
  return {
309
  'f1': round(f1_score(y_te, pred, zero_division=0), 4),
310
  'precision': round(precision_score(y_te, pred, zero_division=0), 4),
311
  'recall': round(recall_score(y_te, pred, zero_division=0), 4),
312
  'auroc': round(roc_auc_score(y_te, proba) if len(np.unique(y_te)) > 1 else 0.5, 4),
 
313
  }
314
 
315
  model_types = [('lgbm', 'LightGBM'), ('rf', 'Random Forest'), ('xgb', 'XGBoost')]
@@ -320,22 +501,40 @@ def main(data_dir):
320
  if tr_m.sum() < 50 or te_m.sum() < 10:
321
  print(f" {strat_name}: yetersiz veri, atlanΔ±yor")
322
  continue
 
323
  print(f"\n {strat_name} (train={tr_m.sum()}, test={te_m.sum()}, test_ill={y[te_m].sum()}):")
 
 
324
  for mt, mn in model_types:
325
- res = train_eval(X_final[tr_m], y[tr_m], X_final[te_m], y[te_m], mt)
326
  res['strateji'] = strat_name
327
  res['model'] = mn
328
  all_results.append(res)
329
- print(f" {mn:15s}: F1={res['f1']:.4f} P={res['precision']:.4f} R={res['recall']:.4f} AUROC={res['auroc']:.4f}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
 
331
  res_df = pd.DataFrame(all_results)
332
  res_df.to_csv('output/results/all_experiment_results.csv', index=False)
333
 
334
  # ════════════════════════════════════════════════════════���═══════
335
- # ADIM 7: WALK-FORWARD VALΔ°DASYON
336
  # ════════════════════════════════════════════════════════════════
337
  print("\n" + "=" * 70)
338
- print("ADIM 7: WALK-FORWARD VALΔ°DASYON (GerΓ§ek DΓΌnya ReferansΔ±)")
339
  print("=" * 70)
340
 
341
  wf_results = {}
@@ -346,35 +545,53 @@ def main(data_dir):
346
  te_m = (ts >= test_start) & (ts < test_start + 3)
347
  if tr_m.sum() < 50 or te_m.sum() < 10 or len(np.unique(y[te_m])) < 2:
348
  continue
349
- res = train_eval(X_final[tr_m], y[tr_m], X_final[te_m], y[te_m], mt)
350
  wf_f1s.append(res['f1'])
351
  wf_results[mn] = round(np.mean(wf_f1s), 4)
352
  print(f" {mn}: Walk-Forward F1 = {wf_results[mn]:.4f}")
353
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354
  # DΓΌrΓΌstlΓΌk tablosu
355
  print("\n DΓΌrΓΌstlΓΌk KarşılaştΔ±rmasΔ±:")
 
356
  for strat_name in strategies:
357
  sapma_list = []
358
  for mn in wf_results:
359
  row = res_df[(res_df['strateji'] == strat_name) & (res_df['model'] == mn)]
360
- if len(row) > 0:
361
  sapma = ((row['f1'].values[0] - wf_results[mn]) / wf_results[mn]) * 100
362
  sapma_list.append(sapma)
363
  if sapma_list:
364
  avg_sapma = np.mean(sapma_list)
365
  durum = "βœ… DÜRÜST" if abs(avg_sapma) < 10 else ("πŸ”΄ ŞİŞME" if avg_sapma > 10 else "⚠️ PESΔ°MΔ°ST")
 
366
  print(f" {strat_name:25s}: ort. sapma = {avg_sapma:+.1f}% {durum}")
367
 
368
  # ════════════════════════════════════════════════════════════════
369
- # ADIM 8: FİGÜRLER
370
  # ════════════════════════════════════════════════════════════════
371
  print("\n" + "=" * 70)
372
- print("ADIM 8: FİGÜRLER")
373
  print("=" * 70)
374
 
375
  sns.set_theme(style='whitegrid', font_scale=1.1)
376
 
377
- # Figür 1: Kırılma noktası + sağlık skoru
378
  fig, axes = plt.subplots(3, 1, figsize=(18, 14), gridspec_kw={'height_ratios': [2, 1, 1]})
379
  axes[0].plot(all_ts, health.values, 'o-', color='steelblue', linewidth=2, markersize=5)
380
  axes[0].axvline(x=bp_final, color='red', linewidth=3, linestyle='--')
@@ -396,16 +613,15 @@ def main(data_dir):
396
  color='red', s=200, zorder=5, edgecolors='black')
397
  axes[2].set_ylabel('Kriz Sinyali', fontsize=12)
398
  axes[2].set_xlabel('Timestep', fontsize=12)
399
-
400
  plt.tight_layout()
401
  plt.savefig('output/figures/fig1_breakpoint.png', dpi=150, bbox_inches='tight')
402
  plt.close()
403
  print(" βœ“ fig1_breakpoint.png")
404
 
405
- # Figür 2: F1 karşılaştırma
406
- fig, ax = plt.subplots(figsize=(16, 8))
407
  strat_names = list(strategies.keys())
408
- model_names = [mn for _, mn in model_types]
409
  colors5 = sns.color_palette('Set2', len(strat_names))
410
  x = np.arange(len(model_names)); width = 0.15
411
 
@@ -423,14 +639,14 @@ def main(data_dir):
423
  ax.axhline(y=wf_avg, color='green', linewidth=2, linestyle='--', label=f'Walk-Forward ({wf_avg:.3f})')
424
  ax.set_xticks(x + width*2); ax.set_xticklabels(model_names, fontsize=12)
425
  ax.set_ylabel('Illicit F1', fontsize=13)
426
- ax.set_title('Bâlme Stratejileri Karşılaştırması (temizlenmiş veri)', fontsize=14, fontweight='bold')
427
- ax.legend(fontsize=9); ax.set_ylim(0, 1.1)
428
  plt.tight_layout()
429
  plt.savefig('output/figures/fig2_f1_comparison.png', dpi=150, bbox_inches='tight')
430
  plt.close()
431
  print(" βœ“ fig2_f1_comparison.png")
432
 
433
- # Figür 3: Pipeline karşılaştırma
434
  fig, ax = plt.subplots(figsize=(10, 6))
435
  p_names = list(pipelines.keys())
436
  p_vals = list(pipelines.values())
@@ -445,38 +661,68 @@ def main(data_dir):
445
  plt.close()
446
  print(" βœ“ fig3_pipeline_comparison.png")
447
 
448
- # FigΓΌr 4: DΓΌrΓΌstlΓΌk Δ±sΔ± haritasΔ±
449
- fig, ax = plt.subplots(figsize=(14, 6))
450
  sapma_data = []
451
  for strat_name in strat_names:
452
  for mn in model_names:
453
  row = res_df[(res_df['strateji'] == strat_name) & (res_df['model'] == mn)]
454
- if len(row) > 0 and mn in wf_results:
455
  sapma = ((row['f1'].values[0] - wf_results[mn]) / wf_results[mn]) * 100
456
  sapma_data.append({'strateji': strat_name, 'model': mn, 'sapma': round(sapma, 1)})
457
  if sapma_data:
458
  sapma_df = pd.DataFrame(sapma_data)
459
  pivot = sapma_df.pivot_table(values='sapma', index='model', columns='strateji')
460
- sns.heatmap(pivot, annot=True, fmt='.1f', cmap='RdYlGn_r', center=0, ax=ax, linewidths=0.5)
461
- ax.set_title('Walk-Forward DΓΌrΓΌstlΓΌk SapmasΔ± (%)', fontsize=14, fontweight='bold')
 
462
  plt.tight_layout()
463
  plt.savefig('output/figures/fig4_honesty.png', dpi=150, bbox_inches='tight')
464
  plt.close()
465
  print(" βœ“ fig4_honesty.png")
466
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
467
  # ════════════════════════════════════════════════════════════════
468
- # ADIM 9: Γ–ZET RAPOR
469
  # ════════════════════════════════════════════════════════════════
470
  elapsed = time.time() - start_time
471
 
472
  summary = {
473
- 'veri': {'toplam': N, 'etiketli': len(y), 'illicit': int(y.sum()), 'ozellik': int(var_mask.sum())},
474
- 'temizleme': {'nan': int(nan_count), 'inf': int(inf_count), 'outlier_pct': round(outlier_mask.sum()/(X.shape[0]*X.shape[1])*100, 2),
 
 
475
  'cikarilan_ozellik': int((~var_mask).sum()), 'en_iyi_pipeline': best_pipe},
476
- 'kirilma': {'saglik_yontemi': int(bp_health), 'tepe_dusus': int(bp_peak), 'final': int(bp_final)},
477
  'walk_forward': wf_results,
478
  'sonuclar': res_df.to_dict(orient='records'),
479
  'pipeline_karsilastirma': {k: round(v, 4) for k, v in pipelines.items()},
 
480
  'sure_dakika': round(elapsed / 60, 1),
481
  }
482
 
@@ -486,19 +732,17 @@ def main(data_dir):
486
  print("\n" + "=" * 70)
487
  print(f"TAMAMLANDI! (SΓΌre: {elapsed/60:.1f} dakika)")
488
  print("=" * 70)
489
- print(f"\n Γ‡Δ±ktΔ±lar:")
490
- print(f" output/results/all_experiment_results.csv")
491
- print(f" output/results/topological_metrics.csv")
492
- print(f" output/results/summary.json")
493
- print(f" output/figures/fig1_breakpoint.png")
494
- print(f" output/figures/fig2_f1_comparison.png")
495
- print(f" output/figures/fig3_pipeline_comparison.png")
496
- print(f" output/figures/fig4_honesty.png")
497
-
498
- # SonuΓ§ tablosu
499
- print(f"\n ═══ SONUΓ‡ TABLOSU (F1) ═══")
500
  pivot_f1 = res_df.pivot_table(values='f1', index='model', columns='strateji')
501
  print(pivot_f1.to_string())
 
 
 
 
 
 
502
 
503
 
504
  if __name__ == '__main__':
 
7
  1. Veri denetimi ve temizleme (data_audit)
8
  2. En iyi ΓΆn işleme pipeline'Δ±nΔ± belirle
9
  3. Topolojik kΔ±rΔ±lma noktasΔ± tespiti
10
+ 4. 5 bΓΆlme stratejisi Γ— 4 model (GraphSAGE dahil) = 20 deney
11
+ 5. Walk-forward validasyon + dΓΌrΓΌstlΓΌk testi
12
+ 6. TΓΌm figΓΌrleri ve sonuΓ§larΔ± kaydet
 
 
13
 
14
  KULLANIM:
15
  pip install pandas numpy scikit-learn matplotlib seaborn lightgbm xgboost networkx scipy imbalanced-learn torch torch-geometric
16
  python run_all.py --data_dir ./dataset
17
 
18
+ SÜRE: ~15 dakika (CPU)
19
  ===============================================================================
20
  """
21
 
 
38
  import xgboost as xgb
39
  import lightgbm as lgb
40
 
41
+ import torch
42
+ import torch.nn as nn
43
+ import torch.nn.functional as F
44
+ from torch_geometric.nn import SAGEConv
45
+ from torch_geometric.data import Data
46
+
47
  warnings.filterwarnings('ignore')
48
  np.random.seed(42)
49
+ torch.manual_seed(42)
50
+
51
+
52
+ # ════════════════════════════════════════════════════════════════
53
+ # GraphSAGE Model
54
+ # ════════════════════════════════════════════════════════════════
55
+ class GraphSAGENet(nn.Module):
56
+ def __init__(self, in_channels, hidden=128, out_channels=2, num_layers=3, dropout=0.3):
57
+ super().__init__()
58
+ self.convs = nn.ModuleList()
59
+ self.bns = nn.ModuleList()
60
+ self.convs.append(SAGEConv(in_channels, hidden))
61
+ self.bns.append(nn.BatchNorm1d(hidden))
62
+ for _ in range(num_layers - 2):
63
+ self.convs.append(SAGEConv(hidden, hidden))
64
+ self.bns.append(nn.BatchNorm1d(hidden))
65
+ self.convs.append(SAGEConv(hidden, out_channels))
66
+ self.dropout = dropout
67
+
68
+ def forward(self, x, edge_index):
69
+ for i, (conv, bn) in enumerate(zip(self.convs[:-1], self.bns)):
70
+ x = conv(x, edge_index)
71
+ x = bn(x)
72
+ x = F.relu(x)
73
+ x = F.dropout(x, p=self.dropout, training=self.training)
74
+ x = self.convs[-1](x, edge_index)
75
+ return x
76
+
77
+
78
+ def train_graphsage(data, train_mask, test_mask, in_channels, epochs=200, lr=0.005, weight=None):
79
+ """GraphSAGE eğit ve değerlendir β€” inductive: test kenarlarΔ± eğitimde kullanΔ±lmaz"""
80
+ device = torch.device('cpu')
81
+ model = GraphSAGENet(in_channels, hidden=128, out_channels=2, num_layers=3, dropout=0.3).to(device)
82
+
83
+ # Class weight
84
+ if weight is not None:
85
+ w = torch.tensor([1.0, weight], dtype=torch.float32).to(device)
86
+ else:
87
+ w = None
88
+
89
+ optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)
90
+ scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
91
+
92
+ # Inductive: sadece train düğümleri arasındaki kenarları al
93
+ train_nodes = set(torch.where(train_mask)[0].tolist())
94
+ edge_index = data.edge_index
95
+ mask_e = torch.tensor([
96
+ (edge_index[0, i].item() in train_nodes) and (edge_index[1, i].item() in train_nodes)
97
+ for i in range(edge_index.shape[1])
98
+ ], dtype=torch.bool)
99
+ train_edge_index = edge_index[:, mask_e]
100
+
101
+ x = data.x.to(device)
102
+ y = data.y.to(device)
103
+ train_mask_d = train_mask.to(device)
104
+ test_mask_d = test_mask.to(device)
105
+ train_edge_index = train_edge_index.to(device)
106
+ full_edge_index = edge_index.to(device)
107
+
108
+ best_f1 = 0
109
+ best_state = None
110
+ patience = 30
111
+ no_improve = 0
112
+
113
+ model.train()
114
+ for epoch in range(epochs):
115
+ optimizer.zero_grad()
116
+ out = model(x, train_edge_index)
117
+ loss = F.cross_entropy(out[train_mask_d], y[train_mask_d], weight=w)
118
+ loss.backward()
119
+ optimizer.step()
120
+ scheduler.step()
121
+
122
+ if (epoch + 1) % 10 == 0:
123
+ model.eval()
124
+ with torch.no_grad():
125
+ out_eval = model(x, full_edge_index)
126
+ pred = out_eval[test_mask_d].argmax(dim=1)
127
+ f1 = f1_score(y[test_mask_d].cpu(), pred.cpu(), zero_division=0)
128
+ if f1 > best_f1:
129
+ best_f1 = f1
130
+ best_state = {k: v.clone() for k, v in model.state_dict().items()}
131
+ no_improve = 0
132
+ else:
133
+ no_improve += 1
134
+ model.train()
135
+ if no_improve >= patience // 10:
136
+ break
137
+
138
+ # Final eval
139
+ if best_state:
140
+ model.load_state_dict(best_state)
141
+ model.eval()
142
+ with torch.no_grad():
143
+ out = model(x, full_edge_index)
144
+ proba = F.softmax(out, dim=1)[:, 1]
145
+
146
+ # Threshold optimization
147
+ best_th_f1 = 0
148
+ best_th = 0.5
149
+ for th in np.arange(0.1, 0.9, 0.05):
150
+ pred_th = (proba[test_mask_d] >= th).long()
151
+ f1_th = f1_score(y[test_mask_d].cpu(), pred_th.cpu(), zero_division=0)
152
+ if f1_th > best_th_f1:
153
+ best_th_f1 = f1_th
154
+ best_th = th
155
+
156
+ pred = (proba[test_mask_d] >= best_th).long()
157
+ y_test = y[test_mask_d].cpu().numpy()
158
+ pred_np = pred.cpu().numpy()
159
+ proba_np = proba[test_mask_d].cpu().numpy()
160
+
161
+ return {
162
+ 'f1': round(f1_score(y_test, pred_np, zero_division=0), 4),
163
+ 'precision': round(precision_score(y_test, pred_np, zero_division=0), 4),
164
+ 'recall': round(recall_score(y_test, pred_np, zero_division=0), 4),
165
+ 'auroc': round(roc_auc_score(y_test, proba_np) if len(np.unique(y_test)) > 1 else 0.5, 4),
166
+ }
167
+
168
 
169
  def main(data_dir):
170
  start_time = time.time()
171
 
172
  # Γ‡Δ±ktΔ± klasΓΆrleri
173
+ for d in ['output/figures', 'output/results']:
174
  os.makedirs(d, exist_ok=True)
175
 
176
  # ════════════════════════════════════════════════════════════════
 
193
  label_map = {'1': 1, '2': 0, 'unknown': -1}
194
  labels_np = np.array([label_map[str(c)] for c in class_df['class'].values])
195
 
196
+ # Kenarlar
197
+ valid_edges = [(id_map[s], id_map[d]) for s, d in zip(edge_df['txId1'], edge_df['txId2'])
198
+ if s in id_map and d in id_map]
199
+ src = np.array([e[0] for e in valid_edges])
200
+ dst = np.array([e[1] for e in valid_edges])
201
 
202
  labeled_mask = labels_np >= 0
203
+ labeled_indices = np.where(labeled_mask)[0]
204
+
205
+ # Etiketli düğüm indeksleme (tüm düğümlerden etiketlilere)
206
+ full_to_labeled = {full_idx: lab_idx for lab_idx, full_idx in enumerate(labeled_indices)}
207
+
208
  X_raw = features_raw[labeled_mask]
209
  y = labels_np[labeled_mask]
210
  ts = timesteps_raw[labeled_mask]
211
 
212
  print(f" Toplam: {N}, Etiketli: {len(y)}")
213
  print(f" Δ°llicit: {y.sum()} ({y.mean()*100:.1f}%), Licit: {len(y)-y.sum()}")
214
+ print(f" Kenar sayΔ±sΔ±: {len(src)}")
215
 
216
  # ════════════════════════════════════════════════════════════════
217
  # ADIM 2: VERΔ° TEMΔ°ZLEME VE Γ–N İŞLEME
 
220
  print("ADIM 2: VERΔ° TEMΔ°ZLEME")
221
  print("=" * 70)
222
 
 
223
  nan_count = np.isnan(X_raw).sum()
224
  inf_count = np.isinf(X_raw).sum()
225
  print(f" NaN: {nan_count}, Inf: {inf_count}")
226
  X = np.nan_to_num(X_raw, nan=0.0, posinf=0.0, neginf=0.0)
227
 
228
+ # Outlier clipping
229
  Q1 = np.percentile(X, 25, axis=0)
230
  Q3 = np.percentile(X, 75, axis=0)
231
  IQR = Q3 - Q1
 
234
  outlier_mask = (X < lower) | (X > upper)
235
  print(f" Outlier hΓΌcre: {outlier_mask.sum()} ({outlier_mask.sum()/(X.shape[0]*X.shape[1])*100:.1f}%)")
236
 
 
 
 
 
 
 
237
  X_clipped = np.clip(X, lower, upper)
 
238
 
239
+ # Düşük varyans çıkarma
240
  variances = np.var(X_clipped, axis=0)
241
  var_mask = variances > 1e-6
242
  X_clean = X_clipped[:, var_mask]
243
+ print(f" Düşük varyanslı âzellik çıkarıldı: {(~var_mask).sum()}, kalan: {var_mask.sum()}")
244
+
245
+ # Son veri: clipped
246
+ X_final = X_clipped
247
 
248
  # ════════════════════════════════════════════════════════════════
249
  # ADIM 3: Γ–N İŞLEME PIPELINE KARŞILAŞTIRMASI
250
  # ════════════════════════════════════════════════════════════════
251
  print("\n" + "=" * 70)
252
+ print("ADIM 3: PIPELINE KARŞILAŞTIRMASI")
253
  print("=" * 70)
254
 
255
+ tr_mask_pipe = ts <= 39
256
+ te_mask_pipe = ts > 39
257
 
258
  def quick_eval(X_tr, y_tr, X_te, y_te):
259
+ m = lgb.LGBMClassifier(n_estimators=500, max_depth=12, scale_pos_weight=10,
260
+ learning_rate=0.05, random_state=42, n_jobs=-1, verbose=-1)
261
  m.fit(X_tr, y_tr)
262
+ proba = m.predict_proba(X_te)[:, 1]
263
+ # Threshold optimizasyonu
264
+ best_f1, best_th = 0, 0.5
265
+ for th in np.arange(0.1, 0.9, 0.05):
266
+ p = (proba >= th).astype(int)
267
+ f = f1_score(y_te, p, zero_division=0)
268
+ if f > best_f1: best_f1, best_th = f, th
269
+ return best_f1
270
 
271
  pipelines = {}
272
 
273
+ f1_raw = quick_eval(X_raw[tr_mask_pipe], y[tr_mask_pipe], X_raw[te_mask_pipe], y[te_mask_pipe])
 
274
  pipelines['Ham Veri'] = f1_raw
275
  print(f" Ham Veri: F1={f1_raw:.4f}")
276
 
 
277
  sc = StandardScaler()
278
+ f1_ss = quick_eval(sc.fit_transform(X[tr_mask_pipe]), y[tr_mask_pipe], sc.transform(X[te_mask_pipe]), y[te_mask_pipe])
279
  pipelines['StandardScaler'] = f1_ss
280
+ print(f" StandardScaler: F1={f1_ss:.4f}")
281
 
 
282
  rs = RobustScaler()
283
+ f1_rs = quick_eval(rs.fit_transform(X[tr_mask_pipe]), y[tr_mask_pipe], rs.transform(X[te_mask_pipe]), y[te_mask_pipe])
284
  pipelines['RobustScaler'] = f1_rs
285
+ print(f" RobustScaler: F1={f1_rs:.4f}")
286
 
 
287
  rs2 = RobustScaler()
288
+ f1_cr = quick_eval(rs2.fit_transform(X_clipped[tr_mask_pipe]), y[tr_mask_pipe], rs2.transform(X_clipped[te_mask_pipe]), y[te_mask_pipe])
 
289
  pipelines['Clip+Robust'] = f1_cr
290
+ print(f" Clip+Robust: F1={f1_cr:.4f}")
291
 
 
292
  rs3 = RobustScaler()
293
+ f1_cvr = quick_eval(rs3.fit_transform(X_clean[tr_mask_pipe]), y[tr_mask_pipe], rs3.transform(X_clean[te_mask_pipe]), y[te_mask_pipe])
 
294
  pipelines['Clip+VarFilter+Robust'] = f1_cvr
295
+ print(f" Clip+VarFilter+Rob: F1={f1_cvr:.4f}")
296
 
 
297
  try:
298
  from imblearn.over_sampling import SMOTE
299
  smote = SMOTE(random_state=42)
300
  rs4 = RobustScaler()
301
+ X_tr_s = rs4.fit_transform(X_clipped[tr_mask_pipe])
302
+ X_te_s = rs4.transform(X_clipped[te_mask_pipe])
303
+ X_tr_sm, y_tr_sm = smote.fit_resample(X_tr_s, y[tr_mask_pipe])
304
+ f1_smote = quick_eval(X_tr_sm, y_tr_sm, X_te_s, y[te_mask_pipe])
305
  pipelines['Clip+Robust+SMOTE'] = f1_smote
306
+ print(f" Clip+Robust+SMOTE: F1={f1_smote:.4f}")
307
  except ImportError:
308
+ print(" SMOTE atlandΔ±")
309
 
 
310
  best_pipe = max(pipelines, key=pipelines.get)
311
  print(f"\n β˜… En iyi pipeline: {best_pipe} (F1={pipelines[best_pipe]:.4f})")
312
 
 
 
 
 
 
313
  # ════════════════════════════════════════════════════════════════
314
  # ADIM 4: TOPOLOJΔ°K METRΔ°KLER
315
  # ════════════════════════════════════════════════════════════════
 
338
  ill_rate = len(ts_ill) / max(len(ts_lab), 1)
339
  topo[t] = {'n_nodes': n, 'n_edges': e, 'density': density, 'cc_ratio': cc_ratio,
340
  'n_components': comps, 'avg_degree': avg_deg, 'illicit_rate': ill_rate}
341
+ print(f" TS {t:2d}: nodes={n:5d} edges={e:5d} density={density:.5f} illicit={ill_rate:.3f}")
342
 
343
  topo_df = pd.DataFrame(topo).T
344
  topo_df.to_csv('output/results/topological_metrics.csv')
345
 
346
  # ════════════════════════════════════════════════════════════════
347
+ # ADIM 5: KIRILMA NOKTASI TESPΔ°TΔ°
348
  # ════════════════════════════════════════════════════════════════
349
  print("\n" + "=" * 70)
350
  print("ADIM 5: KIRILMA NOKTASI TESPΔ°TΔ°")
351
  print("=" * 70)
352
 
353
+ # Sağlık skoru: normalize et
354
  for col in ['density', 'cc_ratio', 'n_components']:
355
  mi, ma = topo_df[col].min(), topo_df[col].max()
356
  topo_df[f'{col}_n'] = (topo_df[col] - mi) / (ma - mi + 1e-8)
357
  health = (topo_df['density_n'] + topo_df['cc_ratio_n'] + (1 - topo_df['n_components_n'])) / 3
358
+ bp_final = health.diff().idxmin()
359
+ print(f" Sağlık skoru kırılma noktası: TS {bp_final}")
360
 
361
+ # Tepe-düşüş analizi (bilgi amaçlı)
362
  df_t = topo_df.copy()
363
  for col in ['n_edges', 'density', 'avg_degree']:
364
  mi, ma = df_t[col].min(), df_t[col].max()
 
366
  crisis = (df_t['n_edges_norm'] * 0.4 + df_t['density_norm'] * 0.3 + df_t['avg_degree_norm'] * 0.3).values
367
  crisis_smooth = uniform_filter1d(crisis, size=5, mode='nearest')
368
  velocity = np.gradient(crisis_smooth)
 
369
  peaks = []
370
  for i in range(1, len(velocity) - 1):
371
  if velocity[i-1] > 0 and velocity[i+1] < 0:
372
  peaks.append({'timestep': all_ts[i], 'index': i, 'drop': abs(velocity[i+1])})
373
  peaks = sorted(peaks, key=lambda x: x['drop'], reverse=True)
 
 
374
 
 
 
 
375
  print(f" β˜… Final kΔ±rΔ±lma noktasΔ±: TS {bp_final}")
376
 
377
  # ════════════════════════════════════════════════════════════════
378
+ # ADIM 6: GRAF VERΔ°SΔ° HAZIRLA (GraphSAGE iΓ§in)
379
+ # ════════════════════════════════════════════════════════════════
380
+ print("\n" + "=" * 70)
381
+ print("ADIM 6: GRAPHSAGE VERΔ° HAZIRLAMA")
382
+ print("=" * 70)
383
+
384
+ # Etiketli düğümler arası kenarları filtrele
385
+ labeled_set = set(labeled_indices.tolist())
386
+ labeled_edges = [(full_to_labeled[s], full_to_labeled[d])
387
+ for s, d in zip(src, dst)
388
+ if s in labeled_set and d in labeled_set
389
+ and s in full_to_labeled and d in full_to_labeled]
390
+
391
+ if labeled_edges:
392
+ edge_src = [e[0] for e in labeled_edges]
393
+ edge_dst = [e[1] for e in labeled_edges]
394
+ edge_index = torch.tensor([edge_src + edge_dst, edge_dst + edge_src], dtype=torch.long) # undirected
395
+ else:
396
+ edge_index = torch.zeros((2, 0), dtype=torch.long)
397
+
398
+ print(f" Etiketli düğümler arası kenar: {len(labeled_edges)} ({edge_index.shape[1]} undirected)")
399
+
400
+ # Normalize features for GNN
401
+ scaler_gnn = RobustScaler()
402
+ X_gnn = scaler_gnn.fit_transform(X_final)
403
+
404
+ x_tensor = torch.tensor(X_gnn, dtype=torch.float32)
405
+ y_tensor = torch.tensor(y, dtype=torch.long)
406
+
407
+ graph_data = Data(x=x_tensor, edge_index=edge_index, y=y_tensor)
408
+
409
+ # ════════════════════════════════════════════════════════════════
410
+ # ADIM 7: BΓ–LME STRATEJΔ°LERΔ° VE DENEYLER
411
  # ════════════════════════════════════════════════════════════════
412
  print("\n" + "=" * 70)
413
+ print("ADIM 7: 5 STRATEJΔ° Γ— 4 MODEL = 20 DENEY")
414
  print("=" * 70)
415
 
416
  def make_masks(train_ts_set, test_ts_set):
 
438
  ),
439
  }
440
 
441
+ def train_eval_tabular(X_tr, y_tr, X_te, y_te, model_type):
442
+ """Tabular model eğit β€” threshold optimizasyonu ile"""
443
  sc = RobustScaler()
444
+ Xtr = sc.fit_transform(X_tr)
445
+ Xte = sc.transform(X_te)
446
+
447
+ # SMOTE uygula (eğitim setine)
448
+ try:
449
+ from imblearn.over_sampling import SMOTE
450
+ smote = SMOTE(random_state=42)
451
+ Xtr, y_tr = smote.fit_resample(Xtr, y_tr)
452
+ except:
453
+ pass
454
+
455
  if model_type == 'lgbm':
456
+ m = lgb.LGBMClassifier(
457
+ n_estimators=500, max_depth=12, learning_rate=0.05,
458
+ num_leaves=63, min_child_samples=20, subsample=0.8,
459
+ colsample_bytree=0.8, scale_pos_weight=10,
460
+ random_state=42, n_jobs=-1, verbose=-1
461
+ )
462
  elif model_type == 'rf':
463
+ m = RandomForestClassifier(
464
+ n_estimators=500, max_depth=20, min_samples_leaf=5,
465
+ class_weight='balanced_subsample', max_features='sqrt',
466
+ random_state=42, n_jobs=-1
467
+ )
468
  elif model_type == 'xgb':
469
+ m = xgb.XGBClassifier(
470
+ n_estimators=500, max_depth=10, learning_rate=0.05,
471
+ subsample=0.8, colsample_bytree=0.8, scale_pos_weight=10,
472
+ min_child_weight=5, gamma=0.1,
473
+ random_state=42, n_jobs=-1, verbosity=0
474
+ )
475
+
476
  m.fit(Xtr, y_tr)
 
477
  proba = m.predict_proba(Xte)[:, 1]
478
+
479
+ # Threshold optimizasyonu
480
+ best_f1, best_th = 0, 0.5
481
+ for th in np.arange(0.1, 0.9, 0.05):
482
+ pred_th = (proba >= th).astype(int)
483
+ f1_th = f1_score(y_te, pred_th, zero_division=0)
484
+ if f1_th > best_f1:
485
+ best_f1, best_th = f1_th, th
486
+
487
+ pred = (proba >= best_th).astype(int)
488
  return {
489
  'f1': round(f1_score(y_te, pred, zero_division=0), 4),
490
  'precision': round(precision_score(y_te, pred, zero_division=0), 4),
491
  'recall': round(recall_score(y_te, pred, zero_division=0), 4),
492
  'auroc': round(roc_auc_score(y_te, proba) if len(np.unique(y_te)) > 1 else 0.5, 4),
493
+ 'threshold': round(best_th, 2),
494
  }
495
 
496
  model_types = [('lgbm', 'LightGBM'), ('rf', 'Random Forest'), ('xgb', 'XGBoost')]
 
501
  if tr_m.sum() < 50 or te_m.sum() < 10:
502
  print(f" {strat_name}: yetersiz veri, atlanΔ±yor")
503
  continue
504
+
505
  print(f"\n {strat_name} (train={tr_m.sum()}, test={te_m.sum()}, test_ill={y[te_m].sum()}):")
506
+
507
+ # Tabular modeller
508
  for mt, mn in model_types:
509
+ res = train_eval_tabular(X_final[tr_m], y[tr_m], X_final[te_m], y[te_m], mt)
510
  res['strateji'] = strat_name
511
  res['model'] = mn
512
  all_results.append(res)
513
+ print(f" {mn:15s}: F1={res['f1']:.4f} P={res['precision']:.4f} R={res['recall']:.4f} AUROC={res['auroc']:.4f} th={res['threshold']}")
514
+
515
+ # GraphSAGE
516
+ print(f" {'GraphSAGE':15s}: eğitiliyor...", end='', flush=True)
517
+ train_mask_t = torch.tensor(tr_m, dtype=torch.bool)
518
+ test_mask_t = torch.tensor(te_m, dtype=torch.bool)
519
+ ill_weight = float((y[tr_m] == 0).sum()) / max(float((y[tr_m] == 1).sum()), 1)
520
+ ill_weight = min(ill_weight, 15.0) # cap at 15
521
+
522
+ gs_res = train_graphsage(graph_data, train_mask_t, test_mask_t,
523
+ X_final.shape[1], epochs=200, lr=0.005, weight=ill_weight)
524
+ gs_res['strateji'] = strat_name
525
+ gs_res['model'] = 'GraphSAGE'
526
+ gs_res['threshold'] = 0.0 # threshold handled internally
527
+ all_results.append(gs_res)
528
+ print(f"\r {'GraphSAGE':15s}: F1={gs_res['f1']:.4f} P={gs_res['precision']:.4f} R={gs_res['recall']:.4f} AUROC={gs_res['auroc']:.4f}")
529
 
530
  res_df = pd.DataFrame(all_results)
531
  res_df.to_csv('output/results/all_experiment_results.csv', index=False)
532
 
533
  # ════════════════════════════════════════════════════════���═══════
534
+ # ADIM 8: WALK-FORWARD VALΔ°DASYON
535
  # ════════════════════════════════════════════════════════════════
536
  print("\n" + "=" * 70)
537
+ print("ADIM 8: WALK-FORWARD VALΔ°DASYON")
538
  print("=" * 70)
539
 
540
  wf_results = {}
 
545
  te_m = (ts >= test_start) & (ts < test_start + 3)
546
  if tr_m.sum() < 50 or te_m.sum() < 10 or len(np.unique(y[te_m])) < 2:
547
  continue
548
+ res = train_eval_tabular(X_final[tr_m], y[tr_m], X_final[te_m], y[te_m], mt)
549
  wf_f1s.append(res['f1'])
550
  wf_results[mn] = round(np.mean(wf_f1s), 4)
551
  print(f" {mn}: Walk-Forward F1 = {wf_results[mn]:.4f}")
552
 
553
+ # GraphSAGE walk-forward
554
+ wf_gs_f1s = []
555
+ for test_start in range(10, 49, 3):
556
+ tr_m_wf = ts < test_start
557
+ te_m_wf = (ts >= test_start) & (ts < test_start + 3)
558
+ if tr_m_wf.sum() < 50 or te_m_wf.sum() < 10 or len(np.unique(y[te_m_wf])) < 2:
559
+ continue
560
+ train_mask_wf = torch.tensor(tr_m_wf, dtype=torch.bool)
561
+ test_mask_wf = torch.tensor(te_m_wf, dtype=torch.bool)
562
+ ill_w = float((y[tr_m_wf]==0).sum()) / max(float((y[tr_m_wf]==1).sum()), 1)
563
+ ill_w = min(ill_w, 15.0)
564
+ gs_wf = train_graphsage(graph_data, train_mask_wf, test_mask_wf, X_final.shape[1], epochs=100, weight=ill_w)
565
+ wf_gs_f1s.append(gs_wf['f1'])
566
+ wf_results['GraphSAGE'] = round(np.mean(wf_gs_f1s), 4) if wf_gs_f1s else 0
567
+ print(f" GraphSAGE: Walk-Forward F1 = {wf_results['GraphSAGE']:.4f}")
568
+
569
  # DΓΌrΓΌstlΓΌk tablosu
570
  print("\n DΓΌrΓΌstlΓΌk KarşılaştΔ±rmasΔ±:")
571
+ honesty_data = []
572
  for strat_name in strategies:
573
  sapma_list = []
574
  for mn in wf_results:
575
  row = res_df[(res_df['strateji'] == strat_name) & (res_df['model'] == mn)]
576
+ if len(row) > 0 and mn in wf_results and wf_results[mn] > 0:
577
  sapma = ((row['f1'].values[0] - wf_results[mn]) / wf_results[mn]) * 100
578
  sapma_list.append(sapma)
579
  if sapma_list:
580
  avg_sapma = np.mean(sapma_list)
581
  durum = "βœ… DÜRÜST" if abs(avg_sapma) < 10 else ("πŸ”΄ ŞİŞME" if avg_sapma > 10 else "⚠️ PESΔ°MΔ°ST")
582
+ honesty_data.append({'strateji': strat_name, 'sapma': round(avg_sapma, 1), 'durum': durum})
583
  print(f" {strat_name:25s}: ort. sapma = {avg_sapma:+.1f}% {durum}")
584
 
585
  # ════════════════════════════════════════════════════════════════
586
+ # ADIM 9: FİGÜRLER
587
  # ════════════════════════════════════════════════════════════════
588
  print("\n" + "=" * 70)
589
+ print("ADIM 9: FİGÜRLER")
590
  print("=" * 70)
591
 
592
  sns.set_theme(style='whitegrid', font_scale=1.1)
593
 
594
+ # Fig 1: KΔ±rΔ±lma noktasΔ±
595
  fig, axes = plt.subplots(3, 1, figsize=(18, 14), gridspec_kw={'height_ratios': [2, 1, 1]})
596
  axes[0].plot(all_ts, health.values, 'o-', color='steelblue', linewidth=2, markersize=5)
597
  axes[0].axvline(x=bp_final, color='red', linewidth=3, linestyle='--')
 
613
  color='red', s=200, zorder=5, edgecolors='black')
614
  axes[2].set_ylabel('Kriz Sinyali', fontsize=12)
615
  axes[2].set_xlabel('Timestep', fontsize=12)
 
616
  plt.tight_layout()
617
  plt.savefig('output/figures/fig1_breakpoint.png', dpi=150, bbox_inches='tight')
618
  plt.close()
619
  print(" βœ“ fig1_breakpoint.png")
620
 
621
+ # Fig 2: F1 karşılaştırma (4 model dahil)
622
+ fig, ax = plt.subplots(figsize=(18, 8))
623
  strat_names = list(strategies.keys())
624
+ model_names = [mn for _, mn in model_types] + ['GraphSAGE']
625
  colors5 = sns.color_palette('Set2', len(strat_names))
626
  x = np.arange(len(model_names)); width = 0.15
627
 
 
639
  ax.axhline(y=wf_avg, color='green', linewidth=2, linestyle='--', label=f'Walk-Forward ({wf_avg:.3f})')
640
  ax.set_xticks(x + width*2); ax.set_xticklabels(model_names, fontsize=12)
641
  ax.set_ylabel('Illicit F1', fontsize=13)
642
+ ax.set_title('BΓΆlme Stratejileri Γ— Model KarşılaştΔ±rmasΔ±', fontsize=14, fontweight='bold')
643
+ ax.legend(fontsize=9, loc='upper right'); ax.set_ylim(0, 1.1)
644
  plt.tight_layout()
645
  plt.savefig('output/figures/fig2_f1_comparison.png', dpi=150, bbox_inches='tight')
646
  plt.close()
647
  print(" βœ“ fig2_f1_comparison.png")
648
 
649
+ # Fig 3: Pipeline karşılaştırma
650
  fig, ax = plt.subplots(figsize=(10, 6))
651
  p_names = list(pipelines.keys())
652
  p_vals = list(pipelines.values())
 
661
  plt.close()
662
  print(" βœ“ fig3_pipeline_comparison.png")
663
 
664
+ # Fig 4: DΓΌrΓΌstlΓΌk Δ±sΔ± haritasΔ±
665
+ fig, ax = plt.subplots(figsize=(16, 7))
666
  sapma_data = []
667
  for strat_name in strat_names:
668
  for mn in model_names:
669
  row = res_df[(res_df['strateji'] == strat_name) & (res_df['model'] == mn)]
670
+ if len(row) > 0 and mn in wf_results and wf_results[mn] > 0:
671
  sapma = ((row['f1'].values[0] - wf_results[mn]) / wf_results[mn]) * 100
672
  sapma_data.append({'strateji': strat_name, 'model': mn, 'sapma': round(sapma, 1)})
673
  if sapma_data:
674
  sapma_df = pd.DataFrame(sapma_data)
675
  pivot = sapma_df.pivot_table(values='sapma', index='model', columns='strateji')
676
+ sns.heatmap(pivot, annot=True, fmt='.1f', cmap='RdYlGn_r', center=0, ax=ax,
677
+ linewidths=0.5, cbar_kws={'label': 'Walk-Forward Sapma (%)'})
678
+ ax.set_title('Walk-Forward DΓΌrΓΌstlΓΌk SapmasΔ± (%) β€” 4 Model Γ— 5 Strateji', fontsize=14, fontweight='bold')
679
  plt.tight_layout()
680
  plt.savefig('output/figures/fig4_honesty.png', dpi=150, bbox_inches='tight')
681
  plt.close()
682
  print(" βœ“ fig4_honesty.png")
683
 
684
+ # Fig 5: Performans şişmesi haritası (inflation)
685
+ fig, ax = plt.subplots(figsize=(14, 6))
686
+ inf_data = []
687
+ for mn in model_names:
688
+ row_rand = res_df[(res_df['model'] == mn) & (res_df['strateji'] == 'Rastgele')]
689
+ row_chr = res_df[(res_df['model'] == mn) & (res_df['strateji'] == 'Kronolojik')]
690
+ row_topo = res_df[(res_df['model'] == mn) & (res_df['strateji'] == 'Topolojik KΔ±rΔ±lma')]
691
+ if len(row_rand) > 0 and len(row_chr) > 0:
692
+ rand_f1 = row_rand['f1'].values[0]
693
+ chr_f1 = row_chr['f1'].values[0]
694
+ topo_f1 = row_topo['f1'].values[0] if len(row_topo) > 0 else 0
695
+ inf_data.append({
696
+ 'model': mn,
697
+ 'Rastgele vs Kronolojik': round((rand_f1 - chr_f1) / chr_f1 * 100, 1),
698
+ 'Rastgele vs Topolojik': round((rand_f1 - topo_f1) / topo_f1 * 100, 1) if topo_f1 > 0 else 0,
699
+ })
700
+ if inf_data:
701
+ inf_df = pd.DataFrame(inf_data).set_index('model')
702
+ sns.heatmap(inf_df, annot=True, fmt='.1f', cmap='Reds', ax=ax, linewidths=0.5,
703
+ cbar_kws={'label': 'Şişme Oranı (%)'})
704
+ ax.set_title('Rastgele Bâlme Performans Şişmesi (%)', fontsize=14, fontweight='bold')
705
+ plt.tight_layout()
706
+ plt.savefig('output/figures/fig5_inflation.png', dpi=150, bbox_inches='tight')
707
+ plt.close()
708
+ print(" βœ“ fig5_inflation.png")
709
+
710
  # ════════════════════════════════════════════════════════════════
711
+ # ADIM 10: Γ–ZET RAPOR
712
  # ════════════════════════════════════════════════════════════════
713
  elapsed = time.time() - start_time
714
 
715
  summary = {
716
+ 'veri': {'toplam': N, 'etiketli': len(y), 'illicit': int(y.sum()),
717
+ 'ozellik': int(X_final.shape[1]), 'kenar': len(valid_edges)},
718
+ 'temizleme': {'nan': int(nan_count), 'inf': int(inf_count),
719
+ 'outlier_pct': round(outlier_mask.sum()/(X.shape[0]*X.shape[1])*100, 2),
720
  'cikarilan_ozellik': int((~var_mask).sum()), 'en_iyi_pipeline': best_pipe},
721
+ 'kirilma': {'saglik_yontemi': int(bp_final), 'final': int(bp_final)},
722
  'walk_forward': wf_results,
723
  'sonuclar': res_df.to_dict(orient='records'),
724
  'pipeline_karsilastirma': {k: round(v, 4) for k, v in pipelines.items()},
725
+ 'durustukluk': honesty_data,
726
  'sure_dakika': round(elapsed / 60, 1),
727
  }
728
 
 
732
  print("\n" + "=" * 70)
733
  print(f"TAMAMLANDI! (SΓΌre: {elapsed/60:.1f} dakika)")
734
  print("=" * 70)
735
+
736
+ # Final sonuΓ§ tablosu
737
+ print(f"\n ═══ SONUΓ‡ TABLOSU (Illicit F1) ═══")
 
 
 
 
 
 
 
 
738
  pivot_f1 = res_df.pivot_table(values='f1', index='model', columns='strateji')
739
  print(pivot_f1.to_string())
740
+
741
+ print(f"\n ═══ WALK-FORWARD REFERANS ═══")
742
+ for mn, f1 in wf_results.items():
743
+ print(f" {mn}: {f1:.4f}")
744
+
745
+ print(f"\n Γ‡Δ±ktΔ±lar: output/results/ ve output/figures/")
746
 
747
 
748
  if __name__ == '__main__':