MucahitSylmz's picture
Update run_all.py: GraphSAGE + TS29 + SMOTE + threshold opt
c2ce770 verified
"""
===============================================================================
MASTER SCRIPT — TEK KOMUTLA TÜM DENEYLERİ ÇALIŞTIR
===============================================================================
Bu script sırayla şunları yapar:
1. Veri denetimi ve temizleme (data_audit)
2. En iyi ön işleme pipeline'ını belirle
3. Topolojik kırılma noktası tespiti
4. 5 bölme stratejisi × 4 model (GraphSAGE dahil) = 20 deney
5. Walk-forward validasyon + dürüstlük testi
6. Tüm figürleri ve sonuçları kaydet
KULLANIM:
pip install pandas numpy scikit-learn matplotlib seaborn lightgbm xgboost networkx scipy imbalanced-learn torch torch-geometric
python run_all.py --data_dir ./dataset
SÜRE: ~15 dakika (CPU)
===============================================================================
"""
import os, sys, json, warnings, time, argparse
import numpy as np
import pandas as pd
import networkx as nx
from scipy.ndimage import uniform_filter1d
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score
import xgboost as xgb
import lightgbm as lgb
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from torch_geometric.data import Data
warnings.filterwarnings('ignore')
np.random.seed(42)
torch.manual_seed(42)
# ════════════════════════════════════════════════════════════════
# GraphSAGE Model
# ════════════════════════════════════════════════════════════════
class GraphSAGENet(nn.Module):
def __init__(self, in_channels, hidden=128, out_channels=2, num_layers=3, dropout=0.3):
super().__init__()
self.convs = nn.ModuleList()
self.bns = nn.ModuleList()
self.convs.append(SAGEConv(in_channels, hidden))
self.bns.append(nn.BatchNorm1d(hidden))
for _ in range(num_layers - 2):
self.convs.append(SAGEConv(hidden, hidden))
self.bns.append(nn.BatchNorm1d(hidden))
self.convs.append(SAGEConv(hidden, out_channels))
self.dropout = dropout
def forward(self, x, edge_index):
for i, (conv, bn) in enumerate(zip(self.convs[:-1], self.bns)):
x = conv(x, edge_index)
x = bn(x)
x = F.relu(x)
x = F.dropout(x, p=self.dropout, training=self.training)
x = self.convs[-1](x, edge_index)
return x
def train_graphsage(data, train_mask, test_mask, in_channels, epochs=200, lr=0.005, weight=None):
"""GraphSAGE eğit ve değerlendir — inductive: test kenarları eğitimde kullanılmaz"""
device = torch.device('cpu')
model = GraphSAGENet(in_channels, hidden=128, out_channels=2, num_layers=3, dropout=0.3).to(device)
# Class weight
if weight is not None:
w = torch.tensor([1.0, weight], dtype=torch.float32).to(device)
else:
w = None
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
# Inductive: sadece train düğümleri arasındaki kenarları al
train_nodes = set(torch.where(train_mask)[0].tolist())
edge_index = data.edge_index
mask_e = torch.tensor([
(edge_index[0, i].item() in train_nodes) and (edge_index[1, i].item() in train_nodes)
for i in range(edge_index.shape[1])
], dtype=torch.bool)
train_edge_index = edge_index[:, mask_e]
x = data.x.to(device)
y = data.y.to(device)
train_mask_d = train_mask.to(device)
test_mask_d = test_mask.to(device)
train_edge_index = train_edge_index.to(device)
full_edge_index = edge_index.to(device)
best_f1 = 0
best_state = None
patience = 30
no_improve = 0
model.train()
for epoch in range(epochs):
optimizer.zero_grad()
out = model(x, train_edge_index)
loss = F.cross_entropy(out[train_mask_d], y[train_mask_d], weight=w)
loss.backward()
optimizer.step()
scheduler.step()
if (epoch + 1) % 10 == 0:
model.eval()
with torch.no_grad():
out_eval = model(x, full_edge_index)
pred = out_eval[test_mask_d].argmax(dim=1)
f1 = f1_score(y[test_mask_d].cpu(), pred.cpu(), zero_division=0)
if f1 > best_f1:
best_f1 = f1
best_state = {k: v.clone() for k, v in model.state_dict().items()}
no_improve = 0
else:
no_improve += 1
model.train()
if no_improve >= patience // 10:
break
# Final eval
if best_state:
model.load_state_dict(best_state)
model.eval()
with torch.no_grad():
out = model(x, full_edge_index)
proba = F.softmax(out, dim=1)[:, 1]
# Threshold optimization
best_th_f1 = 0
best_th = 0.5
for th in np.arange(0.1, 0.9, 0.05):
pred_th = (proba[test_mask_d] >= th).long()
f1_th = f1_score(y[test_mask_d].cpu(), pred_th.cpu(), zero_division=0)
if f1_th > best_th_f1:
best_th_f1 = f1_th
best_th = th
pred = (proba[test_mask_d] >= best_th).long()
y_test = y[test_mask_d].cpu().numpy()
pred_np = pred.cpu().numpy()
proba_np = proba[test_mask_d].cpu().numpy()
return {
'f1': round(f1_score(y_test, pred_np, zero_division=0), 4),
'precision': round(precision_score(y_test, pred_np, zero_division=0), 4),
'recall': round(recall_score(y_test, pred_np, zero_division=0), 4),
'auroc': round(roc_auc_score(y_test, proba_np) if len(np.unique(y_test)) > 1 else 0.5, 4),
}
def main(data_dir):
start_time = time.time()
# Çıktı klasörleri
for d in ['output/figures', 'output/results']:
os.makedirs(d, exist_ok=True)
# ════════════════════════════════════════════════════════════════
# ADIM 1: VERİ YÜKLEME
# ════════════════════════════════════════════════════════════════
print("=" * 70)
print("ADIM 1: VERİ YÜKLEME")
print("=" * 70)
feat_df = pd.read_csv(os.path.join(data_dir, 'elliptic_txs_features.csv'), header=None)
class_df = pd.read_csv(os.path.join(data_dir, 'elliptic_txs_classes.csv'))
edge_df = pd.read_csv(os.path.join(data_dir, 'elliptic_txs_edgelist.csv'))
txids = feat_df.iloc[:, 0].values
timesteps_raw = feat_df.iloc[:, 1].values.astype(int)
features_raw = feat_df.iloc[:, 2:].values.astype(np.float64)
N = len(txids)
id_map = {tid: i for i, tid in enumerate(txids)}
label_map = {'1': 1, '2': 0, 'unknown': -1}
labels_np = np.array([label_map[str(c)] for c in class_df['class'].values])
# Kenarlar
valid_edges = [(id_map[s], id_map[d]) for s, d in zip(edge_df['txId1'], edge_df['txId2'])
if s in id_map and d in id_map]
src = np.array([e[0] for e in valid_edges])
dst = np.array([e[1] for e in valid_edges])
labeled_mask = labels_np >= 0
labeled_indices = np.where(labeled_mask)[0]
# Etiketli düğüm indeksleme (tüm düğümlerden etiketlilere)
full_to_labeled = {full_idx: lab_idx for lab_idx, full_idx in enumerate(labeled_indices)}
X_raw = features_raw[labeled_mask]
y = labels_np[labeled_mask]
ts = timesteps_raw[labeled_mask]
print(f" Toplam: {N}, Etiketli: {len(y)}")
print(f" İllicit: {y.sum()} ({y.mean()*100:.1f}%), Licit: {len(y)-y.sum()}")
print(f" Kenar sayısı: {len(src)}")
# ════════════════════════════════════════════════════════════════
# ADIM 2: VERİ TEMİZLEME VE ÖN İŞLEME
# ════════════════════════════════════════════════════════════════
print("\n" + "=" * 70)
print("ADIM 2: VERİ TEMİZLEME")
print("=" * 70)
nan_count = np.isnan(X_raw).sum()
inf_count = np.isinf(X_raw).sum()
print(f" NaN: {nan_count}, Inf: {inf_count}")
X = np.nan_to_num(X_raw, nan=0.0, posinf=0.0, neginf=0.0)
# Outlier clipping
Q1 = np.percentile(X, 25, axis=0)
Q3 = np.percentile(X, 75, axis=0)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
outlier_mask = (X < lower) | (X > upper)
print(f" Outlier hücre: {outlier_mask.sum()} ({outlier_mask.sum()/(X.shape[0]*X.shape[1])*100:.1f}%)")
X_clipped = np.clip(X, lower, upper)
# Düşük varyans çıkarma
variances = np.var(X_clipped, axis=0)
var_mask = variances > 1e-6
X_clean = X_clipped[:, var_mask]
print(f" Düşük varyanslı özellik çıkarıldı: {(~var_mask).sum()}, kalan: {var_mask.sum()}")
# Son veri: clipped
X_final = X_clipped
# ════════════════════════════════════════════════════════════════
# ADIM 3: ÖN İŞLEME PIPELINE KARŞILAŞTIRMASI
# ════════════════════════════════════════════════════════════════
print("\n" + "=" * 70)
print("ADIM 3: PIPELINE KARŞILAŞTIRMASI")
print("=" * 70)
tr_mask_pipe = ts <= 39
te_mask_pipe = ts > 39
def quick_eval(X_tr, y_tr, X_te, y_te):
m = lgb.LGBMClassifier(n_estimators=500, max_depth=12, scale_pos_weight=10,
learning_rate=0.05, random_state=42, n_jobs=-1, verbose=-1)
m.fit(X_tr, y_tr)
proba = m.predict_proba(X_te)[:, 1]
# Threshold optimizasyonu
best_f1, best_th = 0, 0.5
for th in np.arange(0.1, 0.9, 0.05):
p = (proba >= th).astype(int)
f = f1_score(y_te, p, zero_division=0)
if f > best_f1: best_f1, best_th = f, th
return best_f1
pipelines = {}
f1_raw = quick_eval(X_raw[tr_mask_pipe], y[tr_mask_pipe], X_raw[te_mask_pipe], y[te_mask_pipe])
pipelines['Ham Veri'] = f1_raw
print(f" Ham Veri: F1={f1_raw:.4f}")
sc = StandardScaler()
f1_ss = quick_eval(sc.fit_transform(X[tr_mask_pipe]), y[tr_mask_pipe], sc.transform(X[te_mask_pipe]), y[te_mask_pipe])
pipelines['StandardScaler'] = f1_ss
print(f" StandardScaler: F1={f1_ss:.4f}")
rs = RobustScaler()
f1_rs = quick_eval(rs.fit_transform(X[tr_mask_pipe]), y[tr_mask_pipe], rs.transform(X[te_mask_pipe]), y[te_mask_pipe])
pipelines['RobustScaler'] = f1_rs
print(f" RobustScaler: F1={f1_rs:.4f}")
rs2 = RobustScaler()
f1_cr = quick_eval(rs2.fit_transform(X_clipped[tr_mask_pipe]), y[tr_mask_pipe], rs2.transform(X_clipped[te_mask_pipe]), y[te_mask_pipe])
pipelines['Clip+Robust'] = f1_cr
print(f" Clip+Robust: F1={f1_cr:.4f}")
rs3 = RobustScaler()
f1_cvr = quick_eval(rs3.fit_transform(X_clean[tr_mask_pipe]), y[tr_mask_pipe], rs3.transform(X_clean[te_mask_pipe]), y[te_mask_pipe])
pipelines['Clip+VarFilter+Robust'] = f1_cvr
print(f" Clip+VarFilter+Rob: F1={f1_cvr:.4f}")
try:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
rs4 = RobustScaler()
X_tr_s = rs4.fit_transform(X_clipped[tr_mask_pipe])
X_te_s = rs4.transform(X_clipped[te_mask_pipe])
X_tr_sm, y_tr_sm = smote.fit_resample(X_tr_s, y[tr_mask_pipe])
f1_smote = quick_eval(X_tr_sm, y_tr_sm, X_te_s, y[te_mask_pipe])
pipelines['Clip+Robust+SMOTE'] = f1_smote
print(f" Clip+Robust+SMOTE: F1={f1_smote:.4f}")
except ImportError:
print(" SMOTE atlandı")
best_pipe = max(pipelines, key=pipelines.get)
print(f"\n ★ En iyi pipeline: {best_pipe} (F1={pipelines[best_pipe]:.4f})")
# ════════════════════════════════════════════════════════════════
# ADIM 4: TOPOLOJİK METRİKLER
# ════════════════════════════════════════════════════════════════
print("\n" + "=" * 70)
print("ADIM 4: TOPOLOJİK METRİKLER")
print("=" * 70)
all_ts = sorted(np.unique(timesteps_raw))
topo = {}
for t in all_ts:
ts_nodes = set(np.where(timesteps_raw == t)[0])
m = np.isin(src, list(ts_nodes)) & np.isin(dst, list(ts_nodes))
G = nx.DiGraph()
G.add_nodes_from(ts_nodes)
G.add_edges_from(zip(src[m], dst[m]))
n = G.number_of_nodes(); e = G.number_of_edges()
density = nx.density(G) if n > 1 else 0
G_u = G.to_undirected()
comps = nx.number_connected_components(G_u)
largest = max(nx.connected_components(G_u), key=len)
cc_ratio = len(largest) / max(n, 1)
degs = [d for _, d in G.degree()]
avg_deg = np.mean(degs) if degs else 0
ts_lab = [nd for nd in ts_nodes if labels_np[nd] >= 0]
ts_ill = [nd for nd in ts_lab if labels_np[nd] == 1]
ill_rate = len(ts_ill) / max(len(ts_lab), 1)
topo[t] = {'n_nodes': n, 'n_edges': e, 'density': density, 'cc_ratio': cc_ratio,
'n_components': comps, 'avg_degree': avg_deg, 'illicit_rate': ill_rate}
print(f" TS {t:2d}: nodes={n:5d} edges={e:5d} density={density:.5f} illicit={ill_rate:.3f}")
topo_df = pd.DataFrame(topo).T
topo_df.to_csv('output/results/topological_metrics.csv')
# ════════════════════════════════════════════════════════════════
# ADIM 5: KIRILMA NOKTASI TESPİTİ
# ════════════════════════════════════════════════════════════════
print("\n" + "=" * 70)
print("ADIM 5: KIRILMA NOKTASI TESPİTİ")
print("=" * 70)
# Sağlık skoru: normalize et
for col in ['density', 'cc_ratio', 'n_components']:
mi, ma = topo_df[col].min(), topo_df[col].max()
topo_df[f'{col}_n'] = (topo_df[col] - mi) / (ma - mi + 1e-8)
health = (topo_df['density_n'] + topo_df['cc_ratio_n'] + (1 - topo_df['n_components_n'])) / 3
bp_final = health.diff().idxmin()
print(f" Sağlık skoru kırılma noktası: TS {bp_final}")
# Tepe-düşüş analizi (bilgi amaçlı)
df_t = topo_df.copy()
for col in ['n_edges', 'density', 'avg_degree']:
mi, ma = df_t[col].min(), df_t[col].max()
df_t[f'{col}_norm'] = (df_t[col] - mi) / (ma - mi + 1e-8)
crisis = (df_t['n_edges_norm'] * 0.4 + df_t['density_norm'] * 0.3 + df_t['avg_degree_norm'] * 0.3).values
crisis_smooth = uniform_filter1d(crisis, size=5, mode='nearest')
velocity = np.gradient(crisis_smooth)
peaks = []
for i in range(1, len(velocity) - 1):
if velocity[i-1] > 0 and velocity[i+1] < 0:
peaks.append({'timestep': all_ts[i], 'index': i, 'drop': abs(velocity[i+1])})
peaks = sorted(peaks, key=lambda x: x['drop'], reverse=True)
print(f" ★ Final kırılma noktası: TS {bp_final}")
# ════════════════════════════════════════════════════════════════
# ADIM 6: GRAF VERİSİ HAZIRLA (GraphSAGE için)
# ════════════════════════════════════════════════════════════════
print("\n" + "=" * 70)
print("ADIM 6: GRAPHSAGE VERİ HAZIRLAMA")
print("=" * 70)
# Etiketli düğümler arası kenarları filtrele
labeled_set = set(labeled_indices.tolist())
labeled_edges = [(full_to_labeled[s], full_to_labeled[d])
for s, d in zip(src, dst)
if s in labeled_set and d in labeled_set
and s in full_to_labeled and d in full_to_labeled]
if labeled_edges:
edge_src = [e[0] for e in labeled_edges]
edge_dst = [e[1] for e in labeled_edges]
edge_index = torch.tensor([edge_src + edge_dst, edge_dst + edge_src], dtype=torch.long) # undirected
else:
edge_index = torch.zeros((2, 0), dtype=torch.long)
print(f" Etiketli düğümler arası kenar: {len(labeled_edges)} ({edge_index.shape[1]} undirected)")
# Normalize features for GNN
scaler_gnn = RobustScaler()
X_gnn = scaler_gnn.fit_transform(X_final)
x_tensor = torch.tensor(X_gnn, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)
graph_data = Data(x=x_tensor, edge_index=edge_index, y=y_tensor)
# ════════════════════════════════════════════════════════════════
# ADIM 7: BÖLME STRATEJİLERİ VE DENEYLER
# ════════════════════════════════════════════════════════════════
print("\n" + "=" * 70)
print("ADIM 7: 5 STRATEJİ × 4 MODEL = 20 DENEY")
print("=" * 70)
def make_masks(train_ts_set, test_ts_set):
tr = np.array([ts[i] in train_ts_set for i in range(len(y))])
te = np.array([ts[i] in test_ts_set for i in range(len(y))])
return tr, te
def split_random():
idx = np.arange(len(y))
tr, te = train_test_split(idx, test_size=0.2, random_state=42, stratify=y)
m_tr = np.zeros(len(y), dtype=bool); m_tr[tr] = True
m_te = np.zeros(len(y), dtype=bool); m_te[te] = True
return m_tr, m_te
cutoff_chrono = all_ts[int(len(all_ts) * 0.8) - 1]
strategies = {
'Rastgele': split_random,
'Kronolojik': lambda: make_masks({t for t in all_ts if t <= cutoff_chrono}, {t for t in all_ts if t > cutoff_chrono}),
'Topolojik Kırılma': lambda: make_masks({t for t in all_ts if t < bp_final}, {t for t in all_ts if t >= bp_final}),
'Kayan Pencere': lambda: make_masks(set(all_ts[:-10]), set(all_ts[-10:])),
'Düşmanca-Kriz': lambda: make_masks(
{t for t in all_ts if topo_df.loc[t, 'illicit_rate'] <= 0.18 and t != bp_final},
{t for t in all_ts if topo_df.loc[t, 'illicit_rate'] > 0.18 or t == bp_final}
),
}
def train_eval_tabular(X_tr, y_tr, X_te, y_te, model_type):
"""Tabular model eğit — threshold optimizasyonu ile"""
sc = RobustScaler()
Xtr = sc.fit_transform(X_tr)
Xte = sc.transform(X_te)
# SMOTE uygula (eğitim setine)
try:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
Xtr, y_tr = smote.fit_resample(Xtr, y_tr)
except:
pass
if model_type == 'lgbm':
m = lgb.LGBMClassifier(
n_estimators=500, max_depth=12, learning_rate=0.05,
num_leaves=63, min_child_samples=20, subsample=0.8,
colsample_bytree=0.8, scale_pos_weight=10,
random_state=42, n_jobs=-1, verbose=-1
)
elif model_type == 'rf':
m = RandomForestClassifier(
n_estimators=500, max_depth=20, min_samples_leaf=5,
class_weight='balanced_subsample', max_features='sqrt',
random_state=42, n_jobs=-1
)
elif model_type == 'xgb':
m = xgb.XGBClassifier(
n_estimators=500, max_depth=10, learning_rate=0.05,
subsample=0.8, colsample_bytree=0.8, scale_pos_weight=10,
min_child_weight=5, gamma=0.1,
random_state=42, n_jobs=-1, verbosity=0
)
m.fit(Xtr, y_tr)
proba = m.predict_proba(Xte)[:, 1]
# Threshold optimizasyonu
best_f1, best_th = 0, 0.5
for th in np.arange(0.1, 0.9, 0.05):
pred_th = (proba >= th).astype(int)
f1_th = f1_score(y_te, pred_th, zero_division=0)
if f1_th > best_f1:
best_f1, best_th = f1_th, th
pred = (proba >= best_th).astype(int)
return {
'f1': round(f1_score(y_te, pred, zero_division=0), 4),
'precision': round(precision_score(y_te, pred, zero_division=0), 4),
'recall': round(recall_score(y_te, pred, zero_division=0), 4),
'auroc': round(roc_auc_score(y_te, proba) if len(np.unique(y_te)) > 1 else 0.5, 4),
'threshold': round(best_th, 2),
}
model_types = [('lgbm', 'LightGBM'), ('rf', 'Random Forest'), ('xgb', 'XGBoost')]
all_results = []
for strat_name, strat_fn in strategies.items():
tr_m, te_m = strat_fn()
if tr_m.sum() < 50 or te_m.sum() < 10:
print(f" {strat_name}: yetersiz veri, atlanıyor")
continue
print(f"\n {strat_name} (train={tr_m.sum()}, test={te_m.sum()}, test_ill={y[te_m].sum()}):")
# Tabular modeller
for mt, mn in model_types:
res = train_eval_tabular(X_final[tr_m], y[tr_m], X_final[te_m], y[te_m], mt)
res['strateji'] = strat_name
res['model'] = mn
all_results.append(res)
print(f" {mn:15s}: F1={res['f1']:.4f} P={res['precision']:.4f} R={res['recall']:.4f} AUROC={res['auroc']:.4f} th={res['threshold']}")
# GraphSAGE
print(f" {'GraphSAGE':15s}: eğitiliyor...", end='', flush=True)
train_mask_t = torch.tensor(tr_m, dtype=torch.bool)
test_mask_t = torch.tensor(te_m, dtype=torch.bool)
ill_weight = float((y[tr_m] == 0).sum()) / max(float((y[tr_m] == 1).sum()), 1)
ill_weight = min(ill_weight, 15.0) # cap at 15
gs_res = train_graphsage(graph_data, train_mask_t, test_mask_t,
X_final.shape[1], epochs=200, lr=0.005, weight=ill_weight)
gs_res['strateji'] = strat_name
gs_res['model'] = 'GraphSAGE'
gs_res['threshold'] = 0.0 # threshold handled internally
all_results.append(gs_res)
print(f"\r {'GraphSAGE':15s}: F1={gs_res['f1']:.4f} P={gs_res['precision']:.4f} R={gs_res['recall']:.4f} AUROC={gs_res['auroc']:.4f}")
res_df = pd.DataFrame(all_results)
res_df.to_csv('output/results/all_experiment_results.csv', index=False)
# ════════════════════════════════════════════════════════════════
# ADIM 8: WALK-FORWARD VALİDASYON
# ════════════════════════════════════════════════════════════════
print("\n" + "=" * 70)
print("ADIM 8: WALK-FORWARD VALİDASYON")
print("=" * 70)
wf_results = {}
for mt, mn in model_types:
wf_f1s = []
for test_start in range(10, 49, 3):
tr_m = ts < test_start
te_m = (ts >= test_start) & (ts < test_start + 3)
if tr_m.sum() < 50 or te_m.sum() < 10 or len(np.unique(y[te_m])) < 2:
continue
res = train_eval_tabular(X_final[tr_m], y[tr_m], X_final[te_m], y[te_m], mt)
wf_f1s.append(res['f1'])
wf_results[mn] = round(np.mean(wf_f1s), 4)
print(f" {mn}: Walk-Forward F1 = {wf_results[mn]:.4f}")
# GraphSAGE walk-forward
wf_gs_f1s = []
for test_start in range(10, 49, 3):
tr_m_wf = ts < test_start
te_m_wf = (ts >= test_start) & (ts < test_start + 3)
if tr_m_wf.sum() < 50 or te_m_wf.sum() < 10 or len(np.unique(y[te_m_wf])) < 2:
continue
train_mask_wf = torch.tensor(tr_m_wf, dtype=torch.bool)
test_mask_wf = torch.tensor(te_m_wf, dtype=torch.bool)
ill_w = float((y[tr_m_wf]==0).sum()) / max(float((y[tr_m_wf]==1).sum()), 1)
ill_w = min(ill_w, 15.0)
gs_wf = train_graphsage(graph_data, train_mask_wf, test_mask_wf, X_final.shape[1], epochs=100, weight=ill_w)
wf_gs_f1s.append(gs_wf['f1'])
wf_results['GraphSAGE'] = round(np.mean(wf_gs_f1s), 4) if wf_gs_f1s else 0
print(f" GraphSAGE: Walk-Forward F1 = {wf_results['GraphSAGE']:.4f}")
# Dürüstlük tablosu
print("\n Dürüstlük Karşılaştırması:")
honesty_data = []
for strat_name in strategies:
sapma_list = []
for mn in wf_results:
row = res_df[(res_df['strateji'] == strat_name) & (res_df['model'] == mn)]
if len(row) > 0 and mn in wf_results and wf_results[mn] > 0:
sapma = ((row['f1'].values[0] - wf_results[mn]) / wf_results[mn]) * 100
sapma_list.append(sapma)
if sapma_list:
avg_sapma = np.mean(sapma_list)
durum = "✅ DÜRÜST" if abs(avg_sapma) < 10 else ("🔴 ŞİŞME" if avg_sapma > 10 else "⚠️ PESİMİST")
honesty_data.append({'strateji': strat_name, 'sapma': round(avg_sapma, 1), 'durum': durum})
print(f" {strat_name:25s}: ort. sapma = {avg_sapma:+.1f}% {durum}")
# ════════════════════════════════════════════════════════════════
# ADIM 9: FİGÜRLER
# ════════════════════════════════════════════════════════════════
print("\n" + "=" * 70)
print("ADIM 9: FİGÜRLER")
print("=" * 70)
sns.set_theme(style='whitegrid', font_scale=1.1)
# Fig 1: Kırılma noktası
fig, axes = plt.subplots(3, 1, figsize=(18, 14), gridspec_kw={'height_ratios': [2, 1, 1]})
axes[0].plot(all_ts, health.values, 'o-', color='steelblue', linewidth=2, markersize=5)
axes[0].axvline(x=bp_final, color='red', linewidth=3, linestyle='--')
axes[0].annotate(f'KIRILMA TS={bp_final}', xy=(bp_final, health.loc[bp_final]),
fontsize=12, fontweight='bold', color='red', ha='center',
xytext=(bp_final+3, health.max()-0.05),
arrowprops=dict(arrowstyle='->', color='red', lw=2))
axes[0].set_ylabel('Ağ Sağlık Skoru', fontsize=12)
axes[0].set_title('Topolojik Kırılma Noktası Tespiti', fontsize=14, fontweight='bold')
colors_bar = ['#FF4444' if topo_df.loc[t, 'illicit_rate'] > 0.18 else '#44BB44' for t in all_ts]
axes[1].bar(all_ts, topo_df['illicit_rate'].values * 100, color=colors_bar)
axes[1].axvline(x=bp_final, color='red', linewidth=3, linestyle='--')
axes[1].set_ylabel('İllicit %', fontsize=12)
axes[2].plot(all_ts, crisis_smooth, '-', color='steelblue', linewidth=2)
if peaks:
axes[2].scatter(peaks[0]['timestep'], crisis_smooth[peaks[0]['index']],
color='red', s=200, zorder=5, edgecolors='black')
axes[2].set_ylabel('Kriz Sinyali', fontsize=12)
axes[2].set_xlabel('Timestep', fontsize=12)
plt.tight_layout()
plt.savefig('output/figures/fig1_breakpoint.png', dpi=150, bbox_inches='tight')
plt.close()
print(" ✓ fig1_breakpoint.png")
# Fig 2: F1 karşılaştırma (4 model dahil)
fig, ax = plt.subplots(figsize=(18, 8))
strat_names = list(strategies.keys())
model_names = [mn for _, mn in model_types] + ['GraphSAGE']
colors5 = sns.color_palette('Set2', len(strat_names))
x = np.arange(len(model_names)); width = 0.15
for i, strat in enumerate(strat_names):
vals = []
for m in model_names:
row = res_df[(res_df['model'] == m) & (res_df['strateji'] == strat)]
vals.append(row['f1'].values[0] if len(row) > 0 else 0)
ec = 'black' if 'Topolojik' in strat else 'white'
lw = 2 if 'Topolojik' in strat else 0.5
ax.bar(x + i*width, vals, width, label=strat, color=colors5[i], edgecolor=ec, linewidth=lw)
wf_avg = np.mean(list(wf_results.values()))
ax.axhspan(wf_avg*0.9, wf_avg*1.1, alpha=0.12, color='green')
ax.axhline(y=wf_avg, color='green', linewidth=2, linestyle='--', label=f'Walk-Forward ({wf_avg:.3f})')
ax.set_xticks(x + width*2); ax.set_xticklabels(model_names, fontsize=12)
ax.set_ylabel('Illicit F1', fontsize=13)
ax.set_title('Bölme Stratejileri × Model Karşılaştırması', fontsize=14, fontweight='bold')
ax.legend(fontsize=9, loc='upper right'); ax.set_ylim(0, 1.1)
plt.tight_layout()
plt.savefig('output/figures/fig2_f1_comparison.png', dpi=150, bbox_inches='tight')
plt.close()
print(" ✓ fig2_f1_comparison.png")
# Fig 3: Pipeline karşılaştırma
fig, ax = plt.subplots(figsize=(10, 6))
p_names = list(pipelines.keys())
p_vals = list(pipelines.values())
colors_p = ['#FF6B6B' if v == min(p_vals) else '#4ECDC4' if v == max(p_vals) else '#45B7D1' for v in p_vals]
ax.barh(p_names, p_vals, color=colors_p)
ax.set_xlabel('Illicit F1 Score')
ax.set_title('Ön İşleme Pipeline Karşılaştırması', fontsize=14, fontweight='bold')
for i, v in enumerate(p_vals):
ax.text(v + 0.002, i, f'{v:.4f}', va='center', fontsize=10)
plt.tight_layout()
plt.savefig('output/figures/fig3_pipeline_comparison.png', dpi=150, bbox_inches='tight')
plt.close()
print(" ✓ fig3_pipeline_comparison.png")
# Fig 4: Dürüstlük ısı haritası
fig, ax = plt.subplots(figsize=(16, 7))
sapma_data = []
for strat_name in strat_names:
for mn in model_names:
row = res_df[(res_df['strateji'] == strat_name) & (res_df['model'] == mn)]
if len(row) > 0 and mn in wf_results and wf_results[mn] > 0:
sapma = ((row['f1'].values[0] - wf_results[mn]) / wf_results[mn]) * 100
sapma_data.append({'strateji': strat_name, 'model': mn, 'sapma': round(sapma, 1)})
if sapma_data:
sapma_df = pd.DataFrame(sapma_data)
pivot = sapma_df.pivot_table(values='sapma', index='model', columns='strateji')
sns.heatmap(pivot, annot=True, fmt='.1f', cmap='RdYlGn_r', center=0, ax=ax,
linewidths=0.5, cbar_kws={'label': 'Walk-Forward Sapma (%)'})
ax.set_title('Walk-Forward Dürüstlük Sapması (%) — 4 Model × 5 Strateji', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('output/figures/fig4_honesty.png', dpi=150, bbox_inches='tight')
plt.close()
print(" ✓ fig4_honesty.png")
# Fig 5: Performans şişmesi haritası (inflation)
fig, ax = plt.subplots(figsize=(14, 6))
inf_data = []
for mn in model_names:
row_rand = res_df[(res_df['model'] == mn) & (res_df['strateji'] == 'Rastgele')]
row_chr = res_df[(res_df['model'] == mn) & (res_df['strateji'] == 'Kronolojik')]
row_topo = res_df[(res_df['model'] == mn) & (res_df['strateji'] == 'Topolojik Kırılma')]
if len(row_rand) > 0 and len(row_chr) > 0:
rand_f1 = row_rand['f1'].values[0]
chr_f1 = row_chr['f1'].values[0]
topo_f1 = row_topo['f1'].values[0] if len(row_topo) > 0 else 0
inf_data.append({
'model': mn,
'Rastgele vs Kronolojik': round((rand_f1 - chr_f1) / chr_f1 * 100, 1),
'Rastgele vs Topolojik': round((rand_f1 - topo_f1) / topo_f1 * 100, 1) if topo_f1 > 0 else 0,
})
if inf_data:
inf_df = pd.DataFrame(inf_data).set_index('model')
sns.heatmap(inf_df, annot=True, fmt='.1f', cmap='Reds', ax=ax, linewidths=0.5,
cbar_kws={'label': 'Şişme Oranı (%)'})
ax.set_title('Rastgele Bölme Performans Şişmesi (%)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('output/figures/fig5_inflation.png', dpi=150, bbox_inches='tight')
plt.close()
print(" ✓ fig5_inflation.png")
# ════════════════════════════════════════════════════════════════
# ADIM 10: ÖZET RAPOR
# ════════════════════════════════════════════════════════════════
elapsed = time.time() - start_time
summary = {
'veri': {'toplam': N, 'etiketli': len(y), 'illicit': int(y.sum()),
'ozellik': int(X_final.shape[1]), 'kenar': len(valid_edges)},
'temizleme': {'nan': int(nan_count), 'inf': int(inf_count),
'outlier_pct': round(outlier_mask.sum()/(X.shape[0]*X.shape[1])*100, 2),
'cikarilan_ozellik': int((~var_mask).sum()), 'en_iyi_pipeline': best_pipe},
'kirilma': {'saglik_yontemi': int(bp_final), 'final': int(bp_final)},
'walk_forward': wf_results,
'sonuclar': res_df.to_dict(orient='records'),
'pipeline_karsilastirma': {k: round(v, 4) for k, v in pipelines.items()},
'durustukluk': honesty_data,
'sure_dakika': round(elapsed / 60, 1),
}
with open('output/results/summary.json', 'w') as f:
json.dump(summary, f, indent=2, ensure_ascii=False)
print("\n" + "=" * 70)
print(f"TAMAMLANDI! (Süre: {elapsed/60:.1f} dakika)")
print("=" * 70)
# Final sonuç tablosu
print(f"\n ═══ SONUÇ TABLOSU (Illicit F1) ═══")
pivot_f1 = res_df.pivot_table(values='f1', index='model', columns='strateji')
print(pivot_f1.to_string())
print(f"\n ═══ WALK-FORWARD REFERANS ═══")
for mn, f1 in wf_results.items():
print(f" {mn}: {f1:.4f}")
print(f"\n Çıktılar: output/results/ ve output/figures/")
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--data_dir', type=str, default='./dataset')
args = parser.parse_args()
main(args.data_dir)