Add anonymous Temporal Twins code release

a3682cf verified 5 days ago

14.8 kB

	"""
	models/static_gnn.py
	====================
	Static GNN Baseline: GraphSAGE with Snapshot Batching

	Architecture
	------------
	Events are binned into N time-snapshots (equal-count bins).
	For each snapshot:
	- Build a static homogeneous graph from the events in that bin
	- Run 2-layer GraphSAGE to produce node embeddings
	- Aggregate per-node embeddings across all snapshots (mean pooling)
	A node classifier head is trained on the pooled embeddings.

	This model has NO temporal memory between snapshots. It is the strongest
	"static" baseline: it sees the full graph structure but cannot reason about
	the ordering of events within or across snapshots.

	Note: SAGEConv is used (from torch_geometric). Falls back gracefully when
	a node has no edges in a snapshot (embedding stays at zero for that snapshot).
	"""

	from __future__ import annotations

	from typing import List

	import numpy as np
	import pandas as pd
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch_geometric.nn import SAGEConv

	from models.base import TemporalModel
	from src.graph.graph_builder import build_edge_features

	_BLOCKED_COLS = frozenset({
	"motif_hit_count", "motif_source", "trigger_event_idx", "label_event_idx",
	"label_delay", "is_fallback_label", "fraud_source",
	"twin_role", "twin_label", "twin_pair_id", "template_id",
	"dynamic_fraud_state", "motif_chain_state", "motif_strength",
	})



	# ------------------------------------------------------------------ #
	# Core GraphSAGE nn.Module #
	# ------------------------------------------------------------------ #

	class _SAGEEncoder(nn.Module):
	def __init__(self, in_dim: int, hidden_dim: int):
	super().__init__()
	self.conv1 = SAGEConv(in_dim, hidden_dim)
	self.conv2 = SAGEConv(hidden_dim, hidden_dim)
	self.norm1 = nn.LayerNorm(hidden_dim)
	self.norm2 = nn.LayerNorm(hidden_dim)

	def forward(self, x: torch.Tensor, edge_index: torch.Tensor) -> torch.Tensor:
	h = F.relu(self.norm1(self.conv1(x, edge_index)))
	h = self.norm2(self.conv2(h, edge_index))
	return h


	# ------------------------------------------------------------------ #
	# StaticGNNWrapper (TemporalModel interface) #
	# ------------------------------------------------------------------ #

	class StaticGNNWrapper(TemporalModel):
	"""GraphSAGE with time-snapshot aggregation. No temporal memory."""

	def __init__(
	self,
	hidden_dim: int = 64,
	n_snapshots: int = 10,
	device: str = "cpu",
	):
	self.hidden_dim = hidden_dim
	self.n_snapshots = n_snapshots
	self.device = torch.device(device)

	self._encoder: _SAGEEncoder \| None = None
	self._node_clf: nn.Sequential \| None = None
	self._norm_stats: dict \| None = None
	self._n_nodes: int = 0
	self._node_emb_agg: torch.Tensor \| None = None # (n_nodes, hidden_dim)
	self._in_dim: int = 0

	@property
	def name(self) -> str:
	return "StaticGNN"

	@property
	def is_temporal(self) -> bool:
	return False

	# ------------------------------------------------------------------ #

	def _build_snapshots(
	self, df: pd.DataFrame, ef_np: np.ndarray
	) -> List[tuple]:
	"""
	Returns list of (edge_index_t, edge_attr_t, src_nodes, dst_nodes)
	for each snapshot bin.
	"""
	df = df.sort_values("timestamp").reset_index(drop=True)
	n = len(df)
	bin_size = max(1, n // self.n_snapshots)

	snapshots = []
	for b in range(self.n_snapshots):
	lo = b * bin_size
	hi = lo + bin_size if b < self.n_snapshots - 1 else n
	sub_u = df["sender_id"].values[lo:hi].astype(np.int64)
	sub_v = df["receiver_id"].values[lo:hi].astype(np.int64)
	sub_e = ef_np[lo:hi]

	edge_index = torch.tensor(np.vstack([sub_u, sub_v]), dtype=torch.long)
	edge_attr = torch.tensor(sub_e, dtype=torch.float32)
	snapshots.append((edge_index, edge_attr, sub_u, sub_v))
	return snapshots

	# ------------------------------------------------------------------ #

	def fit(self, df_train: pd.DataFrame, num_epochs: int = 3) -> None:
	leaked = _BLOCKED_COLS & set(df_train.columns)
	assert not leaked, f"Oracle columns leaked into StaticGNN.fit(): {leaked}"
	df_train = df_train.sort_values("timestamp").reset_index(drop=True)


	ef_np = build_edge_features(df_train).astype(np.float32)
	edge_dim = ef_np.shape[1]
	self._in_dim = edge_dim # node features are mean-pooled edge features per snapshot

	ea_mean = ef_np.mean(axis=0)
	ea_std = ef_np.std(axis=0) + 1e-6
	ef_np = (ef_np - ea_mean) / ea_std
	self._norm_stats = {"ea_mean": ea_mean, "ea_std": ea_std}

	all_ids = np.union1d(df_train["sender_id"].values, df_train["receiver_id"].values)
	n_nodes = int(all_ids.max()) + 1
	self._n_nodes = n_nodes

	device = self.device

	# Node input features: mean of outgoing edge features per node (snapshot-level)
	encoder = _SAGEEncoder(in_dim=edge_dim, hidden_dim=self.hidden_dim).to(device)
	self._encoder = encoder

	node_clf = nn.Sequential(
	nn.Linear(self.hidden_dim, 64),
	nn.ReLU(),
	nn.Linear(64, 1),
	).to(device)
	self._node_clf = node_clf

	# Build snapshots
	snapshots = self._build_snapshots(df_train, ef_np)

	y_all = torch.tensor(df_train["is_fraud"].values, dtype=torch.float32)
	raw_pw = (y_all == 0).sum() / ((y_all == 1).sum() + 1e-6)
	pos_weight = torch.clamp(raw_pw, max=10.0).to(device)

	loss_fn_edge = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
	opt = torch.optim.Adam(
	list(encoder.parameters()) + list(node_clf.parameters()),
	lr=1e-3,
	)

	# Build per-node input feature matrix: aggregate edge features to nodes
	node_feat = self._build_node_feat(df_train, ef_np, n_nodes)
	x_full = torch.tensor(node_feat, dtype=torch.float32, device=device)

	for epoch in range(num_epochs):
	encoder.train()
	node_clf.train()
	total_loss = 0.0
	emb_accum = torch.zeros(n_nodes, self.hidden_dim, device=device)
	snap_cnt = torch.zeros(n_nodes, dtype=torch.float32, device=device)

	for snap_idx, (edge_index, edge_attr, src_np, _) in enumerate(snapshots):
	edge_index = edge_index.to(device)
	edge_attr = edge_attr.to(device)

	# Get snapshot slice indices in original df
	n = len(df_train)
	bin_size = max(1, n // self.n_snapshots)
	lo = snap_idx * bin_size
	hi = lo + bin_size if snap_idx < self.n_snapshots - 1 else n
	y_snap = y_all[lo:hi].to(device)

	h = encoder(x_full, edge_index) # (n_nodes, hidden_dim)

	# Edge-level fraud loss on this snapshot
	src_t = edge_index[0]
	dst_t = edge_index[1]
	h_src = h[src_t]
	h_dst = h[dst_t]
	edge_logits = (h_src * h_dst).sum(dim=-1) # dot-product score
	edge_logits = torch.clamp(edge_logits, -10, 10)
	loss = loss_fn_edge(edge_logits, y_snap)

	opt.zero_grad()
	loss.backward()
	torch.nn.utils.clip_grad_norm_(encoder.parameters(), 1.0)
	opt.step()
	total_loss += loss.item()

	# Accumulate node embeddings across snapshots (detached)
	with torch.no_grad():
	emb_accum += h.detach()
	snap_cnt += 1.0

	# Pooled node embedding
	emb_pooled = emb_accum / snap_cnt.unsqueeze(1).clamp(min=1.0)
	self._node_emb_agg = emb_pooled.clone()

	print(f"[StaticGNN] Epoch {epoch + 1}/{num_epochs} Loss: {total_loss:.4f}")

	# Freeze encoder; train node classifier on pooled embeddings
	self._train_node_clf(df_train)

	# ------------------------------------------------------------------ #

	def _compute_prefix_embeddings(self, df_prefix: pd.DataFrame) -> torch.Tensor:
	"""Compute node embeddings for a causal prefix graph."""
	device = self.device
	ns = self._norm_stats

	df_prefix = df_prefix.sort_values("timestamp").reset_index(drop=True)
	ef_np = build_edge_features(df_prefix).astype(np.float32)
	ef_np = (ef_np - ns["ea_mean"]) / ns["ea_std"]

	all_ids = np.union1d(df_prefix["sender_id"].values, df_prefix["receiver_id"].values)
	n_nodes = max(int(all_ids.max()) + 1, self._n_nodes)
	node_feat = self._build_node_feat(df_prefix, ef_np, n_nodes)
	x = torch.tensor(node_feat, dtype=torch.float32, device=device)
	edge_index = torch.tensor(
	np.vstack([df_prefix["sender_id"].values, df_prefix["receiver_id"].values]),
	dtype=torch.long, device=device,
	)

	self._encoder.eval()
	with torch.no_grad():
	return self._encoder(x, edge_index)

	# ------------------------------------------------------------------ #

	def _build_node_feat(
	self, df: pd.DataFrame, ef_np: np.ndarray, n_nodes: int
	) -> np.ndarray:
	"""Aggregate edge features to sender nodes (mean)."""
	feat = np.zeros((n_nodes, ef_np.shape[1]), dtype=np.float32)
	cnt = np.zeros(n_nodes, dtype=np.float32)
	sids = df["sender_id"].values.astype(np.int64)
	np.add.at(feat, sids, ef_np)
	np.add.at(cnt, sids, 1.0)
	cnt = np.maximum(cnt, 1.0)
	return feat / cnt[:, None]

	def _train_node_clf(self, df_train: pd.DataFrame, num_epochs: int = 150) -> None:
	"""Fine-tune node classifier on node-level fraud labels (training split)."""
	device = self.device
	emb = self._node_emb_agg # (n_nodes, hidden_dim)
	all_nodes = sorted(df_train["sender_id"].unique())
	eval_t = torch.tensor(all_nodes, dtype=torch.long, device=device)

	# Build node-level labels: any fraud in the training window?
	y_map = df_train.groupby("sender_id")["is_fraud"].max()
	y_np = np.array([y_map.get(n, 0) for n in all_nodes], dtype=np.float32)
	y = torch.tensor(y_np, device=device)

	node_emb = emb[eval_t].detach()
	pw = torch.clamp((y == 0).sum() / ((y == 1).sum() + 1e-6), max=10.0)
	loss_fn = nn.BCEWithLogitsLoss(pos_weight=pw)
	opt = torch.optim.Adam(self._node_clf.parameters(), lr=1e-3)

	self._node_clf.train()
	for _ in range(num_epochs):
	logits = self._node_clf(node_emb).squeeze(-1)
	loss = loss_fn(logits, y)
	opt.zero_grad()
	loss.backward()
	opt.step()
	self._node_clf.eval()

	# ------------------------------------------------------------------ #

	def predict(self, df_eval: pd.DataFrame, eval_nodes: List[int]) -> np.ndarray:
	assert self._encoder is not None, "Call fit() first."
	leaked = _BLOCKED_COLS & set(df_eval.columns)
	assert not leaked, f"Oracle columns leaked into StaticGNN.predict(): {leaked}"
	device = self.device

	ns = self._norm_stats

	# Build node embeddings from eval graph (no memory — static)
	df_eval = df_eval.sort_values("timestamp").reset_index(drop=True)
	ef_np = build_edge_features(df_eval).astype(np.float32)
	ef_np = (ef_np - ns["ea_mean"]) / ns["ea_std"]

	all_ids = np.union1d(df_eval["sender_id"].values, df_eval["receiver_id"].values)
	n_nodes = max(int(all_ids.max()) + 1, self._n_nodes)

	node_feat = self._build_node_feat(df_eval, ef_np, n_nodes)
	x = torch.tensor(node_feat, dtype=torch.float32, device=device)

	edge_index = torch.tensor(
	np.vstack([df_eval["sender_id"].values, df_eval["receiver_id"].values]),
	dtype=torch.long, device=device,
	)

	self._encoder.eval()
	with torch.no_grad():
	h = self._encoder(x, edge_index) # (n_nodes, hidden_dim)

	eval_t = torch.tensor(eval_nodes, dtype=torch.long, device=device)
	node_emb = h[eval_t]

	with torch.no_grad():
	probs = torch.sigmoid(self._node_clf(node_emb).squeeze(-1)).cpu().numpy()
	return probs.astype(np.float32)

	# ------------------------------------------------------------------ #

	def reset_memory(self) -> None:
	"""No-op: StaticGNN has no temporal memory."""
	pass

	# ------------------------------------------------------------------ #

	def train_node_classifier(
	self, eval_nodes: List[int], y_labels: np.ndarray, num_epochs: int = 150
	) -> None:
	"""Re-train node classifier with fresh labels (for horizon sweep)."""
	device = self.device
	eval_t = torch.tensor(eval_nodes, dtype=torch.long, device=device)
	node_emb = self._node_emb_agg[eval_t].detach()
	y = torch.tensor(y_labels, dtype=torch.float32, device=device)
	pw = torch.clamp((y == 0).sum() / ((y == 1).sum() + 1e-6), max=10.0)
	loss_fn = nn.BCEWithLogitsLoss(pos_weight=pw)
	opt = torch.optim.Adam(self._node_clf.parameters(), lr=1e-3)
	self._node_clf.train()
	for _ in range(num_epochs):
	logits = self._node_clf(node_emb).squeeze(-1)
	loss = loss_fn(logits, y)
	opt.zero_grad()
	loss.backward()
	opt.step()
	self._node_clf.eval()

	def train_node_classifier_on_prefix(
	self,
	df_prefix: pd.DataFrame,
	eval_nodes: List[int],
	y_labels: np.ndarray,
	num_epochs: int = 150,
	) -> None:
	"""Train the node classifier on embeddings computed from a causal prefix."""
	device = self.device
	prefix_emb = self._compute_prefix_embeddings(df_prefix)
	eval_t = torch.tensor(eval_nodes, dtype=torch.long, device=device)
	node_emb = prefix_emb[eval_t].detach()
	y = torch.tensor(y_labels, dtype=torch.float32, device=device)
	pw = torch.clamp((y == 0).sum() / ((y == 1).sum() + 1e-6), max=10.0)
	loss_fn = nn.BCEWithLogitsLoss(pos_weight=pw)
	opt = torch.optim.Adam(self._node_clf.parameters(), lr=1e-3)
	self._node_clf.train()
	for _ in range(num_epochs):
	logits = self._node_clf(node_emb).squeeze(-1)
	loss = loss_fn(logits, y)
	opt.zero_grad()
	loss.backward()
	opt.step()
	self._node_clf.eval()