Spaces:
Sleeping
Sleeping
| """Vendored model classes + inference bundle for the Gradio Space. | |
| Self-contained — does not import from the engine repo so the Space can | |
| deploy from `VynFi/je-fraud-gnn` without pulling the full SyntheticData | |
| codebase. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| from dataclasses import dataclass | |
| from pathlib import Path | |
| from typing import Any | |
| import numpy as np | |
| import pandas as pd | |
| import torch | |
| import torch.nn.functional as F | |
| from torch import nn | |
| from torch_geometric.nn import SAGEConv | |
| ROUND_LEVELS = np.array([1_000.0, 5_000.0, 10_000.0, 25_000.0, 50_000.0, 100_000.0]) | |
| BUSINESS_PROCESSES = ["P2P", "O2C", "R2R", "H2R", "A2R"] | |
| # ─── Model classes (must match training scripts byte-for-byte) ─────────────── | |
| class EdgeFraudGNN(nn.Module): | |
| def __init__( | |
| self, | |
| node_in: int, | |
| edge_in: int, | |
| hidden: int = 64, | |
| out: int = 64, | |
| head_hidden: int = 128, | |
| dropout: float = 0.2, | |
| ) -> None: | |
| super().__init__() | |
| self.conv1 = SAGEConv(node_in, hidden, aggr="mean") | |
| self.conv2 = SAGEConv(hidden, out, aggr="mean") | |
| self.dropout = dropout | |
| self.head = nn.Sequential( | |
| nn.Linear(2 * out + edge_in, head_hidden), | |
| nn.ReLU(), | |
| nn.Dropout(dropout), | |
| nn.Linear(head_hidden, 1), | |
| ) | |
| def encode(self, x: torch.Tensor, edge_index: torch.Tensor) -> torch.Tensor: | |
| h = F.relu(self.conv1(x, edge_index)) | |
| h = F.dropout(h, p=self.dropout, training=self.training) | |
| h = self.conv2(h, edge_index) | |
| return h | |
| def edge_logits(self, h, edge_index, edge_attr): | |
| src, dst = edge_index | |
| z = torch.cat([h[src], h[dst], edge_attr], dim=-1) | |
| return self.head(z).squeeze(-1) | |
| class SageEncoder(nn.Module): | |
| def __init__(self, in_dim: int, hidden: int = 64, out: int = 32, dropout: float = 0.2) -> None: | |
| super().__init__() | |
| self.conv1 = SAGEConv(in_dim, hidden, aggr="mean") | |
| self.conv2 = SAGEConv(hidden, out, aggr="mean") | |
| self.dropout = dropout | |
| def forward(self, x, edge_index): | |
| h = F.relu(self.conv1(x, edge_index)) | |
| h = F.dropout(h, p=self.dropout, training=self.training) | |
| return self.conv2(h, edge_index) | |
| class AttrDecoder(nn.Module): | |
| def __init__(self, z_dim: int, edge_attr_dim: int, hidden: int = 128, dropout: float = 0.2) -> None: | |
| super().__init__() | |
| self.net = nn.Sequential( | |
| nn.Linear(2 * z_dim, hidden), | |
| nn.ReLU(), | |
| nn.Dropout(dropout), | |
| nn.Linear(hidden, edge_attr_dim), | |
| ) | |
| def forward(self, z, edge_index): | |
| src, dst = edge_index | |
| return self.net(torch.cat([z[src], z[dst]], dim=-1)) | |
| class AttrGAE(nn.Module): | |
| def __init__(self, in_dim: int, edge_attr_dim: int, hidden: int = 64, out: int = 32, dropout: float = 0.2) -> None: | |
| super().__init__() | |
| self.encoder = SageEncoder(in_dim=in_dim, hidden=hidden, out=out, dropout=dropout) | |
| self.decoder = AttrDecoder(z_dim=out, edge_attr_dim=edge_attr_dim, hidden=hidden * 2, dropout=dropout) | |
| def forward(self, x, edge_index, target_edges): | |
| z = self.encoder(x, edge_index) | |
| return self.decoder(z, target_edges) | |
| # ─── Inference bundle ──────────────────────────────────────────────────────── | |
| class InferenceBundle: | |
| fraud_model: EdgeFraudGNN | |
| anomaly_model: AttrGAE | |
| node_index: dict[str, int] | |
| edge_attr_scaler_mean: np.ndarray | |
| edge_attr_scaler_scale: np.ndarray | |
| node_feature_scaler_mean: np.ndarray | |
| node_feature_scaler_scale: np.ndarray | |
| node_features_raw: np.ndarray | |
| edge_index: np.ndarray | |
| feature_columns: dict[str, list[str]] | |
| fraud_threshold: float | |
| metadata: dict[str, Any] | |
| def node_features_scaled(self) -> torch.Tensor: | |
| x = (self.node_features_raw - self.node_feature_scaler_mean) / self.node_feature_scaler_scale | |
| return torch.from_numpy(x.astype(np.float32)) | |
| def reverse_node_index(self) -> dict[int, str]: | |
| return {v: k for k, v in self.node_index.items()} | |
| def encode_edges( | |
| self, | |
| from_account, | |
| to_account, | |
| amount, | |
| business_process, | |
| posting_date, | |
| confidence=None, | |
| ) -> tuple[torch.Tensor, torch.Tensor]: | |
| n = len(from_account) | |
| if confidence is None: | |
| confidence = [1.0] * n | |
| df = pd.DataFrame( | |
| { | |
| "from_account": [str(a) for a in from_account], | |
| "to_account": [str(a) for a in to_account], | |
| "amount": amount, | |
| "business_process": business_process, | |
| "posting_date": pd.to_datetime(posting_date, errors="coerce"), | |
| "confidence": confidence, | |
| } | |
| ) | |
| unknown = set(df["from_account"]) | set(df["to_account"]) | |
| unknown -= set(self.node_index.keys()) | |
| if unknown: | |
| raise ValueError(f"unknown account number(s): {sorted(unknown)}") | |
| src = df["from_account"].map(self.node_index).to_numpy(dtype=np.int64) | |
| dst = df["to_account"].map(self.node_index).to_numpy(dtype=np.int64) | |
| edge_index = np.stack([src, dst], axis=0) | |
| a = df["amount"].astype(float).to_numpy() | |
| log_amt = np.log1p(a).astype(np.float32) | |
| diffs = np.abs(a[:, None] - ROUND_LEVELS[None, :]) | |
| nearest = diffs.min(axis=1) | |
| is_round = (nearest < 1.0).astype(np.float32) | |
| log_dist = np.log1p(nearest).astype(np.float32) | |
| nearest_idx = diffs.argmin(axis=1) | |
| per_level = np.zeros((n, len(ROUND_LEVELS)), dtype=np.float32) | |
| is_close = nearest < 1.0 | |
| per_level[is_close, nearest_idx[is_close]] = 1.0 | |
| bp_oh = ( | |
| pd.get_dummies(df["business_process"].fillna("UNK"), prefix="bp") | |
| .reindex(columns=[f"bp_{p}" for p in BUSINESS_PROCESSES], fill_value=0) | |
| .astype(np.float32) | |
| .to_numpy() | |
| ) | |
| dt = df["posting_date"] | |
| doy = dt.dt.dayofyear.fillna(1).to_numpy() | |
| woy = dt.dt.isocalendar().week.astype(int).to_numpy() | |
| dow = dt.dt.dayofweek.fillna(0).to_numpy() | |
| is_weekend = (dow >= 5).astype(np.float32) | |
| date_feats = np.stack( | |
| [ | |
| np.sin(2 * np.pi * doy / 366), | |
| np.cos(2 * np.pi * doy / 366), | |
| np.sin(2 * np.pi * woy / 53), | |
| np.cos(2 * np.pi * woy / 53), | |
| np.sin(2 * np.pi * dow / 7), | |
| np.cos(2 * np.pi * dow / 7), | |
| is_weekend, | |
| ], | |
| axis=1, | |
| ).astype(np.float32) | |
| confidence_arr = df["confidence"].astype(float).to_numpy().reshape(-1, 1).astype(np.float32) | |
| edge_attr = np.concatenate( | |
| [ | |
| log_amt[:, None], | |
| is_round[:, None], | |
| log_dist[:, None], | |
| per_level, | |
| confidence_arr, | |
| bp_oh, | |
| date_feats, | |
| ], | |
| axis=1, | |
| ) | |
| edge_attr_scaled = ( | |
| (edge_attr - self.edge_attr_scaler_mean) / self.edge_attr_scaler_scale | |
| ).astype(np.float32) | |
| return torch.from_numpy(edge_index), torch.from_numpy(edge_attr_scaled) | |
| def predict_fraud(self, **kwargs) -> np.ndarray: | |
| target_edge_index, target_edge_attr = self.encode_edges(**kwargs) | |
| graph_edge_index = torch.from_numpy(self.edge_index) | |
| x = self.node_features_scaled | |
| self.fraud_model.train(False) | |
| h = self.fraud_model.encode(x, graph_edge_index) | |
| logits = self.fraud_model.edge_logits(h, target_edge_index, target_edge_attr) | |
| return torch.sigmoid(logits).cpu().numpy() | |
| def anomaly_score_edges(self, **kwargs) -> np.ndarray: | |
| target_edge_index, target_edge_attr = self.encode_edges(**kwargs) | |
| graph_edge_index = torch.from_numpy(self.edge_index) | |
| x = self.node_features_scaled | |
| self.anomaly_model.train(False) | |
| recon = self.anomaly_model(x, graph_edge_index, target_edge_index) | |
| return ((recon - target_edge_attr) ** 2).mean(dim=-1).cpu().numpy() | |
| def load_bundle(model_dir: Path | str) -> InferenceBundle: | |
| model_dir = Path(model_dir) | |
| fraud_payload = torch.load(model_dir / "je_fraud_gnn.pt", weights_only=False, map_location="cpu") | |
| anomaly_payload = torch.load(model_dir / "je_anomaly_gae.pt", weights_only=False, map_location="cpu") | |
| preprocessor = torch.load(model_dir / "preprocessor.pt", weights_only=False, map_location="cpu") | |
| metadata = json.loads((model_dir / "metadata.json").read_text()) | |
| fraud_model = EdgeFraudGNN(**fraud_payload["model_config"]) | |
| fraud_model.load_state_dict(fraud_payload["model_state_dict"]) | |
| fraud_model.train(False) | |
| anomaly_model = AttrGAE(**anomaly_payload["model_config"]) | |
| anomaly_model.load_state_dict(anomaly_payload["model_state_dict"]) | |
| anomaly_model.train(False) | |
| return InferenceBundle( | |
| fraud_model=fraud_model, | |
| anomaly_model=anomaly_model, | |
| node_index=preprocessor["node_index"], | |
| edge_attr_scaler_mean=np.asarray(preprocessor["edge_attr_scaler_mean"], dtype=np.float32), | |
| edge_attr_scaler_scale=np.asarray(preprocessor["edge_attr_scaler_scale"], dtype=np.float32), | |
| node_feature_scaler_mean=np.asarray(preprocessor["node_feature_scaler_mean"], dtype=np.float32), | |
| node_feature_scaler_scale=np.asarray(preprocessor["node_feature_scaler_scale"], dtype=np.float32), | |
| node_features_raw=np.asarray(preprocessor["node_features_raw"], dtype=np.float32), | |
| edge_index=np.asarray(preprocessor["edge_index"], dtype=np.int64), | |
| feature_columns=preprocessor["feature_columns"], | |
| fraud_threshold=float(metadata.get("fraud_threshold", 0.5)), | |
| metadata=metadata, | |
| ) | |