File size: 4,002 Bytes

a3682cf

"""
models/base.py
==============
Abstract base class for all temporal fraud models.

All models MUST:
  - Accept a raw DataFrame event stream (sorted by timestamp)
  - Maintain internal memory (or not, for static models)
  - Return node-level fraud probabilities for a specified set of eval_nodes
  - Support reset_memory() for temporal ablation experiments
"""

from __future__ import annotations

from abc import ABC, abstractmethod
from typing import List

import numpy as np
import pandas as pd


class TemporalModel(ABC):
    """
    Unified interface for all temporal and static fraud detection models.

    Data contract
    -------------
    df_train / df_eval must contain at minimum:
        sender_id    int   — source node
        receiver_id  int   — destination node
        timestamp    float — unix seconds, sorted ascending
        is_fraud     int   — edge-level binary label (0/1)
        dynamic_fraud_state  float — hidden EMA state (available for mechanistic analysis but
                                     MUST NOT be used as a feature)

    All models receive the complete DataFrame so they can build any internal
    features they need. Models are responsible for respecting the data leakage
    constraint (no dynamic_fraud_state in features).
    """

    # ------------------------------------------------------------------ #
    # Abstract interface                                                   #
    # ------------------------------------------------------------------ #

    @property
    @abstractmethod
    def name(self) -> str:
        """Human-readable model identifier used in CSV/plot outputs."""

    @abstractmethod
    def fit(self, df_train: pd.DataFrame, num_epochs: int = 3) -> None:
        """
        Train on chronologically ordered event stream.

        Parameters
        ----------
        df_train : pd.DataFrame
            All events available for training (sorted by timestamp).
        num_epochs : int
            Number of passes over the training data.
        """

    @abstractmethod
    def predict(self, df_eval: pd.DataFrame, eval_nodes: List[int]) -> np.ndarray:
        """
        Return fraud probability scores for eval_nodes.

        The model may perform a warm-up memory pass over df_eval events
        (reading timestamps/IDs only — NOT fraud labels) before scoring.

        Parameters
        ----------
        df_eval : pd.DataFrame
            Events in the evaluation window.
        eval_nodes : List[int]
            Sender IDs of nodes to score, in order.

        Returns
        -------
        probs : np.ndarray, shape (len(eval_nodes),), dtype float32
            Fraud probability in [0, 1] for each node.
        """

    @abstractmethod
    def reset_memory(self) -> None:
        """
        Zero out all internal memory / hidden states.

        Used in the temporal ablation experiment to measure how much
        the model relies on accumulated temporal history vs. static structure.
        For static models (XGBoost, StaticGNN) this is a no-op.
        """

    # ------------------------------------------------------------------ #
    # Optional properties                                                  #
    # ------------------------------------------------------------------ #

    @property
    def is_temporal(self) -> bool:
        """True for models that maintain temporal memory across events."""
        return True

    # ------------------------------------------------------------------ #
    # Shared helpers                                                       #
    # ------------------------------------------------------------------ #

    @staticmethod
    def _safe_auc(y_true: np.ndarray, y_score: np.ndarray) -> float:
        """ROC-AUC that returns 0.5 when only one class is present."""
        from sklearn.metrics import roc_auc_score
        if len(np.unique(y_true)) < 2:
            return 0.5
        return float(roc_auc_score(y_true, y_score))