"""
models/tgat.py
==============
Temporal Graph Attention Network (TGAT)
Xu et al., "Inductive Representation Learning on Temporal Graphs" (ICLR 2020)

Architecture
------------
- Sinusoidal time encoding (reuses src/tgn/time_encoding.py)
- Per-node ring buffer of K most recent temporal neighbors
- Multi-head scaled dot-product attention over temporal neighborhood
- GRU-cell aggregator updates node memory after each event
- Node classifier head: memory → fraud probability

Event processing (streaming, chronological):
  For each edge (u, v, t, edge_feat):
    1. Retrieve last K neighbors of u from buffer → {(t_i, h_i, e_i)}
    2. Build query: Q = W_q(cat(h_u, φ(0)))          [current state at t]
       Build keys:  K = W_k(cat(h_i, φ(t−t_i)))      [neighbor state at t_i]
       Build vals:  V = W_v(cat(h_i, e_i, φ(t−t_i))) [neighbor context]
    3. attn = softmax(Q K^T / √d), z = attn·V
    4. h_u ← GRU(z, h_u)   [update sender memory]
    5. Symmetrically update h_v using u's neighborhood
    6. Append (t, h_u, h_v, e) to neighbor buffers
"""

from __future__ import annotations

from collections import defaultdict
from typing import List

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

from models.base import TemporalModel
from models.tgn_wrapper import _make_users_df
from src.graph.graph_builder import build_edge_features
from src.tgn.time_encoding import TimeEncoding


# ------------------------------------------------------------------ #
# Core TGAT nn.Module                                                 #
# ------------------------------------------------------------------ #

class _TGATModule(nn.Module):
    def __init__(
        self,
        memory_dim: int,
        edge_dim: int,
        time_dim: int,
        num_heads: int,
    ):
        super().__init__()
        self.memory_dim = memory_dim
        self.time_enc = TimeEncoding(time_dim)

        # Input dimensions after concatenation
        q_in = memory_dim + 2 * time_dim      # h_u || φ(0)
        kv_base = memory_dim + 2 * time_dim   # h_nbr || φ(dt)
        v_in = memory_dim + edge_dim + 2 * time_dim  # h_nbr || e || φ(dt)

        self.attn_dim = memory_dim  # output of attention
        self.num_heads = num_heads
        assert self.attn_dim % num_heads == 0, "attn_dim must be divisible by num_heads"

        self.W_q = nn.Linear(q_in, self.attn_dim, bias=False)
        self.W_k = nn.Linear(kv_base, self.attn_dim, bias=False)
        self.W_v = nn.Linear(v_in, self.attn_dim, bias=False)

        self.scale = (self.attn_dim // num_heads) ** -0.5

        # Merge attended output with current memory
        self.merge = nn.Linear(self.attn_dim + memory_dim, memory_dim)
        self.gru = nn.GRUCell(memory_dim, memory_dim)

        # Node classifier
        self.classifier = nn.Sequential(
            nn.Linear(memory_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
        )

    def attend(
        self,
        h_u: torch.Tensor,      # (B, memory_dim)  — current node state
        h_nbrs: torch.Tensor,   # (B, K, memory_dim)
        e_nbrs: torch.Tensor,   # (B, K, edge_dim)
        dt_nbrs: torch.Tensor,  # (B, K)            — time deltas
        mask: torch.Tensor,     # (B, K) bool        — True = valid
    ) -> torch.Tensor:
        """Compute multi-head attention over temporal neighborhood."""
        B, K = dt_nbrs.shape
        H = self.num_heads
        d_h = self.attn_dim // H

        phi_0 = self.time_enc(torch.zeros(B, device=h_u.device))  # (B, 2*time_dim)
        phi_dt = self.time_enc(dt_nbrs.reshape(-1)).reshape(B, K, -1)  # (B, K, 2*time_dim)

        # Query
        q_in = torch.cat([h_u, phi_0], dim=-1)       # (B, q_in)
        Q = self.W_q(q_in).view(B, H, d_h)           # (B, H, d_h)

        # Key
        h_nbrs_flat = h_nbrs.reshape(B * K, -1)
        phi_dt_flat = phi_dt.reshape(B * K, -1)
        k_in = torch.cat([h_nbrs_flat, phi_dt_flat], dim=-1)  # (B*K, kv)
        K_ = self.W_k(k_in).view(B, K, H, d_h)               # (B, K, H, d_h)
        K_ = K_.permute(0, 2, 1, 3)                           # (B, H, K, d_h)

        # Value
        v_in = torch.cat([h_nbrs_flat, e_nbrs.reshape(B * K, -1), phi_dt_flat], dim=-1)
        V = self.W_v(v_in).view(B, K, H, d_h)
        V = V.permute(0, 2, 1, 3)  # (B, H, K, d_h)

        # Attention scores
        scores = (Q.unsqueeze(2) @ K_.transpose(-2, -1)).squeeze(2)  # (B, H, K)
        scores = scores * self.scale

        # Mask invalid neighbors (padding)
        if mask is not None:
            inv_mask = ~mask.unsqueeze(1)  # (B, 1, K)
            scores = scores.masked_fill(inv_mask, float("-inf"))

        attn = F.softmax(scores, dim=-1)
        attn = torch.nan_to_num(attn, nan=0.0)  # handle all-masked rows

        # Weighted sum
        z = (attn.unsqueeze(-1) * V).sum(dim=2)  # (B, H, d_h)
        z = z.reshape(B, self.attn_dim)           # (B, attn_dim)

        return z

    def update(self, h_u: torch.Tensor, z: torch.Tensor) -> torch.Tensor:
        merged = self.merge(torch.cat([z, h_u], dim=-1))
        return self.gru(merged, h_u)

    def classify(self, memory: torch.Tensor) -> torch.Tensor:
        return self.classifier(memory).squeeze(-1)


# ------------------------------------------------------------------ #
# TGAT Streamer (event-level memory management)                       #
# ------------------------------------------------------------------ #

class _TGATStreamer:
    """
    Maintains per-node memory and temporal neighbor buffers.
    Processes events in a batched manner (approximate — same-batch
    events use pre-batch memory state, standard practice for scalability).
    """

    def __init__(
        self,
        module: _TGATModule,
        n_nodes: int,
        memory_dim: int,
        edge_dim: int,
        n_neighbors: int,
        device: torch.device,
    ):
        self.module = module
        self.memory_dim = memory_dim
        self.edge_dim = edge_dim
        self.n_neighbors = n_neighbors
        self.device = device

        # Node memory: (n_nodes, memory_dim)
        self.memory = torch.zeros(n_nodes, memory_dim, device=device)

        # Per-node circular neighbor buffer: stores (time, h_nbr, edge_feat) tuples
        # Stored as plain Python lists for flexibility; trimmed to n_neighbors
        self.nbr_times: List[List[float]] = [[] for _ in range(n_nodes)]
        self.nbr_h: List[List[torch.Tensor]] = [[] for _ in range(n_nodes)]
        self.nbr_e: List[List[torch.Tensor]] = [[] for _ in range(n_nodes)]

    def _write_memory_rows(
        self,
        node_ids: torch.Tensor,
        values: torch.Tensor,
    ) -> None:
        """Deterministic last-write-wins update for repeated node ids in a batch."""
        for idx in range(len(node_ids)):
            self.memory[int(node_ids[idx].item())] = values[idx].detach()

    def _get_neighbor_tensors(
        self, node_ids: torch.Tensor
    ):
        """
        Returns padded (h_nbrs, e_nbrs, dt_nbrs, mask) for a batch of nodes.
        """
        B = len(node_ids)
        K = self.n_neighbors
        mem_dim = self.memory_dim
        e_dim = self.edge_dim
        device = self.device

        h_out = torch.zeros(B, K, mem_dim, device=device)
        e_out = torch.zeros(B, K, e_dim, device=device)
        dt_out = torch.zeros(B, K, device=device)
        mask = torch.zeros(B, K, dtype=torch.bool, device=device)

        # Use current timestamp == max in buf (approximate, fine for inference)
        # We'll pass dt as a separate tensor
        return h_out, e_out, dt_out, mask

    def _fill_neighbor_batch(
        self,
        node_ids: torch.Tensor,
        current_times: torch.Tensor,
    ):
        """
        Fills neighbor tensors for a batch, using the stored per-node buffers.
        """
        B = len(node_ids)
        K = self.n_neighbors
        mem_dim = self.memory_dim
        e_dim = self.edge_dim
        device = self.device

        h_out = torch.zeros(B, K, mem_dim, device=device)
        e_out = torch.zeros(B, K, e_dim, device=device)
        dt_out = torch.zeros(B, K, device=device)
        mask = torch.zeros(B, K, dtype=torch.bool, device=device)

        node_ids_np = node_ids.cpu().numpy()
        times_np = current_times.cpu().numpy()

        for b_idx, (nid, t_cur) in enumerate(zip(node_ids_np, times_np)):
            buf_t = self.nbr_times[nid]
            buf_h = self.nbr_h[nid]
            buf_e = self.nbr_e[nid]
            n_valid = len(buf_t)
            if n_valid == 0:
                continue
            n_use = min(n_valid, K)
            # Most recent K neighbors
            for k, i in enumerate(range(n_valid - n_use, n_valid)):
                h_out[b_idx, k] = buf_h[i]
                e_out[b_idx, k] = buf_e[i]
                dt_out[b_idx, k] = max(0.0, float(t_cur) - float(buf_t[i]))
                mask[b_idx, k] = True

        return h_out, e_out, dt_out, mask

    def _update_buffers(
        self,
        node_ids_np: np.ndarray,
        times_np: np.ndarray,
        h_others: torch.Tensor,  # (N, mem_dim) — embedding of the other node
        edge_feats: torch.Tensor,  # (N, edge_dim)
    ):
        """Add events to per-node neighbor buffers (detached)."""
        for i, nid in enumerate(node_ids_np):
            self.nbr_times[nid].append(float(times_np[i]))
            self.nbr_h[nid].append(h_others[i].detach().cpu())
            self.nbr_e[nid].append(edge_feats[i].detach().cpu())
            # Trim
            if len(self.nbr_times[nid]) > self.n_neighbors:
                self.nbr_times[nid].pop(0)
                self.nbr_h[nid].pop(0)
                self.nbr_e[nid].pop(0)

    def process_batch(
        self,
        u_ids: torch.Tensor,    # (B,)
        v_ids: torch.Tensor,    # (B,)
        times: torch.Tensor,    # (B,)  normalised
        edge_feats: torch.Tensor,  # (B, edge_dim)
        compute_grad: bool = True,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        """
        Process a batch of events, update memory, return (logits_u, logits_v)
        for training (edge-level fraud prediction used only during training).
        """
        device = self.device
        module = self.module

        # Current memory state (detach to avoid BPTT through the buffer)
        h_u = self.memory[u_ids].clone()   # (B, mem_dim)
        h_v = self.memory[v_ids].clone()   # (B, mem_dim)

        u_np = u_ids.cpu().numpy()
        v_np = v_ids.cpu().numpy()
        t_np = times.cpu().numpy()

        # ---- Attend for u ----
        h_nbrs_u, e_nbrs_u, dt_u, mask_u = self._fill_neighbor_batch(u_ids, times)
        z_u = module.attend(h_u, h_nbrs_u, e_nbrs_u, dt_u, mask_u)
        h_u_new = module.update(h_u.detach(), z_u)

        # ---- Attend for v ----
        h_nbrs_v, e_nbrs_v, dt_v, mask_v = self._fill_neighbor_batch(v_ids, times)
        z_v = module.attend(h_v, h_nbrs_v, e_nbrs_v, dt_v, mask_v)
        h_v_new = module.update(h_v.detach(), z_v)

        # Write back in a deterministic order when a node appears multiple times.
        self._write_memory_rows(u_ids, h_u_new)
        self._write_memory_rows(v_ids, h_v_new)

        # Update neighbor buffers
        self._update_buffers(u_np, t_np, h_v_new, edge_feats)
        self._update_buffers(v_np, t_np, h_u_new, edge_feats)

        return h_u_new, h_v_new

    def reset(self):
        self.memory.zero_()
        self.nbr_times = [[] for _ in range(self.memory.shape[0])]
        self.nbr_h = [[] for _ in range(self.memory.shape[0])]
        self.nbr_e = [[] for _ in range(self.memory.shape[0])]


# ------------------------------------------------------------------ #
# TGATWrapper (TemporalModel interface)                               #
# ------------------------------------------------------------------ #

class TGATWrapper(TemporalModel):
    """TGAT wrapped behind the unified TemporalModel interface."""

    def __init__(
        self,
        memory_dim: int = 64,
        time_dim: int = 8,
        num_heads: int = 4,
        n_neighbors: int = 10,
        device: str = "cpu",
    ):
        self.memory_dim = memory_dim
        self.time_dim = time_dim
        self.num_heads = num_heads
        self.n_neighbors = n_neighbors
        self.device = torch.device(device)

        self._module: _TGATModule | None = None
        self._streamer: _TGATStreamer | None = None
        self._norm_stats: dict | None = None
        self._n_nodes: int = 0
        self._edge_dim: int = 0

    @property
    def name(self) -> str:
        return "TGAT"

    # ------------------------------------------------------------------ #

    def fit(self, df_train: pd.DataFrame, num_epochs: int = 3) -> None:
        df_train = df_train.sort_values("timestamp").reset_index(drop=True)

        # Pre-compute edge features
        edge_feats_np = build_edge_features(df_train)  # (N, edge_dim)
        edge_dim = edge_feats_np.shape[1]
        self._edge_dim = edge_dim

        # Normalise
        ea_mean = edge_feats_np.mean(axis=0)
        ea_std = edge_feats_np.std(axis=0) + 1e-6
        edge_feats_np = (edge_feats_np - ea_mean) / ea_std

        # Timestamps (normalise to [0,1] then amplify)
        t_vals = df_train["timestamp"].values.astype(np.float32)
        t_min, t_max = t_vals.min(), t_vals.max()
        t_norm = (t_vals - t_min) / (t_max - t_min + 1e-6)

        self._norm_stats = {
            "ea_mean": ea_mean, "ea_std": ea_std,
            "t_min": t_min, "t_max": t_max,
        }

        # Node universe
        all_nodes = np.union1d(
            df_train["sender_id"].values, df_train["receiver_id"].values
        )
        n_nodes = int(all_nodes.max()) + 1
        self._n_nodes = n_nodes

        # Build module and streamer
        module = _TGATModule(
            memory_dim=self.memory_dim,
            edge_dim=edge_dim,
            time_dim=self.time_dim,
            num_heads=self.num_heads,
        ).to(self.device)
        self._module = module

        streamer = _TGATStreamer(
            module=module,
            n_nodes=n_nodes,
            memory_dim=self.memory_dim,
            edge_dim=edge_dim,
            n_neighbors=self.n_neighbors,
            device=self.device,
        )
        self._streamer = streamer

        # Labels (edge-level)
        y = torch.tensor(df_train["is_fraud"].values, dtype=torch.float32)
        u_ids = torch.tensor(df_train["sender_id"].values, dtype=torch.long)
        v_ids = torch.tensor(df_train["receiver_id"].values, dtype=torch.long)
        ef_all = torch.tensor(edge_feats_np, dtype=torch.float32)
        t_all = torch.tensor(t_norm * 5.0, dtype=torch.float32)

        raw_pw = (y == 0).sum() / ((y == 1).sum() + 1e-6)
        pos_weight = torch.clamp(raw_pw, max=10.0).to(self.device)
        loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
        optimiser = torch.optim.Adam(module.parameters(), lr=1e-3)

        # Edge-level loss: predict fraud for events where u is sender
        # (proxy training signal; node classifier fine-tuned separately)
        edge_classifier = nn.Sequential(
            nn.Linear(self.memory_dim * 2 + edge_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
        ).to(self.device)
        self._edge_clf = edge_classifier
        optimiser.add_param_group({"params": edge_classifier.parameters()})

        batch_size = 512
        N = len(df_train)

        for epoch in range(num_epochs):
            # Re-initialise memory each epoch to avoid over-fitting to order
            streamer.reset()
            total_loss = 0.0

            for i in range(0, N, batch_size):
                j = min(i + batch_size, N)
                u_b = u_ids[i:j].to(self.device)
                v_b = v_ids[i:j].to(self.device)
                t_b = t_all[i:j].to(self.device)
                ef_b = ef_all[i:j].to(self.device)
                y_b = y[i:j].to(self.device)

                h_u, h_v = streamer.process_batch(u_b, v_b, t_b, ef_b)

                edge_in = torch.cat([h_u, h_v, ef_b], dim=-1)
                logits = edge_classifier(edge_in).squeeze(-1)
                logits = torch.clamp(logits, -10, 10)

                loss = loss_fn(logits, y_b)
                optimiser.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(module.parameters(), 1.0)
                optimiser.step()

                total_loss += loss.item()

            print(f"[TGAT] Epoch {epoch + 1}/{num_epochs}  Loss: {total_loss:.4f}")

        # Node classifier head (trained separately on node-level labels)
        self._node_clf = nn.Sequential(
            nn.Linear(self.memory_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
        ).to(self.device)

    # ------------------------------------------------------------------ #

    def predict(self, df_eval: pd.DataFrame, eval_nodes: List[int]) -> np.ndarray:
        assert self._streamer is not None, "Call fit() first."
        df_eval = df_eval.sort_values("timestamp").reset_index(drop=True)

        ns = self._norm_stats
        ef_np = build_edge_features(df_eval).astype(np.float32)
        ef_np = (ef_np - ns["ea_mean"]) / ns["ea_std"]

        t_vals = df_eval["timestamp"].values.astype(np.float32)
        t_norm = (t_vals - ns["t_min"]) / (ns["t_max"] - ns["t_min"] + 1e-6)

        u_ids = torch.tensor(df_eval["sender_id"].values, dtype=torch.long)
        v_ids = torch.tensor(df_eval["receiver_id"].values, dtype=torch.long)
        ef_t = torch.tensor(ef_np, dtype=torch.float32)
        t_t = torch.tensor(t_norm * 5.0, dtype=torch.float32)

        self._module.eval()
        with torch.no_grad():
            batch_size = 512
            for i in range(0, len(df_eval), batch_size):
                j = min(i + batch_size, len(df_eval))
                self._streamer.process_batch(
                    u_ids[i:j].to(self.device),
                    v_ids[i:j].to(self.device),
                    t_t[i:j].to(self.device),
                    ef_t[i:j].to(self.device),
                    compute_grad=False,
                )

        # Extract memory for eval nodes (clamp to valid range)
        eval_t = torch.tensor(
            [min(n, self._n_nodes - 1) for n in eval_nodes],
            dtype=torch.long, device=self.device,
        )
        node_emb = self._streamer.memory[eval_t]

        if not hasattr(self, "_node_clf") or self._node_clf is None:
            self._node_clf = nn.Sequential(
                nn.Linear(self.memory_dim, 64), nn.ReLU(), nn.Linear(64, 1)
            ).to(self.device)

        with torch.no_grad():
            logits = self._node_clf(node_emb).squeeze(-1)
            probs = torch.sigmoid(logits).cpu().numpy()

        return probs.astype(np.float32)

    def extract_prefix_embeddings(
        self,
        df_eval: pd.DataFrame,
        examples: pd.DataFrame,
    ) -> np.ndarray:
        assert self._module is not None, "Call fit() first."
        if examples.empty:
            return np.zeros((0, self.memory_dim), dtype=np.float32)

        df_eval = df_eval.sort_values("timestamp").reset_index(drop=True).copy()
        if "local_event_idx" not in df_eval.columns:
            df_eval["local_event_idx"] = df_eval.groupby("sender_id").cumcount().astype(np.int32)

        capture_map: dict[tuple[int, int], list[int]] = {}
        for ex_idx, row in enumerate(examples.itertuples(index=False)):
            key = (int(row.sender_id), int(row.eval_local_event_idx))
            capture_map.setdefault(key, []).append(ex_idx)

        max_seen_id = int(max(df_eval["sender_id"].max(), df_eval["receiver_id"].max())) + 1
        streamer = _TGATStreamer(
            module=self._module,
            n_nodes=max(self._n_nodes, max_seen_id),
            memory_dim=self.memory_dim,
            edge_dim=self._edge_dim,
            n_neighbors=self.n_neighbors,
            device=self.device,
        )

        ns = self._norm_stats
        edge_feats_np = build_edge_features(df_eval).astype(np.float32)
        edge_feats_np = (edge_feats_np - ns["ea_mean"]) / ns["ea_std"]
        t_vals = df_eval["timestamp"].to_numpy(dtype=np.float32)
        t_norm = (t_vals - ns["t_min"]) / (ns["t_max"] - ns["t_min"] + 1e-6) * 5.0

        out = np.zeros((len(examples), self.memory_dim), dtype=np.float32)
        self._module.eval()
        with torch.no_grad():
            for idx, row in enumerate(df_eval.itertuples(index=False)):
                u = torch.tensor([int(row.sender_id)], dtype=torch.long, device=self.device)
                v = torch.tensor([int(row.receiver_id)], dtype=torch.long, device=self.device)
                t = torch.tensor([t_norm[idx]], dtype=torch.float32, device=self.device)
                ef = torch.tensor(edge_feats_np[idx:idx + 1], dtype=torch.float32, device=self.device)
                streamer.process_batch(u, v, t, ef, compute_grad=False)

                key = (int(row.sender_id), int(row.local_event_idx))
                if key in capture_map:
                    emb = streamer.memory[int(row.sender_id)].detach().cpu().numpy().astype(np.float32)
                    for ex_idx in capture_map[key]:
                        out[ex_idx] = emb

        return out

    # ------------------------------------------------------------------ #

    def reset_memory(self) -> None:
        if self._streamer is not None:
            self._streamer.memory.zero_()
            self._streamer.nbr_times = [[] for _ in range(self._n_nodes)]
            self._streamer.nbr_h = [[] for _ in range(self._n_nodes)]
            self._streamer.nbr_e = [[] for _ in range(self._n_nodes)]

    # ------------------------------------------------------------------ #

    def train_node_classifier(
        self,
        eval_nodes: List[int],
        y_labels: np.ndarray,
        num_epochs: int = 150,
    ) -> None:
        """Fine-tune node classifier on node-level labels from training window."""
        device = self.device
        eval_t = torch.tensor(eval_nodes, dtype=torch.long, device=device)
        node_emb = self._streamer.memory[eval_t].detach()
        y = torch.tensor(y_labels, dtype=torch.float32, device=device)

        pw = torch.clamp((y == 0).sum() / ((y == 1).sum() + 1e-6), max=10.0)
        loss_fn = nn.BCEWithLogitsLoss(pos_weight=pw)
        opt = torch.optim.Adam(self._node_clf.parameters(), lr=1e-3)

        self._node_clf.train()
        for _ in range(num_epochs):
            logits = self._node_clf(node_emb).squeeze(-1)
            loss = loss_fn(logits, y)
            opt.zero_grad()
            loss.backward()
            opt.step()
        self._node_clf.eval()