"""
src/nlp_features.py
──────────────────────────────────────────────────────────────────────────────
NLP / Sentence-BERT embedding pipeline for alternative credit signals.
 
Provides:
  • FinancialNarrativeBuilder  — synthesises a text description per applicant
  • SBERTEmbedder              — encodes texts → embeddings → PCA reduction
  • NLPFeaturePipeline         — end-to-end fit/transform orchestrator
 
In production, FinancialNarrativeBuilder would be replaced by real user
survey or app-usage text. Here we synthesise from tabular signals to
demonstrate the pipeline architecture.
 
Usage:
    from src.nlp_features import NLPFeaturePipeline
    nlp = NLPFeaturePipeline(cfg)
    train_nlp_df = nlp.fit_transform(train_df)
    test_nlp_df  = nlp.transform(test_df)
──────────────────────────────────────────────────────────────────────────────
"""
 
from __future__ import annotations
 
import os
import gc
import warnings
from typing import List, Optional
 
import numpy as np
import pandas as pd
import joblib
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer
 
warnings.filterwarnings("ignore")
 
 
# ─── Narrative templates ──────────────────────────────────────────────────────
 
_LITERACY_LEVELS = {
    "high": [
        "demonstrates strong financial planning habits and consistently pays obligations on time",
        "shows excellent budgeting discipline and proactively manages debt obligations",
        "has a clear savings strategy and maintains low revolving credit utilisation",
    ],
    "medium": [
        "shows moderate financial awareness with occasional delayed payments",
        "manages debt adequately but has limited long-term financial planning",
        "meets minimum payment requirements but rarely pays ahead of schedule",
    ],
    "low": [
        "has limited formal financial experience and irregular payment patterns",
        "relies heavily on informal credit channels and lacks credit history",
        "demonstrates financial stress indicators with frequent payment shortfalls",
    ],
}
 
_EMPLOYMENT_TEMPLATES = {
    "long":   "Has stable employment of {years:.1f} years with the current employer.",
    "medium": "Currently employed for {years:.1f} years; career trajectory appears stable.",
    "short":  "Recently started employment ({years:.1f} years); income may not be fully stabilised.",
    "none":   "No current formal employment; income source requires verification.",
}
 
_ASSET_SENTENCES = {
    (True,  True):  "Applicant owns both a vehicle and residential property, indicating established assets.",
    (True,  False): "Applicant owns a vehicle but rents accommodation.",
    (False, True):  "Applicant owns residential property, a strong collateral signal.",
    (False, False): "No registered asset ownership; relies solely on income for repayment.",
}
 
 
class FinancialNarrativeBuilder:
    """
    Constructs a structured financial narrative text per applicant row.
 
    Parameters
    ----------
    ext_high_threshold  : EXT_SOURCE_MEAN above which → high financial literacy
    ext_low_threshold   : EXT_SOURCE_MEAN below which → low financial literacy
    include_enquiry     : whether to include credit enquiry paragraph
    include_bureau      : whether to include bureau summary paragraph
    random_template_seed: reproducibility for template sampling
    """
 
    def __init__(
        self,
        ext_high_threshold:   float = 0.60,
        ext_low_threshold:    float = 0.40,
        include_enquiry:      bool  = True,
        include_bureau:       bool  = True,
        random_template_seed: int   = 42,
    ):
        self.ext_high     = ext_high_threshold
        self.ext_low      = ext_low_threshold
        self.incl_enquiry = include_enquiry
        self.incl_bureau  = include_bureau
        self._rng         = np.random.RandomState(random_template_seed)
 
    def _literacy_sentence(self, ext_mean: float) -> str:
        if ext_mean >= self.ext_high:
            pool = _LITERACY_LEVELS["high"]
        elif ext_mean >= self.ext_low:
            pool = _LITERACY_LEVELS["medium"]
        else:
            pool = _LITERACY_LEVELS["low"]
        return self._rng.choice(pool)
 
    def _employment_sentence(self, emp_years: float) -> str:
        if emp_years > 10:
            tmpl = _EMPLOYMENT_TEMPLATES["long"]
        elif emp_years > 2:
            tmpl = _EMPLOYMENT_TEMPLATES["medium"]
        elif emp_years > 0:
            tmpl = _EMPLOYMENT_TEMPLATES["short"]
        else:
            return _EMPLOYMENT_TEMPLATES["none"]
        return tmpl.format(years=emp_years)
 
    def build_one(self, row: pd.Series) -> str:
        """Build a single narrative string from one applicant row."""
 
        income      = float(row.get("AMT_INCOME_TOTAL",  150_000))
        credit      = float(row.get("AMT_CREDIT",        300_000))
        age         = abs(float(row.get("DAYS_BIRTH",    -35*365))) / 365
        emp_years   = max(0, -float(row.get("DAYS_EMPLOYED", -3*365))) / 365
        ext1        = float(row.get("EXT_SOURCE_1",      0.5))
        ext2        = float(row.get("EXT_SOURCE_2",      0.5))
        ext3        = float(row.get("EXT_SOURCE_3",      0.5))
        ext_mean    = np.nanmean([ext1, ext2, ext3])
        has_realty  = bool(row.get("FLAG_OWN_REALTY", 0))
        has_car     = bool(row.get("FLAG_OWN_CAR",    0))
        n_children  = int(row.get("CNT_CHILDREN",     0))
        fam_size    = int(row.get("CNT_FAM_MEMBERS",  2))
        credit_income = credit / (income + 1)
 
        parts: List[str] = []
 
        # ── Core financial summary ──────────────────────────────────────
        parts.append(
            f"Applicant is {age:.0f} years old with a declared annual income of "
            f"{income:,.0f} currency units. "
            f"Requesting a credit facility of {credit:,.0f} units, "
            f"representing a credit-to-income ratio of {credit_income:.2f}x."
        )
 
        # ── Financial literacy level ────────────────────────────────────
        parts.append(f"Client {self._literacy_sentence(ext_mean)}.")
 
        # ── Employment ─────────────────────────────────────────────────
        parts.append(self._employment_sentence(emp_years))
 
        # ── Asset ownership ────────────────────────────────────────────
        parts.append(_ASSET_SENTENCES.get((has_car, has_realty), ""))
 
        # ── Family context ─────────────────────────────────────────────
        if n_children > 0:
            parts.append(
                f"Applicant has {n_children} dependent child{'ren' if n_children>1 else ''} "
                f"in a household of {fam_size}."
            )
        else:
            parts.append(f"No dependents; household size of {fam_size}.")
 
        # ── Credit bureau summary ──────────────────────────────────────
        if self.incl_bureau:
            bureau_count  = int(row.get("BUREAU_COUNT", 0))
            active_count  = int(row.get("BUREAU_ACTIVE_COUNT", 0))
            if bureau_count > 0:
                parts.append(
                    f"Bureau records show {bureau_count} historical credit lines, "
                    f"of which {active_count} are currently active."
                )
            else:
                parts.append(
                    "No external bureau credit history found — applicant is credit-invisible."
                )
 
        # ── Enquiry signals ────────────────────────────────────────────
        if self.incl_enquiry:
            enquiries = int(row.get("TOTAL_ENQUIRIES", 0))
            if enquiries > 5:
                parts.append(
                    f"High credit enquiry volume ({enquiries} enquiries) may indicate "
                    f"credit-seeking stress or rate shopping."
                )
            elif enquiries > 0:
                parts.append(f"Moderate enquiry activity ({enquiries} enquiries recorded).")
            else:
                parts.append("No recent credit enquiries recorded.")
 
        # ── External score summary ─────────────────────────────────────
        parts.append(
            f"External creditworthiness assessments: "
            f"bureau={ext1:.2f}, behavioural={ext2:.2f}, alternative={ext3:.2f} "
            f"(composite={ext_mean:.2f})."
        )
 
        return " ".join(p for p in parts if p)
 
    def build_batch(self, df: pd.DataFrame, verbose: bool = True) -> List[str]:
        """Build narratives for an entire DataFrame."""
        narratives = []
        n = len(df)
        for i, (_, row) in enumerate(df.iterrows()):
            narratives.append(self.build_one(row))
            if verbose and (i + 1) % 50_000 == 0:
                print(f"  Narratives built: {i+1:,}/{n:,}")
        return narratives
 
 
# ─── SBERT Embedder ───────────────────────────────────────────────────────────
 
class SBERTEmbedder:
    """
    Encodes text narratives with Sentence-BERT and optionally reduces
    dimensionality with PCA.
 
    Parameters
    ----------
    model_name     : HuggingFace SBERT model name
    n_components   : PCA output dimension (None = no PCA)
    batch_size     : encoding batch size
    normalize      : L2-normalise embeddings before PCA
    device         : "cpu" | "cuda" | "mps" (auto if None)
    """
 
    def __init__(
        self,
        model_name:   str           = "all-MiniLM-L6-v2",
        n_components: Optional[int] = 32,
        batch_size:   int           = 512,
        normalize:    bool          = True,
        device:       Optional[str] = None,
    ):
        self.model_name   = model_name
        self.n_components = n_components
        self.batch_size   = batch_size
        self.normalize    = normalize
        self.device       = device
        self.pca: Optional[PCA] = None
        self._model: Optional[SentenceTransformer] = None
 
    def _load_model(self):
        if self._model is None:
            kwargs = {"device": self.device} if self.device else {}
            self._model = SentenceTransformer(self.model_name, **kwargs)
            print(f"✅ SBERT loaded: {self.model_name} "
                  f"(dim={self._model.get_sentence_embedding_dimension()})")
 
    def _encode(self, texts: List[str]) -> np.ndarray:
        self._load_model()
        return self._model.encode(
            texts,
            batch_size=self.batch_size,
            show_progress_bar=True,
            normalize_embeddings=self.normalize,
            convert_to_numpy=True,
        )
 
    def fit_transform(self, texts: List[str]) -> np.ndarray:
        """Encode + fit PCA on train texts."""
        print(f"🤖 Encoding {len(texts):,} texts with SBERT...")
        emb = self._encode(texts)
        print(f"   Raw embedding shape: {emb.shape}")
 
        if self.n_components:
            n = min(self.n_components, emb.shape[0], emb.shape[1])
            self.pca = PCA(n_components=n, random_state=42)
            emb = self.pca.fit_transform(emb)
            print(f"   After PCA({n}): {emb.shape} | "
                  f"Explained variance: {self.pca.explained_variance_ratio_.sum():.3f}")
 
        del self._model; self._model = None; gc.collect()
        return emb
 
    def transform(self, texts: List[str]) -> np.ndarray:
        """Encode + apply fitted PCA on new texts."""
        print(f"🤖 Encoding {len(texts):,} texts (transform)...")
        emb = self._encode(texts)
        if self.pca is not None:
            emb = self.pca.transform(emb)
        del self._model; self._model = None; gc.collect()
        return emb
 
    def save(self, path: str):
        """Persist PCA object."""
        if self.pca is not None:
            joblib.dump(self.pca, path)
            print(f"✅ PCA saved → {path}")
 
    def load_pca(self, path: str):
        """Load a previously saved PCA object."""
        self.pca = joblib.load(path)
        print(f"✅ PCA loaded ← {path}")
 
 
# ─── End-to-end NLP pipeline ─────────────────────────────────────────────────
 
class NLPFeaturePipeline:
    """
    Orchestrates FinancialNarrativeBuilder + SBERTEmbedder.
 
    Parameters
    ----------
    cfg          : project config dataclass
    model_name   : SBERT model name
    n_components : PCA components
    batch_size   : SBERT batch size
    """
 
    def __init__(
        self,
        cfg,
        model_name:   str = "all-MiniLM-L6-v2",
        n_components: int = 32,
        batch_size:   int = 512,
    ):
        self.cfg           = cfg
        self.n_components  = n_components
        self.narrator      = FinancialNarrativeBuilder()
        self.embedder      = SBERTEmbedder(
            model_name   = model_name,
            n_components = n_components,
            batch_size   = batch_size,
        )
        self._emb_col_names = [f"NLP_EMB_{i}" for i in range(n_components)]
 
    def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Build NLP features for training data.
        Fits PCA internally. Returns DataFrame with NLP_EMB_* columns.
        """
        texts = self.narrator.build_batch(df)
        emb   = self.embedder.fit_transform(texts)
        self.embedder.save(os.path.join(self.cfg.MODEL_DIR, "pca.pkl"))
        return pd.DataFrame(emb, columns=self._emb_col_names, index=df.index)
 
    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Build NLP features for new / test data.
        Uses already-fitted PCA. Returns DataFrame with NLP_EMB_* columns.
        """
        if self.embedder.pca is None:
            pca_path = os.path.join(self.cfg.MODEL_DIR, "pca.pkl")
            if os.path.exists(pca_path):
                self.embedder.load_pca(pca_path)
            else:
                raise FileNotFoundError(
                    f"PCA not found at {pca_path}. Run fit_transform first."
                )
        texts = self.narrator.build_batch(df, verbose=False)
        emb   = self.embedder.transform(texts)
        return pd.DataFrame(emb, columns=self._emb_col_names, index=df.index)
 
    def build_single_row(self, feature_dict: dict) -> pd.DataFrame:
        """
        Build NLP features for a single applicant (inference).
 
        Parameters
        ----------
        feature_dict : raw applicant features as a dict
 
        Returns
        -------
        DataFrame with NLP_EMB_* columns (1 row)
        """
        row = pd.Series(feature_dict)
        text = self.narrator.build_one(row)
        emb  = self.embedder.transform([text])
        return pd.DataFrame(emb, columns=self._emb_col_names)