suvradeepp's picture
Upload 89 files
cea1951 verified
"""
src/nlp_features.py
──────────────────────────────────────────────────────────────────────────────
NLP / Sentence-BERT embedding pipeline for alternative credit signals.
Provides:
β€’ FinancialNarrativeBuilder β€” synthesises a text description per applicant
β€’ SBERTEmbedder β€” encodes texts β†’ embeddings β†’ PCA reduction
β€’ NLPFeaturePipeline β€” end-to-end fit/transform orchestrator
In production, FinancialNarrativeBuilder would be replaced by real user
survey or app-usage text. Here we synthesise from tabular signals to
demonstrate the pipeline architecture.
Usage:
from src.nlp_features import NLPFeaturePipeline
nlp = NLPFeaturePipeline(cfg)
train_nlp_df = nlp.fit_transform(train_df)
test_nlp_df = nlp.transform(test_df)
──────────────────────────────────────────────────────────────────────────────
"""
from __future__ import annotations
import os
import gc
import warnings
from typing import List, Optional
import numpy as np
import pandas as pd
import joblib
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer
warnings.filterwarnings("ignore")
# ─── Narrative templates ──────────────────────────────────────────────────────
_LITERACY_LEVELS = {
"high": [
"demonstrates strong financial planning habits and consistently pays obligations on time",
"shows excellent budgeting discipline and proactively manages debt obligations",
"has a clear savings strategy and maintains low revolving credit utilisation",
],
"medium": [
"shows moderate financial awareness with occasional delayed payments",
"manages debt adequately but has limited long-term financial planning",
"meets minimum payment requirements but rarely pays ahead of schedule",
],
"low": [
"has limited formal financial experience and irregular payment patterns",
"relies heavily on informal credit channels and lacks credit history",
"demonstrates financial stress indicators with frequent payment shortfalls",
],
}
_EMPLOYMENT_TEMPLATES = {
"long": "Has stable employment of {years:.1f} years with the current employer.",
"medium": "Currently employed for {years:.1f} years; career trajectory appears stable.",
"short": "Recently started employment ({years:.1f} years); income may not be fully stabilised.",
"none": "No current formal employment; income source requires verification.",
}
_ASSET_SENTENCES = {
(True, True): "Applicant owns both a vehicle and residential property, indicating established assets.",
(True, False): "Applicant owns a vehicle but rents accommodation.",
(False, True): "Applicant owns residential property, a strong collateral signal.",
(False, False): "No registered asset ownership; relies solely on income for repayment.",
}
class FinancialNarrativeBuilder:
"""
Constructs a structured financial narrative text per applicant row.
Parameters
----------
ext_high_threshold : EXT_SOURCE_MEAN above which β†’ high financial literacy
ext_low_threshold : EXT_SOURCE_MEAN below which β†’ low financial literacy
include_enquiry : whether to include credit enquiry paragraph
include_bureau : whether to include bureau summary paragraph
random_template_seed: reproducibility for template sampling
"""
def __init__(
self,
ext_high_threshold: float = 0.60,
ext_low_threshold: float = 0.40,
include_enquiry: bool = True,
include_bureau: bool = True,
random_template_seed: int = 42,
):
self.ext_high = ext_high_threshold
self.ext_low = ext_low_threshold
self.incl_enquiry = include_enquiry
self.incl_bureau = include_bureau
self._rng = np.random.RandomState(random_template_seed)
def _literacy_sentence(self, ext_mean: float) -> str:
if ext_mean >= self.ext_high:
pool = _LITERACY_LEVELS["high"]
elif ext_mean >= self.ext_low:
pool = _LITERACY_LEVELS["medium"]
else:
pool = _LITERACY_LEVELS["low"]
return self._rng.choice(pool)
def _employment_sentence(self, emp_years: float) -> str:
if emp_years > 10:
tmpl = _EMPLOYMENT_TEMPLATES["long"]
elif emp_years > 2:
tmpl = _EMPLOYMENT_TEMPLATES["medium"]
elif emp_years > 0:
tmpl = _EMPLOYMENT_TEMPLATES["short"]
else:
return _EMPLOYMENT_TEMPLATES["none"]
return tmpl.format(years=emp_years)
def build_one(self, row: pd.Series) -> str:
"""Build a single narrative string from one applicant row."""
income = float(row.get("AMT_INCOME_TOTAL", 150_000))
credit = float(row.get("AMT_CREDIT", 300_000))
age = abs(float(row.get("DAYS_BIRTH", -35*365))) / 365
emp_years = max(0, -float(row.get("DAYS_EMPLOYED", -3*365))) / 365
ext1 = float(row.get("EXT_SOURCE_1", 0.5))
ext2 = float(row.get("EXT_SOURCE_2", 0.5))
ext3 = float(row.get("EXT_SOURCE_3", 0.5))
ext_mean = np.nanmean([ext1, ext2, ext3])
has_realty = bool(row.get("FLAG_OWN_REALTY", 0))
has_car = bool(row.get("FLAG_OWN_CAR", 0))
n_children = int(row.get("CNT_CHILDREN", 0))
fam_size = int(row.get("CNT_FAM_MEMBERS", 2))
credit_income = credit / (income + 1)
parts: List[str] = []
# ── Core financial summary ──────────────────────────────────────
parts.append(
f"Applicant is {age:.0f} years old with a declared annual income of "
f"{income:,.0f} currency units. "
f"Requesting a credit facility of {credit:,.0f} units, "
f"representing a credit-to-income ratio of {credit_income:.2f}x."
)
# ── Financial literacy level ────────────────────────────────────
parts.append(f"Client {self._literacy_sentence(ext_mean)}.")
# ── Employment ─────────────────────────────────────────────────
parts.append(self._employment_sentence(emp_years))
# ── Asset ownership ────────────────────────────────────────────
parts.append(_ASSET_SENTENCES.get((has_car, has_realty), ""))
# ── Family context ─────────────────────────────────────────────
if n_children > 0:
parts.append(
f"Applicant has {n_children} dependent child{'ren' if n_children>1 else ''} "
f"in a household of {fam_size}."
)
else:
parts.append(f"No dependents; household size of {fam_size}.")
# ── Credit bureau summary ──────────────────────────────────────
if self.incl_bureau:
bureau_count = int(row.get("BUREAU_COUNT", 0))
active_count = int(row.get("BUREAU_ACTIVE_COUNT", 0))
if bureau_count > 0:
parts.append(
f"Bureau records show {bureau_count} historical credit lines, "
f"of which {active_count} are currently active."
)
else:
parts.append(
"No external bureau credit history found β€” applicant is credit-invisible."
)
# ── Enquiry signals ────────────────────────────────────────────
if self.incl_enquiry:
enquiries = int(row.get("TOTAL_ENQUIRIES", 0))
if enquiries > 5:
parts.append(
f"High credit enquiry volume ({enquiries} enquiries) may indicate "
f"credit-seeking stress or rate shopping."
)
elif enquiries > 0:
parts.append(f"Moderate enquiry activity ({enquiries} enquiries recorded).")
else:
parts.append("No recent credit enquiries recorded.")
# ── External score summary ─────────────────────────────────────
parts.append(
f"External creditworthiness assessments: "
f"bureau={ext1:.2f}, behavioural={ext2:.2f}, alternative={ext3:.2f} "
f"(composite={ext_mean:.2f})."
)
return " ".join(p for p in parts if p)
def build_batch(self, df: pd.DataFrame, verbose: bool = True) -> List[str]:
"""Build narratives for an entire DataFrame."""
narratives = []
n = len(df)
for i, (_, row) in enumerate(df.iterrows()):
narratives.append(self.build_one(row))
if verbose and (i + 1) % 50_000 == 0:
print(f" Narratives built: {i+1:,}/{n:,}")
return narratives
# ─── SBERT Embedder ───────────────────────────────────────────────────────────
class SBERTEmbedder:
"""
Encodes text narratives with Sentence-BERT and optionally reduces
dimensionality with PCA.
Parameters
----------
model_name : HuggingFace SBERT model name
n_components : PCA output dimension (None = no PCA)
batch_size : encoding batch size
normalize : L2-normalise embeddings before PCA
device : "cpu" | "cuda" | "mps" (auto if None)
"""
def __init__(
self,
model_name: str = "all-MiniLM-L6-v2",
n_components: Optional[int] = 32,
batch_size: int = 512,
normalize: bool = True,
device: Optional[str] = None,
):
self.model_name = model_name
self.n_components = n_components
self.batch_size = batch_size
self.normalize = normalize
self.device = device
self.pca: Optional[PCA] = None
self._model: Optional[SentenceTransformer] = None
def _load_model(self):
if self._model is None:
kwargs = {"device": self.device} if self.device else {}
self._model = SentenceTransformer(self.model_name, **kwargs)
print(f"βœ… SBERT loaded: {self.model_name} "
f"(dim={self._model.get_sentence_embedding_dimension()})")
def _encode(self, texts: List[str]) -> np.ndarray:
self._load_model()
return self._model.encode(
texts,
batch_size=self.batch_size,
show_progress_bar=True,
normalize_embeddings=self.normalize,
convert_to_numpy=True,
)
def fit_transform(self, texts: List[str]) -> np.ndarray:
"""Encode + fit PCA on train texts."""
print(f"πŸ€– Encoding {len(texts):,} texts with SBERT...")
emb = self._encode(texts)
print(f" Raw embedding shape: {emb.shape}")
if self.n_components:
n = min(self.n_components, emb.shape[0], emb.shape[1])
self.pca = PCA(n_components=n, random_state=42)
emb = self.pca.fit_transform(emb)
print(f" After PCA({n}): {emb.shape} | "
f"Explained variance: {self.pca.explained_variance_ratio_.sum():.3f}")
del self._model; self._model = None; gc.collect()
return emb
def transform(self, texts: List[str]) -> np.ndarray:
"""Encode + apply fitted PCA on new texts."""
print(f"πŸ€– Encoding {len(texts):,} texts (transform)...")
emb = self._encode(texts)
if self.pca is not None:
emb = self.pca.transform(emb)
del self._model; self._model = None; gc.collect()
return emb
def save(self, path: str):
"""Persist PCA object."""
if self.pca is not None:
joblib.dump(self.pca, path)
print(f"βœ… PCA saved β†’ {path}")
def load_pca(self, path: str):
"""Load a previously saved PCA object."""
self.pca = joblib.load(path)
print(f"βœ… PCA loaded ← {path}")
# ─── End-to-end NLP pipeline ─────────────────────────────────────────────────
class NLPFeaturePipeline:
"""
Orchestrates FinancialNarrativeBuilder + SBERTEmbedder.
Parameters
----------
cfg : project config dataclass
model_name : SBERT model name
n_components : PCA components
batch_size : SBERT batch size
"""
def __init__(
self,
cfg,
model_name: str = "all-MiniLM-L6-v2",
n_components: int = 32,
batch_size: int = 512,
):
self.cfg = cfg
self.n_components = n_components
self.narrator = FinancialNarrativeBuilder()
self.embedder = SBERTEmbedder(
model_name = model_name,
n_components = n_components,
batch_size = batch_size,
)
self._emb_col_names = [f"NLP_EMB_{i}" for i in range(n_components)]
def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Build NLP features for training data.
Fits PCA internally. Returns DataFrame with NLP_EMB_* columns.
"""
texts = self.narrator.build_batch(df)
emb = self.embedder.fit_transform(texts)
self.embedder.save(os.path.join(self.cfg.MODEL_DIR, "pca.pkl"))
return pd.DataFrame(emb, columns=self._emb_col_names, index=df.index)
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Build NLP features for new / test data.
Uses already-fitted PCA. Returns DataFrame with NLP_EMB_* columns.
"""
if self.embedder.pca is None:
pca_path = os.path.join(self.cfg.MODEL_DIR, "pca.pkl")
if os.path.exists(pca_path):
self.embedder.load_pca(pca_path)
else:
raise FileNotFoundError(
f"PCA not found at {pca_path}. Run fit_transform first."
)
texts = self.narrator.build_batch(df, verbose=False)
emb = self.embedder.transform(texts)
return pd.DataFrame(emb, columns=self._emb_col_names, index=df.index)
def build_single_row(self, feature_dict: dict) -> pd.DataFrame:
"""
Build NLP features for a single applicant (inference).
Parameters
----------
feature_dict : raw applicant features as a dict
Returns
-------
DataFrame with NLP_EMB_* columns (1 row)
"""
row = pd.Series(feature_dict)
text = self.narrator.build_one(row)
emb = self.embedder.transform([text])
return pd.DataFrame(emb, columns=self._emb_col_names)