Spaces:

suvradeepp
/

Explainable-Credit-Risk-Modeling-with-Schduling

Configuration error

App Files Files Community

Explainable-Credit-Risk-Modeling-with-Schduling / src /nlp_features.py

suvradeepp

Upload 89 files

cea1951 verified 18 days ago

raw

history blame contribute delete

16.3 kB

	"""
	src/nlp_features.py
	──────────────────────────────────────────────────────────────────────────────
	NLP / Sentence-BERT embedding pipeline for alternative credit signals.

	Provides:
	• FinancialNarrativeBuilder — synthesises a text description per applicant
	• SBERTEmbedder — encodes texts → embeddings → PCA reduction
	• NLPFeaturePipeline — end-to-end fit/transform orchestrator

	In production, FinancialNarrativeBuilder would be replaced by real user
	survey or app-usage text. Here we synthesise from tabular signals to
	demonstrate the pipeline architecture.

	Usage:
	from src.nlp_features import NLPFeaturePipeline
	nlp = NLPFeaturePipeline(cfg)
	train_nlp_df = nlp.fit_transform(train_df)
	test_nlp_df = nlp.transform(test_df)
	──────────────────────────────────────────────────────────────────────────────
	"""

	from __future__ import annotations

	import os
	import gc
	import warnings
	from typing import List, Optional

	import numpy as np
	import pandas as pd
	import joblib
	from sklearn.decomposition import PCA
	from sentence_transformers import SentenceTransformer

	warnings.filterwarnings("ignore")


	# ─── Narrative templates ──────────────────────────────────────────────────────

	_LITERACY_LEVELS = {
	"high": [
	"demonstrates strong financial planning habits and consistently pays obligations on time",
	"shows excellent budgeting discipline and proactively manages debt obligations",
	"has a clear savings strategy and maintains low revolving credit utilisation",
	],
	"medium": [
	"shows moderate financial awareness with occasional delayed payments",
	"manages debt adequately but has limited long-term financial planning",
	"meets minimum payment requirements but rarely pays ahead of schedule",
	],
	"low": [
	"has limited formal financial experience and irregular payment patterns",
	"relies heavily on informal credit channels and lacks credit history",
	"demonstrates financial stress indicators with frequent payment shortfalls",
	],
	}

	_EMPLOYMENT_TEMPLATES = {
	"long": "Has stable employment of {years:.1f} years with the current employer.",
	"medium": "Currently employed for {years:.1f} years; career trajectory appears stable.",
	"short": "Recently started employment ({years:.1f} years); income may not be fully stabilised.",
	"none": "No current formal employment; income source requires verification.",
	}

	_ASSET_SENTENCES = {
	(True, True): "Applicant owns both a vehicle and residential property, indicating established assets.",
	(True, False): "Applicant owns a vehicle but rents accommodation.",
	(False, True): "Applicant owns residential property, a strong collateral signal.",
	(False, False): "No registered asset ownership; relies solely on income for repayment.",
	}


	class FinancialNarrativeBuilder:
	"""
	Constructs a structured financial narrative text per applicant row.

	Parameters
	----------
	ext_high_threshold : EXT_SOURCE_MEAN above which → high financial literacy
	ext_low_threshold : EXT_SOURCE_MEAN below which → low financial literacy
	include_enquiry : whether to include credit enquiry paragraph
	include_bureau : whether to include bureau summary paragraph
	random_template_seed: reproducibility for template sampling
	"""

	def __init__(
	self,
	ext_high_threshold: float = 0.60,
	ext_low_threshold: float = 0.40,
	include_enquiry: bool = True,
	include_bureau: bool = True,
	random_template_seed: int = 42,
	):
	self.ext_high = ext_high_threshold
	self.ext_low = ext_low_threshold
	self.incl_enquiry = include_enquiry
	self.incl_bureau = include_bureau
	self._rng = np.random.RandomState(random_template_seed)

	def _literacy_sentence(self, ext_mean: float) -> str:
	if ext_mean >= self.ext_high:
	pool = _LITERACY_LEVELS["high"]
	elif ext_mean >= self.ext_low:
	pool = _LITERACY_LEVELS["medium"]
	else:
	pool = _LITERACY_LEVELS["low"]
	return self._rng.choice(pool)

	def _employment_sentence(self, emp_years: float) -> str:
	if emp_years > 10:
	tmpl = _EMPLOYMENT_TEMPLATES["long"]
	elif emp_years > 2:
	tmpl = _EMPLOYMENT_TEMPLATES["medium"]
	elif emp_years > 0:
	tmpl = _EMPLOYMENT_TEMPLATES["short"]
	else:
	return _EMPLOYMENT_TEMPLATES["none"]
	return tmpl.format(years=emp_years)

	def build_one(self, row: pd.Series) -> str:
	"""Build a single narrative string from one applicant row."""

	income = float(row.get("AMT_INCOME_TOTAL", 150_000))
	credit = float(row.get("AMT_CREDIT", 300_000))
	age = abs(float(row.get("DAYS_BIRTH", -35*365))) / 365
	emp_years = max(0, -float(row.get("DAYS_EMPLOYED", -3*365))) / 365
	ext1 = float(row.get("EXT_SOURCE_1", 0.5))
	ext2 = float(row.get("EXT_SOURCE_2", 0.5))
	ext3 = float(row.get("EXT_SOURCE_3", 0.5))
	ext_mean = np.nanmean([ext1, ext2, ext3])
	has_realty = bool(row.get("FLAG_OWN_REALTY", 0))
	has_car = bool(row.get("FLAG_OWN_CAR", 0))
	n_children = int(row.get("CNT_CHILDREN", 0))
	fam_size = int(row.get("CNT_FAM_MEMBERS", 2))
	credit_income = credit / (income + 1)

	parts: List[str] = []

	# ── Core financial summary ──────────────────────────────────────
	parts.append(
	f"Applicant is {age:.0f} years old with a declared annual income of "
	f"{income:,.0f} currency units. "
	f"Requesting a credit facility of {credit:,.0f} units, "
	f"representing a credit-to-income ratio of {credit_income:.2f}x."
	)

	# ── Financial literacy level ────────────────────────────────────
	parts.append(f"Client {self._literacy_sentence(ext_mean)}.")

	# ── Employment ─────────────────────────────────────────────────
	parts.append(self._employment_sentence(emp_years))

	# ── Asset ownership ────────────────────────────────────────────
	parts.append(_ASSET_SENTENCES.get((has_car, has_realty), ""))

	# ── Family context ─────────────────────────────────────────────
	if n_children > 0:
	parts.append(
	f"Applicant has {n_children} dependent child{'ren' if n_children>1 else ''} "
	f"in a household of {fam_size}."
	)
	else:
	parts.append(f"No dependents; household size of {fam_size}.")

	# ── Credit bureau summary ──────────────────────────────────────
	if self.incl_bureau:
	bureau_count = int(row.get("BUREAU_COUNT", 0))
	active_count = int(row.get("BUREAU_ACTIVE_COUNT", 0))
	if bureau_count > 0:
	parts.append(
	f"Bureau records show {bureau_count} historical credit lines, "
	f"of which {active_count} are currently active."
	)
	else:
	parts.append(
	"No external bureau credit history found — applicant is credit-invisible."
	)

	# ── Enquiry signals ────────────────────────────────────────────
	if self.incl_enquiry:
	enquiries = int(row.get("TOTAL_ENQUIRIES", 0))
	if enquiries > 5:
	parts.append(
	f"High credit enquiry volume ({enquiries} enquiries) may indicate "
	f"credit-seeking stress or rate shopping."
	)
	elif enquiries > 0:
	parts.append(f"Moderate enquiry activity ({enquiries} enquiries recorded).")
	else:
	parts.append("No recent credit enquiries recorded.")

	# ── External score summary ─────────────────────────────────────
	parts.append(
	f"External creditworthiness assessments: "
	f"bureau={ext1:.2f}, behavioural={ext2:.2f}, alternative={ext3:.2f} "
	f"(composite={ext_mean:.2f})."
	)

	return " ".join(p for p in parts if p)

	def build_batch(self, df: pd.DataFrame, verbose: bool = True) -> List[str]:
	"""Build narratives for an entire DataFrame."""
	narratives = []
	n = len(df)
	for i, (_, row) in enumerate(df.iterrows()):
	narratives.append(self.build_one(row))
	if verbose and (i + 1) % 50_000 == 0:
	print(f" Narratives built: {i+1:,}/{n:,}")
	return narratives


	# ─── SBERT Embedder ───────────────────────────────────────────────────────────

	class SBERTEmbedder:
	"""
	Encodes text narratives with Sentence-BERT and optionally reduces
	dimensionality with PCA.

	Parameters
	----------
	model_name : HuggingFace SBERT model name
	n_components : PCA output dimension (None = no PCA)
	batch_size : encoding batch size
	normalize : L2-normalise embeddings before PCA
	device : "cpu" \| "cuda" \| "mps" (auto if None)
	"""

	def __init__(
	self,
	model_name: str = "all-MiniLM-L6-v2",
	n_components: Optional[int] = 32,
	batch_size: int = 512,
	normalize: bool = True,
	device: Optional[str] = None,
	):
	self.model_name = model_name
	self.n_components = n_components
	self.batch_size = batch_size
	self.normalize = normalize
	self.device = device
	self.pca: Optional[PCA] = None
	self._model: Optional[SentenceTransformer] = None

	def _load_model(self):
	if self._model is None:
	kwargs = {"device": self.device} if self.device else {}
	self._model = SentenceTransformer(self.model_name, **kwargs)
	print(f"✅ SBERT loaded: {self.model_name} "
	f"(dim={self._model.get_sentence_embedding_dimension()})")

	def _encode(self, texts: List[str]) -> np.ndarray:
	self._load_model()
	return self._model.encode(
	texts,
	batch_size=self.batch_size,
	show_progress_bar=True,
	normalize_embeddings=self.normalize,
	convert_to_numpy=True,
	)

	def fit_transform(self, texts: List[str]) -> np.ndarray:
	"""Encode + fit PCA on train texts."""
	print(f"🤖 Encoding {len(texts):,} texts with SBERT...")
	emb = self._encode(texts)
	print(f" Raw embedding shape: {emb.shape}")

	if self.n_components:
	n = min(self.n_components, emb.shape[0], emb.shape[1])
	self.pca = PCA(n_components=n, random_state=42)
	emb = self.pca.fit_transform(emb)
	print(f" After PCA({n}): {emb.shape} \| "
	f"Explained variance: {self.pca.explained_variance_ratio_.sum():.3f}")

	del self._model; self._model = None; gc.collect()
	return emb

	def transform(self, texts: List[str]) -> np.ndarray:
	"""Encode + apply fitted PCA on new texts."""
	print(f"🤖 Encoding {len(texts):,} texts (transform)...")
	emb = self._encode(texts)
	if self.pca is not None:
	emb = self.pca.transform(emb)
	del self._model; self._model = None; gc.collect()
	return emb

	def save(self, path: str):
	"""Persist PCA object."""
	if self.pca is not None:
	joblib.dump(self.pca, path)
	print(f"✅ PCA saved → {path}")

	def load_pca(self, path: str):
	"""Load a previously saved PCA object."""
	self.pca = joblib.load(path)
	print(f"✅ PCA loaded ← {path}")


	# ─── End-to-end NLP pipeline ─────────────────────────────────────────────────

	class NLPFeaturePipeline:
	"""
	Orchestrates FinancialNarrativeBuilder + SBERTEmbedder.

	Parameters
	----------
	cfg : project config dataclass
	model_name : SBERT model name
	n_components : PCA components
	batch_size : SBERT batch size
	"""

	def __init__(
	self,
	cfg,
	model_name: str = "all-MiniLM-L6-v2",
	n_components: int = 32,
	batch_size: int = 512,
	):
	self.cfg = cfg
	self.n_components = n_components
	self.narrator = FinancialNarrativeBuilder()
	self.embedder = SBERTEmbedder(
	model_name = model_name,
	n_components = n_components,
	batch_size = batch_size,
	)
	self._emb_col_names = [f"NLP_EMB_{i}" for i in range(n_components)]

	def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
	"""
	Build NLP features for training data.
	Fits PCA internally. Returns DataFrame with NLP_EMB_* columns.
	"""
	texts = self.narrator.build_batch(df)
	emb = self.embedder.fit_transform(texts)
	self.embedder.save(os.path.join(self.cfg.MODEL_DIR, "pca.pkl"))
	return pd.DataFrame(emb, columns=self._emb_col_names, index=df.index)

	def transform(self, df: pd.DataFrame) -> pd.DataFrame:
	"""
	Build NLP features for new / test data.
	Uses already-fitted PCA. Returns DataFrame with NLP_EMB_* columns.
	"""
	if self.embedder.pca is None:
	pca_path = os.path.join(self.cfg.MODEL_DIR, "pca.pkl")
	if os.path.exists(pca_path):
	self.embedder.load_pca(pca_path)
	else:
	raise FileNotFoundError(
	f"PCA not found at {pca_path}. Run fit_transform first."
	)
	texts = self.narrator.build_batch(df, verbose=False)
	emb = self.embedder.transform(texts)
	return pd.DataFrame(emb, columns=self._emb_col_names, index=df.index)

	def build_single_row(self, feature_dict: dict) -> pd.DataFrame:
	"""
	Build NLP features for a single applicant (inference).

	Parameters
	----------
	feature_dict : raw applicant features as a dict

	Returns
	-------
	DataFrame with NLP_EMB_* columns (1 row)
	"""
	row = pd.Series(feature_dict)
	text = self.narrator.build_one(row)
	emb = self.embedder.transform([text])
	return pd.DataFrame(emb, columns=self._emb_col_names)