Spaces:
Configuration error
Configuration error
| """ | |
| src/nlp_features.py | |
| ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| NLP / Sentence-BERT embedding pipeline for alternative credit signals. | |
| Provides: | |
| β’ FinancialNarrativeBuilder β synthesises a text description per applicant | |
| β’ SBERTEmbedder β encodes texts β embeddings β PCA reduction | |
| β’ NLPFeaturePipeline β end-to-end fit/transform orchestrator | |
| In production, FinancialNarrativeBuilder would be replaced by real user | |
| survey or app-usage text. Here we synthesise from tabular signals to | |
| demonstrate the pipeline architecture. | |
| Usage: | |
| from src.nlp_features import NLPFeaturePipeline | |
| nlp = NLPFeaturePipeline(cfg) | |
| train_nlp_df = nlp.fit_transform(train_df) | |
| test_nlp_df = nlp.transform(test_df) | |
| ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import gc | |
| import warnings | |
| from typing import List, Optional | |
| import numpy as np | |
| import pandas as pd | |
| import joblib | |
| from sklearn.decomposition import PCA | |
| from sentence_transformers import SentenceTransformer | |
| warnings.filterwarnings("ignore") | |
| # βββ Narrative templates ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _LITERACY_LEVELS = { | |
| "high": [ | |
| "demonstrates strong financial planning habits and consistently pays obligations on time", | |
| "shows excellent budgeting discipline and proactively manages debt obligations", | |
| "has a clear savings strategy and maintains low revolving credit utilisation", | |
| ], | |
| "medium": [ | |
| "shows moderate financial awareness with occasional delayed payments", | |
| "manages debt adequately but has limited long-term financial planning", | |
| "meets minimum payment requirements but rarely pays ahead of schedule", | |
| ], | |
| "low": [ | |
| "has limited formal financial experience and irregular payment patterns", | |
| "relies heavily on informal credit channels and lacks credit history", | |
| "demonstrates financial stress indicators with frequent payment shortfalls", | |
| ], | |
| } | |
| _EMPLOYMENT_TEMPLATES = { | |
| "long": "Has stable employment of {years:.1f} years with the current employer.", | |
| "medium": "Currently employed for {years:.1f} years; career trajectory appears stable.", | |
| "short": "Recently started employment ({years:.1f} years); income may not be fully stabilised.", | |
| "none": "No current formal employment; income source requires verification.", | |
| } | |
| _ASSET_SENTENCES = { | |
| (True, True): "Applicant owns both a vehicle and residential property, indicating established assets.", | |
| (True, False): "Applicant owns a vehicle but rents accommodation.", | |
| (False, True): "Applicant owns residential property, a strong collateral signal.", | |
| (False, False): "No registered asset ownership; relies solely on income for repayment.", | |
| } | |
| class FinancialNarrativeBuilder: | |
| """ | |
| Constructs a structured financial narrative text per applicant row. | |
| Parameters | |
| ---------- | |
| ext_high_threshold : EXT_SOURCE_MEAN above which β high financial literacy | |
| ext_low_threshold : EXT_SOURCE_MEAN below which β low financial literacy | |
| include_enquiry : whether to include credit enquiry paragraph | |
| include_bureau : whether to include bureau summary paragraph | |
| random_template_seed: reproducibility for template sampling | |
| """ | |
| def __init__( | |
| self, | |
| ext_high_threshold: float = 0.60, | |
| ext_low_threshold: float = 0.40, | |
| include_enquiry: bool = True, | |
| include_bureau: bool = True, | |
| random_template_seed: int = 42, | |
| ): | |
| self.ext_high = ext_high_threshold | |
| self.ext_low = ext_low_threshold | |
| self.incl_enquiry = include_enquiry | |
| self.incl_bureau = include_bureau | |
| self._rng = np.random.RandomState(random_template_seed) | |
| def _literacy_sentence(self, ext_mean: float) -> str: | |
| if ext_mean >= self.ext_high: | |
| pool = _LITERACY_LEVELS["high"] | |
| elif ext_mean >= self.ext_low: | |
| pool = _LITERACY_LEVELS["medium"] | |
| else: | |
| pool = _LITERACY_LEVELS["low"] | |
| return self._rng.choice(pool) | |
| def _employment_sentence(self, emp_years: float) -> str: | |
| if emp_years > 10: | |
| tmpl = _EMPLOYMENT_TEMPLATES["long"] | |
| elif emp_years > 2: | |
| tmpl = _EMPLOYMENT_TEMPLATES["medium"] | |
| elif emp_years > 0: | |
| tmpl = _EMPLOYMENT_TEMPLATES["short"] | |
| else: | |
| return _EMPLOYMENT_TEMPLATES["none"] | |
| return tmpl.format(years=emp_years) | |
| def build_one(self, row: pd.Series) -> str: | |
| """Build a single narrative string from one applicant row.""" | |
| income = float(row.get("AMT_INCOME_TOTAL", 150_000)) | |
| credit = float(row.get("AMT_CREDIT", 300_000)) | |
| age = abs(float(row.get("DAYS_BIRTH", -35*365))) / 365 | |
| emp_years = max(0, -float(row.get("DAYS_EMPLOYED", -3*365))) / 365 | |
| ext1 = float(row.get("EXT_SOURCE_1", 0.5)) | |
| ext2 = float(row.get("EXT_SOURCE_2", 0.5)) | |
| ext3 = float(row.get("EXT_SOURCE_3", 0.5)) | |
| ext_mean = np.nanmean([ext1, ext2, ext3]) | |
| has_realty = bool(row.get("FLAG_OWN_REALTY", 0)) | |
| has_car = bool(row.get("FLAG_OWN_CAR", 0)) | |
| n_children = int(row.get("CNT_CHILDREN", 0)) | |
| fam_size = int(row.get("CNT_FAM_MEMBERS", 2)) | |
| credit_income = credit / (income + 1) | |
| parts: List[str] = [] | |
| # ββ Core financial summary ββββββββββββββββββββββββββββββββββββββ | |
| parts.append( | |
| f"Applicant is {age:.0f} years old with a declared annual income of " | |
| f"{income:,.0f} currency units. " | |
| f"Requesting a credit facility of {credit:,.0f} units, " | |
| f"representing a credit-to-income ratio of {credit_income:.2f}x." | |
| ) | |
| # ββ Financial literacy level ββββββββββββββββββββββββββββββββββββ | |
| parts.append(f"Client {self._literacy_sentence(ext_mean)}.") | |
| # ββ Employment βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| parts.append(self._employment_sentence(emp_years)) | |
| # ββ Asset ownership ββββββββββββββββββββββββββββββββββββββββββββ | |
| parts.append(_ASSET_SENTENCES.get((has_car, has_realty), "")) | |
| # ββ Family context βββββββββββββββββββββββββββββββββββββββββββββ | |
| if n_children > 0: | |
| parts.append( | |
| f"Applicant has {n_children} dependent child{'ren' if n_children>1 else ''} " | |
| f"in a household of {fam_size}." | |
| ) | |
| else: | |
| parts.append(f"No dependents; household size of {fam_size}.") | |
| # ββ Credit bureau summary ββββββββββββββββββββββββββββββββββββββ | |
| if self.incl_bureau: | |
| bureau_count = int(row.get("BUREAU_COUNT", 0)) | |
| active_count = int(row.get("BUREAU_ACTIVE_COUNT", 0)) | |
| if bureau_count > 0: | |
| parts.append( | |
| f"Bureau records show {bureau_count} historical credit lines, " | |
| f"of which {active_count} are currently active." | |
| ) | |
| else: | |
| parts.append( | |
| "No external bureau credit history found β applicant is credit-invisible." | |
| ) | |
| # ββ Enquiry signals ββββββββββββββββββββββββββββββββββββββββββββ | |
| if self.incl_enquiry: | |
| enquiries = int(row.get("TOTAL_ENQUIRIES", 0)) | |
| if enquiries > 5: | |
| parts.append( | |
| f"High credit enquiry volume ({enquiries} enquiries) may indicate " | |
| f"credit-seeking stress or rate shopping." | |
| ) | |
| elif enquiries > 0: | |
| parts.append(f"Moderate enquiry activity ({enquiries} enquiries recorded).") | |
| else: | |
| parts.append("No recent credit enquiries recorded.") | |
| # ββ External score summary βββββββββββββββββββββββββββββββββββββ | |
| parts.append( | |
| f"External creditworthiness assessments: " | |
| f"bureau={ext1:.2f}, behavioural={ext2:.2f}, alternative={ext3:.2f} " | |
| f"(composite={ext_mean:.2f})." | |
| ) | |
| return " ".join(p for p in parts if p) | |
| def build_batch(self, df: pd.DataFrame, verbose: bool = True) -> List[str]: | |
| """Build narratives for an entire DataFrame.""" | |
| narratives = [] | |
| n = len(df) | |
| for i, (_, row) in enumerate(df.iterrows()): | |
| narratives.append(self.build_one(row)) | |
| if verbose and (i + 1) % 50_000 == 0: | |
| print(f" Narratives built: {i+1:,}/{n:,}") | |
| return narratives | |
| # βββ SBERT Embedder βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class SBERTEmbedder: | |
| """ | |
| Encodes text narratives with Sentence-BERT and optionally reduces | |
| dimensionality with PCA. | |
| Parameters | |
| ---------- | |
| model_name : HuggingFace SBERT model name | |
| n_components : PCA output dimension (None = no PCA) | |
| batch_size : encoding batch size | |
| normalize : L2-normalise embeddings before PCA | |
| device : "cpu" | "cuda" | "mps" (auto if None) | |
| """ | |
| def __init__( | |
| self, | |
| model_name: str = "all-MiniLM-L6-v2", | |
| n_components: Optional[int] = 32, | |
| batch_size: int = 512, | |
| normalize: bool = True, | |
| device: Optional[str] = None, | |
| ): | |
| self.model_name = model_name | |
| self.n_components = n_components | |
| self.batch_size = batch_size | |
| self.normalize = normalize | |
| self.device = device | |
| self.pca: Optional[PCA] = None | |
| self._model: Optional[SentenceTransformer] = None | |
| def _load_model(self): | |
| if self._model is None: | |
| kwargs = {"device": self.device} if self.device else {} | |
| self._model = SentenceTransformer(self.model_name, **kwargs) | |
| print(f"β SBERT loaded: {self.model_name} " | |
| f"(dim={self._model.get_sentence_embedding_dimension()})") | |
| def _encode(self, texts: List[str]) -> np.ndarray: | |
| self._load_model() | |
| return self._model.encode( | |
| texts, | |
| batch_size=self.batch_size, | |
| show_progress_bar=True, | |
| normalize_embeddings=self.normalize, | |
| convert_to_numpy=True, | |
| ) | |
| def fit_transform(self, texts: List[str]) -> np.ndarray: | |
| """Encode + fit PCA on train texts.""" | |
| print(f"π€ Encoding {len(texts):,} texts with SBERT...") | |
| emb = self._encode(texts) | |
| print(f" Raw embedding shape: {emb.shape}") | |
| if self.n_components: | |
| n = min(self.n_components, emb.shape[0], emb.shape[1]) | |
| self.pca = PCA(n_components=n, random_state=42) | |
| emb = self.pca.fit_transform(emb) | |
| print(f" After PCA({n}): {emb.shape} | " | |
| f"Explained variance: {self.pca.explained_variance_ratio_.sum():.3f}") | |
| del self._model; self._model = None; gc.collect() | |
| return emb | |
| def transform(self, texts: List[str]) -> np.ndarray: | |
| """Encode + apply fitted PCA on new texts.""" | |
| print(f"π€ Encoding {len(texts):,} texts (transform)...") | |
| emb = self._encode(texts) | |
| if self.pca is not None: | |
| emb = self.pca.transform(emb) | |
| del self._model; self._model = None; gc.collect() | |
| return emb | |
| def save(self, path: str): | |
| """Persist PCA object.""" | |
| if self.pca is not None: | |
| joblib.dump(self.pca, path) | |
| print(f"β PCA saved β {path}") | |
| def load_pca(self, path: str): | |
| """Load a previously saved PCA object.""" | |
| self.pca = joblib.load(path) | |
| print(f"β PCA loaded β {path}") | |
| # βββ End-to-end NLP pipeline βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class NLPFeaturePipeline: | |
| """ | |
| Orchestrates FinancialNarrativeBuilder + SBERTEmbedder. | |
| Parameters | |
| ---------- | |
| cfg : project config dataclass | |
| model_name : SBERT model name | |
| n_components : PCA components | |
| batch_size : SBERT batch size | |
| """ | |
| def __init__( | |
| self, | |
| cfg, | |
| model_name: str = "all-MiniLM-L6-v2", | |
| n_components: int = 32, | |
| batch_size: int = 512, | |
| ): | |
| self.cfg = cfg | |
| self.n_components = n_components | |
| self.narrator = FinancialNarrativeBuilder() | |
| self.embedder = SBERTEmbedder( | |
| model_name = model_name, | |
| n_components = n_components, | |
| batch_size = batch_size, | |
| ) | |
| self._emb_col_names = [f"NLP_EMB_{i}" for i in range(n_components)] | |
| def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame: | |
| """ | |
| Build NLP features for training data. | |
| Fits PCA internally. Returns DataFrame with NLP_EMB_* columns. | |
| """ | |
| texts = self.narrator.build_batch(df) | |
| emb = self.embedder.fit_transform(texts) | |
| self.embedder.save(os.path.join(self.cfg.MODEL_DIR, "pca.pkl")) | |
| return pd.DataFrame(emb, columns=self._emb_col_names, index=df.index) | |
| def transform(self, df: pd.DataFrame) -> pd.DataFrame: | |
| """ | |
| Build NLP features for new / test data. | |
| Uses already-fitted PCA. Returns DataFrame with NLP_EMB_* columns. | |
| """ | |
| if self.embedder.pca is None: | |
| pca_path = os.path.join(self.cfg.MODEL_DIR, "pca.pkl") | |
| if os.path.exists(pca_path): | |
| self.embedder.load_pca(pca_path) | |
| else: | |
| raise FileNotFoundError( | |
| f"PCA not found at {pca_path}. Run fit_transform first." | |
| ) | |
| texts = self.narrator.build_batch(df, verbose=False) | |
| emb = self.embedder.transform(texts) | |
| return pd.DataFrame(emb, columns=self._emb_col_names, index=df.index) | |
| def build_single_row(self, feature_dict: dict) -> pd.DataFrame: | |
| """ | |
| Build NLP features for a single applicant (inference). | |
| Parameters | |
| ---------- | |
| feature_dict : raw applicant features as a dict | |
| Returns | |
| ------- | |
| DataFrame with NLP_EMB_* columns (1 row) | |
| """ | |
| row = pd.Series(feature_dict) | |
| text = self.narrator.build_one(row) | |
| emb = self.embedder.transform([text]) | |
| return pd.DataFrame(emb, columns=self._emb_col_names) |