| """Feature builder: human-readable inputs -> 133-dim feature vector. |
| |
| Design principle: we don't fake the speed-dating-specific features (like |
| partner attractiveness rating, ambition rating from the opposite gender). |
| These default to population means (0.5 on a 0-1 scale for normalized |
| features, 5 on a 1-10 scale for raw). The Gottman and survival features — |
| which dominate SHAP — are computed precisely from the user's answers. |
| |
| If the loaded feature_columns list contains names we don't know how to |
| populate, we fill them with 0.5 (a neutral midpoint after normalization) |
| and log which ones were defaulted. This keeps predictions directionally |
| correct without pretending to have data we don't have. |
| """ |
| from __future__ import annotations |
|
|
| import logging |
| from dataclasses import dataclass |
| from typing import Any |
|
|
| import numpy as np |
|
|
| from src import gottman_scorer, survival_scorer |
|
|
| log = logging.getLogger(__name__) |
|
|
|
|
| @dataclass |
| class UserInputs: |
| """The 15 questions from the Quick Check form.""" |
| |
| age_you: int |
| gender_you: str |
| education_you: str |
| career_you: str |
|
|
| |
| age_partner: int |
| gender_partner: str |
| education_partner: str |
| career_partner: str |
|
|
| |
| shared_interests: int |
| shared_goals: int |
| repair_after_conflict: int |
| criticism_frequency: int |
| stonewalling: int |
|
|
| |
| relationship_type: str |
| marriage_number: int |
|
|
| |
| years_together: float = 2.0 |
| contempt_frequency: int = 2 |
| defensiveness: int = 2 |
| know_inner_world: int = 4 |
| partner_knows_me: int = 4 |
|
|
|
|
| |
| EDU_MAP = { |
| "high_school": 2, |
| "associates": 3, |
| "bachelors": 3, |
| "masters": 4, |
| "phd": 5, |
| "doctorate": 5, |
| "other": 3, |
| } |
|
|
| |
| |
| GENDER_MAP = { |
| "female": 0, |
| "male": 1, |
| "nonbinary": 0.5, |
| "prefer_not": 0.5, |
| } |
|
|
| |
| |
| CAREER_MAP = { |
| "tech": 8, |
| "finance": 7, |
| "medicine": 9, |
| "academia": 6, |
| "law": 7, |
| "arts": 4, |
| "education": 5, |
| "business": 6, |
| "other": 5, |
| } |
|
|
|
|
| def _career_score(c: str) -> float: |
| key = c.lower().strip().replace(" ", "_") |
| return CAREER_MAP.get(key, 5) / 10.0 |
|
|
|
|
| def _direct_features(u: UserInputs) -> dict[str, float]: |
| """Features we can set directly from user input.""" |
| age_avg = (u.age_you + u.age_partner) / 2.0 |
| age_diff = abs(u.age_you - u.age_partner) |
|
|
| return { |
| |
| "age": u.age_you, |
| "age_o": u.age_partner, |
| "d_age": age_diff, |
| "age_avg": age_avg, |
|
|
| |
| "gender": GENDER_MAP.get(u.gender_you.lower(), 0.5), |
| "samerace": 1.0, |
| "race": 0.0, |
| "race_o": 0.0, |
|
|
| |
| "goal": 3.0, |
| "field_cd": EDU_MAP.get(u.education_you.lower(), 3), |
|
|
| |
| "income": _career_score(u.career_you) * 100000, |
|
|
| |
| "marriage_number": u.marriage_number, |
| "years_together": u.years_together, |
| } |
|
|
|
|
| def build(u: UserInputs, feature_columns: list[str]) -> np.ndarray: |
| """Build the full feature vector aligned to the model's column order. |
| |
| Returns a float32 array of shape (n_features,). |
| """ |
| |
| gottman = gottman_scorer.score( |
| gottman_scorer.GottmanAnswers( |
| shared_interests=u.shared_interests, |
| shared_goals=u.shared_goals, |
| know_inner_world=u.know_inner_world, |
| partner_knows_me=u.partner_knows_me, |
| repair_after_conflict=u.repair_after_conflict, |
| criticism_frequency=u.criticism_frequency, |
| contempt_frequency=u.contempt_frequency, |
| defensiveness=u.defensiveness, |
| stonewalling=u.stonewalling, |
| ) |
| ) |
| survival = survival_scorer.compute( |
| survival_scorer.SurvivalInputs( |
| age_you=u.age_you, |
| age_partner=u.age_partner, |
| marriage_number=u.marriage_number, |
| relationship_type=u.relationship_type, |
| years_together=u.years_together, |
| ) |
| ) |
| direct = _direct_features(u) |
|
|
| |
| known: dict[str, float] = {} |
| known.update(direct) |
| known.update(gottman) |
| known.update(survival) |
|
|
| |
| vec = np.zeros(len(feature_columns), dtype=np.float32) |
| unknowns: list[str] = [] |
| for i, col in enumerate(feature_columns): |
| if col in known: |
| vec[i] = float(known[col]) |
| else: |
| |
| vec[i] = _default_for(col) |
| unknowns.append(col) |
|
|
| if unknowns: |
| log.debug("Defaulted %d/%d features to population means: %s...", |
| len(unknowns), len(feature_columns), unknowns[:5]) |
|
|
| return vec |
|
|
|
|
| def _default_for(col: str) -> float: |
| """Heuristic default for a feature we can't derive from user input. |
| |
| Speed-dating features are typically 1-10 scales (rating attributes) or |
| 0-1 normalized scores. We use 5.0 for rating-style, 0.5 for normalized, |
| and 0.0 for binary. |
| """ |
| name = col.lower() |
| |
| if any(name.startswith(p) for p in ("attr", "sinc", "intel", "fun", "amb", "shar")): |
| return 5.0 |
| |
| if "imp" in name or "pref" in name: |
| return 16.67 |
| |
| if any(k in name for k in ("sports", "tvsports", "exercise", "dining", |
| "museums", "art", "hiking", "gaming", |
| "clubbing", "reading", "tv", "theater", |
| "movies", "concerts", "music", "shopping", |
| "yoga")): |
| return 5.0 |
| |
| if name.startswith(("is_", "has_", "same", "dec_")): |
| return 0.0 |
| |
| return 0.5 |
|
|