| """ |
| Theme Sampler v2 — Data-driven gap analysis using REAL BRAIN fields. |
| Picks under-explored themes from the canonical field registry. |
| Now uses actual field IDs, AC counts, and dataset tiers. |
| """ |
| import math |
| import random |
| from collections import Counter |
| from typing import Optional |
| from ..schemas import AnomalyTag |
| from ..data.brain_fields import ( |
| ALL_FIELDS, GOLDMINE_FIELDS, TIER1_MODEL77_FIELDS, |
| TIER3_ANALYST_FIELDS, TIER2_NEWS_FIELDS, TIER3_OPTION_FIELDS, |
| TIER3_SUPPLY_CHAIN_FIELDS, TIER3_SOCIAL_FIELDS, TIER2_MODEL16_FIELDS, |
| BrainField, DatasetTier, pick_field, get_sign_multiplier, |
| FIELD_INDEX, |
| ) |
|
|
|
|
| THEME_FIELDS: dict[str, list[str]] = { |
| "earnings_surprise_momentum": [ |
| "standardized_unexpected_earnings_2", |
| "quarterly_earnings_surprise_stddev", |
| "quarterly_eps_surprise_change", |
| "six_month_eps_revision_fy2", |
| "mdl77_ooearningsmomemtummodel_fc_fqsurstd", |
| ], |
| "earnings_quality_signaling": [ |
| "mdl77_2valuemomemtummodel_earningsqualitymodule", |
| "mdl77_2valuemomemtummodel_managementsignalingmodule", |
| "mdl77_valueanalystmodelqva_mgtsignaling", |
| "mdl77_valueanalystmodelqva_yoychgdebt", |
| "mdl77_valueanalystmodelqva_chginv", |
| ], |
| "asset_growth_anomaly": [ |
| "mdl77_ohistoricalgrowthfactor_pctchgqtrast", |
| "three_year_change_gross_profit_margin_2", |
| "yearly_percentage_change_roe", |
| ], |
| "forward_value_composite": [ |
| "time_weighted_cash_flow_to_price", |
| "time_weighted_ebitda_to_enterprise_value_2", |
| "ttm_sales_to_enterprise_value", |
| "fundamental_growth_module_score", |
| ], |
| "liquidity_risk_premium": [ |
| "mdl77_2liquidityriskfactor_milliq", |
| "mdl177_2_globaldevnorthamerica_v502_liqcoeff", |
| ], |
| "multi_factor_momentum": [ |
| "multi_factor_static_score_derivative", |
| "relative_valuation_rank_derivative", |
| "growth_potential_rank_derivative", |
| "earnings_certainty_rank_derivative", |
| ], |
| "news_reaction_drift": [ |
| "news_short_interest", |
| "news_pct_5_min", |
| "news_vol_stddev", |
| ], |
| "analyst_guidance_revision": [ |
| "dividend_estimate_average", |
| "max_ebitda_guidance", |
| "cash_flow_operations_min_guidance", |
| "pretax_income_reported", |
| ], |
| "options_sentiment_pcr": [ |
| "pcr_vol_90", |
| "pcr_vol_20", |
| "forward_price_120", |
| ], |
| "supply_chain_network": [ |
| "pv13_customergraphrank_auth_rank", |
| "pv13_customergraphrank_page_rank", |
| "rel_ret_all", |
| "rel_ret_comp", |
| "pv13_custretsig_retsig", |
| ], |
| "social_contrarian": [ |
| "snt_buzz_ret_fast_d1", |
| "scl12_sentiment_fast_d1", |
| ], |
| "geographic_exposure": [ |
| "north_america_sales_exposure", |
| "mdl177_2_globaldevnorthamerica_v502_chgalpha12m", |
| ], |
| } |
|
|
| THEME_TO_ARCHETYPE: dict[str, str] = { |
| "earnings_surprise_momentum": "pead_revisions", |
| "earnings_quality_signaling": "value_quality_blend", |
| "asset_growth_anomaly": "value_quality_blend", |
| "forward_value_composite": "fundamental_yield_composite", |
| "liquidity_risk_premium": "vol_scaled_shock", |
| "multi_factor_momentum": "multi_horizon_mr", |
| "news_reaction_drift": "intraday_mr_decay", |
| "analyst_guidance_revision": "pead_revisions", |
| "options_sentiment_pcr": "vol_scaled_shock", |
| "supply_chain_network": "multi_horizon_mr", |
| "social_contrarian": "intraday_mr_decay", |
| "geographic_exposure": "value_quality_blend", |
| } |
|
|
| THEME_TO_TAG: dict[str, AnomalyTag] = { |
| "earnings_surprise_momentum": AnomalyTag.PEAD, |
| "earnings_quality_signaling": AnomalyTag.QUALITY, |
| "asset_growth_anomaly": AnomalyTag.FUNDAMENTAL, |
| "forward_value_composite": AnomalyTag.VALUE, |
| "liquidity_risk_premium": AnomalyTag.LIQUIDITY, |
| "multi_factor_momentum": AnomalyTag.MOMENTUM, |
| "news_reaction_drift": AnomalyTag.EVENT, |
| "analyst_guidance_revision": AnomalyTag.ANALYST, |
| "options_sentiment_pcr": AnomalyTag.OPTION_SURFACE, |
| "supply_chain_network": AnomalyTag.TECHNICAL, |
| "social_contrarian": AnomalyTag.SOCIAL, |
| "geographic_exposure": AnomalyTag.OTHER, |
| } |
|
|
| PROVEN_ARCHETYPES = list(set(THEME_TO_ARCHETYPE.values())) |
|
|
| THEME_AVG_AC: dict[str, float] = {} |
| for _theme, _field_ids in THEME_FIELDS.items(): |
| _acs = [FIELD_INDEX[fid].alpha_count for fid in _field_ids if fid in FIELD_INDEX] |
| THEME_AVG_AC[_theme] = sum(_acs) / len(_acs) if _acs else 999 |
|
|
|
|
| def compute_gap_scores( |
| existing_themes: list[str], |
| existing_anomaly_tags: list[str], |
| dead_themes: Optional[list[str]] = None, |
| ) -> list[tuple[str, float]]: |
| """Rank themes by opportunity (higher = bigger gap).""" |
| theme_counts = Counter(existing_themes) |
| anomaly_counts = Counter(existing_anomaly_tags) |
| dead_set = set(dead_themes or []) |
|
|
| scores = [] |
| for theme, fields in THEME_FIELDS.items(): |
| if theme in dead_set: |
| continue |
|
|
| field_count = len(fields) |
| alpha_count = theme_counts.get(theme, 0) |
| avg_ac = THEME_AVG_AC.get(theme, 100) |
|
|
| gap = math.log(field_count + 1) - 2 * math.log(1 + alpha_count) |
|
|
| |
| has_goldmine = any( |
| fid in FIELD_INDEX and FIELD_INDEX[fid].alpha_count == 0 |
| for fid in fields |
| ) |
| if has_goldmine: |
| gap += 2.0 |
|
|
| if avg_ac <= 5: |
| gap += 1.0 |
| elif avg_ac <= 50: |
| gap += 0.5 |
|
|
| tag = THEME_TO_TAG.get(theme, AnomalyTag.OTHER) |
| tag_count = anomaly_counts.get(tag.value, 0) |
| if tag_count < 2: |
| gap += 0.5 |
|
|
| scores.append((theme, gap)) |
|
|
| scores.sort(key=lambda x: -x[1]) |
| return scores |
|
|
|
|
| def pick_theme( |
| existing_themes: list[str], |
| existing_anomaly_tags: list[str], |
| dead_themes: Optional[list[str]] = None, |
| top_k: int = 3, |
| ) -> str: |
| """Pick the best theme to explore next.""" |
| scores = compute_gap_scores(existing_themes, existing_anomaly_tags, dead_themes) |
| top = scores[:top_k] |
| if not top: |
| return random.choice(list(THEME_FIELDS.keys())) |
| return random.choice(top)[0] |
|
|
|
|
| def get_theme_fields(theme: str) -> list[str]: |
| return THEME_FIELDS.get(theme, []) |
|
|
|
|
| def get_theme_archetype(theme: str) -> str: |
| return THEME_TO_ARCHETYPE.get(theme, "novel") |
|
|
|
|
| def get_theme_tag(theme: str) -> AnomalyTag: |
| return THEME_TO_TAG.get(theme, AnomalyTag.OTHER) |
|
|