feat: theme_sampler v2 — real BRAIN fields from model77/analyst4/news12/option9/pv13, data-driven gap scoring

Browse files

Files changed (1) hide show

alpha_factory/deterministic/theme_sampler.py +169 -55

alpha_factory/deterministic/theme_sampler.py CHANGED Viewed

@@ -1,40 +1,141 @@
 """
-Theme Sampler — deterministic gap analysis.
-Picks under-explored themes from the factor store.
 """
 import math
 from collections import Counter
 from typing import Optional
 from ..schemas import AnomalyTag
-# Theme definitions mapped to BRAIN field families
-THEME_FIELDS = {
-    "value_quality": ["book_to_price", "earnings_yield", "roe", "roa", "debt_to_equity", "current_ratio"],
-    "momentum": ["close", "returns", "volume", "ts_returns", "high", "low"],
-    "reversal": ["close", "returns", "volume", "bid_ask_spread"],
-    "volatility": ["volatility", "ivol", "beta", "hv", "atr"],
-    "analyst": ["analyst_rating", "estimate_revision", "target_price", "recommendation"],
-    "sentiment_social": ["sentiment", "social_volume", "social_score", "news_sentiment"],
-    "option_surface": ["iv30", "iv60", "iv90", "iv180", "pcr", "skew", "term_structure"],
-    "earnings_event": ["earnings_surprise", "post_earnings_drift", "guidance"],
-    "liquidity_micro": ["bid_ask_spread", "volume", "turnover", "amihud_illiquidity"],
-    "growth": ["revenue_growth", "earnings_growth", "asset_growth", "sales_growth"],
-    "intraday": ["open", "high", "low", "close", "vwap", "intraday_range"],
-    "fundamental_yield": ["dividend_yield", "buyback_yield", "shareholder_yield", "fcf_yield"],
 }
-# Known archetypes from the existing 18-alpha library
-PROVEN_ARCHETYPES = [
-    "value_quality_blend",
-    "intraday_mr_decay",
-    "vol_scaled_shock",
-    "pead_revisions",
-    "skew_term",
-    "social_momentum",
-    "multi_horizon_mr",
-    "fundamental_yield_composite",
-]
 def compute_gap_scores(
@@ -44,9 +145,13 @@ def compute_gap_scores(
 ) -> list[tuple[str, float]]:
     """
     Rank themes by how under-explored they are.
-    gap_score = log(field_count) - 2 * log(1 + alphas_in_theme) - dead_penalty
     Higher score = bigger opportunity.
     """
     theme_counts = Counter(existing_themes)
@@ -56,22 +161,37 @@ def compute_gap_scores(
     scores = []
     for theme, fields in THEME_FIELDS.items():
         if theme in dead_set:
-            continue  # Skip dead themes (§11.5)
         field_count = len(fields)
         alpha_count = theme_counts.get(theme, 0)
         gap = math.log(field_count + 1) - 2 * math.log(1 + alpha_count)
-        # Bonus if the anomaly tag is under-represented
-        tag = _theme_to_tag(theme)
-        tag_count = anomaly_counts.get(tag, 0)
         if tag_count < 2:
-            gap += 0.5  # novelty bonus
         scores.append((theme, gap))
-    # Sort descending
     scores.sort(key=lambda x: -x[1])
     return scores
@@ -83,29 +203,23 @@ def pick_theme(
     top_k: int = 3,
 ) -> str:
     """Pick the best theme to explore next (highest gap score)."""
-    import random
     scores = compute_gap_scores(existing_themes, existing_anomaly_tags, dead_themes)
-    # Sample from top-k for diversity
     top = scores[:top_k]
     if not top:
         return random.choice(list(THEME_FIELDS.keys()))
     return random.choice(top)[0]
-def _theme_to_tag(theme: str) -> str:
-    """Map theme name to AnomalyTag value."""
-    mapping = {
-        "value_quality": "value",
-        "momentum": "momentum",
-        "reversal": "reversal",
-        "volatility": "low_vol",
-        "analyst": "analyst",
-        "sentiment_social": "sentiment",
-        "option_surface": "option_surface",
-        "earnings_event": "pead",
-        "liquidity_micro": "liquidity",
-        "growth": "fundamental",
-        "intraday": "technical",
-        "fundamental_yield": "fundamental",
-    }
-    return mapping.get(theme, "other")

 """
+Theme Sampler v2 — Data-driven gap analysis using REAL BRAIN fields.
+Picks under-explored themes from the canonical field registry.
+Now uses actual field IDs, AC counts, and dataset tiers.
 """
 import math
+import random
 from collections import Counter
 from typing import Optional
 from ..schemas import AnomalyTag
+from ..data.brain_fields import (
+    ALL_FIELDS, GOLDMINE_FIELDS, TIER1_MODEL77_FIELDS,
+    TIER3_ANALYST_FIELDS, TIER2_NEWS_FIELDS, TIER3_OPTION_FIELDS,
+    TIER3_SUPPLY_CHAIN_FIELDS, TIER3_SOCIAL_FIELDS, TIER2_MODEL16_FIELDS,
+    BrainField, DatasetTier, pick_field, get_sign_multiplier,
+)
+# ═══════════════════════════════════════════════════════════════════════════
+# THEME DEFINITIONS — mapped to REAL BRAIN field IDs
+# Each theme contains actual implementable fields from the platform
+# ═══════════════════════════════════════════════════════════════════════════
+THEME_FIELDS: dict[str, list[str]] = {
+    # TIER 1 — model77 goldmine (density 24 α/field)
+    "earnings_surprise_momentum": [
+        "standardized_unexpected_earnings_2",           # AC=0 GOLDMINE
+        "quarterly_earnings_surprise_stddev",           # AC=1
+        "quarterly_eps_surprise_change",                # AC=1
+        "six_month_eps_revision_fy2",                   # AC=1
+        "mdl77_ooearningsmomemtummodel_fc_fqsurstd",   # AC=1
+    ],
+    "earnings_quality_signaling": [
+        "mdl77_2valuemomemtummodel_earningsqualitymodule",  # AC=1
+        "mdl77_2valuemomemtummodel_managementsignalingmodule",  # AC=1
+        "mdl77_valueanalystmodelqva_mgtsignaling",      # AC=1
+        "mdl77_valueanalystmodelqva_yoychgdebt",        # AC=1
+        "mdl77_valueanalystmodelqva_chginv",            # AC=1
+    ],
+    "asset_growth_anomaly": [
+        "mdl77_ohistoricalgrowthfactor_pctchgqtrast",   # AC=1 (Cooper et al. 2008)
+        "three_year_change_gross_profit_margin_2",       # AC=1
+        "yearly_percentage_change_roe",                  # AC=1
+    ],
+    "forward_value_composite": [
+        "time_weighted_cash_flow_to_price",             # AC=0 GOLDMINE
+        "time_weighted_ebitda_to_enterprise_value_2",   # AC=1
+        "ttm_sales_to_enterprise_value",                # AC=1
+        "fundamental_growth_module_score",               # AC=1
+    ],
+    "liquidity_risk_premium": [
+        "mdl77_2liquidityriskfactor_milliq",            # AC=1 (Amihud illiquidity)
+        "mdl177_2_globaldevnorthamerica_v502_liqcoeff", # AC=0 GOLDMINE
+    ],
+    # TIER 2 — model16 score derivatives + news
+    "multi_factor_momentum": [
+        "multi_factor_static_score_derivative",         # AC=98
+        "relative_valuation_rank_derivative",           # AC=119
+        "growth_potential_rank_derivative",             # AC=152
+        "earnings_certainty_rank_derivative",           # AC=175
+    ],
+    "news_reaction_drift": [
+        "news_short_interest",                          # AC=535
+        "news_pct_5_min",                               # AC=353
+        "news_vol_stddev",                              # AC=902
+    ],
+    # TIER 3 — analyst, options, supply chain, social
+    "analyst_guidance_revision": [
+        "dividend_estimate_average",                    # AC=5
+        "max_ebitda_guidance",                          # AC=16
+        "cash_flow_operations_min_guidance",            # AC=17
+        "pretax_income_reported",                       # AC=15
+    ],
+    "options_sentiment_pcr": [
+        "pcr_vol_90",                                   # AC=184
+        "pcr_vol_20",                                   # AC=233
+        "forward_price_120",                            # AC=359
+    ],
+    "supply_chain_network": [
+        "pv13_ustomergraphrank_auth_rank",             # AC=595 (Cohen & Frazzini proxy)
+        "pv13_ustomergraphrank_page_rank",             # AC=921
+        "rel_ret_all",                                  # AC=2280
+        "rel_ret_comp",                                 # AC=3078
+        "pv13_custretsig_retsig",                      # AC=2718
+    ],
+    "social_contrarian": [
+        "snt_buzz_ret_fast_d1",                        # AC=56
+        "scl12_sentiment_fast_d1",                     # AC=134
+    ],
+    "geographic_exposure": [
+        "north_america_sales_exposure",                 # AC=0 GOLDMINE
+        "mdl177_2_globaldevnorthamerica_v502_chgalpha12m",  # AC=0 GOLDMINE
+    ],
+}
+# Theme → expected archetype mapping for the expression compiler
+THEME_TO_ARCHETYPE: dict[str, str] = {
+    "earnings_surprise_momentum": "pead_revisions",
+    "earnings_quality_signaling": "value_quality_blend",
+    "asset_growth_anomaly": "value_quality_blend",
+    "forward_value_composite": "fundamental_yield_composite",
+    "liquidity_risk_premium": "vol_scaled_shock",
+    "multi_factor_momentum": "multi_horizon_mr",
+    "news_reaction_drift": "intraday_mr_decay",
+    "analyst_guidance_revision": "pead_revisions",
+    "options_sentiment_pcr": "vol_scaled_shock",
+    "supply_chain_network": "multi_horizon_mr",
+    "social_contrarian": "intraday_mr_decay",
+    "geographic_exposure": "value_quality_blend",
+}
+# Theme → anomaly tag
+THEME_TO_TAG: dict[str, AnomalyTag] = {
+    "earnings_surprise_momentum": AnomalyTag.PEAD,
+    "earnings_quality_signaling": AnomalyTag.QUALITY,
+    "asset_growth_anomaly": AnomalyTag.FUNDAMENTAL,
+    "forward_value_composite": AnomalyTag.VALUE,
+    "liquidity_risk_premium": AnomalyTag.LIQUIDITY,
+    "multi_factor_momentum": AnomalyTag.MOMENTUM,
+    "news_reaction_drift": AnomalyTag.EVENT,
+    "analyst_guidance_revision": AnomalyTag.ANALYST,
+    "options_sentiment_pcr": AnomalyTag.OPTION_SURFACE,
+    "supply_chain_network": AnomalyTag.TECHNICAL,
+    "social_contrarian": AnomalyTag.SOCIAL,
+    "geographic_exposure": AnomalyTag.OTHER,
 }
+# Theme → average AC (lower = higher EV)
+THEME_AVG_AC: dict[str, float] = {}
+for theme, field_ids in THEME_FIELDS.items():
+    acs = []
+    for fid in field_ids:
+        from ..data.brain_fields import FIELD_INDEX
+        if fid in FIELD_INDEX:
+            acs.append(FIELD_INDEX[fid].alpha_count)
+    THEME_AVG_AC[theme] = sum(acs) / len(acs) if acs else 999
 def compute_gap_scores(
 ) -> list[tuple[str, float]]:
     """
     Rank themes by how under-explored they are.
+    gap_score = log(field_count + 1)
+                - 2 * log(1 + alphas_in_theme)
+                + novelty_bonus (if AC < 2)
+                + goldmine_bonus (if any AC=0 field)
+                - dead_penalty
     Higher score = bigger opportunity.
     """
     theme_counts = Counter(existing_themes)
     scores = []
     for theme, fields in THEME_FIELDS.items():
         if theme in dead_set:
+            continue
         field_count = len(fields)
         alpha_count = theme_counts.get(theme, 0)
+        avg_ac = THEME_AVG_AC.get(theme, 100)
+        # Base gap score
         gap = math.log(field_count + 1) - 2 * math.log(1 + alpha_count)
+        # Goldmine bonus: themes containing AC=0 fields get massive boost
+        has_goldmine = any(
+            FIELD_INDEX.get(fid, None) and FIELD_INDEX[fid].alpha_count == 0
+            for fid in fields
+        )
+        if has_goldmine:
+            gap += 2.0  # Huge bonus for untouched fields
+        # Low-AC bonus: average AC of theme's fields
+        if avg_ac <= 5:
+            gap += 1.0
+        elif avg_ac <= 50:
+            gap += 0.5
+        # Anomaly diversity bonus
+        tag = THEME_TO_TAG.get(theme, AnomalyTag.OTHER)
+        tag_count = anomaly_counts.get(tag.value, 0)
         if tag_count < 2:
+            gap += 0.5
         scores.append((theme, gap))
     scores.sort(key=lambda x: -x[1])
     return scores
     top_k: int = 3,
 ) -> str:
     """Pick the best theme to explore next (highest gap score)."""
     scores = compute_gap_scores(existing_themes, existing_anomaly_tags, dead_themes)
     top = scores[:top_k]
     if not top:
         return random.choice(list(THEME_FIELDS.keys()))
     return random.choice(top)[0]
+def get_theme_fields(theme: str) -> list[str]:
+    """Get the BRAIN field IDs for a theme."""
+    return THEME_FIELDS.get(theme, [])
+def get_theme_archetype(theme: str) -> str:
+    """Get the recommended archetype for a theme."""
+    return THEME_TO_ARCHETYPE.get(theme, "novel")
+def get_theme_tag(theme: str) -> AnomalyTag:
+    """Get the anomaly tag for a theme."""
+    return THEME_TO_TAG.get(theme, AnomalyTag.OTHER)