fix: add PROVEN_ARCHETYPES constant back to theme_sampler.py for direct imports

Browse files

Files changed (1) hide show

alpha_factory/deterministic/theme_sampler.py +71 -78

alpha_factory/deterministic/theme_sampler.py CHANGED Viewed

@@ -13,89 +13,84 @@ from ..data.brain_fields import (
     TIER3_ANALYST_FIELDS, TIER2_NEWS_FIELDS, TIER3_OPTION_FIELDS,
     TIER3_SUPPLY_CHAIN_FIELDS, TIER3_SOCIAL_FIELDS, TIER2_MODEL16_FIELDS,
     BrainField, DatasetTier, pick_field, get_sign_multiplier,
 )
 # ═══════════════════════════════════════════════════════════════════════════
 # THEME DEFINITIONS — mapped to REAL BRAIN field IDs
-# Each theme contains actual implementable fields from the platform
 # ═══════════════════════════════════════════════════════════════════════════
 THEME_FIELDS: dict[str, list[str]] = {
-    # TIER 1 — model77 goldmine (density 24 α/field)
     "earnings_surprise_momentum": [
-        "standardized_unexpected_earnings_2",           # AC=0 GOLDMINE
-        "quarterly_earnings_surprise_stddev",           # AC=1
-        "quarterly_eps_surprise_change",                # AC=1
-        "six_month_eps_revision_fy2",                   # AC=1
-        "mdl77_ooearningsmomemtummodel_fc_fqsurstd",   # AC=1
     ],
     "earnings_quality_signaling": [
-        "mdl77_2valuemomemtummodel_earningsqualitymodule",  # AC=1
-        "mdl77_2valuemomemtummodel_managementsignalingmodule",  # AC=1
-        "mdl77_valueanalystmodelqva_mgtsignaling",      # AC=1
-        "mdl77_valueanalystmodelqva_yoychgdebt",        # AC=1
-        "mdl77_valueanalystmodelqva_chginv",            # AC=1
     ],
     "asset_growth_anomaly": [
-        "mdl77_ohistoricalgrowthfactor_pctchgqtrast",   # AC=1 (Cooper et al. 2008)
-        "three_year_change_gross_profit_margin_2",       # AC=1
-        "yearly_percentage_change_roe",                  # AC=1
     ],
     "forward_value_composite": [
-        "time_weighted_cash_flow_to_price",             # AC=0 GOLDMINE
-        "time_weighted_ebitda_to_enterprise_value_2",   # AC=1
-        "ttm_sales_to_enterprise_value",                # AC=1
-        "fundamental_growth_module_score",               # AC=1
     ],
     "liquidity_risk_premium": [
-        "mdl77_2liquidityriskfactor_milliq",            # AC=1 (Amihud illiquidity)
-        "mdl177_2_globaldevnorthamerica_v502_liqcoeff", # AC=0 GOLDMINE
     ],
-    # TIER 2 — model16 score derivatives + news
     "multi_factor_momentum": [
-        "multi_factor_static_score_derivative",         # AC=98
-        "relative_valuation_rank_derivative",           # AC=119
-        "growth_potential_rank_derivative",             # AC=152
-        "earnings_certainty_rank_derivative",           # AC=175
     ],
     "news_reaction_drift": [
-        "news_short_interest",                          # AC=535
-        "news_pct_5_min",                               # AC=353
-        "news_vol_stddev",                              # AC=902
     ],
-    # TIER 3 — analyst, options, supply chain, social
     "analyst_guidance_revision": [
-        "dividend_estimate_average",                    # AC=5
-        "max_ebitda_guidance",                          # AC=16
-        "cash_flow_operations_min_guidance",            # AC=17
-        "pretax_income_reported",                       # AC=15
     ],
     "options_sentiment_pcr": [
-        "pcr_vol_90",                                   # AC=184
-        "pcr_vol_20",                                   # AC=233
-        "forward_price_120",                            # AC=359
     ],
     "supply_chain_network": [
-        "pv13_ustomergraphrank_auth_rank",             # AC=595 (Cohen & Frazzini proxy)
-        "pv13_ustomergraphrank_page_rank",             # AC=921
-        "rel_ret_all",                                  # AC=2280
-        "rel_ret_comp",                                 # AC=3078
-        "pv13_custretsig_retsig",                      # AC=2718
     ],
     "social_contrarian": [
-        "snt_buzz_ret_fast_d1",                        # AC=56
-        "scl12_sentiment_fast_d1",                     # AC=134
     ],
     "geographic_exposure": [
-        "north_america_sales_exposure",                 # AC=0 GOLDMINE
-        "mdl177_2_globaldevnorthamerica_v502_chgalpha12m",  # AC=0 GOLDMINE
     ],
 }
-# Theme → expected archetype mapping for the expression compiler
 THEME_TO_ARCHETYPE: dict[str, str] = {
     "earnings_surprise_momentum": "pead_revisions",
     "earnings_quality_signaling": "value_quality_blend",
@@ -127,15 +122,29 @@ THEME_TO_TAG: dict[str, AnomalyTag] = {
     "geographic_exposure": AnomalyTag.OTHER,
 }
-# Theme → average AC (lower = higher EV)
 THEME_AVG_AC: dict[str, float] = {}
-for theme, field_ids in THEME_FIELDS.items():
-    acs = []
-    for fid in field_ids:
-        from ..data.brain_fields import FIELD_INDEX
-        if fid in FIELD_INDEX:
-            acs.append(FIELD_INDEX[fid].alpha_count)
-    THEME_AVG_AC[theme] = sum(acs) / len(acs) if acs else 999
 def compute_gap_scores(
@@ -143,17 +152,7 @@ def compute_gap_scores(
     existing_anomaly_tags: list[str],
     dead_themes: Optional[list[str]] = None,
 ) -> list[tuple[str, float]]:
-    """
-    Rank themes by how under-explored they are.
-    gap_score = log(field_count + 1)
-                - 2 * log(1 + alphas_in_theme)
-                + novelty_bonus (if AC < 2)
-                + goldmine_bonus (if any AC=0 field)
-                - dead_penalty
-    Higher score = bigger opportunity.
-    """
     theme_counts = Counter(existing_themes)
     anomaly_counts = Counter(existing_anomaly_tags)
     dead_set = set(dead_themes or [])
@@ -167,24 +166,21 @@ def compute_gap_scores(
         alpha_count = theme_counts.get(theme, 0)
         avg_ac = THEME_AVG_AC.get(theme, 100)
-        # Base gap score
         gap = math.log(field_count + 1) - 2 * math.log(1 + alpha_count)
-        # Goldmine bonus: themes containing AC=0 fields get massive boost
         has_goldmine = any(
-            FIELD_INDEX.get(fid, None) and FIELD_INDEX[fid].alpha_count == 0
             for fid in fields
         )
         if has_goldmine:
-            gap += 2.0  # Huge bonus for untouched fields
-        # Low-AC bonus: average AC of theme's fields
         if avg_ac <= 5:
             gap += 1.0
         elif avg_ac <= 50:
             gap += 0.5
-        # Anomaly diversity bonus
         tag = THEME_TO_TAG.get(theme, AnomalyTag.OTHER)
         tag_count = anomaly_counts.get(tag.value, 0)
         if tag_count < 2:
@@ -202,7 +198,7 @@ def pick_theme(
     dead_themes: Optional[list[str]] = None,
     top_k: int = 3,
 ) -> str:
-    """Pick the best theme to explore next (highest gap score)."""
     scores = compute_gap_scores(existing_themes, existing_anomaly_tags, dead_themes)
     top = scores[:top_k]
     if not top:
@@ -211,15 +207,12 @@ def pick_theme(
 def get_theme_fields(theme: str) -> list[str]:
-    """Get the BRAIN field IDs for a theme."""
     return THEME_FIELDS.get(theme, [])
 def get_theme_archetype(theme: str) -> str:
-    """Get the recommended archetype for a theme."""
     return THEME_TO_ARCHETYPE.get(theme, "novel")
 def get_theme_tag(theme: str) -> AnomalyTag:
-    """Get the anomaly tag for a theme."""
     return THEME_TO_TAG.get(theme, AnomalyTag.OTHER)

     TIER3_ANALYST_FIELDS, TIER2_NEWS_FIELDS, TIER3_OPTION_FIELDS,
     TIER3_SUPPLY_CHAIN_FIELDS, TIER3_SOCIAL_FIELDS, TIER2_MODEL16_FIELDS,
     BrainField, DatasetTier, pick_field, get_sign_multiplier,
+    FIELD_INDEX,
 )
 # ═══════════════════════════════════════════════════════════════════════════
 # THEME DEFINITIONS — mapped to REAL BRAIN field IDs
 # ═══════════════════════════════════════════════════════════════════════════
 THEME_FIELDS: dict[str, list[str]] = {
     "earnings_surprise_momentum": [
+        "standardized_unexpected_earnings_2",
+        "quarterly_earnings_surprise_stddev",
+        "quarterly_eps_surprise_change",
+        "six_month_eps_revision_fy2",
+        "mdl77_ooearningsmomemtummodel_fc_fqsurstd",
     ],
     "earnings_quality_signaling": [
+        "mdl77_2valuemomemtummodel_earningsqualitymodule",
+        "mdl77_2valuemomemtummodel_managementsignalingmodule",
+        "mdl77_valueanalystmodelqva_mgtsignaling",
+        "mdl77_valueanalystmodelqva_yoychgdebt",
+        "mdl77_valueanalystmodelqva_chginv",
     ],
     "asset_growth_anomaly": [
+        "mdl77_ohistoricalgrowthfactor_pctchgqtrast",
+        "three_year_change_gross_profit_margin_2",
+        "yearly_percentage_change_roe",
     ],
     "forward_value_composite": [
+        "time_weighted_cash_flow_to_price",
+        "time_weighted_ebitda_to_enterprise_value_2",
+        "ttm_sales_to_enterprise_value",
+        "fundamental_growth_module_score",
     ],
     "liquidity_risk_premium": [
+        "mdl77_2liquidityriskfactor_milliq",
+        "mdl177_2_globaldevnorthamerica_v502_liqcoeff",
     ],
     "multi_factor_momentum": [
+        "multi_factor_static_score_derivative",
+        "relative_valuation_rank_derivative",
+        "growth_potential_rank_derivative",
+        "earnings_certainty_rank_derivative",
     ],
     "news_reaction_drift": [
+        "news_short_interest",
+        "news_pct_5_min",
+        "news_vol_stddev",
     ],
     "analyst_guidance_revision": [
+        "dividend_estimate_average",
+        "max_ebitda_guidance",
+        "cash_flow_operations_min_guidance",
+        "pretax_income_reported",
     ],
     "options_sentiment_pcr": [
+        "pcr_vol_90",
+        "pcr_vol_20",
+        "forward_price_120",
     ],
     "supply_chain_network": [
+        "pv13_ustomergraphrank_auth_rank",
+        "pv13_ustomergraphrank_page_rank",
+        "rel_ret_all",
+        "rel_ret_comp",
+        "pv13_custretsig_retsig",
     ],
     "social_contrarian": [
+        "snt_buzz_ret_fast_d1",
+        "scl12_sentiment_fast_d1",
     ],
     "geographic_exposure": [
+        "north_america_sales_exposure",
+        "mdl177_2_globaldevnorthamerica_v502_chgalpha12m",
     ],
 }
+# Theme → archetype
 THEME_TO_ARCHETYPE: dict[str, str] = {
     "earnings_surprise_momentum": "pead_revisions",
     "earnings_quality_signaling": "value_quality_blend",
     "geographic_exposure": AnomalyTag.OTHER,
 }
+# ═══════════════════════════════════════════════════════════════════════════
+# BACKWARD-COMPAT: PROVEN_ARCHETYPES (used by hypothesis_hunter.py)
+# ═══════════════════════════════════════════════════════════════════════════
+PROVEN_ARCHETYPES = [
+    "value_quality_blend",
+    "multi_horizon_mr",
+    "vol_scaled_shock",
+    "intraday_mr_decay",
+    "pead_revisions",
+    "fundamental_yield_composite",
+    "sue_drift",
+    "supply_chain_lead_lag",
+    "analyst_guidance_yield",
+    "pcr_contrarian",
+    "model_score_momentum",
+    "alpha15_hybrid",
+]
+# Theme → average AC
 THEME_AVG_AC: dict[str, float] = {}
+for _theme, _field_ids in THEME_FIELDS.items():
+    _acs = [FIELD_INDEX[fid].alpha_count for fid in _field_ids if fid in FIELD_INDEX]
+    THEME_AVG_AC[_theme] = sum(_acs) / len(_acs) if _acs else 999
 def compute_gap_scores(
     existing_anomaly_tags: list[str],
     dead_themes: Optional[list[str]] = None,
 ) -> list[tuple[str, float]]:
+    """Rank themes by opportunity (higher = bigger gap)."""
     theme_counts = Counter(existing_themes)
     anomaly_counts = Counter(existing_anomaly_tags)
     dead_set = set(dead_themes or [])
         alpha_count = theme_counts.get(theme, 0)
         avg_ac = THEME_AVG_AC.get(theme, 100)
         gap = math.log(field_count + 1) - 2 * math.log(1 + alpha_count)
+        # Goldmine bonus for AC=0 fields
         has_goldmine = any(
+            fid in FIELD_INDEX and FIELD_INDEX[fid].alpha_count == 0
             for fid in fields
         )
         if has_goldmine:
+            gap += 2.0
         if avg_ac <= 5:
             gap += 1.0
         elif avg_ac <= 50:
             gap += 0.5
         tag = THEME_TO_TAG.get(theme, AnomalyTag.OTHER)
         tag_count = anomaly_counts.get(tag.value, 0)
         if tag_count < 2:
     dead_themes: Optional[list[str]] = None,
     top_k: int = 3,
 ) -> str:
+    """Pick the best theme to explore next."""
     scores = compute_gap_scores(existing_themes, existing_anomaly_tags, dead_themes)
     top = scores[:top_k]
     if not top:
 def get_theme_fields(theme: str) -> list[str]:
     return THEME_FIELDS.get(theme, [])
 def get_theme_archetype(theme: str) -> str:
     return THEME_TO_ARCHETYPE.get(theme, "novel")
 def get_theme_tag(theme: str) -> AnomalyTag:
     return THEME_TO_TAG.get(theme, AnomalyTag.OTHER)