gaurv007 commited on
Commit
d120435
Β·
verified Β·
1 Parent(s): d8511eb

fix: add PROVEN_ARCHETYPES constant back to theme_sampler.py for direct imports

Browse files
alpha_factory/deterministic/theme_sampler.py CHANGED
@@ -13,89 +13,84 @@ from ..data.brain_fields import (
13
  TIER3_ANALYST_FIELDS, TIER2_NEWS_FIELDS, TIER3_OPTION_FIELDS,
14
  TIER3_SUPPLY_CHAIN_FIELDS, TIER3_SOCIAL_FIELDS, TIER2_MODEL16_FIELDS,
15
  BrainField, DatasetTier, pick_field, get_sign_multiplier,
 
16
  )
17
 
18
 
19
  # ═══════════════════════════════════════════════════════════════════════════
20
  # THEME DEFINITIONS β€” mapped to REAL BRAIN field IDs
21
- # Each theme contains actual implementable fields from the platform
22
  # ═══════════════════════════════════════════════════════════════════════════
23
 
24
  THEME_FIELDS: dict[str, list[str]] = {
25
- # TIER 1 β€” model77 goldmine (density 24 Ξ±/field)
26
  "earnings_surprise_momentum": [
27
- "standardized_unexpected_earnings_2", # AC=0 GOLDMINE
28
- "quarterly_earnings_surprise_stddev", # AC=1
29
- "quarterly_eps_surprise_change", # AC=1
30
- "six_month_eps_revision_fy2", # AC=1
31
- "mdl77_ooearningsmomemtummodel_fc_fqsurstd", # AC=1
32
  ],
33
  "earnings_quality_signaling": [
34
- "mdl77_2valuemomemtummodel_earningsqualitymodule", # AC=1
35
- "mdl77_2valuemomemtummodel_managementsignalingmodule", # AC=1
36
- "mdl77_valueanalystmodelqva_mgtsignaling", # AC=1
37
- "mdl77_valueanalystmodelqva_yoychgdebt", # AC=1
38
- "mdl77_valueanalystmodelqva_chginv", # AC=1
39
  ],
40
  "asset_growth_anomaly": [
41
- "mdl77_ohistoricalgrowthfactor_pctchgqtrast", # AC=1 (Cooper et al. 2008)
42
- "three_year_change_gross_profit_margin_2", # AC=1
43
- "yearly_percentage_change_roe", # AC=1
44
  ],
45
  "forward_value_composite": [
46
- "time_weighted_cash_flow_to_price", # AC=0 GOLDMINE
47
- "time_weighted_ebitda_to_enterprise_value_2", # AC=1
48
- "ttm_sales_to_enterprise_value", # AC=1
49
- "fundamental_growth_module_score", # AC=1
50
  ],
51
  "liquidity_risk_premium": [
52
- "mdl77_2liquidityriskfactor_milliq", # AC=1 (Amihud illiquidity)
53
- "mdl177_2_globaldevnorthamerica_v502_liqcoeff", # AC=0 GOLDMINE
54
  ],
55
-
56
- # TIER 2 β€” model16 score derivatives + news
57
  "multi_factor_momentum": [
58
- "multi_factor_static_score_derivative", # AC=98
59
- "relative_valuation_rank_derivative", # AC=119
60
- "growth_potential_rank_derivative", # AC=152
61
- "earnings_certainty_rank_derivative", # AC=175
62
  ],
63
  "news_reaction_drift": [
64
- "news_short_interest", # AC=535
65
- "news_pct_5_min", # AC=353
66
- "news_vol_stddev", # AC=902
67
  ],
68
-
69
- # TIER 3 β€” analyst, options, supply chain, social
70
  "analyst_guidance_revision": [
71
- "dividend_estimate_average", # AC=5
72
- "max_ebitda_guidance", # AC=16
73
- "cash_flow_operations_min_guidance", # AC=17
74
- "pretax_income_reported", # AC=15
75
  ],
76
  "options_sentiment_pcr": [
77
- "pcr_vol_90", # AC=184
78
- "pcr_vol_20", # AC=233
79
- "forward_price_120", # AC=359
80
  ],
81
  "supply_chain_network": [
82
- "pv13_ustomergraphrank_auth_rank", # AC=595 (Cohen & Frazzini proxy)
83
- "pv13_ustomergraphrank_page_rank", # AC=921
84
- "rel_ret_all", # AC=2280
85
- "rel_ret_comp", # AC=3078
86
- "pv13_custretsig_retsig", # AC=2718
87
  ],
88
  "social_contrarian": [
89
- "snt_buzz_ret_fast_d1", # AC=56
90
- "scl12_sentiment_fast_d1", # AC=134
91
  ],
92
  "geographic_exposure": [
93
- "north_america_sales_exposure", # AC=0 GOLDMINE
94
- "mdl177_2_globaldevnorthamerica_v502_chgalpha12m", # AC=0 GOLDMINE
95
  ],
96
  }
97
 
98
- # Theme β†’ expected archetype mapping for the expression compiler
99
  THEME_TO_ARCHETYPE: dict[str, str] = {
100
  "earnings_surprise_momentum": "pead_revisions",
101
  "earnings_quality_signaling": "value_quality_blend",
@@ -127,15 +122,29 @@ THEME_TO_TAG: dict[str, AnomalyTag] = {
127
  "geographic_exposure": AnomalyTag.OTHER,
128
  }
129
 
130
- # Theme β†’ average AC (lower = higher EV)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  THEME_AVG_AC: dict[str, float] = {}
132
- for theme, field_ids in THEME_FIELDS.items():
133
- acs = []
134
- for fid in field_ids:
135
- from ..data.brain_fields import FIELD_INDEX
136
- if fid in FIELD_INDEX:
137
- acs.append(FIELD_INDEX[fid].alpha_count)
138
- THEME_AVG_AC[theme] = sum(acs) / len(acs) if acs else 999
139
 
140
 
141
  def compute_gap_scores(
@@ -143,17 +152,7 @@ def compute_gap_scores(
143
  existing_anomaly_tags: list[str],
144
  dead_themes: Optional[list[str]] = None,
145
  ) -> list[tuple[str, float]]:
146
- """
147
- Rank themes by how under-explored they are.
148
-
149
- gap_score = log(field_count + 1)
150
- - 2 * log(1 + alphas_in_theme)
151
- + novelty_bonus (if AC < 2)
152
- + goldmine_bonus (if any AC=0 field)
153
- - dead_penalty
154
-
155
- Higher score = bigger opportunity.
156
- """
157
  theme_counts = Counter(existing_themes)
158
  anomaly_counts = Counter(existing_anomaly_tags)
159
  dead_set = set(dead_themes or [])
@@ -167,24 +166,21 @@ def compute_gap_scores(
167
  alpha_count = theme_counts.get(theme, 0)
168
  avg_ac = THEME_AVG_AC.get(theme, 100)
169
 
170
- # Base gap score
171
  gap = math.log(field_count + 1) - 2 * math.log(1 + alpha_count)
172
 
173
- # Goldmine bonus: themes containing AC=0 fields get massive boost
174
  has_goldmine = any(
175
- FIELD_INDEX.get(fid, None) and FIELD_INDEX[fid].alpha_count == 0
176
  for fid in fields
177
  )
178
  if has_goldmine:
179
- gap += 2.0 # Huge bonus for untouched fields
180
 
181
- # Low-AC bonus: average AC of theme's fields
182
  if avg_ac <= 5:
183
  gap += 1.0
184
  elif avg_ac <= 50:
185
  gap += 0.5
186
 
187
- # Anomaly diversity bonus
188
  tag = THEME_TO_TAG.get(theme, AnomalyTag.OTHER)
189
  tag_count = anomaly_counts.get(tag.value, 0)
190
  if tag_count < 2:
@@ -202,7 +198,7 @@ def pick_theme(
202
  dead_themes: Optional[list[str]] = None,
203
  top_k: int = 3,
204
  ) -> str:
205
- """Pick the best theme to explore next (highest gap score)."""
206
  scores = compute_gap_scores(existing_themes, existing_anomaly_tags, dead_themes)
207
  top = scores[:top_k]
208
  if not top:
@@ -211,15 +207,12 @@ def pick_theme(
211
 
212
 
213
  def get_theme_fields(theme: str) -> list[str]:
214
- """Get the BRAIN field IDs for a theme."""
215
  return THEME_FIELDS.get(theme, [])
216
 
217
 
218
  def get_theme_archetype(theme: str) -> str:
219
- """Get the recommended archetype for a theme."""
220
  return THEME_TO_ARCHETYPE.get(theme, "novel")
221
 
222
 
223
  def get_theme_tag(theme: str) -> AnomalyTag:
224
- """Get the anomaly tag for a theme."""
225
  return THEME_TO_TAG.get(theme, AnomalyTag.OTHER)
 
13
  TIER3_ANALYST_FIELDS, TIER2_NEWS_FIELDS, TIER3_OPTION_FIELDS,
14
  TIER3_SUPPLY_CHAIN_FIELDS, TIER3_SOCIAL_FIELDS, TIER2_MODEL16_FIELDS,
15
  BrainField, DatasetTier, pick_field, get_sign_multiplier,
16
+ FIELD_INDEX,
17
  )
18
 
19
 
20
  # ═══════════════════════════════════════════════════════════════════════════
21
  # THEME DEFINITIONS β€” mapped to REAL BRAIN field IDs
 
22
  # ═══════════════════════════════════════════════════════════════════════════
23
 
24
  THEME_FIELDS: dict[str, list[str]] = {
 
25
  "earnings_surprise_momentum": [
26
+ "standardized_unexpected_earnings_2",
27
+ "quarterly_earnings_surprise_stddev",
28
+ "quarterly_eps_surprise_change",
29
+ "six_month_eps_revision_fy2",
30
+ "mdl77_ooearningsmomemtummodel_fc_fqsurstd",
31
  ],
32
  "earnings_quality_signaling": [
33
+ "mdl77_2valuemomemtummodel_earningsqualitymodule",
34
+ "mdl77_2valuemomemtummodel_managementsignalingmodule",
35
+ "mdl77_valueanalystmodelqva_mgtsignaling",
36
+ "mdl77_valueanalystmodelqva_yoychgdebt",
37
+ "mdl77_valueanalystmodelqva_chginv",
38
  ],
39
  "asset_growth_anomaly": [
40
+ "mdl77_ohistoricalgrowthfactor_pctchgqtrast",
41
+ "three_year_change_gross_profit_margin_2",
42
+ "yearly_percentage_change_roe",
43
  ],
44
  "forward_value_composite": [
45
+ "time_weighted_cash_flow_to_price",
46
+ "time_weighted_ebitda_to_enterprise_value_2",
47
+ "ttm_sales_to_enterprise_value",
48
+ "fundamental_growth_module_score",
49
  ],
50
  "liquidity_risk_premium": [
51
+ "mdl77_2liquidityriskfactor_milliq",
52
+ "mdl177_2_globaldevnorthamerica_v502_liqcoeff",
53
  ],
 
 
54
  "multi_factor_momentum": [
55
+ "multi_factor_static_score_derivative",
56
+ "relative_valuation_rank_derivative",
57
+ "growth_potential_rank_derivative",
58
+ "earnings_certainty_rank_derivative",
59
  ],
60
  "news_reaction_drift": [
61
+ "news_short_interest",
62
+ "news_pct_5_min",
63
+ "news_vol_stddev",
64
  ],
 
 
65
  "analyst_guidance_revision": [
66
+ "dividend_estimate_average",
67
+ "max_ebitda_guidance",
68
+ "cash_flow_operations_min_guidance",
69
+ "pretax_income_reported",
70
  ],
71
  "options_sentiment_pcr": [
72
+ "pcr_vol_90",
73
+ "pcr_vol_20",
74
+ "forward_price_120",
75
  ],
76
  "supply_chain_network": [
77
+ "pv13_ustomergraphrank_auth_rank",
78
+ "pv13_ustomergraphrank_page_rank",
79
+ "rel_ret_all",
80
+ "rel_ret_comp",
81
+ "pv13_custretsig_retsig",
82
  ],
83
  "social_contrarian": [
84
+ "snt_buzz_ret_fast_d1",
85
+ "scl12_sentiment_fast_d1",
86
  ],
87
  "geographic_exposure": [
88
+ "north_america_sales_exposure",
89
+ "mdl177_2_globaldevnorthamerica_v502_chgalpha12m",
90
  ],
91
  }
92
 
93
+ # Theme β†’ archetype
94
  THEME_TO_ARCHETYPE: dict[str, str] = {
95
  "earnings_surprise_momentum": "pead_revisions",
96
  "earnings_quality_signaling": "value_quality_blend",
 
122
  "geographic_exposure": AnomalyTag.OTHER,
123
  }
124
 
125
+ # ═══════════════════════════════════════════════════════════════════════════
126
+ # BACKWARD-COMPAT: PROVEN_ARCHETYPES (used by hypothesis_hunter.py)
127
+ # ═══════════════════════════════════════════════════════════════════════════
128
+ PROVEN_ARCHETYPES = [
129
+ "value_quality_blend",
130
+ "multi_horizon_mr",
131
+ "vol_scaled_shock",
132
+ "intraday_mr_decay",
133
+ "pead_revisions",
134
+ "fundamental_yield_composite",
135
+ "sue_drift",
136
+ "supply_chain_lead_lag",
137
+ "analyst_guidance_yield",
138
+ "pcr_contrarian",
139
+ "model_score_momentum",
140
+ "alpha15_hybrid",
141
+ ]
142
+
143
+ # Theme β†’ average AC
144
  THEME_AVG_AC: dict[str, float] = {}
145
+ for _theme, _field_ids in THEME_FIELDS.items():
146
+ _acs = [FIELD_INDEX[fid].alpha_count for fid in _field_ids if fid in FIELD_INDEX]
147
+ THEME_AVG_AC[_theme] = sum(_acs) / len(_acs) if _acs else 999
 
 
 
 
148
 
149
 
150
  def compute_gap_scores(
 
152
  existing_anomaly_tags: list[str],
153
  dead_themes: Optional[list[str]] = None,
154
  ) -> list[tuple[str, float]]:
155
+ """Rank themes by opportunity (higher = bigger gap)."""
 
 
 
 
 
 
 
 
 
 
156
  theme_counts = Counter(existing_themes)
157
  anomaly_counts = Counter(existing_anomaly_tags)
158
  dead_set = set(dead_themes or [])
 
166
  alpha_count = theme_counts.get(theme, 0)
167
  avg_ac = THEME_AVG_AC.get(theme, 100)
168
 
 
169
  gap = math.log(field_count + 1) - 2 * math.log(1 + alpha_count)
170
 
171
+ # Goldmine bonus for AC=0 fields
172
  has_goldmine = any(
173
+ fid in FIELD_INDEX and FIELD_INDEX[fid].alpha_count == 0
174
  for fid in fields
175
  )
176
  if has_goldmine:
177
+ gap += 2.0
178
 
 
179
  if avg_ac <= 5:
180
  gap += 1.0
181
  elif avg_ac <= 50:
182
  gap += 0.5
183
 
 
184
  tag = THEME_TO_TAG.get(theme, AnomalyTag.OTHER)
185
  tag_count = anomaly_counts.get(tag.value, 0)
186
  if tag_count < 2:
 
198
  dead_themes: Optional[list[str]] = None,
199
  top_k: int = 3,
200
  ) -> str:
201
+ """Pick the best theme to explore next."""
202
  scores = compute_gap_scores(existing_themes, existing_anomaly_tags, dead_themes)
203
  top = scores[:top_k]
204
  if not top:
 
207
 
208
 
209
  def get_theme_fields(theme: str) -> list[str]:
 
210
  return THEME_FIELDS.get(theme, [])
211
 
212
 
213
  def get_theme_archetype(theme: str) -> str:
 
214
  return THEME_TO_ARCHETYPE.get(theme, "novel")
215
 
216
 
217
  def get_theme_tag(theme: str) -> AnomalyTag:
 
218
  return THEME_TO_TAG.get(theme, AnomalyTag.OTHER)