gaurv007 commited on
Commit
88e6e07
·
verified ·
1 Parent(s): 23dc829

feat: theme_sampler v2 — real BRAIN fields from model77/analyst4/news12/option9/pv13, data-driven gap scoring

Browse files
alpha_factory/deterministic/theme_sampler.py CHANGED
@@ -1,40 +1,141 @@
1
  """
2
- Theme Sampler — deterministic gap analysis.
3
- Picks under-explored themes from the factor store.
 
4
  """
5
  import math
 
6
  from collections import Counter
7
  from typing import Optional
8
  from ..schemas import AnomalyTag
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
- # Theme definitions mapped to BRAIN field families
12
- THEME_FIELDS = {
13
- "value_quality": ["book_to_price", "earnings_yield", "roe", "roa", "debt_to_equity", "current_ratio"],
14
- "momentum": ["close", "returns", "volume", "ts_returns", "high", "low"],
15
- "reversal": ["close", "returns", "volume", "bid_ask_spread"],
16
- "volatility": ["volatility", "ivol", "beta", "hv", "atr"],
17
- "analyst": ["analyst_rating", "estimate_revision", "target_price", "recommendation"],
18
- "sentiment_social": ["sentiment", "social_volume", "social_score", "news_sentiment"],
19
- "option_surface": ["iv30", "iv60", "iv90", "iv180", "pcr", "skew", "term_structure"],
20
- "earnings_event": ["earnings_surprise", "post_earnings_drift", "guidance"],
21
- "liquidity_micro": ["bid_ask_spread", "volume", "turnover", "amihud_illiquidity"],
22
- "growth": ["revenue_growth", "earnings_growth", "asset_growth", "sales_growth"],
23
- "intraday": ["open", "high", "low", "close", "vwap", "intraday_range"],
24
- "fundamental_yield": ["dividend_yield", "buyback_yield", "shareholder_yield", "fcf_yield"],
25
  }
26
 
27
- # Known archetypes from the existing 18-alpha library
28
- PROVEN_ARCHETYPES = [
29
- "value_quality_blend",
30
- "intraday_mr_decay",
31
- "vol_scaled_shock",
32
- "pead_revisions",
33
- "skew_term",
34
- "social_momentum",
35
- "multi_horizon_mr",
36
- "fundamental_yield_composite",
37
- ]
38
 
39
 
40
  def compute_gap_scores(
@@ -44,9 +145,13 @@ def compute_gap_scores(
44
  ) -> list[tuple[str, float]]:
45
  """
46
  Rank themes by how under-explored they are.
47
-
48
- gap_score = log(field_count) - 2 * log(1 + alphas_in_theme) - dead_penalty
49
-
 
 
 
 
50
  Higher score = bigger opportunity.
51
  """
52
  theme_counts = Counter(existing_themes)
@@ -56,22 +161,37 @@ def compute_gap_scores(
56
  scores = []
57
  for theme, fields in THEME_FIELDS.items():
58
  if theme in dead_set:
59
- continue # Skip dead themes (§11.5)
60
 
61
  field_count = len(fields)
62
  alpha_count = theme_counts.get(theme, 0)
 
63
 
 
64
  gap = math.log(field_count + 1) - 2 * math.log(1 + alpha_count)
65
 
66
- # Bonus if the anomaly tag is under-represented
67
- tag = _theme_to_tag(theme)
68
- tag_count = anomaly_counts.get(tag, 0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  if tag_count < 2:
70
- gap += 0.5 # novelty bonus
71
 
72
  scores.append((theme, gap))
73
 
74
- # Sort descending
75
  scores.sort(key=lambda x: -x[1])
76
  return scores
77
 
@@ -83,29 +203,23 @@ def pick_theme(
83
  top_k: int = 3,
84
  ) -> str:
85
  """Pick the best theme to explore next (highest gap score)."""
86
- import random
87
  scores = compute_gap_scores(existing_themes, existing_anomaly_tags, dead_themes)
88
- # Sample from top-k for diversity
89
  top = scores[:top_k]
90
  if not top:
91
  return random.choice(list(THEME_FIELDS.keys()))
92
  return random.choice(top)[0]
93
 
94
 
95
- def _theme_to_tag(theme: str) -> str:
96
- """Map theme name to AnomalyTag value."""
97
- mapping = {
98
- "value_quality": "value",
99
- "momentum": "momentum",
100
- "reversal": "reversal",
101
- "volatility": "low_vol",
102
- "analyst": "analyst",
103
- "sentiment_social": "sentiment",
104
- "option_surface": "option_surface",
105
- "earnings_event": "pead",
106
- "liquidity_micro": "liquidity",
107
- "growth": "fundamental",
108
- "intraday": "technical",
109
- "fundamental_yield": "fundamental",
110
- }
111
- return mapping.get(theme, "other")
 
1
  """
2
+ Theme Sampler v2 Data-driven gap analysis using REAL BRAIN fields.
3
+ Picks under-explored themes from the canonical field registry.
4
+ Now uses actual field IDs, AC counts, and dataset tiers.
5
  """
6
  import math
7
+ import random
8
  from collections import Counter
9
  from typing import Optional
10
  from ..schemas import AnomalyTag
11
+ from ..data.brain_fields import (
12
+ ALL_FIELDS, GOLDMINE_FIELDS, TIER1_MODEL77_FIELDS,
13
+ TIER3_ANALYST_FIELDS, TIER2_NEWS_FIELDS, TIER3_OPTION_FIELDS,
14
+ TIER3_SUPPLY_CHAIN_FIELDS, TIER3_SOCIAL_FIELDS, TIER2_MODEL16_FIELDS,
15
+ BrainField, DatasetTier, pick_field, get_sign_multiplier,
16
+ )
17
+
18
+
19
+ # ═══════════════════════════════════════════════════════════════════════════
20
+ # THEME DEFINITIONS — mapped to REAL BRAIN field IDs
21
+ # Each theme contains actual implementable fields from the platform
22
+ # ═══════════════════════════════════════════════════════════════════════════
23
+
24
+ THEME_FIELDS: dict[str, list[str]] = {
25
+ # TIER 1 — model77 goldmine (density 24 α/field)
26
+ "earnings_surprise_momentum": [
27
+ "standardized_unexpected_earnings_2", # AC=0 GOLDMINE
28
+ "quarterly_earnings_surprise_stddev", # AC=1
29
+ "quarterly_eps_surprise_change", # AC=1
30
+ "six_month_eps_revision_fy2", # AC=1
31
+ "mdl77_ooearningsmomemtummodel_fc_fqsurstd", # AC=1
32
+ ],
33
+ "earnings_quality_signaling": [
34
+ "mdl77_2valuemomemtummodel_earningsqualitymodule", # AC=1
35
+ "mdl77_2valuemomemtummodel_managementsignalingmodule", # AC=1
36
+ "mdl77_valueanalystmodelqva_mgtsignaling", # AC=1
37
+ "mdl77_valueanalystmodelqva_yoychgdebt", # AC=1
38
+ "mdl77_valueanalystmodelqva_chginv", # AC=1
39
+ ],
40
+ "asset_growth_anomaly": [
41
+ "mdl77_ohistoricalgrowthfactor_pctchgqtrast", # AC=1 (Cooper et al. 2008)
42
+ "three_year_change_gross_profit_margin_2", # AC=1
43
+ "yearly_percentage_change_roe", # AC=1
44
+ ],
45
+ "forward_value_composite": [
46
+ "time_weighted_cash_flow_to_price", # AC=0 GOLDMINE
47
+ "time_weighted_ebitda_to_enterprise_value_2", # AC=1
48
+ "ttm_sales_to_enterprise_value", # AC=1
49
+ "fundamental_growth_module_score", # AC=1
50
+ ],
51
+ "liquidity_risk_premium": [
52
+ "mdl77_2liquidityriskfactor_milliq", # AC=1 (Amihud illiquidity)
53
+ "mdl177_2_globaldevnorthamerica_v502_liqcoeff", # AC=0 GOLDMINE
54
+ ],
55
+
56
+ # TIER 2 — model16 score derivatives + news
57
+ "multi_factor_momentum": [
58
+ "multi_factor_static_score_derivative", # AC=98
59
+ "relative_valuation_rank_derivative", # AC=119
60
+ "growth_potential_rank_derivative", # AC=152
61
+ "earnings_certainty_rank_derivative", # AC=175
62
+ ],
63
+ "news_reaction_drift": [
64
+ "news_short_interest", # AC=535
65
+ "news_pct_5_min", # AC=353
66
+ "news_vol_stddev", # AC=902
67
+ ],
68
+
69
+ # TIER 3 — analyst, options, supply chain, social
70
+ "analyst_guidance_revision": [
71
+ "dividend_estimate_average", # AC=5
72
+ "max_ebitda_guidance", # AC=16
73
+ "cash_flow_operations_min_guidance", # AC=17
74
+ "pretax_income_reported", # AC=15
75
+ ],
76
+ "options_sentiment_pcr": [
77
+ "pcr_vol_90", # AC=184
78
+ "pcr_vol_20", # AC=233
79
+ "forward_price_120", # AC=359
80
+ ],
81
+ "supply_chain_network": [
82
+ "pv13_ustomergraphrank_auth_rank", # AC=595 (Cohen & Frazzini proxy)
83
+ "pv13_ustomergraphrank_page_rank", # AC=921
84
+ "rel_ret_all", # AC=2280
85
+ "rel_ret_comp", # AC=3078
86
+ "pv13_custretsig_retsig", # AC=2718
87
+ ],
88
+ "social_contrarian": [
89
+ "snt_buzz_ret_fast_d1", # AC=56
90
+ "scl12_sentiment_fast_d1", # AC=134
91
+ ],
92
+ "geographic_exposure": [
93
+ "north_america_sales_exposure", # AC=0 GOLDMINE
94
+ "mdl177_2_globaldevnorthamerica_v502_chgalpha12m", # AC=0 GOLDMINE
95
+ ],
96
+ }
97
 
98
+ # Theme → expected archetype mapping for the expression compiler
99
+ THEME_TO_ARCHETYPE: dict[str, str] = {
100
+ "earnings_surprise_momentum": "pead_revisions",
101
+ "earnings_quality_signaling": "value_quality_blend",
102
+ "asset_growth_anomaly": "value_quality_blend",
103
+ "forward_value_composite": "fundamental_yield_composite",
104
+ "liquidity_risk_premium": "vol_scaled_shock",
105
+ "multi_factor_momentum": "multi_horizon_mr",
106
+ "news_reaction_drift": "intraday_mr_decay",
107
+ "analyst_guidance_revision": "pead_revisions",
108
+ "options_sentiment_pcr": "vol_scaled_shock",
109
+ "supply_chain_network": "multi_horizon_mr",
110
+ "social_contrarian": "intraday_mr_decay",
111
+ "geographic_exposure": "value_quality_blend",
112
+ }
113
 
114
+ # Theme anomaly tag
115
+ THEME_TO_TAG: dict[str, AnomalyTag] = {
116
+ "earnings_surprise_momentum": AnomalyTag.PEAD,
117
+ "earnings_quality_signaling": AnomalyTag.QUALITY,
118
+ "asset_growth_anomaly": AnomalyTag.FUNDAMENTAL,
119
+ "forward_value_composite": AnomalyTag.VALUE,
120
+ "liquidity_risk_premium": AnomalyTag.LIQUIDITY,
121
+ "multi_factor_momentum": AnomalyTag.MOMENTUM,
122
+ "news_reaction_drift": AnomalyTag.EVENT,
123
+ "analyst_guidance_revision": AnomalyTag.ANALYST,
124
+ "options_sentiment_pcr": AnomalyTag.OPTION_SURFACE,
125
+ "supply_chain_network": AnomalyTag.TECHNICAL,
126
+ "social_contrarian": AnomalyTag.SOCIAL,
127
+ "geographic_exposure": AnomalyTag.OTHER,
128
  }
129
 
130
+ # Theme average AC (lower = higher EV)
131
+ THEME_AVG_AC: dict[str, float] = {}
132
+ for theme, field_ids in THEME_FIELDS.items():
133
+ acs = []
134
+ for fid in field_ids:
135
+ from ..data.brain_fields import FIELD_INDEX
136
+ if fid in FIELD_INDEX:
137
+ acs.append(FIELD_INDEX[fid].alpha_count)
138
+ THEME_AVG_AC[theme] = sum(acs) / len(acs) if acs else 999
 
 
139
 
140
 
141
  def compute_gap_scores(
 
145
  ) -> list[tuple[str, float]]:
146
  """
147
  Rank themes by how under-explored they are.
148
+
149
+ gap_score = log(field_count + 1)
150
+ - 2 * log(1 + alphas_in_theme)
151
+ + novelty_bonus (if AC < 2)
152
+ + goldmine_bonus (if any AC=0 field)
153
+ - dead_penalty
154
+
155
  Higher score = bigger opportunity.
156
  """
157
  theme_counts = Counter(existing_themes)
 
161
  scores = []
162
  for theme, fields in THEME_FIELDS.items():
163
  if theme in dead_set:
164
+ continue
165
 
166
  field_count = len(fields)
167
  alpha_count = theme_counts.get(theme, 0)
168
+ avg_ac = THEME_AVG_AC.get(theme, 100)
169
 
170
+ # Base gap score
171
  gap = math.log(field_count + 1) - 2 * math.log(1 + alpha_count)
172
 
173
+ # Goldmine bonus: themes containing AC=0 fields get massive boost
174
+ has_goldmine = any(
175
+ FIELD_INDEX.get(fid, None) and FIELD_INDEX[fid].alpha_count == 0
176
+ for fid in fields
177
+ )
178
+ if has_goldmine:
179
+ gap += 2.0 # Huge bonus for untouched fields
180
+
181
+ # Low-AC bonus: average AC of theme's fields
182
+ if avg_ac <= 5:
183
+ gap += 1.0
184
+ elif avg_ac <= 50:
185
+ gap += 0.5
186
+
187
+ # Anomaly diversity bonus
188
+ tag = THEME_TO_TAG.get(theme, AnomalyTag.OTHER)
189
+ tag_count = anomaly_counts.get(tag.value, 0)
190
  if tag_count < 2:
191
+ gap += 0.5
192
 
193
  scores.append((theme, gap))
194
 
 
195
  scores.sort(key=lambda x: -x[1])
196
  return scores
197
 
 
203
  top_k: int = 3,
204
  ) -> str:
205
  """Pick the best theme to explore next (highest gap score)."""
 
206
  scores = compute_gap_scores(existing_themes, existing_anomaly_tags, dead_themes)
 
207
  top = scores[:top_k]
208
  if not top:
209
  return random.choice(list(THEME_FIELDS.keys()))
210
  return random.choice(top)[0]
211
 
212
 
213
+ def get_theme_fields(theme: str) -> list[str]:
214
+ """Get the BRAIN field IDs for a theme."""
215
+ return THEME_FIELDS.get(theme, [])
216
+
217
+
218
+ def get_theme_archetype(theme: str) -> str:
219
+ """Get the recommended archetype for a theme."""
220
+ return THEME_TO_ARCHETYPE.get(theme, "novel")
221
+
222
+
223
+ def get_theme_tag(theme: str) -> AnomalyTag:
224
+ """Get the anomaly tag for a theme."""
225
+ return THEME_TO_TAG.get(theme, AnomalyTag.OTHER)