fix: add PROVEN_ARCHETYPES constant back to theme_sampler.py for direct imports
Browse files
alpha_factory/deterministic/theme_sampler.py
CHANGED
|
@@ -13,89 +13,84 @@ from ..data.brain_fields import (
|
|
| 13 |
TIER3_ANALYST_FIELDS, TIER2_NEWS_FIELDS, TIER3_OPTION_FIELDS,
|
| 14 |
TIER3_SUPPLY_CHAIN_FIELDS, TIER3_SOCIAL_FIELDS, TIER2_MODEL16_FIELDS,
|
| 15 |
BrainField, DatasetTier, pick_field, get_sign_multiplier,
|
|
|
|
| 16 |
)
|
| 17 |
|
| 18 |
|
| 19 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 20 |
# THEME DEFINITIONS β mapped to REAL BRAIN field IDs
|
| 21 |
-
# Each theme contains actual implementable fields from the platform
|
| 22 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 23 |
|
| 24 |
THEME_FIELDS: dict[str, list[str]] = {
|
| 25 |
-
# TIER 1 β model77 goldmine (density 24 Ξ±/field)
|
| 26 |
"earnings_surprise_momentum": [
|
| 27 |
-
"standardized_unexpected_earnings_2",
|
| 28 |
-
"quarterly_earnings_surprise_stddev",
|
| 29 |
-
"quarterly_eps_surprise_change",
|
| 30 |
-
"six_month_eps_revision_fy2",
|
| 31 |
-
"mdl77_ooearningsmomemtummodel_fc_fqsurstd",
|
| 32 |
],
|
| 33 |
"earnings_quality_signaling": [
|
| 34 |
-
"mdl77_2valuemomemtummodel_earningsqualitymodule",
|
| 35 |
-
"mdl77_2valuemomemtummodel_managementsignalingmodule",
|
| 36 |
-
"mdl77_valueanalystmodelqva_mgtsignaling",
|
| 37 |
-
"mdl77_valueanalystmodelqva_yoychgdebt",
|
| 38 |
-
"mdl77_valueanalystmodelqva_chginv",
|
| 39 |
],
|
| 40 |
"asset_growth_anomaly": [
|
| 41 |
-
"mdl77_ohistoricalgrowthfactor_pctchgqtrast",
|
| 42 |
-
"three_year_change_gross_profit_margin_2",
|
| 43 |
-
"yearly_percentage_change_roe",
|
| 44 |
],
|
| 45 |
"forward_value_composite": [
|
| 46 |
-
"time_weighted_cash_flow_to_price",
|
| 47 |
-
"time_weighted_ebitda_to_enterprise_value_2",
|
| 48 |
-
"ttm_sales_to_enterprise_value",
|
| 49 |
-
"fundamental_growth_module_score",
|
| 50 |
],
|
| 51 |
"liquidity_risk_premium": [
|
| 52 |
-
"mdl77_2liquidityriskfactor_milliq",
|
| 53 |
-
"mdl177_2_globaldevnorthamerica_v502_liqcoeff",
|
| 54 |
],
|
| 55 |
-
|
| 56 |
-
# TIER 2 β model16 score derivatives + news
|
| 57 |
"multi_factor_momentum": [
|
| 58 |
-
"multi_factor_static_score_derivative",
|
| 59 |
-
"relative_valuation_rank_derivative",
|
| 60 |
-
"growth_potential_rank_derivative",
|
| 61 |
-
"earnings_certainty_rank_derivative",
|
| 62 |
],
|
| 63 |
"news_reaction_drift": [
|
| 64 |
-
"news_short_interest",
|
| 65 |
-
"news_pct_5_min",
|
| 66 |
-
"news_vol_stddev",
|
| 67 |
],
|
| 68 |
-
|
| 69 |
-
# TIER 3 β analyst, options, supply chain, social
|
| 70 |
"analyst_guidance_revision": [
|
| 71 |
-
"dividend_estimate_average",
|
| 72 |
-
"max_ebitda_guidance",
|
| 73 |
-
"cash_flow_operations_min_guidance",
|
| 74 |
-
"pretax_income_reported",
|
| 75 |
],
|
| 76 |
"options_sentiment_pcr": [
|
| 77 |
-
"pcr_vol_90",
|
| 78 |
-
"pcr_vol_20",
|
| 79 |
-
"forward_price_120",
|
| 80 |
],
|
| 81 |
"supply_chain_network": [
|
| 82 |
-
"pv13_ustomergraphrank_auth_rank",
|
| 83 |
-
"pv13_ustomergraphrank_page_rank",
|
| 84 |
-
"rel_ret_all",
|
| 85 |
-
"rel_ret_comp",
|
| 86 |
-
"pv13_custretsig_retsig",
|
| 87 |
],
|
| 88 |
"social_contrarian": [
|
| 89 |
-
"snt_buzz_ret_fast_d1",
|
| 90 |
-
"scl12_sentiment_fast_d1",
|
| 91 |
],
|
| 92 |
"geographic_exposure": [
|
| 93 |
-
"north_america_sales_exposure",
|
| 94 |
-
"mdl177_2_globaldevnorthamerica_v502_chgalpha12m",
|
| 95 |
],
|
| 96 |
}
|
| 97 |
|
| 98 |
-
# Theme β
|
| 99 |
THEME_TO_ARCHETYPE: dict[str, str] = {
|
| 100 |
"earnings_surprise_momentum": "pead_revisions",
|
| 101 |
"earnings_quality_signaling": "value_quality_blend",
|
|
@@ -127,15 +122,29 @@ THEME_TO_TAG: dict[str, AnomalyTag] = {
|
|
| 127 |
"geographic_exposure": AnomalyTag.OTHER,
|
| 128 |
}
|
| 129 |
|
| 130 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
THEME_AVG_AC: dict[str, float] = {}
|
| 132 |
-
for
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
from ..data.brain_fields import FIELD_INDEX
|
| 136 |
-
if fid in FIELD_INDEX:
|
| 137 |
-
acs.append(FIELD_INDEX[fid].alpha_count)
|
| 138 |
-
THEME_AVG_AC[theme] = sum(acs) / len(acs) if acs else 999
|
| 139 |
|
| 140 |
|
| 141 |
def compute_gap_scores(
|
|
@@ -143,17 +152,7 @@ def compute_gap_scores(
|
|
| 143 |
existing_anomaly_tags: list[str],
|
| 144 |
dead_themes: Optional[list[str]] = None,
|
| 145 |
) -> list[tuple[str, float]]:
|
| 146 |
-
"""
|
| 147 |
-
Rank themes by how under-explored they are.
|
| 148 |
-
|
| 149 |
-
gap_score = log(field_count + 1)
|
| 150 |
-
- 2 * log(1 + alphas_in_theme)
|
| 151 |
-
+ novelty_bonus (if AC < 2)
|
| 152 |
-
+ goldmine_bonus (if any AC=0 field)
|
| 153 |
-
- dead_penalty
|
| 154 |
-
|
| 155 |
-
Higher score = bigger opportunity.
|
| 156 |
-
"""
|
| 157 |
theme_counts = Counter(existing_themes)
|
| 158 |
anomaly_counts = Counter(existing_anomaly_tags)
|
| 159 |
dead_set = set(dead_themes or [])
|
|
@@ -167,24 +166,21 @@ def compute_gap_scores(
|
|
| 167 |
alpha_count = theme_counts.get(theme, 0)
|
| 168 |
avg_ac = THEME_AVG_AC.get(theme, 100)
|
| 169 |
|
| 170 |
-
# Base gap score
|
| 171 |
gap = math.log(field_count + 1) - 2 * math.log(1 + alpha_count)
|
| 172 |
|
| 173 |
-
# Goldmine bonus
|
| 174 |
has_goldmine = any(
|
| 175 |
-
|
| 176 |
for fid in fields
|
| 177 |
)
|
| 178 |
if has_goldmine:
|
| 179 |
-
gap += 2.0
|
| 180 |
|
| 181 |
-
# Low-AC bonus: average AC of theme's fields
|
| 182 |
if avg_ac <= 5:
|
| 183 |
gap += 1.0
|
| 184 |
elif avg_ac <= 50:
|
| 185 |
gap += 0.5
|
| 186 |
|
| 187 |
-
# Anomaly diversity bonus
|
| 188 |
tag = THEME_TO_TAG.get(theme, AnomalyTag.OTHER)
|
| 189 |
tag_count = anomaly_counts.get(tag.value, 0)
|
| 190 |
if tag_count < 2:
|
|
@@ -202,7 +198,7 @@ def pick_theme(
|
|
| 202 |
dead_themes: Optional[list[str]] = None,
|
| 203 |
top_k: int = 3,
|
| 204 |
) -> str:
|
| 205 |
-
"""Pick the best theme to explore next
|
| 206 |
scores = compute_gap_scores(existing_themes, existing_anomaly_tags, dead_themes)
|
| 207 |
top = scores[:top_k]
|
| 208 |
if not top:
|
|
@@ -211,15 +207,12 @@ def pick_theme(
|
|
| 211 |
|
| 212 |
|
| 213 |
def get_theme_fields(theme: str) -> list[str]:
|
| 214 |
-
"""Get the BRAIN field IDs for a theme."""
|
| 215 |
return THEME_FIELDS.get(theme, [])
|
| 216 |
|
| 217 |
|
| 218 |
def get_theme_archetype(theme: str) -> str:
|
| 219 |
-
"""Get the recommended archetype for a theme."""
|
| 220 |
return THEME_TO_ARCHETYPE.get(theme, "novel")
|
| 221 |
|
| 222 |
|
| 223 |
def get_theme_tag(theme: str) -> AnomalyTag:
|
| 224 |
-
"""Get the anomaly tag for a theme."""
|
| 225 |
return THEME_TO_TAG.get(theme, AnomalyTag.OTHER)
|
|
|
|
| 13 |
TIER3_ANALYST_FIELDS, TIER2_NEWS_FIELDS, TIER3_OPTION_FIELDS,
|
| 14 |
TIER3_SUPPLY_CHAIN_FIELDS, TIER3_SOCIAL_FIELDS, TIER2_MODEL16_FIELDS,
|
| 15 |
BrainField, DatasetTier, pick_field, get_sign_multiplier,
|
| 16 |
+
FIELD_INDEX,
|
| 17 |
)
|
| 18 |
|
| 19 |
|
| 20 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 21 |
# THEME DEFINITIONS β mapped to REAL BRAIN field IDs
|
|
|
|
| 22 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 23 |
|
| 24 |
THEME_FIELDS: dict[str, list[str]] = {
|
|
|
|
| 25 |
"earnings_surprise_momentum": [
|
| 26 |
+
"standardized_unexpected_earnings_2",
|
| 27 |
+
"quarterly_earnings_surprise_stddev",
|
| 28 |
+
"quarterly_eps_surprise_change",
|
| 29 |
+
"six_month_eps_revision_fy2",
|
| 30 |
+
"mdl77_ooearningsmomemtummodel_fc_fqsurstd",
|
| 31 |
],
|
| 32 |
"earnings_quality_signaling": [
|
| 33 |
+
"mdl77_2valuemomemtummodel_earningsqualitymodule",
|
| 34 |
+
"mdl77_2valuemomemtummodel_managementsignalingmodule",
|
| 35 |
+
"mdl77_valueanalystmodelqva_mgtsignaling",
|
| 36 |
+
"mdl77_valueanalystmodelqva_yoychgdebt",
|
| 37 |
+
"mdl77_valueanalystmodelqva_chginv",
|
| 38 |
],
|
| 39 |
"asset_growth_anomaly": [
|
| 40 |
+
"mdl77_ohistoricalgrowthfactor_pctchgqtrast",
|
| 41 |
+
"three_year_change_gross_profit_margin_2",
|
| 42 |
+
"yearly_percentage_change_roe",
|
| 43 |
],
|
| 44 |
"forward_value_composite": [
|
| 45 |
+
"time_weighted_cash_flow_to_price",
|
| 46 |
+
"time_weighted_ebitda_to_enterprise_value_2",
|
| 47 |
+
"ttm_sales_to_enterprise_value",
|
| 48 |
+
"fundamental_growth_module_score",
|
| 49 |
],
|
| 50 |
"liquidity_risk_premium": [
|
| 51 |
+
"mdl77_2liquidityriskfactor_milliq",
|
| 52 |
+
"mdl177_2_globaldevnorthamerica_v502_liqcoeff",
|
| 53 |
],
|
|
|
|
|
|
|
| 54 |
"multi_factor_momentum": [
|
| 55 |
+
"multi_factor_static_score_derivative",
|
| 56 |
+
"relative_valuation_rank_derivative",
|
| 57 |
+
"growth_potential_rank_derivative",
|
| 58 |
+
"earnings_certainty_rank_derivative",
|
| 59 |
],
|
| 60 |
"news_reaction_drift": [
|
| 61 |
+
"news_short_interest",
|
| 62 |
+
"news_pct_5_min",
|
| 63 |
+
"news_vol_stddev",
|
| 64 |
],
|
|
|
|
|
|
|
| 65 |
"analyst_guidance_revision": [
|
| 66 |
+
"dividend_estimate_average",
|
| 67 |
+
"max_ebitda_guidance",
|
| 68 |
+
"cash_flow_operations_min_guidance",
|
| 69 |
+
"pretax_income_reported",
|
| 70 |
],
|
| 71 |
"options_sentiment_pcr": [
|
| 72 |
+
"pcr_vol_90",
|
| 73 |
+
"pcr_vol_20",
|
| 74 |
+
"forward_price_120",
|
| 75 |
],
|
| 76 |
"supply_chain_network": [
|
| 77 |
+
"pv13_ustomergraphrank_auth_rank",
|
| 78 |
+
"pv13_ustomergraphrank_page_rank",
|
| 79 |
+
"rel_ret_all",
|
| 80 |
+
"rel_ret_comp",
|
| 81 |
+
"pv13_custretsig_retsig",
|
| 82 |
],
|
| 83 |
"social_contrarian": [
|
| 84 |
+
"snt_buzz_ret_fast_d1",
|
| 85 |
+
"scl12_sentiment_fast_d1",
|
| 86 |
],
|
| 87 |
"geographic_exposure": [
|
| 88 |
+
"north_america_sales_exposure",
|
| 89 |
+
"mdl177_2_globaldevnorthamerica_v502_chgalpha12m",
|
| 90 |
],
|
| 91 |
}
|
| 92 |
|
| 93 |
+
# Theme β archetype
|
| 94 |
THEME_TO_ARCHETYPE: dict[str, str] = {
|
| 95 |
"earnings_surprise_momentum": "pead_revisions",
|
| 96 |
"earnings_quality_signaling": "value_quality_blend",
|
|
|
|
| 122 |
"geographic_exposure": AnomalyTag.OTHER,
|
| 123 |
}
|
| 124 |
|
| 125 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 126 |
+
# BACKWARD-COMPAT: PROVEN_ARCHETYPES (used by hypothesis_hunter.py)
|
| 127 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 128 |
+
PROVEN_ARCHETYPES = [
|
| 129 |
+
"value_quality_blend",
|
| 130 |
+
"multi_horizon_mr",
|
| 131 |
+
"vol_scaled_shock",
|
| 132 |
+
"intraday_mr_decay",
|
| 133 |
+
"pead_revisions",
|
| 134 |
+
"fundamental_yield_composite",
|
| 135 |
+
"sue_drift",
|
| 136 |
+
"supply_chain_lead_lag",
|
| 137 |
+
"analyst_guidance_yield",
|
| 138 |
+
"pcr_contrarian",
|
| 139 |
+
"model_score_momentum",
|
| 140 |
+
"alpha15_hybrid",
|
| 141 |
+
]
|
| 142 |
+
|
| 143 |
+
# Theme β average AC
|
| 144 |
THEME_AVG_AC: dict[str, float] = {}
|
| 145 |
+
for _theme, _field_ids in THEME_FIELDS.items():
|
| 146 |
+
_acs = [FIELD_INDEX[fid].alpha_count for fid in _field_ids if fid in FIELD_INDEX]
|
| 147 |
+
THEME_AVG_AC[_theme] = sum(_acs) / len(_acs) if _acs else 999
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
|
| 149 |
|
| 150 |
def compute_gap_scores(
|
|
|
|
| 152 |
existing_anomaly_tags: list[str],
|
| 153 |
dead_themes: Optional[list[str]] = None,
|
| 154 |
) -> list[tuple[str, float]]:
|
| 155 |
+
"""Rank themes by opportunity (higher = bigger gap)."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
theme_counts = Counter(existing_themes)
|
| 157 |
anomaly_counts = Counter(existing_anomaly_tags)
|
| 158 |
dead_set = set(dead_themes or [])
|
|
|
|
| 166 |
alpha_count = theme_counts.get(theme, 0)
|
| 167 |
avg_ac = THEME_AVG_AC.get(theme, 100)
|
| 168 |
|
|
|
|
| 169 |
gap = math.log(field_count + 1) - 2 * math.log(1 + alpha_count)
|
| 170 |
|
| 171 |
+
# Goldmine bonus for AC=0 fields
|
| 172 |
has_goldmine = any(
|
| 173 |
+
fid in FIELD_INDEX and FIELD_INDEX[fid].alpha_count == 0
|
| 174 |
for fid in fields
|
| 175 |
)
|
| 176 |
if has_goldmine:
|
| 177 |
+
gap += 2.0
|
| 178 |
|
|
|
|
| 179 |
if avg_ac <= 5:
|
| 180 |
gap += 1.0
|
| 181 |
elif avg_ac <= 50:
|
| 182 |
gap += 0.5
|
| 183 |
|
|
|
|
| 184 |
tag = THEME_TO_TAG.get(theme, AnomalyTag.OTHER)
|
| 185 |
tag_count = anomaly_counts.get(tag.value, 0)
|
| 186 |
if tag_count < 2:
|
|
|
|
| 198 |
dead_themes: Optional[list[str]] = None,
|
| 199 |
top_k: int = 3,
|
| 200 |
) -> str:
|
| 201 |
+
"""Pick the best theme to explore next."""
|
| 202 |
scores = compute_gap_scores(existing_themes, existing_anomaly_tags, dead_themes)
|
| 203 |
top = scores[:top_k]
|
| 204 |
if not top:
|
|
|
|
| 207 |
|
| 208 |
|
| 209 |
def get_theme_fields(theme: str) -> list[str]:
|
|
|
|
| 210 |
return THEME_FIELDS.get(theme, [])
|
| 211 |
|
| 212 |
|
| 213 |
def get_theme_archetype(theme: str) -> str:
|
|
|
|
| 214 |
return THEME_TO_ARCHETYPE.get(theme, "novel")
|
| 215 |
|
| 216 |
|
| 217 |
def get_theme_tag(theme: str) -> AnomalyTag:
|
|
|
|
| 218 |
return THEME_TO_TAG.get(theme, AnomalyTag.OTHER)
|