| """ |
| BRAIN Field Registry — Canonical reference for the Alpha Factory pipeline. |
| Contains the highest-EV fields organized by tier and domain. |
| |
| Source: fields_USA_TOP3000_D1.csv (5,905 total, 3,447 MATRIX candidates) |
| Generated: 2026-05-07 |
| |
| Usage: |
| from alpha_factory.data.brain_fields import GOLDMINE_FIELDS, TIER1_FIELDS, pick_field |
| """ |
| from dataclasses import dataclass |
| from enum import Enum |
| from typing import Optional |
| import random |
|
|
|
|
| class SignConvention(str, Enum): |
| LONG_HIGH = "long_high" |
| LONG_LOW = "long_low" |
| CONTRARIAN = "contrarian" |
| AMBIGUOUS = "ambiguous" |
|
|
|
|
| class DatasetTier(str, Enum): |
| TIER1 = "tier1" |
| TIER2 = "tier2" |
| TIER3 = "tier3" |
| TIER4 = "tier4" |
|
|
|
|
| @dataclass |
| class BrainField: |
| id: str |
| dataset: str |
| coverage: float |
| alpha_count: int |
| description: str |
| category: str |
| sign: SignConvention |
| tier: DatasetTier |
| backfill_days: int = 10 |
|
|
|
|
| GOLDMINE_FIELDS = [ |
| BrainField("time_weighted_cash_flow_to_price", "model77", 1.00, 0, "Time-weighted avg cash flows/share for next 2 years divided by price", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1), |
| BrainField("north_america_sales_exposure", "model77", 0.94, 0, "Proportion of company sales from North America", "Model", SignConvention.AMBIGUOUS, DatasetTier.TIER1), |
| BrainField("standardized_unexpected_earnings_2", "model77", 0.92, 0, "Standardized Unexpected Earnings", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1), |
| BrainField("mdl177_2_globaldevnorthamerica_v502_liqcoeff", "model77", 0.59, 0, "Slope of regression between monthly turnover and price return", "Model", SignConvention.LONG_LOW, DatasetTier.TIER1, 30), |
| BrainField("mdl177_2_globaldevnorthamerica_v502_chgalpha12m", "model77", 0.58, 0, "Six-month nominal change in 12-month alpha", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1, 30), |
| ] |
|
|
| TIER1_MODEL77_FIELDS = [ |
| BrainField("fundamental_growth_module_score", "model77", 1.00, 1, "Fundamental Growth submodule of Momentum Analyst II", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1), |
| BrainField("mdl77_2valuemomemtummodel_earningsqualitymodule", "model77", 1.00, 1, "Earnings Quality Module", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1), |
| BrainField("quarterly_earnings_surprise_stddev", "model77", 1.00, 1, "Most recent quarterly earnings surprise in standardized units", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1), |
| BrainField("quarterly_eps_surprise_change", "model77", 1.00, 1, "Change in EPS surprise between recent periods", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1), |
| BrainField("six_month_eps_revision_fy2", "model77", 1.00, 1, "Avg of prior six-month changes in consensus FY2 earnings forecasts", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1), |
| BrainField("mdl77_ohistoricalgrowthfactor_pctchgqtrast", "model77", 1.00, 1, "1-Yr Change in Total Assets (asset growth anomaly)", "Model", SignConvention.LONG_LOW, DatasetTier.TIER1), |
| BrainField("mdl77_valueanalystmodelqva_chginv", "model77", 1.00, 1, "1-year change in trailing 4Q inventory scaled by total assets", "Model", SignConvention.LONG_LOW, DatasetTier.TIER1), |
| BrainField("mdl77_valueanalystmodelqva_yoychgdebt", "model77", 1.00, 1, "Change in Debt Issuance Rank", "Model", SignConvention.LONG_LOW, DatasetTier.TIER1), |
| BrainField("three_year_change_gross_profit_margin_2", "model77", 1.00, 1, "Three-year change in gross profit margin", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1), |
| BrainField("time_weighted_ebitda_to_enterprise_value_2", "model77", 1.00, 1, "Time-weighted EBITDA/EV for next two years", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1), |
| BrainField("ttm_sales_to_enterprise_value", "model77", 1.00, 1, "TTM sales divided by enterprise value", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1), |
| BrainField("yearly_percentage_change_roe", "model77", 1.00, 1, "Year-over-year percentage change in return on equity", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1), |
| BrainField("mdl77_2liquidityriskfactor_milliq", "model77", 1.00, 1, "Stock Illiquidity (Amihud)", "Model", SignConvention.LONG_LOW, DatasetTier.TIER1), |
| ] |
|
|
| TIER3_ANALYST_FIELDS = [ |
| BrainField("dividend_estimate_average", "analyst4", 0.62, 5, "Dividend per share - average of estimations", "Analyst", SignConvention.LONG_HIGH, DatasetTier.TIER3, 30), |
| BrainField("max_ebitda_guidance", "analyst4", 1.00, 16, "Maximum guidance value for EBITDA (annual)", "Analyst", SignConvention.LONG_HIGH, DatasetTier.TIER3), |
| BrainField("cash_flow_operations_min_guidance", "analyst4", 1.00, 17, "Minimum guidance for Cash Flow from Operations", "Analyst", SignConvention.LONG_HIGH, DatasetTier.TIER3), |
| BrainField("pretax_income_reported", "analyst4", 0.56, 15, "Reported Pretax income for annual period", "Analyst", SignConvention.LONG_HIGH, DatasetTier.TIER3, 30), |
| ] |
|
|
| TIER2_NEWS_FIELDS = [ |
| BrainField("news_short_interest", "news12", 0.87, 535, "Ratio of shares sold short to shares outstanding", "News", SignConvention.LONG_LOW, DatasetTier.TIER2), |
| BrainField("news_pct_5_min", "news12", 0.77, 353, "Price change in first 5 min after news", "News", SignConvention.LONG_HIGH, DatasetTier.TIER2, 30), |
| BrainField("news_vol_stddev", "news12", 0.97, 902, "Z-score of current volume vs 30-day average", "News", SignConvention.CONTRARIAN, DatasetTier.TIER2), |
| ] |
|
|
| TIER3_OPTION_FIELDS = [ |
| BrainField("pcr_vol_90", "option9", 0.70, 184, "Put/call volume ratio for 90-day options", "Option", SignConvention.CONTRARIAN, DatasetTier.TIER3, 30), |
| BrainField("pcr_vol_20", "option9", 0.70, 233, "Put/call volume ratio for 20-day options", "Option", SignConvention.CONTRARIAN, DatasetTier.TIER3, 30), |
| BrainField("forward_price_120", "option9", 0.70, 359, "Synthetic forward price at 120 days from ATM options", "Option", SignConvention.LONG_HIGH, DatasetTier.TIER3, 30), |
| ] |
|
|
| |
| TIER3_SUPPLY_CHAIN_FIELDS = [ |
| BrainField("pv13_customergraphrank_auth_rank", "pv13", 0.79, 595, "HITS authority score of customers", "Price Volume", SignConvention.LONG_HIGH, DatasetTier.TIER3, 30), |
| BrainField("pv13_customergraphrank_page_rank", "pv13", 0.79, 921, "PageRank of customers", "Price Volume", SignConvention.LONG_HIGH, DatasetTier.TIER3, 30), |
| BrainField("rel_ret_all", "pv13", 0.96, 2280, "Averaged 1-day return of product-overlapping companies", "Price Volume", SignConvention.LONG_HIGH, DatasetTier.TIER3), |
| BrainField("rel_ret_comp", "pv13", 0.82, 3078, "Averaged 1-day return of competing companies", "Price Volume", SignConvention.LONG_HIGH, DatasetTier.TIER3, 30), |
| BrainField("pv13_custretsig_retsig", "pv13", 0.93, 2718, "Sign of customer return", "Price Volume", SignConvention.LONG_HIGH, DatasetTier.TIER3), |
| ] |
|
|
| TIER3_SOCIAL_FIELDS = [ |
| BrainField("snt_buzz_ret_fast_d1", "socialmedia12", 0.98, 56, "Negative return of relative sentiment volume", "Social Media", SignConvention.CONTRARIAN, DatasetTier.TIER3), |
| BrainField("scl12_sentiment_fast_d1", "socialmedia12", 0.98, 134, "Daily sentiment score", "Social Media", SignConvention.CONTRARIAN, DatasetTier.TIER3), |
| ] |
|
|
| TIER2_MODEL16_FIELDS = [ |
| BrainField("multi_factor_static_score_derivative", "model16", 1.00, 98, "Change in static multi-factor score", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER2), |
| BrainField("relative_valuation_rank_derivative", "model16", 1.00, 119, "Under/overpriced based on valuation multiples (change)", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER2), |
| BrainField("growth_potential_rank_derivative", "model16", 1.00, 152, "Composite growth score change", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER2), |
| BrainField("earnings_certainty_rank_derivative", "model16", 1.00, 175, "Earnings quality certainty (change)", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER2), |
| ] |
|
|
| ALL_FIELDS: list[BrainField] = ( |
| GOLDMINE_FIELDS + TIER1_MODEL77_FIELDS + TIER3_ANALYST_FIELDS |
| + TIER2_NEWS_FIELDS + TIER3_OPTION_FIELDS + TIER3_SUPPLY_CHAIN_FIELDS |
| + TIER3_SOCIAL_FIELDS + TIER2_MODEL16_FIELDS |
| ) |
|
|
| FIELD_INDEX: dict[str, BrainField] = {f.id: f for f in ALL_FIELDS} |
|
|
|
|
| def pick_field(tier=None, category=None, max_ac=50, min_coverage=0.55, exclude_ids=None): |
| exclude = exclude_ids or set() |
| candidates = [f for f in ALL_FIELDS if f.alpha_count <= max_ac and f.coverage >= min_coverage and f.id not in exclude and (tier is None or f.tier == tier) and (category is None or f.category == category)] |
| if not candidates: |
| return None |
| weights = [1.0 / (f.alpha_count + 1) for f in candidates] |
| total = sum(weights) |
| weights = [w / total for w in weights] |
| return random.choices(candidates, weights=weights, k=1)[0] |
|
|
|
|
| def get_backfill_days(field: BrainField) -> int: |
| if field.coverage < 0.70: |
| return 30 |
| elif field.coverage < 0.85: |
| return 20 |
| return field.backfill_days |
|
|
|
|
| def get_sign_multiplier(field: BrainField) -> int: |
| if field.sign == SignConvention.LONG_HIGH: |
| return 1 |
| elif field.sign == SignConvention.LONG_LOW: |
| return -1 |
| return 1 |
|
|