alpha-factory / alpha_factory /data /brain_fields.py
gaurv007's picture
Upload alpha_factory/data/brain_fields.py
0af1dfb verified
"""
BRAIN Field Registry — Canonical reference for the Alpha Factory pipeline.
Contains the highest-EV fields organized by tier and domain.
Source: fields_USA_TOP3000_D1.csv (5,905 total, 3,447 MATRIX candidates)
Generated: 2026-05-07
Usage:
from alpha_factory.data.brain_fields import GOLDMINE_FIELDS, TIER1_FIELDS, pick_field
"""
from dataclasses import dataclass
from enum import Enum
from typing import Optional
import random
class SignConvention(str, Enum):
LONG_HIGH = "long_high"
LONG_LOW = "long_low"
CONTRARIAN = "contrarian"
AMBIGUOUS = "ambiguous"
class DatasetTier(str, Enum):
TIER1 = "tier1"
TIER2 = "tier2"
TIER3 = "tier3"
TIER4 = "tier4"
@dataclass
class BrainField:
id: str
dataset: str
coverage: float
alpha_count: int
description: str
category: str
sign: SignConvention
tier: DatasetTier
backfill_days: int = 10
GOLDMINE_FIELDS = [
BrainField("time_weighted_cash_flow_to_price", "model77", 1.00, 0, "Time-weighted avg cash flows/share for next 2 years divided by price", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1),
BrainField("north_america_sales_exposure", "model77", 0.94, 0, "Proportion of company sales from North America", "Model", SignConvention.AMBIGUOUS, DatasetTier.TIER1),
BrainField("standardized_unexpected_earnings_2", "model77", 0.92, 0, "Standardized Unexpected Earnings", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1),
BrainField("mdl177_2_globaldevnorthamerica_v502_liqcoeff", "model77", 0.59, 0, "Slope of regression between monthly turnover and price return", "Model", SignConvention.LONG_LOW, DatasetTier.TIER1, 30),
BrainField("mdl177_2_globaldevnorthamerica_v502_chgalpha12m", "model77", 0.58, 0, "Six-month nominal change in 12-month alpha", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1, 30),
]
TIER1_MODEL77_FIELDS = [
BrainField("fundamental_growth_module_score", "model77", 1.00, 1, "Fundamental Growth submodule of Momentum Analyst II", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1),
BrainField("mdl77_2valuemomemtummodel_earningsqualitymodule", "model77", 1.00, 1, "Earnings Quality Module", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1),
BrainField("quarterly_earnings_surprise_stddev", "model77", 1.00, 1, "Most recent quarterly earnings surprise in standardized units", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1),
BrainField("quarterly_eps_surprise_change", "model77", 1.00, 1, "Change in EPS surprise between recent periods", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1),
BrainField("six_month_eps_revision_fy2", "model77", 1.00, 1, "Avg of prior six-month changes in consensus FY2 earnings forecasts", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1),
BrainField("mdl77_ohistoricalgrowthfactor_pctchgqtrast", "model77", 1.00, 1, "1-Yr Change in Total Assets (asset growth anomaly)", "Model", SignConvention.LONG_LOW, DatasetTier.TIER1),
BrainField("mdl77_valueanalystmodelqva_chginv", "model77", 1.00, 1, "1-year change in trailing 4Q inventory scaled by total assets", "Model", SignConvention.LONG_LOW, DatasetTier.TIER1),
BrainField("mdl77_valueanalystmodelqva_yoychgdebt", "model77", 1.00, 1, "Change in Debt Issuance Rank", "Model", SignConvention.LONG_LOW, DatasetTier.TIER1),
BrainField("three_year_change_gross_profit_margin_2", "model77", 1.00, 1, "Three-year change in gross profit margin", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1),
BrainField("time_weighted_ebitda_to_enterprise_value_2", "model77", 1.00, 1, "Time-weighted EBITDA/EV for next two years", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1),
BrainField("ttm_sales_to_enterprise_value", "model77", 1.00, 1, "TTM sales divided by enterprise value", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1),
BrainField("yearly_percentage_change_roe", "model77", 1.00, 1, "Year-over-year percentage change in return on equity", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1),
BrainField("mdl77_2liquidityriskfactor_milliq", "model77", 1.00, 1, "Stock Illiquidity (Amihud)", "Model", SignConvention.LONG_LOW, DatasetTier.TIER1),
]
TIER3_ANALYST_FIELDS = [
BrainField("dividend_estimate_average", "analyst4", 0.62, 5, "Dividend per share - average of estimations", "Analyst", SignConvention.LONG_HIGH, DatasetTier.TIER3, 30),
BrainField("max_ebitda_guidance", "analyst4", 1.00, 16, "Maximum guidance value for EBITDA (annual)", "Analyst", SignConvention.LONG_HIGH, DatasetTier.TIER3),
BrainField("cash_flow_operations_min_guidance", "analyst4", 1.00, 17, "Minimum guidance for Cash Flow from Operations", "Analyst", SignConvention.LONG_HIGH, DatasetTier.TIER3),
BrainField("pretax_income_reported", "analyst4", 0.56, 15, "Reported Pretax income for annual period", "Analyst", SignConvention.LONG_HIGH, DatasetTier.TIER3, 30),
]
TIER2_NEWS_FIELDS = [
BrainField("news_short_interest", "news12", 0.87, 535, "Ratio of shares sold short to shares outstanding", "News", SignConvention.LONG_LOW, DatasetTier.TIER2),
BrainField("news_pct_5_min", "news12", 0.77, 353, "Price change in first 5 min after news", "News", SignConvention.LONG_HIGH, DatasetTier.TIER2, 30),
BrainField("news_vol_stddev", "news12", 0.97, 902, "Z-score of current volume vs 30-day average", "News", SignConvention.CONTRARIAN, DatasetTier.TIER2),
]
TIER3_OPTION_FIELDS = [
BrainField("pcr_vol_90", "option9", 0.70, 184, "Put/call volume ratio for 90-day options", "Option", SignConvention.CONTRARIAN, DatasetTier.TIER3, 30),
BrainField("pcr_vol_20", "option9", 0.70, 233, "Put/call volume ratio for 20-day options", "Option", SignConvention.CONTRARIAN, DatasetTier.TIER3, 30),
BrainField("forward_price_120", "option9", 0.70, 359, "Synthetic forward price at 120 days from ATM options", "Option", SignConvention.LONG_HIGH, DatasetTier.TIER3, 30),
]
# FIXED: pv13_customergraphrank (was "pv13_ustomergraphrank")
TIER3_SUPPLY_CHAIN_FIELDS = [
BrainField("pv13_customergraphrank_auth_rank", "pv13", 0.79, 595, "HITS authority score of customers", "Price Volume", SignConvention.LONG_HIGH, DatasetTier.TIER3, 30),
BrainField("pv13_customergraphrank_page_rank", "pv13", 0.79, 921, "PageRank of customers", "Price Volume", SignConvention.LONG_HIGH, DatasetTier.TIER3, 30),
BrainField("rel_ret_all", "pv13", 0.96, 2280, "Averaged 1-day return of product-overlapping companies", "Price Volume", SignConvention.LONG_HIGH, DatasetTier.TIER3),
BrainField("rel_ret_comp", "pv13", 0.82, 3078, "Averaged 1-day return of competing companies", "Price Volume", SignConvention.LONG_HIGH, DatasetTier.TIER3, 30),
BrainField("pv13_custretsig_retsig", "pv13", 0.93, 2718, "Sign of customer return", "Price Volume", SignConvention.LONG_HIGH, DatasetTier.TIER3),
]
TIER3_SOCIAL_FIELDS = [
BrainField("snt_buzz_ret_fast_d1", "socialmedia12", 0.98, 56, "Negative return of relative sentiment volume", "Social Media", SignConvention.CONTRARIAN, DatasetTier.TIER3),
BrainField("scl12_sentiment_fast_d1", "socialmedia12", 0.98, 134, "Daily sentiment score", "Social Media", SignConvention.CONTRARIAN, DatasetTier.TIER3),
]
TIER2_MODEL16_FIELDS = [
BrainField("multi_factor_static_score_derivative", "model16", 1.00, 98, "Change in static multi-factor score", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER2),
BrainField("relative_valuation_rank_derivative", "model16", 1.00, 119, "Under/overpriced based on valuation multiples (change)", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER2),
BrainField("growth_potential_rank_derivative", "model16", 1.00, 152, "Composite growth score change", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER2),
BrainField("earnings_certainty_rank_derivative", "model16", 1.00, 175, "Earnings quality certainty (change)", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER2),
]
ALL_FIELDS: list[BrainField] = (
GOLDMINE_FIELDS + TIER1_MODEL77_FIELDS + TIER3_ANALYST_FIELDS
+ TIER2_NEWS_FIELDS + TIER3_OPTION_FIELDS + TIER3_SUPPLY_CHAIN_FIELDS
+ TIER3_SOCIAL_FIELDS + TIER2_MODEL16_FIELDS
)
FIELD_INDEX: dict[str, BrainField] = {f.id: f for f in ALL_FIELDS}
def pick_field(tier=None, category=None, max_ac=50, min_coverage=0.55, exclude_ids=None):
exclude = exclude_ids or set()
candidates = [f for f in ALL_FIELDS if f.alpha_count <= max_ac and f.coverage >= min_coverage and f.id not in exclude and (tier is None or f.tier == tier) and (category is None or f.category == category)]
if not candidates:
return None
weights = [1.0 / (f.alpha_count + 1) for f in candidates]
total = sum(weights)
weights = [w / total for w in weights]
return random.choices(candidates, weights=weights, k=1)[0]
def get_backfill_days(field: BrainField) -> int:
if field.coverage < 0.70:
return 30
elif field.coverage < 0.85:
return 20
return field.backfill_days
def get_sign_multiplier(field: BrainField) -> int:
if field.sign == SignConvention.LONG_HIGH:
return 1
elif field.sign == SignConvention.LONG_LOW:
return -1
return 1