File size: 9,156 Bytes
c3afe55 0af1dfb c3afe55 0af1dfb c3afe55 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | """
BRAIN Field Registry — Canonical reference for the Alpha Factory pipeline.
Contains the highest-EV fields organized by tier and domain.
Source: fields_USA_TOP3000_D1.csv (5,905 total, 3,447 MATRIX candidates)
Generated: 2026-05-07
Usage:
from alpha_factory.data.brain_fields import GOLDMINE_FIELDS, TIER1_FIELDS, pick_field
"""
from dataclasses import dataclass
from enum import Enum
from typing import Optional
import random
class SignConvention(str, Enum):
LONG_HIGH = "long_high"
LONG_LOW = "long_low"
CONTRARIAN = "contrarian"
AMBIGUOUS = "ambiguous"
class DatasetTier(str, Enum):
TIER1 = "tier1"
TIER2 = "tier2"
TIER3 = "tier3"
TIER4 = "tier4"
@dataclass
class BrainField:
id: str
dataset: str
coverage: float
alpha_count: int
description: str
category: str
sign: SignConvention
tier: DatasetTier
backfill_days: int = 10
GOLDMINE_FIELDS = [
BrainField("time_weighted_cash_flow_to_price", "model77", 1.00, 0, "Time-weighted avg cash flows/share for next 2 years divided by price", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1),
BrainField("north_america_sales_exposure", "model77", 0.94, 0, "Proportion of company sales from North America", "Model", SignConvention.AMBIGUOUS, DatasetTier.TIER1),
BrainField("standardized_unexpected_earnings_2", "model77", 0.92, 0, "Standardized Unexpected Earnings", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1),
BrainField("mdl177_2_globaldevnorthamerica_v502_liqcoeff", "model77", 0.59, 0, "Slope of regression between monthly turnover and price return", "Model", SignConvention.LONG_LOW, DatasetTier.TIER1, 30),
BrainField("mdl177_2_globaldevnorthamerica_v502_chgalpha12m", "model77", 0.58, 0, "Six-month nominal change in 12-month alpha", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1, 30),
]
TIER1_MODEL77_FIELDS = [
BrainField("fundamental_growth_module_score", "model77", 1.00, 1, "Fundamental Growth submodule of Momentum Analyst II", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1),
BrainField("mdl77_2valuemomemtummodel_earningsqualitymodule", "model77", 1.00, 1, "Earnings Quality Module", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1),
BrainField("quarterly_earnings_surprise_stddev", "model77", 1.00, 1, "Most recent quarterly earnings surprise in standardized units", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1),
BrainField("quarterly_eps_surprise_change", "model77", 1.00, 1, "Change in EPS surprise between recent periods", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1),
BrainField("six_month_eps_revision_fy2", "model77", 1.00, 1, "Avg of prior six-month changes in consensus FY2 earnings forecasts", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1),
BrainField("mdl77_ohistoricalgrowthfactor_pctchgqtrast", "model77", 1.00, 1, "1-Yr Change in Total Assets (asset growth anomaly)", "Model", SignConvention.LONG_LOW, DatasetTier.TIER1),
BrainField("mdl77_valueanalystmodelqva_chginv", "model77", 1.00, 1, "1-year change in trailing 4Q inventory scaled by total assets", "Model", SignConvention.LONG_LOW, DatasetTier.TIER1),
BrainField("mdl77_valueanalystmodelqva_yoychgdebt", "model77", 1.00, 1, "Change in Debt Issuance Rank", "Model", SignConvention.LONG_LOW, DatasetTier.TIER1),
BrainField("three_year_change_gross_profit_margin_2", "model77", 1.00, 1, "Three-year change in gross profit margin", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1),
BrainField("time_weighted_ebitda_to_enterprise_value_2", "model77", 1.00, 1, "Time-weighted EBITDA/EV for next two years", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1),
BrainField("ttm_sales_to_enterprise_value", "model77", 1.00, 1, "TTM sales divided by enterprise value", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1),
BrainField("yearly_percentage_change_roe", "model77", 1.00, 1, "Year-over-year percentage change in return on equity", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER1),
BrainField("mdl77_2liquidityriskfactor_milliq", "model77", 1.00, 1, "Stock Illiquidity (Amihud)", "Model", SignConvention.LONG_LOW, DatasetTier.TIER1),
]
TIER3_ANALYST_FIELDS = [
BrainField("dividend_estimate_average", "analyst4", 0.62, 5, "Dividend per share - average of estimations", "Analyst", SignConvention.LONG_HIGH, DatasetTier.TIER3, 30),
BrainField("max_ebitda_guidance", "analyst4", 1.00, 16, "Maximum guidance value for EBITDA (annual)", "Analyst", SignConvention.LONG_HIGH, DatasetTier.TIER3),
BrainField("cash_flow_operations_min_guidance", "analyst4", 1.00, 17, "Minimum guidance for Cash Flow from Operations", "Analyst", SignConvention.LONG_HIGH, DatasetTier.TIER3),
BrainField("pretax_income_reported", "analyst4", 0.56, 15, "Reported Pretax income for annual period", "Analyst", SignConvention.LONG_HIGH, DatasetTier.TIER3, 30),
]
TIER2_NEWS_FIELDS = [
BrainField("news_short_interest", "news12", 0.87, 535, "Ratio of shares sold short to shares outstanding", "News", SignConvention.LONG_LOW, DatasetTier.TIER2),
BrainField("news_pct_5_min", "news12", 0.77, 353, "Price change in first 5 min after news", "News", SignConvention.LONG_HIGH, DatasetTier.TIER2, 30),
BrainField("news_vol_stddev", "news12", 0.97, 902, "Z-score of current volume vs 30-day average", "News", SignConvention.CONTRARIAN, DatasetTier.TIER2),
]
TIER3_OPTION_FIELDS = [
BrainField("pcr_vol_90", "option9", 0.70, 184, "Put/call volume ratio for 90-day options", "Option", SignConvention.CONTRARIAN, DatasetTier.TIER3, 30),
BrainField("pcr_vol_20", "option9", 0.70, 233, "Put/call volume ratio for 20-day options", "Option", SignConvention.CONTRARIAN, DatasetTier.TIER3, 30),
BrainField("forward_price_120", "option9", 0.70, 359, "Synthetic forward price at 120 days from ATM options", "Option", SignConvention.LONG_HIGH, DatasetTier.TIER3, 30),
]
# FIXED: pv13_customergraphrank (was "pv13_ustomergraphrank")
TIER3_SUPPLY_CHAIN_FIELDS = [
BrainField("pv13_customergraphrank_auth_rank", "pv13", 0.79, 595, "HITS authority score of customers", "Price Volume", SignConvention.LONG_HIGH, DatasetTier.TIER3, 30),
BrainField("pv13_customergraphrank_page_rank", "pv13", 0.79, 921, "PageRank of customers", "Price Volume", SignConvention.LONG_HIGH, DatasetTier.TIER3, 30),
BrainField("rel_ret_all", "pv13", 0.96, 2280, "Averaged 1-day return of product-overlapping companies", "Price Volume", SignConvention.LONG_HIGH, DatasetTier.TIER3),
BrainField("rel_ret_comp", "pv13", 0.82, 3078, "Averaged 1-day return of competing companies", "Price Volume", SignConvention.LONG_HIGH, DatasetTier.TIER3, 30),
BrainField("pv13_custretsig_retsig", "pv13", 0.93, 2718, "Sign of customer return", "Price Volume", SignConvention.LONG_HIGH, DatasetTier.TIER3),
]
TIER3_SOCIAL_FIELDS = [
BrainField("snt_buzz_ret_fast_d1", "socialmedia12", 0.98, 56, "Negative return of relative sentiment volume", "Social Media", SignConvention.CONTRARIAN, DatasetTier.TIER3),
BrainField("scl12_sentiment_fast_d1", "socialmedia12", 0.98, 134, "Daily sentiment score", "Social Media", SignConvention.CONTRARIAN, DatasetTier.TIER3),
]
TIER2_MODEL16_FIELDS = [
BrainField("multi_factor_static_score_derivative", "model16", 1.00, 98, "Change in static multi-factor score", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER2),
BrainField("relative_valuation_rank_derivative", "model16", 1.00, 119, "Under/overpriced based on valuation multiples (change)", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER2),
BrainField("growth_potential_rank_derivative", "model16", 1.00, 152, "Composite growth score change", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER2),
BrainField("earnings_certainty_rank_derivative", "model16", 1.00, 175, "Earnings quality certainty (change)", "Model", SignConvention.LONG_HIGH, DatasetTier.TIER2),
]
ALL_FIELDS: list[BrainField] = (
GOLDMINE_FIELDS + TIER1_MODEL77_FIELDS + TIER3_ANALYST_FIELDS
+ TIER2_NEWS_FIELDS + TIER3_OPTION_FIELDS + TIER3_SUPPLY_CHAIN_FIELDS
+ TIER3_SOCIAL_FIELDS + TIER2_MODEL16_FIELDS
)
FIELD_INDEX: dict[str, BrainField] = {f.id: f for f in ALL_FIELDS}
def pick_field(tier=None, category=None, max_ac=50, min_coverage=0.55, exclude_ids=None):
exclude = exclude_ids or set()
candidates = [f for f in ALL_FIELDS if f.alpha_count <= max_ac and f.coverage >= min_coverage and f.id not in exclude and (tier is None or f.tier == tier) and (category is None or f.category == category)]
if not candidates:
return None
weights = [1.0 / (f.alpha_count + 1) for f in candidates]
total = sum(weights)
weights = [w / total for w in weights]
return random.choices(candidates, weights=weights, k=1)[0]
def get_backfill_days(field: BrainField) -> int:
if field.coverage < 0.70:
return 30
elif field.coverage < 0.85:
return 20
return field.backfill_days
def get_sign_multiplier(field: BrainField) -> int:
if field.sign == SignConvention.LONG_HIGH:
return 1
elif field.sign == SignConvention.LONG_LOW:
return -1
return 1
|