feat: expression_compiler v2 — new archetypes for model77 SUE/PEAD, supply chain, analyst guidance, PCR contrarian + auto field-aware backfill/sign"
Browse files
alpha_factory/personas/expression_compiler.py
CHANGED
|
@@ -1,20 +1,29 @@
|
|
| 1 |
"""
|
| 2 |
-
Expression Compiler — Persona 2 (Hybrid: Jinja + Tinyfish LLM)
|
| 3 |
Converts Blueprint JSON → valid BRAIN expression string.
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
POST-COMPILE RULES (mandatory, applied to ALL expressions):
|
| 7 |
1. Wrap in ts_decay_linear if decay > 0 (reduces turnover)
|
| 8 |
2. Negate if short-horizon returns theme (cross-sectional reversal)
|
| 9 |
-
3.
|
| 10 |
"""
|
| 11 |
from jinja2 import Environment, BaseLoader
|
| 12 |
from ..infra.llm_client import LLMClient
|
| 13 |
from ..schemas import Blueprint, Expression
|
|
|
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
-
# ─── Jinja Templates
|
| 17 |
TEMPLATES = {
|
|
|
|
| 18 |
"value_quality_blend": """
|
| 19 |
{%- set comps = [] -%}
|
| 20 |
{%- for c in bp.components -%}
|
|
@@ -45,79 +54,130 @@ group_zscore(ts_delta({{ c.fields[0] }}, {{ c.horizon_days }}), {{ bp.neutraliza
|
|
| 45 |
{%- set _ = comps.append(c.weight|string ~ " * zscore(rank(" ~ c.fields[0] ~ "))") -%}
|
| 46 |
{%- endfor -%}
|
| 47 |
{{ comps | join(" + ") }}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
""",
|
| 49 |
}
|
| 50 |
|
| 51 |
_env = Environment(loader=BaseLoader())
|
| 52 |
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
COMPILER_SYSTEM_PROMPT = """You are a BRAIN expression compiler. Your ONLY job is to convert
|
| 55 |
a factor blueprint into a valid WorldQuant BRAIN expression.
|
| 56 |
|
| 57 |
RULES:
|
| 58 |
-
1. Use ONLY operators from the BRAIN operator catalogue.
|
| 59 |
2. Every additive operand MUST be wrapped in zscore(), rank(), or group_zscore().
|
| 60 |
3. Output ONLY the expression string — no explanation, no markdown.
|
| 61 |
4. The expression must be syntactically valid (balanced parentheses, correct arity).
|
| 62 |
5. Do NOT include ts_decay_linear — that will be added automatically post-compilation.
|
| 63 |
6. For short-horizon returns (<=20 days), use NEGATIVE sign (cross-sectional reversal).
|
| 64 |
-
7. Use
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
"""
|
| 71 |
|
| 72 |
|
| 73 |
-
# ─── Post-compilation rules (mandatory) ─────────────────────────────────
|
| 74 |
-
|
| 75 |
def _apply_post_compile_rules(expression: str, blueprint: Blueprint) -> str:
|
| 76 |
-
"""
|
| 77 |
-
Mandatory post-compilation transformations:
|
| 78 |
-
1. Apply ts_decay_linear to reduce turnover (ALWAYS, min decay=5)
|
| 79 |
-
2. Apply sign flip for short-horizon reversal themes
|
| 80 |
-
3. Ensure the expression won't exceed 70% turnover
|
| 81 |
-
"""
|
| 82 |
expr = expression.strip()
|
| 83 |
|
| 84 |
-
# Rule 1:
|
| 85 |
-
# In cross-section, short-term returns (<= 20 days) are mean-reverting
|
| 86 |
needs_flip = False
|
| 87 |
for c in blueprint.components:
|
| 88 |
if any(f in ["returns", "close", "ts_returns"] for f in c.fields):
|
| 89 |
if c.horizon_days <= 20 and c.sign_direction == "long_high":
|
| 90 |
needs_flip = True
|
| 91 |
break
|
| 92 |
-
|
| 93 |
if needs_flip:
|
| 94 |
expr = f"-({expr})"
|
| 95 |
|
| 96 |
-
# Rule 2:
|
| 97 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
decay = max(blueprint.decay, 5)
|
| 99 |
-
|
| 100 |
-
# Don't double-wrap if already has ts_decay_linear
|
| 101 |
if not expr.startswith("ts_decay_linear("):
|
| 102 |
expr = f"ts_decay_linear({expr}, {decay})"
|
| 103 |
|
| 104 |
return expr
|
| 105 |
|
| 106 |
|
| 107 |
-
async def compile_expression(
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
"""
|
| 112 |
-
Convert a Blueprint to a BRAIN expression.
|
| 113 |
-
Uses Jinja template if archetype is known; LLM fallback for novel structures.
|
| 114 |
-
ALWAYS applies post-compile rules (decay, sign correction).
|
| 115 |
-
"""
|
| 116 |
-
# Try template first (95% of cases)
|
| 117 |
if blueprint.archetype in TEMPLATES:
|
| 118 |
template_str = TEMPLATES[blueprint.archetype]
|
| 119 |
template = _env.from_string(template_str)
|
| 120 |
-
expr_text = template.render(bp=blueprint).strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
|
| 122 |
fields_used = []
|
| 123 |
ops_used = []
|
|
@@ -125,7 +185,6 @@ async def compile_expression(
|
|
| 125 |
fields_used.extend(c.fields)
|
| 126 |
ops_used.extend(c.operators)
|
| 127 |
|
| 128 |
-
# Apply mandatory post-compile rules
|
| 129 |
expr_text = _apply_post_compile_rules(expr_text, blueprint)
|
| 130 |
|
| 131 |
return Expression(
|
|
@@ -145,10 +204,13 @@ Blueprint:
|
|
| 145 |
- Neutralization: {blueprint.neutralization.value}
|
| 146 |
- Target: cross-sectional rank → long/short
|
| 147 |
|
| 148 |
-
IMPORTANT:
|
| 149 |
-
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
-
Output
|
| 152 |
|
| 153 |
result = await llm.generate_json(
|
| 154 |
prompt=user_prompt,
|
|
@@ -158,7 +220,6 @@ Output a valid BRAIN expression. Wrap all additive operands in zscore() or rank(
|
|
| 158 |
system_prompt=COMPILER_SYSTEM_PROMPT,
|
| 159 |
)
|
| 160 |
|
| 161 |
-
# Apply mandatory post-compile rules to LLM output too
|
| 162 |
result.expression = _apply_post_compile_rules(result.expression, blueprint)
|
| 163 |
if "ts_decay_linear" not in result.operators_used:
|
| 164 |
result.operators_used.append("ts_decay_linear")
|
|
|
|
| 1 |
"""
|
| 2 |
+
Expression Compiler v2 — Persona 2 (Hybrid: Jinja + Tinyfish LLM)
|
| 3 |
Converts Blueprint JSON → valid BRAIN expression string.
|
| 4 |
+
|
| 5 |
+
NEW ARCHETYPES (orthogonal to existing 18 alphas):
|
| 6 |
+
- sue_drift: Standardized Unexpected Earnings momentum
|
| 7 |
+
- supply_chain_lead_lag: Customer/competitor return propagation
|
| 8 |
+
- analyst_guidance_yield: Management guidance as yield signal
|
| 9 |
+
- pcr_contrarian: Put-call ratio contrarian reversal
|
| 10 |
+
- model_score_momentum: Multi-factor score derivative momentum
|
| 11 |
|
| 12 |
POST-COMPILE RULES (mandatory, applied to ALL expressions):
|
| 13 |
1. Wrap in ts_decay_linear if decay > 0 (reduces turnover)
|
| 14 |
2. Negate if short-horizon returns theme (cross-sectional reversal)
|
| 15 |
+
3. Apply correct sign based on field metadata
|
| 16 |
"""
|
| 17 |
from jinja2 import Environment, BaseLoader
|
| 18 |
from ..infra.llm_client import LLMClient
|
| 19 |
from ..schemas import Blueprint, Expression
|
| 20 |
+
from ..data.brain_fields import FIELD_INDEX, get_backfill_days, get_sign_multiplier
|
| 21 |
+
from ..data.brain_groups import get_group_for_expression
|
| 22 |
|
| 23 |
|
| 24 |
+
# ─── Jinja Templates ─────────────────────────────────────────────────────
|
| 25 |
TEMPLATES = {
|
| 26 |
+
# === EXISTING PROVEN ARCHETYPES ===
|
| 27 |
"value_quality_blend": """
|
| 28 |
{%- set comps = [] -%}
|
| 29 |
{%- for c in bp.components -%}
|
|
|
|
| 54 |
{%- set _ = comps.append(c.weight|string ~ " * zscore(rank(" ~ c.fields[0] ~ "))") -%}
|
| 55 |
{%- endfor -%}
|
| 56 |
{{ comps | join(" + ") }}
|
| 57 |
+
""",
|
| 58 |
+
|
| 59 |
+
# === NEW ORTHOGONAL ARCHETYPES (model77, analyst, supply chain, options) ===
|
| 60 |
+
|
| 61 |
+
"sue_drift": """
|
| 62 |
+
{%- set c = bp.components[0] -%}
|
| 63 |
+
{%- set bf = 10 if c.fields[0] in high_cov else 30 -%}
|
| 64 |
+
group_neutralize(rank(ts_backfill({{ c.fields[0] }}, {{ bf }})), {{ bp.neutralization.value }})
|
| 65 |
+
""",
|
| 66 |
+
|
| 67 |
+
"supply_chain_lead_lag": """
|
| 68 |
+
{%- set c = bp.components[0] -%}
|
| 69 |
+
group_neutralize(rank(ts_mean(ts_backfill({{ c.fields[0] }}, 30), {{ c.horizon_days }})), {{ bp.neutralization.value }})
|
| 70 |
+
""",
|
| 71 |
+
|
| 72 |
+
"analyst_guidance_yield": """
|
| 73 |
+
{%- set c = bp.components[0] -%}
|
| 74 |
+
{%- set bf = 10 if c.fields[0] in high_cov else 30 -%}
|
| 75 |
+
group_neutralize(zscore(ts_rank(ts_backfill({{ c.fields[0] }}, {{ bf }}), 252)), {{ bp.neutralization.value }})
|
| 76 |
+
""",
|
| 77 |
+
|
| 78 |
+
"pcr_contrarian": """
|
| 79 |
+
{%- set c = bp.components[0] -%}
|
| 80 |
+
group_neutralize(rank(-ts_delta(ts_backfill({{ c.fields[0] }}, 30), {{ c.horizon_days }})), {{ bp.neutralization.value }})
|
| 81 |
+
""",
|
| 82 |
+
|
| 83 |
+
"model_score_momentum": """
|
| 84 |
+
{%- set c = bp.components[0] -%}
|
| 85 |
+
group_neutralize(zscore(ts_delta({{ c.fields[0] }}, {{ c.horizon_days }})), {{ bp.neutralization.value }})
|
| 86 |
+
""",
|
| 87 |
+
|
| 88 |
+
# Alpha 15 archetype (the benchmark) — plug any field into the value leg
|
| 89 |
+
"alpha15_hybrid": """
|
| 90 |
+
{%- set c = bp.components[0] -%}
|
| 91 |
+
{%- set bf = 10 if c.fields[0] in high_cov else 30 -%}
|
| 92 |
+
{%- set sign_mult = "+" if c.sign_direction == "long_high" else "-" -%}
|
| 93 |
+
intraday_mr = (high + low) / 2 - close;
|
| 94 |
+
leg_raw = ts_backfill({{ c.fields[0] }}, {{ bf }});
|
| 95 |
+
mr_z = zscore(ts_rank(intraday_mr, 252));
|
| 96 |
+
leg_z = {{ sign_mult }}1 * zscore(ts_rank(leg_raw, 252));
|
| 97 |
+
score = 0.60 * mr_z + 0.40 * leg_z;
|
| 98 |
+
group_neutralize(rank(score), {{ bp.neutralization.value }})
|
| 99 |
""",
|
| 100 |
}
|
| 101 |
|
| 102 |
_env = Environment(loader=BaseLoader())
|
| 103 |
|
| 104 |
+
# High-coverage fields (don't need long backfill)
|
| 105 |
+
HIGH_COV_FIELDS = {f.id for f in FIELD_INDEX.values() if f.coverage >= 0.85} if FIELD_INDEX else set()
|
| 106 |
+
|
| 107 |
|
| 108 |
COMPILER_SYSTEM_PROMPT = """You are a BRAIN expression compiler. Your ONLY job is to convert
|
| 109 |
a factor blueprint into a valid WorldQuant BRAIN expression.
|
| 110 |
|
| 111 |
RULES:
|
| 112 |
+
1. Use ONLY operators from the BRAIN 71-operator catalogue.
|
| 113 |
2. Every additive operand MUST be wrapped in zscore(), rank(), or group_zscore().
|
| 114 |
3. Output ONLY the expression string — no explanation, no markdown.
|
| 115 |
4. The expression must be syntactically valid (balanced parentheses, correct arity).
|
| 116 |
5. Do NOT include ts_decay_linear — that will be added automatically post-compilation.
|
| 117 |
6. For short-horizon returns (<=20 days), use NEGATIVE sign (cross-sectional reversal).
|
| 118 |
+
7. Use group_neutralize for neutralization. Prefer novel group keys over subindustry.
|
| 119 |
+
8. Always ts_backfill fields with coverage < 0.70 using window of 30 days.
|
| 120 |
+
|
| 121 |
+
Available operators (71 total): rank, zscore, group_zscore, group_rank, group_neutralize,
|
| 122 |
+
indneutralize, ts_mean, ts_std, ts_sum, ts_delta, ts_decay_linear, ts_rank, ts_argmax,
|
| 123 |
+
ts_argmin, ts_correlation, ts_covariance, ts_regression, ts_backfill, ts_delay,
|
| 124 |
+
ts_zscore, ts_skewness, ts_kurtosis, ts_entropy, ts_av_diff, ts_hump, ts_scale,
|
| 125 |
+
ts_decay_exp_window, winsorize, abs, log, sign, power, sqrt, max, min,
|
| 126 |
+
if_else, less, greater, equal, filter, trade_when, mask, vec_avg, vec_sum,
|
| 127 |
+
quantile, bucket, sigmoid, tanh, relu, pasteurize, truncate
|
| 128 |
"""
|
| 129 |
|
| 130 |
|
|
|
|
|
|
|
| 131 |
def _apply_post_compile_rules(expression: str, blueprint: Blueprint) -> str:
|
| 132 |
+
"""Mandatory post-compilation transformations."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
expr = expression.strip()
|
| 134 |
|
| 135 |
+
# Rule 1: Sign flip for short-horizon reversal
|
|
|
|
| 136 |
needs_flip = False
|
| 137 |
for c in blueprint.components:
|
| 138 |
if any(f in ["returns", "close", "ts_returns"] for f in c.fields):
|
| 139 |
if c.horizon_days <= 20 and c.sign_direction == "long_high":
|
| 140 |
needs_flip = True
|
| 141 |
break
|
|
|
|
| 142 |
if needs_flip:
|
| 143 |
expr = f"-({expr})"
|
| 144 |
|
| 145 |
+
# Rule 2: Apply field-level sign from registry
|
| 146 |
+
for c in blueprint.components:
|
| 147 |
+
for fid in c.fields:
|
| 148 |
+
if fid in FIELD_INDEX:
|
| 149 |
+
sign = get_sign_multiplier(FIELD_INDEX[fid])
|
| 150 |
+
if sign == -1 and c.sign_direction != "long_low":
|
| 151 |
+
# Field is inverted but blueprint doesn't know — fix it
|
| 152 |
+
if not expr.startswith("-"):
|
| 153 |
+
expr = f"-({expr})"
|
| 154 |
+
break
|
| 155 |
+
|
| 156 |
+
# Rule 3: ALWAYS wrap in ts_decay_linear
|
| 157 |
decay = max(blueprint.decay, 5)
|
|
|
|
|
|
|
| 158 |
if not expr.startswith("ts_decay_linear("):
|
| 159 |
expr = f"ts_decay_linear({expr}, {decay})"
|
| 160 |
|
| 161 |
return expr
|
| 162 |
|
| 163 |
|
| 164 |
+
async def compile_expression(blueprint: Blueprint, llm: LLMClient) -> Expression:
|
| 165 |
+
"""Convert a Blueprint to a BRAIN expression."""
|
| 166 |
+
|
| 167 |
+
# Try template first
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
if blueprint.archetype in TEMPLATES:
|
| 169 |
template_str = TEMPLATES[blueprint.archetype]
|
| 170 |
template = _env.from_string(template_str)
|
| 171 |
+
expr_text = template.render(bp=blueprint, high_cov=HIGH_COV_FIELDS).strip()
|
| 172 |
+
|
| 173 |
+
# Clean up multi-line expressions (Alpha15 style)
|
| 174 |
+
if ";" in expr_text:
|
| 175 |
+
lines = [l.strip() for l in expr_text.split(";") if l.strip()]
|
| 176 |
+
expr_text = lines[-1] # Last line is the final expression
|
| 177 |
+
# Prepend variable assignments
|
| 178 |
+
for line in lines[:-1]:
|
| 179 |
+
var, val = line.split("=", 1)
|
| 180 |
+
expr_text = expr_text.replace(var.strip(), f"({val.strip()})")
|
| 181 |
|
| 182 |
fields_used = []
|
| 183 |
ops_used = []
|
|
|
|
| 185 |
fields_used.extend(c.fields)
|
| 186 |
ops_used.extend(c.operators)
|
| 187 |
|
|
|
|
| 188 |
expr_text = _apply_post_compile_rules(expr_text, blueprint)
|
| 189 |
|
| 190 |
return Expression(
|
|
|
|
| 204 |
- Neutralization: {blueprint.neutralization.value}
|
| 205 |
- Target: cross-sectional rank → long/short
|
| 206 |
|
| 207 |
+
IMPORTANT:
|
| 208 |
+
- Do NOT include ts_decay_linear (added automatically).
|
| 209 |
+
- For fields with coverage < 0.70, wrap in ts_backfill(field, 30).
|
| 210 |
+
- For short-horizon returns (<=20 days), use NEGATIVE sign.
|
| 211 |
+
- Use group_neutralize with the specified neutralization.
|
| 212 |
|
| 213 |
+
Output ONLY the expression string."""
|
| 214 |
|
| 215 |
result = await llm.generate_json(
|
| 216 |
prompt=user_prompt,
|
|
|
|
| 220 |
system_prompt=COMPILER_SYSTEM_PROMPT,
|
| 221 |
)
|
| 222 |
|
|
|
|
| 223 |
result.expression = _apply_post_compile_rules(result.expression, blueprint)
|
| 224 |
if "ts_decay_linear" not in result.operators_used:
|
| 225 |
result.operators_used.append("ts_decay_linear")
|