gaurv007 commited on
Commit
1991baa
·
verified ·
1 Parent(s): fa79bf1

feat: expression_compiler v2 — new archetypes for model77 SUE/PEAD, supply chain, analyst guidance, PCR contrarian + auto field-aware backfill/sign"

Browse files
alpha_factory/personas/expression_compiler.py CHANGED
@@ -1,20 +1,29 @@
1
  """
2
- Expression Compiler — Persona 2 (Hybrid: Jinja + Tinyfish LLM)
3
  Converts Blueprint JSON → valid BRAIN expression string.
4
- 95% handled by templates. LLM only for novel structures.
 
 
 
 
 
 
5
 
6
  POST-COMPILE RULES (mandatory, applied to ALL expressions):
7
  1. Wrap in ts_decay_linear if decay > 0 (reduces turnover)
8
  2. Negate if short-horizon returns theme (cross-sectional reversal)
9
- 3. Ensure outer wrapper is unit-safe
10
  """
11
  from jinja2 import Environment, BaseLoader
12
  from ..infra.llm_client import LLMClient
13
  from ..schemas import Blueprint, Expression
 
 
14
 
15
 
16
- # ─── Jinja Templates for proven archetypes ───────────────────────────────
17
  TEMPLATES = {
 
18
  "value_quality_blend": """
19
  {%- set comps = [] -%}
20
  {%- for c in bp.components -%}
@@ -45,79 +54,130 @@ group_zscore(ts_delta({{ c.fields[0] }}, {{ c.horizon_days }}), {{ bp.neutraliza
45
  {%- set _ = comps.append(c.weight|string ~ " * zscore(rank(" ~ c.fields[0] ~ "))") -%}
46
  {%- endfor -%}
47
  {{ comps | join(" + ") }}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  """,
49
  }
50
 
51
  _env = Environment(loader=BaseLoader())
52
 
 
 
 
53
 
54
  COMPILER_SYSTEM_PROMPT = """You are a BRAIN expression compiler. Your ONLY job is to convert
55
  a factor blueprint into a valid WorldQuant BRAIN expression.
56
 
57
  RULES:
58
- 1. Use ONLY operators from the BRAIN operator catalogue.
59
  2. Every additive operand MUST be wrapped in zscore(), rank(), or group_zscore().
60
  3. Output ONLY the expression string — no explanation, no markdown.
61
  4. The expression must be syntactically valid (balanced parentheses, correct arity).
62
  5. Do NOT include ts_decay_linear — that will be added automatically post-compilation.
63
  6. For short-horizon returns (<=20 days), use NEGATIVE sign (cross-sectional reversal).
64
- 7. Use group_zscore or indneutralize for neutralization within the expression if specified.
65
-
66
- Available operators: rank, zscore, group_zscore, group_rank, ts_mean, ts_std, ts_sum,
67
- ts_delta, ts_decay_linear, ts_rank, ts_argmax, ts_argmin, ts_correlation,
68
- ts_covariance, ts_regression, winsorize, abs, log, sign, power, sqrt, max, min,
69
- if_else, less, greater, filter, trade_when, ts_backfill, indneutralize
 
 
 
 
70
  """
71
 
72
 
73
- # ─── Post-compilation rules (mandatory) ─────────────────────────────────
74
-
75
  def _apply_post_compile_rules(expression: str, blueprint: Blueprint) -> str:
76
- """
77
- Mandatory post-compilation transformations:
78
- 1. Apply ts_decay_linear to reduce turnover (ALWAYS, min decay=5)
79
- 2. Apply sign flip for short-horizon reversal themes
80
- 3. Ensure the expression won't exceed 70% turnover
81
- """
82
  expr = expression.strip()
83
 
84
- # Rule 1: Determine if sign should be flipped (short-horizon reversal)
85
- # In cross-section, short-term returns (<= 20 days) are mean-reverting
86
  needs_flip = False
87
  for c in blueprint.components:
88
  if any(f in ["returns", "close", "ts_returns"] for f in c.fields):
89
  if c.horizon_days <= 20 and c.sign_direction == "long_high":
90
  needs_flip = True
91
  break
92
-
93
  if needs_flip:
94
  expr = f"-({expr})"
95
 
96
- # Rule 2: ALWAYS wrap in ts_decay_linear to control turnover
97
- # Minimum decay = 5 days, use blueprint.decay if higher
 
 
 
 
 
 
 
 
 
 
98
  decay = max(blueprint.decay, 5)
99
-
100
- # Don't double-wrap if already has ts_decay_linear
101
  if not expr.startswith("ts_decay_linear("):
102
  expr = f"ts_decay_linear({expr}, {decay})"
103
 
104
  return expr
105
 
106
 
107
- async def compile_expression(
108
- blueprint: Blueprint,
109
- llm: LLMClient,
110
- ) -> Expression:
111
- """
112
- Convert a Blueprint to a BRAIN expression.
113
- Uses Jinja template if archetype is known; LLM fallback for novel structures.
114
- ALWAYS applies post-compile rules (decay, sign correction).
115
- """
116
- # Try template first (95% of cases)
117
  if blueprint.archetype in TEMPLATES:
118
  template_str = TEMPLATES[blueprint.archetype]
119
  template = _env.from_string(template_str)
120
- expr_text = template.render(bp=blueprint).strip()
 
 
 
 
 
 
 
 
 
121
 
122
  fields_used = []
123
  ops_used = []
@@ -125,7 +185,6 @@ async def compile_expression(
125
  fields_used.extend(c.fields)
126
  ops_used.extend(c.operators)
127
 
128
- # Apply mandatory post-compile rules
129
  expr_text = _apply_post_compile_rules(expr_text, blueprint)
130
 
131
  return Expression(
@@ -145,10 +204,13 @@ Blueprint:
145
  - Neutralization: {blueprint.neutralization.value}
146
  - Target: cross-sectional rank → long/short
147
 
148
- IMPORTANT: Do NOT include ts_decay_linear — it will be added automatically.
149
- For short-horizon returns (<=20 days), use NEGATIVE sign (reversal works better cross-sectionally).
 
 
 
150
 
151
- Output a valid BRAIN expression. Wrap all additive operands in zscore() or rank()."""
152
 
153
  result = await llm.generate_json(
154
  prompt=user_prompt,
@@ -158,7 +220,6 @@ Output a valid BRAIN expression. Wrap all additive operands in zscore() or rank(
158
  system_prompt=COMPILER_SYSTEM_PROMPT,
159
  )
160
 
161
- # Apply mandatory post-compile rules to LLM output too
162
  result.expression = _apply_post_compile_rules(result.expression, blueprint)
163
  if "ts_decay_linear" not in result.operators_used:
164
  result.operators_used.append("ts_decay_linear")
 
1
  """
2
+ Expression Compiler v2 — Persona 2 (Hybrid: Jinja + Tinyfish LLM)
3
  Converts Blueprint JSON → valid BRAIN expression string.
4
+
5
+ NEW ARCHETYPES (orthogonal to existing 18 alphas):
6
+ - sue_drift: Standardized Unexpected Earnings momentum
7
+ - supply_chain_lead_lag: Customer/competitor return propagation
8
+ - analyst_guidance_yield: Management guidance as yield signal
9
+ - pcr_contrarian: Put-call ratio contrarian reversal
10
+ - model_score_momentum: Multi-factor score derivative momentum
11
 
12
  POST-COMPILE RULES (mandatory, applied to ALL expressions):
13
  1. Wrap in ts_decay_linear if decay > 0 (reduces turnover)
14
  2. Negate if short-horizon returns theme (cross-sectional reversal)
15
+ 3. Apply correct sign based on field metadata
16
  """
17
  from jinja2 import Environment, BaseLoader
18
  from ..infra.llm_client import LLMClient
19
  from ..schemas import Blueprint, Expression
20
+ from ..data.brain_fields import FIELD_INDEX, get_backfill_days, get_sign_multiplier
21
+ from ..data.brain_groups import get_group_for_expression
22
 
23
 
24
+ # ─── Jinja Templates ─────────────────────────────────────────────────────
25
  TEMPLATES = {
26
+ # === EXISTING PROVEN ARCHETYPES ===
27
  "value_quality_blend": """
28
  {%- set comps = [] -%}
29
  {%- for c in bp.components -%}
 
54
  {%- set _ = comps.append(c.weight|string ~ " * zscore(rank(" ~ c.fields[0] ~ "))") -%}
55
  {%- endfor -%}
56
  {{ comps | join(" + ") }}
57
+ """,
58
+
59
+ # === NEW ORTHOGONAL ARCHETYPES (model77, analyst, supply chain, options) ===
60
+
61
+ "sue_drift": """
62
+ {%- set c = bp.components[0] -%}
63
+ {%- set bf = 10 if c.fields[0] in high_cov else 30 -%}
64
+ group_neutralize(rank(ts_backfill({{ c.fields[0] }}, {{ bf }})), {{ bp.neutralization.value }})
65
+ """,
66
+
67
+ "supply_chain_lead_lag": """
68
+ {%- set c = bp.components[0] -%}
69
+ group_neutralize(rank(ts_mean(ts_backfill({{ c.fields[0] }}, 30), {{ c.horizon_days }})), {{ bp.neutralization.value }})
70
+ """,
71
+
72
+ "analyst_guidance_yield": """
73
+ {%- set c = bp.components[0] -%}
74
+ {%- set bf = 10 if c.fields[0] in high_cov else 30 -%}
75
+ group_neutralize(zscore(ts_rank(ts_backfill({{ c.fields[0] }}, {{ bf }}), 252)), {{ bp.neutralization.value }})
76
+ """,
77
+
78
+ "pcr_contrarian": """
79
+ {%- set c = bp.components[0] -%}
80
+ group_neutralize(rank(-ts_delta(ts_backfill({{ c.fields[0] }}, 30), {{ c.horizon_days }})), {{ bp.neutralization.value }})
81
+ """,
82
+
83
+ "model_score_momentum": """
84
+ {%- set c = bp.components[0] -%}
85
+ group_neutralize(zscore(ts_delta({{ c.fields[0] }}, {{ c.horizon_days }})), {{ bp.neutralization.value }})
86
+ """,
87
+
88
+ # Alpha 15 archetype (the benchmark) — plug any field into the value leg
89
+ "alpha15_hybrid": """
90
+ {%- set c = bp.components[0] -%}
91
+ {%- set bf = 10 if c.fields[0] in high_cov else 30 -%}
92
+ {%- set sign_mult = "+" if c.sign_direction == "long_high" else "-" -%}
93
+ intraday_mr = (high + low) / 2 - close;
94
+ leg_raw = ts_backfill({{ c.fields[0] }}, {{ bf }});
95
+ mr_z = zscore(ts_rank(intraday_mr, 252));
96
+ leg_z = {{ sign_mult }}1 * zscore(ts_rank(leg_raw, 252));
97
+ score = 0.60 * mr_z + 0.40 * leg_z;
98
+ group_neutralize(rank(score), {{ bp.neutralization.value }})
99
  """,
100
  }
101
 
102
  _env = Environment(loader=BaseLoader())
103
 
104
+ # High-coverage fields (don't need long backfill)
105
+ HIGH_COV_FIELDS = {f.id for f in FIELD_INDEX.values() if f.coverage >= 0.85} if FIELD_INDEX else set()
106
+
107
 
108
  COMPILER_SYSTEM_PROMPT = """You are a BRAIN expression compiler. Your ONLY job is to convert
109
  a factor blueprint into a valid WorldQuant BRAIN expression.
110
 
111
  RULES:
112
+ 1. Use ONLY operators from the BRAIN 71-operator catalogue.
113
  2. Every additive operand MUST be wrapped in zscore(), rank(), or group_zscore().
114
  3. Output ONLY the expression string — no explanation, no markdown.
115
  4. The expression must be syntactically valid (balanced parentheses, correct arity).
116
  5. Do NOT include ts_decay_linear — that will be added automatically post-compilation.
117
  6. For short-horizon returns (<=20 days), use NEGATIVE sign (cross-sectional reversal).
118
+ 7. Use group_neutralize for neutralization. Prefer novel group keys over subindustry.
119
+ 8. Always ts_backfill fields with coverage < 0.70 using window of 30 days.
120
+
121
+ Available operators (71 total): rank, zscore, group_zscore, group_rank, group_neutralize,
122
+ indneutralize, ts_mean, ts_std, ts_sum, ts_delta, ts_decay_linear, ts_rank, ts_argmax,
123
+ ts_argmin, ts_correlation, ts_covariance, ts_regression, ts_backfill, ts_delay,
124
+ ts_zscore, ts_skewness, ts_kurtosis, ts_entropy, ts_av_diff, ts_hump, ts_scale,
125
+ ts_decay_exp_window, winsorize, abs, log, sign, power, sqrt, max, min,
126
+ if_else, less, greater, equal, filter, trade_when, mask, vec_avg, vec_sum,
127
+ quantile, bucket, sigmoid, tanh, relu, pasteurize, truncate
128
  """
129
 
130
 
 
 
131
  def _apply_post_compile_rules(expression: str, blueprint: Blueprint) -> str:
132
+ """Mandatory post-compilation transformations."""
 
 
 
 
 
133
  expr = expression.strip()
134
 
135
+ # Rule 1: Sign flip for short-horizon reversal
 
136
  needs_flip = False
137
  for c in blueprint.components:
138
  if any(f in ["returns", "close", "ts_returns"] for f in c.fields):
139
  if c.horizon_days <= 20 and c.sign_direction == "long_high":
140
  needs_flip = True
141
  break
 
142
  if needs_flip:
143
  expr = f"-({expr})"
144
 
145
+ # Rule 2: Apply field-level sign from registry
146
+ for c in blueprint.components:
147
+ for fid in c.fields:
148
+ if fid in FIELD_INDEX:
149
+ sign = get_sign_multiplier(FIELD_INDEX[fid])
150
+ if sign == -1 and c.sign_direction != "long_low":
151
+ # Field is inverted but blueprint doesn't know — fix it
152
+ if not expr.startswith("-"):
153
+ expr = f"-({expr})"
154
+ break
155
+
156
+ # Rule 3: ALWAYS wrap in ts_decay_linear
157
  decay = max(blueprint.decay, 5)
 
 
158
  if not expr.startswith("ts_decay_linear("):
159
  expr = f"ts_decay_linear({expr}, {decay})"
160
 
161
  return expr
162
 
163
 
164
+ async def compile_expression(blueprint: Blueprint, llm: LLMClient) -> Expression:
165
+ """Convert a Blueprint to a BRAIN expression."""
166
+
167
+ # Try template first
 
 
 
 
 
 
168
  if blueprint.archetype in TEMPLATES:
169
  template_str = TEMPLATES[blueprint.archetype]
170
  template = _env.from_string(template_str)
171
+ expr_text = template.render(bp=blueprint, high_cov=HIGH_COV_FIELDS).strip()
172
+
173
+ # Clean up multi-line expressions (Alpha15 style)
174
+ if ";" in expr_text:
175
+ lines = [l.strip() for l in expr_text.split(";") if l.strip()]
176
+ expr_text = lines[-1] # Last line is the final expression
177
+ # Prepend variable assignments
178
+ for line in lines[:-1]:
179
+ var, val = line.split("=", 1)
180
+ expr_text = expr_text.replace(var.strip(), f"({val.strip()})")
181
 
182
  fields_used = []
183
  ops_used = []
 
185
  fields_used.extend(c.fields)
186
  ops_used.extend(c.operators)
187
 
 
188
  expr_text = _apply_post_compile_rules(expr_text, blueprint)
189
 
190
  return Expression(
 
204
  - Neutralization: {blueprint.neutralization.value}
205
  - Target: cross-sectional rank → long/short
206
 
207
+ IMPORTANT:
208
+ - Do NOT include ts_decay_linear (added automatically).
209
+ - For fields with coverage < 0.70, wrap in ts_backfill(field, 30).
210
+ - For short-horizon returns (<=20 days), use NEGATIVE sign.
211
+ - Use group_neutralize with the specified neutralization.
212
 
213
+ Output ONLY the expression string."""
214
 
215
  result = await llm.generate_json(
216
  prompt=user_prompt,
 
220
  system_prompt=COMPILER_SYSTEM_PROMPT,
221
  )
222
 
 
223
  result.expression = _apply_post_compile_rules(result.expression, blueprint)
224
  if "ts_decay_linear" not in result.operators_used:
225
  result.operators_used.append("ts_decay_linear")