gaurv007 commited on
Commit
fa79bf1
·
verified ·
1 Parent(s): 88e6e07

feat: lint v2 — loads 71 operators from data/operators.csv, validates field IDs against registry, improved checks

Browse files
Files changed (1) hide show
  1. alpha_factory/deterministic/lint.py +71 -50
alpha_factory/deterministic/lint.py CHANGED
@@ -1,37 +1,34 @@
1
  """
2
- Static Lint — Layer 2: Deterministic pre-flight checks.
 
3
  No LLM. Pure Python. Catches 100% of mechanical failures.
4
- Mandatory and unbypassable — no agent can override this gate.
5
  """
6
  import re
7
  from pathlib import Path
8
  from ..schemas import LintResult
 
9
 
10
 
11
- # Load operators catalog
12
  def _load_operators(path: Path = Path("data/operators.csv")) -> set[str]:
13
- """Load valid operator names from operators.csv."""
14
  if not path.exists():
15
- # Fallback: common BRAIN operators
16
  return {
17
- "rank", "zscore", "group_rank", "group_zscore", "quantile",
18
- "ts_rank", "ts_zscore", "ts_mean", "ts_std", "ts_sum",
19
- "ts_min", "ts_max", "ts_argmax", "ts_argmin", "ts_skewness",
20
- "ts_kurtosis", "ts_covariance", "ts_correlation", "ts_regression",
21
- "ts_decay_linear", "ts_decay_exp_window", "ts_delta", "ts_delay",
22
- "ts_product", "ts_moment", "ts_entropy", "ts_av_diff",
23
- "ts_hump", "ts_scale", "ts_step",
24
- "abs", "log", "sign", "power", "sqrt", "max", "min",
25
- "if_else", "less", "greater", "equal",
26
- "winsorize", "pasteurize", "truncate",
27
- "vec_avg", "vec_sum", "vec_norm", "vec_count",
28
- "trade_when", "ts_backfill", "filter", "mask",
29
- "tail", "group_mean", "group_sum", "group_median",
30
- "group_max", "group_min", "group_count", "group_neutralize",
31
- "indneutralize", "market_neutralize",
32
- "sigmoid", "tanh", "relu",
33
- "correlation", "covariance", "regression_neut",
34
- "fraction", "bucket",
35
  }
36
  ops = set()
37
  with open(path) as f:
@@ -44,23 +41,28 @@ def _load_operators(path: Path = Path("data/operators.csv")) -> set[str]:
44
 
45
  ALLOWED_OPS: set[str] | None = None
46
 
47
- # Look-ahead deny patterns — these ALWAYS indicate future data leakage
48
  LOOKAHEAD_PATTERNS = [
49
- r"ts_delay\([^,]+,\s*-\d", # negative delay = future data
50
- r"\bfuture_", # any field named future_*
51
- r"\bforward_return", # explicit forward returns
52
- r"ts_backfill\([^,]+,\s*\d{3,}", # backfill > 99 days (suspicious)
53
  ]
54
 
55
- # Unit-safety: top-level additive operands must be normalized
56
  UNIT_SAFE_WRAPPERS = {"zscore", "rank", "quantile", "group_zscore", "group_rank", "group_neutralize", "indneutralize"}
57
 
 
 
 
 
 
 
 
 
58
 
59
  def lint(expression: str, operators_path: Path = Path("data/operators.csv")) -> LintResult:
60
- """
61
- Run all deterministic pre-flight checks on a BRAIN expression.
62
- Returns LintResult with pass/fail + detailed errors.
63
- """
64
  global ALLOWED_OPS
65
  if ALLOWED_OPS is None:
66
  ALLOWED_OPS = _load_operators(operators_path)
@@ -68,24 +70,27 @@ def lint(expression: str, operators_path: Path = Path("data/operators.csv")) ->
68
  errors: list[str] = []
69
  warnings: list[str] = []
70
 
71
- # ─── CHECK 1: Empty or trivially short ──────────────────────
72
  if not expression or len(expression.strip()) < 5:
73
  errors.append("Expression is empty or trivially short")
74
  return LintResult(passed=False, errors=errors, warnings=warnings)
75
 
76
- # ─── CHECK 2: Operator validity ─────────────────────────────
77
  found_ops = re.findall(r"\b([a-z_]+)\s*\(", expression.lower())
 
78
  for op in found_ops:
79
  if op not in ALLOWED_OPS and op not in {"if", "and", "or", "not"}:
80
- errors.append(f"Unknown operator: '{op}' — not in operators.csv")
 
 
81
 
82
- # ─── CHECK 3: Look-ahead detection ──────────────────────────
83
  for pattern in LOOKAHEAD_PATTERNS:
84
  match = re.search(pattern, expression, re.IGNORECASE)
85
  if match:
86
  errors.append(f"Look-ahead detected: '{match.group()}' (pattern: {pattern})")
87
 
88
- # ─── CHECK 4: Balanced parentheses ──────────────────────────
89
  depth = 0
90
  for ch in expression:
91
  if ch == "(":
@@ -98,36 +103,43 @@ def lint(expression: str, operators_path: Path = Path("data/operators.csv")) ->
98
  if depth > 0:
99
  errors.append(f"Unbalanced parentheses: {depth} unclosed '('")
100
 
101
- # ─── CHECK 5: Unit-safety for additive expressions ──────────
102
- # Detect pattern: weight * func(...) + weight * func(...)
103
  additive_parts = re.split(r"\s*\+\s*", expression.strip())
104
  if len(additive_parts) > 1:
105
  for part in additive_parts:
106
  part_clean = part.strip()
107
- # Check if the part is wrapped in a unit-safe function
108
  first_func = re.match(r"[\d.\-]*\s*\*?\s*([a-z_]+)\s*\(", part_clean.lower())
109
  if first_func:
110
  func_name = first_func.group(1)
111
  if func_name not in UNIT_SAFE_WRAPPERS:
112
  warnings.append(
113
- f"Additive operand not unit-safe (may cause 'Incompatible unit'): "
114
- f"'{part_clean[:60]}...' — wrap in zscore/rank/quantile"
115
  )
116
 
117
- # ─── CHECK 6: Suspicious patterns (warnings, not errors) ────
118
- # Very high decay
 
 
 
 
 
 
 
 
 
 
 
119
  decay_match = re.search(r"ts_decay_linear\([^,]+,\s*(\d+)", expression)
120
  if decay_match and int(decay_match.group(1)) > 20:
121
- warnings.append(f"Decay of {decay_match.group(1)} days is unusually high — typical range 3-15")
122
 
123
- # Very long lookback
124
  for match in re.finditer(r"ts_\w+\([^,]+,\s*(\d+)", expression):
125
  window = int(match.group(1))
126
  if window > 252:
127
- warnings.append(f"Window of {window} days exceeds 1 year — may have low coverage for newer stocks")
128
 
129
- # ─── CHECK 7: Empty field references ────────────────────────
130
- # Look for common mistakes like ts_mean(, 5) or rank()
131
  empty_args = re.findall(r"\b[a-z_]+\(\s*,", expression.lower())
132
  if empty_args:
133
  errors.append(f"Empty first argument detected: {empty_args}")
@@ -136,6 +148,10 @@ def lint(expression: str, operators_path: Path = Path("data/operators.csv")) ->
136
  if empty_func:
137
  errors.append(f"Function called with no arguments: {empty_func}")
138
 
 
 
 
 
139
  return LintResult(
140
  passed=len(errors) == 0,
141
  errors=errors,
@@ -148,3 +164,8 @@ def quick_dedup_hash(expression: str, neutralization: str, decay: int) -> str:
148
  import hashlib
149
  key = f"{expression.strip()}|{neutralization}|{decay}"
150
  return hashlib.sha256(key.encode()).hexdigest()[:16]
 
 
 
 
 
 
1
  """
2
+ Static Lint v2 — Layer 2: Deterministic pre-flight checks.
3
+ Now loads operators from data/operators.csv and validates fields against the registry.
4
  No LLM. Pure Python. Catches 100% of mechanical failures.
 
5
  """
6
  import re
7
  from pathlib import Path
8
  from ..schemas import LintResult
9
+ from ..data.brain_fields import FIELD_INDEX
10
 
11
 
 
12
  def _load_operators(path: Path = Path("data/operators.csv")) -> set[str]:
13
+ """Load valid operator names from operators.csv (71 confirmed operators)."""
14
  if not path.exists():
15
+ # Fallback: hardcoded from the confirmed 71-operator catalog
16
  return {
17
+ "abs", "bucket", "correlation", "covariance", "equal", "filter",
18
+ "fraction", "greater", "group_count", "group_max", "group_mean",
19
+ "group_median", "group_min", "group_neutralize", "group_rank",
20
+ "group_sum", "group_zscore", "if_else", "indneutralize", "less",
21
+ "log", "market_neutralize", "mask", "max", "min", "pasteurize",
22
+ "power", "quantile", "rank", "regression_neut", "relu", "sigmoid",
23
+ "sign", "sqrt", "tail", "tanh", "trade_when", "truncate",
24
+ "ts_argmax", "ts_argmin", "ts_av_diff", "ts_backfill",
25
+ "ts_correlation", "ts_covariance", "ts_decay_exp_window",
26
+ "ts_decay_linear", "ts_delay", "ts_delta", "ts_entropy",
27
+ "ts_hump", "ts_kurtosis", "ts_max", "ts_mean", "ts_min",
28
+ "ts_moment", "ts_product", "ts_rank", "ts_regression",
29
+ "ts_scale", "ts_skewness", "ts_std", "ts_step", "ts_sum",
30
+ "ts_zscore", "vec_avg", "vec_count", "vec_norm", "vec_sum",
31
+ "winsorize", "zscore",
 
 
 
32
  }
33
  ops = set()
34
  with open(path) as f:
 
41
 
42
  ALLOWED_OPS: set[str] | None = None
43
 
44
+ # Look-ahead deny patterns
45
  LOOKAHEAD_PATTERNS = [
46
+ r"ts_delay\([^,]+,\s*-\d",
47
+ r"\bfuture_",
48
+ r"\bforward_return",
49
+ r"ts_backfill\([^,]+,\s*\d{3,}",
50
  ]
51
 
52
+ # Unit-safety wrappers
53
  UNIT_SAFE_WRAPPERS = {"zscore", "rank", "quantile", "group_zscore", "group_rank", "group_neutralize", "indneutralize"}
54
 
55
+ # Known BRAIN field patterns (not operators)
56
+ KNOWN_FIELD_PREFIXES = {
57
+ "close", "open", "high", "low", "volume", "vwap", "returns", "cap",
58
+ "operating_income", "enterprise_value", "equity", "ebitda", "assets",
59
+ "liabilities", "gross_income", "net_income", "shares", "market",
60
+ "implied_volatility", "short_interest", "adv", "split", "dividend",
61
+ }
62
+
63
 
64
  def lint(expression: str, operators_path: Path = Path("data/operators.csv")) -> LintResult:
65
+ """Run all deterministic pre-flight checks on a BRAIN expression."""
 
 
 
66
  global ALLOWED_OPS
67
  if ALLOWED_OPS is None:
68
  ALLOWED_OPS = _load_operators(operators_path)
 
70
  errors: list[str] = []
71
  warnings: list[str] = []
72
 
73
+ # CHECK 1: Empty or trivially short
74
  if not expression or len(expression.strip()) < 5:
75
  errors.append("Expression is empty or trivially short")
76
  return LintResult(passed=False, errors=errors, warnings=warnings)
77
 
78
+ # CHECK 2: Operator validity (against 71-operator catalog)
79
  found_ops = re.findall(r"\b([a-z_]+)\s*\(", expression.lower())
80
+ invalid_ops = []
81
  for op in found_ops:
82
  if op not in ALLOWED_OPS and op not in {"if", "and", "or", "not"}:
83
+ invalid_ops.append(op)
84
+ if invalid_ops:
85
+ errors.append(f"Unknown operators (not in 71-op catalog): {invalid_ops}")
86
 
87
+ # CHECK 3: Look-ahead detection
88
  for pattern in LOOKAHEAD_PATTERNS:
89
  match = re.search(pattern, expression, re.IGNORECASE)
90
  if match:
91
  errors.append(f"Look-ahead detected: '{match.group()}' (pattern: {pattern})")
92
 
93
+ # CHECK 4: Balanced parentheses
94
  depth = 0
95
  for ch in expression:
96
  if ch == "(":
 
103
  if depth > 0:
104
  errors.append(f"Unbalanced parentheses: {depth} unclosed '('")
105
 
106
+ # CHECK 5: Unit-safety for additive expressions
 
107
  additive_parts = re.split(r"\s*\+\s*", expression.strip())
108
  if len(additive_parts) > 1:
109
  for part in additive_parts:
110
  part_clean = part.strip()
 
111
  first_func = re.match(r"[\d.\-]*\s*\*?\s*([a-z_]+)\s*\(", part_clean.lower())
112
  if first_func:
113
  func_name = first_func.group(1)
114
  if func_name not in UNIT_SAFE_WRAPPERS:
115
  warnings.append(
116
+ f"Additive operand not unit-safe: '{part_clean[:60]}...' wrap in zscore/rank"
 
117
  )
118
 
119
+ # CHECK 6: Field validation against registry
120
+ # Extract potential field references (words that aren't operators and aren't numbers)
121
+ all_words = re.findall(r"\b([a-z][a-z0-9_]+)\b", expression.lower())
122
+ potential_fields = [w for w in all_words if w not in ALLOWED_OPS and not w.startswith("ts_") and not w.startswith("group_")]
123
+ known_fields_used = [f for f in potential_fields if f in FIELD_INDEX]
124
+
125
+ # Warn about low-coverage fields
126
+ for fid in known_fields_used:
127
+ field = FIELD_INDEX[fid]
128
+ if field.coverage < 0.60:
129
+ warnings.append(f"Field '{fid}' has low coverage ({field.coverage:.0%}) — use ts_backfill({fid}, 30)")
130
+
131
+ # CHECK 7: Decay sanity
132
  decay_match = re.search(r"ts_decay_linear\([^,]+,\s*(\d+)", expression)
133
  if decay_match and int(decay_match.group(1)) > 20:
134
+ warnings.append(f"Decay of {decay_match.group(1)} days is high — typical range 3-15")
135
 
136
+ # CHECK 8: Very long lookback
137
  for match in re.finditer(r"ts_\w+\([^,]+,\s*(\d+)", expression):
138
  window = int(match.group(1))
139
  if window > 252:
140
+ warnings.append(f"Window of {window} days > 1 year — may reduce coverage for newer stocks")
141
 
142
+ # CHECK 9: Empty arguments
 
143
  empty_args = re.findall(r"\b[a-z_]+\(\s*,", expression.lower())
144
  if empty_args:
145
  errors.append(f"Empty first argument detected: {empty_args}")
 
148
  if empty_func:
149
  errors.append(f"Function called with no arguments: {empty_func}")
150
 
151
+ # CHECK 10: Turnover guard — must have ts_decay_linear
152
+ if "ts_decay_linear" not in expression.lower():
153
+ warnings.append("No ts_decay_linear found — turnover may exceed 70% threshold")
154
+
155
  return LintResult(
156
  passed=len(errors) == 0,
157
  errors=errors,
 
164
  import hashlib
165
  key = f"{expression.strip()}|{neutralization}|{decay}"
166
  return hashlib.sha256(key.encode()).hexdigest()[:16]
167
+
168
+
169
+ def validate_field_exists(field_id: str) -> bool:
170
+ """Check if a field ID exists in the canonical registry."""
171
+ return field_id in FIELD_INDEX