fix: add operator arity validation to lint + enrich LLM prompt with exact arities to prevent 'Invalid number of inputs' errors"

Browse files

Files changed (1) hide show

alpha_factory/deterministic/lint.py +84 -61

alpha_factory/deterministic/lint.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """
-Static Lint v2 — Layer 2: Deterministic pre-flight checks.
-Now loads operators from data/operators.csv and validates fields against the registry.
-No LLM. Pure Python. Catches 100% of mechanical failures.
 """
 import re
 from pathlib import Path
@@ -9,27 +8,40 @@ from ..schemas import LintResult
 from ..data.brain_fields import FIELD_INDEX
 def _load_operators(path: Path = Path("data/operators.csv")) -> set[str]:
-    """Load valid operator names from operators.csv (71 confirmed operators)."""
     if not path.exists():
-        # Fallback: hardcoded from the confirmed 71-operator catalog
-        return {
-            "abs", "bucket", "correlation", "covariance", "equal", "filter",
-            "fraction", "greater", "group_count", "group_max", "group_mean",
-            "group_median", "group_min", "group_neutralize", "group_rank",
-            "group_sum", "group_zscore", "if_else", "indneutralize", "less",
-            "log", "market_neutralize", "mask", "max", "min", "pasteurize",
-            "power", "quantile", "rank", "regression_neut", "relu", "sigmoid",
-            "sign", "sqrt", "tail", "tanh", "trade_when", "truncate",
-            "ts_argmax", "ts_argmin", "ts_av_diff", "ts_backfill",
-            "ts_correlation", "ts_covariance", "ts_decay_exp_window",
-            "ts_decay_linear", "ts_delay", "ts_delta", "ts_entropy",
-            "ts_hump", "ts_kurtosis", "ts_max", "ts_mean", "ts_min",
-            "ts_moment", "ts_product", "ts_rank", "ts_regression",
-            "ts_scale", "ts_skewness", "ts_std", "ts_step", "ts_sum",
-            "ts_zscore", "vec_avg", "vec_count", "vec_norm", "vec_sum",
-            "winsorize", "zscore",
-        }
     ops = set()
     with open(path) as f:
         for line in f:
@@ -41,7 +53,6 @@ def _load_operators(path: Path = Path("data/operators.csv")) -> set[str]:
 ALLOWED_OPS: set[str] | None = None
-# Look-ahead deny patterns
 LOOKAHEAD_PATTERNS = [
     r"ts_delay\([^,]+,\s*-\d",
     r"\bfuture_",
@@ -49,16 +60,29 @@ LOOKAHEAD_PATTERNS = [
     r"ts_backfill\([^,]+,\s*\d{3,}",
 ]
-# Unit-safety wrappers
 UNIT_SAFE_WRAPPERS = {"zscore", "rank", "quantile", "group_zscore", "group_rank", "group_neutralize", "indneutralize"}
-# Known BRAIN field patterns (not operators)
-KNOWN_FIELD_PREFIXES = {
-    "close", "open", "high", "low", "volume", "vwap", "returns", "cap",
-    "operating_income", "enterprise_value", "equity", "ebitda", "assets",
-    "liabilities", "gross_income", "net_income", "shares", "market",
-    "implied_volatility", "short_interest", "adv", "split", "dividend",
-}
 def lint(expression: str, operators_path: Path = Path("data/operators.csv")) -> LintResult:
@@ -75,20 +99,17 @@ def lint(expression: str, operators_path: Path = Path("data/operators.csv")) ->
         errors.append("Expression is empty or trivially short")
         return LintResult(passed=False, errors=errors, warnings=warnings)
-    # CHECK 2: Operator validity (against 71-operator catalog)
     found_ops = re.findall(r"\b([a-z_]+)\s*\(", expression.lower())
-    invalid_ops = []
     for op in found_ops:
         if op not in ALLOWED_OPS and op not in {"if", "and", "or", "not"}:
-            invalid_ops.append(op)
-    if invalid_ops:
-        errors.append(f"Unknown operators (not in 71-op catalog): {invalid_ops}")
     # CHECK 3: Look-ahead detection
     for pattern in LOOKAHEAD_PATTERNS:
         match = re.search(pattern, expression, re.IGNORECASE)
         if match:
-            errors.append(f"Look-ahead detected: '{match.group()}' (pattern: {pattern})")
     # CHECK 4: Balanced parentheses
     depth = 0
@@ -103,7 +124,20 @@ def lint(expression: str, operators_path: Path = Path("data/operators.csv")) ->
     if depth > 0:
         errors.append(f"Unbalanced parentheses: {depth} unclosed '('")
-    # CHECK 5: Unit-safety for additive expressions
     additive_parts = re.split(r"\s*\+\s*", expression.strip())
     if len(additive_parts) > 1:
         for part in additive_parts:
@@ -112,45 +146,36 @@ def lint(expression: str, operators_path: Path = Path("data/operators.csv")) ->
             if first_func:
                 func_name = first_func.group(1)
                 if func_name not in UNIT_SAFE_WRAPPERS:
-                    warnings.append(
-                        f"Additive operand not unit-safe: '{part_clean[:60]}...' — wrap in zscore/rank"
-                    )
-    # CHECK 6: Field validation against registry
-    # Extract potential field references (words that aren't operators and aren't numbers)
     all_words = re.findall(r"\b([a-z][a-z0-9_]+)\b", expression.lower())
-    potential_fields = [w for w in all_words if w not in ALLOWED_OPS and not w.startswith("ts_") and not w.startswith("group_")]
-    known_fields_used = [f for f in potential_fields if f in FIELD_INDEX]
-    # Warn about low-coverage fields
-    for fid in known_fields_used:
-        field = FIELD_INDEX[fid]
-        if field.coverage < 0.60:
-            warnings.append(f"Field '{fid}' has low coverage ({field.coverage:.0%}) — use ts_backfill({fid}, 30)")
-    # CHECK 7: Decay sanity
     decay_match = re.search(r"ts_decay_linear\([^,]+,\s*(\d+)", expression)
     if decay_match and int(decay_match.group(1)) > 20:
         warnings.append(f"Decay of {decay_match.group(1)} days is high — typical range 3-15")
-    # CHECK 8: Very long lookback
     for match in re.finditer(r"ts_\w+\([^,]+,\s*(\d+)", expression):
         window = int(match.group(1))
         if window > 252:
-            warnings.append(f"Window of {window} days > 1 year — may reduce coverage for newer stocks")
-    # CHECK 9: Empty arguments
     empty_args = re.findall(r"\b[a-z_]+\(\s*,", expression.lower())
     if empty_args:
         errors.append(f"Empty first argument detected: {empty_args}")
     empty_func = re.findall(r"\b[a-z_]+\(\s*\)", expression.lower())
     if empty_func:
         errors.append(f"Function called with no arguments: {empty_func}")
-    # CHECK 10: Turnover guard — must have ts_decay_linear
     if "ts_decay_linear" not in expression.lower():
-        warnings.append("No ts_decay_linear found — turnover may exceed 70% threshold")
     return LintResult(
         passed=len(errors) == 0,
@@ -160,12 +185,10 @@ def lint(expression: str, operators_path: Path = Path("data/operators.csv")) ->
 def quick_dedup_hash(expression: str, neutralization: str, decay: int) -> str:
-    """Generate a deterministic hash for dedup against factor store."""
     import hashlib
     key = f"{expression.strip()}|{neutralization}|{decay}"
     return hashlib.sha256(key.encode()).hexdigest()[:16]
 def validate_field_exists(field_id: str) -> bool:
-    """Check if a field ID exists in the canonical registry."""
     return field_id in FIELD_INDEX

 """
+Static Lint v3 — Layer 2: Deterministic pre-flight checks.
+Now validates operator ARITY (argument count) to catch 'Invalid number of inputs' errors.
 """
 import re
 from pathlib import Path
 from ..data.brain_fields import FIELD_INDEX
+# Operator → required number of arguments (minimum)
+OPERATOR_ARITY: dict[str, int] = {
+    # 1-argument operators
+    "rank": 1, "zscore": 1, "quantile": 1, "abs": 1, "log": 1,
+    "sign": 1, "sqrt": 1, "sigmoid": 1, "tanh": 1, "relu": 1,
+    "pasteurize": 1, "truncate": 1, "fraction": 1,
+    "vec_avg": 1, "vec_sum": 1, "vec_norm": 1, "vec_count": 1,
+    # 2-argument operators
+    "ts_mean": 2, "ts_std": 2, "ts_sum": 2, "ts_min": 2, "ts_max": 2,
+    "ts_rank": 2, "ts_zscore": 2, "ts_delta": 2, "ts_delay": 2,
+    "ts_decay_linear": 2, "ts_argmax": 2, "ts_argmin": 2,
+    "ts_skewness": 2, "ts_kurtosis": 2, "ts_entropy": 2,
+    "ts_product": 2, "ts_moment": 2, "ts_av_diff": 2,
+    "ts_hump": 2, "ts_scale": 2, "ts_step": 2,
+    "ts_decay_exp_window": 2, "ts_backfill": 2,
+    "group_neutralize": 2, "group_rank": 2, "group_zscore": 2,
+    "group_mean": 2, "group_sum": 2, "group_median": 2,
+    "group_max": 2, "group_min": 2, "group_count": 2,
+    "indneutralize": 2, "market_neutralize": 2,
+    "power": 2, "max": 2, "min": 2, "winsorize": 2,
+    "less": 2, "greater": 2, "equal": 2,
+    "bucket": 2, "tail": 2, "mask": 2, "filter": 2,
+    # 3-argument operators
+    "ts_correlation": 3, "ts_covariance": 3,
+    "if_else": 3, "trade_when": 3,
+    # Variable (3+)
+    "ts_regression": 3,
+}
 def _load_operators(path: Path = Path("data/operators.csv")) -> set[str]:
+    """Load valid operator names from operators.csv."""
     if not path.exists():
+        return set(OPERATOR_ARITY.keys())
     ops = set()
     with open(path) as f:
         for line in f:
 ALLOWED_OPS: set[str] | None = None
 LOOKAHEAD_PATTERNS = [
     r"ts_delay\([^,]+,\s*-\d",
     r"\bfuture_",
     r"ts_backfill\([^,]+,\s*\d{3,}",
 ]
 UNIT_SAFE_WRAPPERS = {"zscore", "rank", "quantile", "group_zscore", "group_rank", "group_neutralize", "indneutralize"}
+def _count_args(expression: str, start_pos: int) -> int:
+    """Count arguments of a function call starting at the opening paren."""
+    depth = 0
+    arg_count = 1  # At least 1 if there's any content
+    i = start_pos
+    has_content = False
+    while i < len(expression):
+        ch = expression[i]
+        if ch == '(':
+            depth += 1
+        elif ch == ')':
+            depth -= 1
+            if depth == 0:
+                return arg_count if has_content else 0
+        elif ch == ',' and depth == 1:
+            arg_count += 1
+        elif ch not in ' \t\n' and depth == 1:
+            has_content = True
+        i += 1
+    return arg_count if has_content else 0
 def lint(expression: str, operators_path: Path = Path("data/operators.csv")) -> LintResult:
         errors.append("Expression is empty or trivially short")
         return LintResult(passed=False, errors=errors, warnings=warnings)
+    # CHECK 2: Operator validity
     found_ops = re.findall(r"\b([a-z_]+)\s*\(", expression.lower())
     for op in found_ops:
         if op not in ALLOWED_OPS and op not in {"if", "and", "or", "not"}:
+            errors.append(f"Unknown operator: '{op}' — not in 71-op catalog")
     # CHECK 3: Look-ahead detection
     for pattern in LOOKAHEAD_PATTERNS:
         match = re.search(pattern, expression, re.IGNORECASE)
         if match:
+            errors.append(f"Look-ahead detected: '{match.group()}'")
     # CHECK 4: Balanced parentheses
     depth = 0
     if depth > 0:
         errors.append(f"Unbalanced parentheses: {depth} unclosed '('")
+    # CHECK 5: Operator ARITY validation (catches "Invalid number of inputs")
+    for match in re.finditer(r"\b([a-z_]+)\s*\(", expression.lower()):
+        op_name = match.group(1)
+        if op_name in OPERATOR_ARITY:
+            expected_min = OPERATOR_ARITY[op_name]
+            paren_start = match.end() - 1  # position of '('
+            actual_args = _count_args(expression.lower(), paren_start)
+            if actual_args < expected_min:
+                errors.append(
+                    f"Arity error: '{op_name}' requires {expected_min} args, got {actual_args}. "
+                    f"Example: {op_name}(field, days)" if expected_min == 2 else f"Example: {op_name}(x, y, z)"
+                )
+    # CHECK 6: Unit-safety for additive expressions
     additive_parts = re.split(r"\s*\+\s*", expression.strip())
     if len(additive_parts) > 1:
         for part in additive_parts:
             if first_func:
                 func_name = first_func.group(1)
                 if func_name not in UNIT_SAFE_WRAPPERS:
+                    warnings.append(f"Additive operand not unit-safe: '{part_clean[:60]}...' — wrap in zscore/rank")
+    # CHECK 7: Field coverage validation
     all_words = re.findall(r"\b([a-z][a-z0-9_]+)\b", expression.lower())
+    for fid in all_words:
+        if fid in FIELD_INDEX and FIELD_INDEX[fid].coverage < 0.60:
+            warnings.append(f"Field '{fid}' has low coverage ({FIELD_INDEX[fid].coverage:.0%}) — use ts_backfill({fid}, 30)")
+    # CHECK 8: Decay sanity
     decay_match = re.search(r"ts_decay_linear\([^,]+,\s*(\d+)", expression)
     if decay_match and int(decay_match.group(1)) > 20:
         warnings.append(f"Decay of {decay_match.group(1)} days is high — typical range 3-15")
+    # CHECK 9: Very long lookback
     for match in re.finditer(r"ts_\w+\([^,]+,\s*(\d+)", expression):
         window = int(match.group(1))
         if window > 252:
+            warnings.append(f"Window of {window} days > 1 year — may reduce coverage")
+    # CHECK 10: Empty arguments
     empty_args = re.findall(r"\b[a-z_]+\(\s*,", expression.lower())
     if empty_args:
         errors.append(f"Empty first argument detected: {empty_args}")
     empty_func = re.findall(r"\b[a-z_]+\(\s*\)", expression.lower())
     if empty_func:
         errors.append(f"Function called with no arguments: {empty_func}")
+    # CHECK 11: Turnover guard
     if "ts_decay_linear" not in expression.lower():
+        warnings.append("No ts_decay_linear found — turnover may exceed 70%")
     return LintResult(
         passed=len(errors) == 0,
 def quick_dedup_hash(expression: str, neutralization: str, decay: int) -> str:
     import hashlib
     key = f"{expression.strip()}|{neutralization}|{decay}"
     return hashlib.sha256(key.encode()).hexdigest()[:16]
 def validate_field_exists(field_id: str) -> bool:
     return field_id in FIELD_INDEX