Upload alpha_factory/cleanup.py with huggingface_hub
Browse files- alpha_factory/cleanup.py +11 -3
alpha_factory/cleanup.py
CHANGED
|
@@ -107,11 +107,19 @@ def cleanup_orphans():
|
|
| 107 |
if not expression:
|
| 108 |
continue
|
| 109 |
# Extract all word-like tokens that could be field names
|
| 110 |
-
|
|
|
|
| 111 |
# Filter out operators and known keywords
|
| 112 |
-
skip = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
for t in tokens:
|
| 114 |
-
if t.startswith("ts_") or t.startswith("group_") or t.startswith("vec_")
|
|
|
|
|
|
|
| 115 |
continue
|
| 116 |
if t in skip:
|
| 117 |
continue
|
|
|
|
| 107 |
if not expression:
|
| 108 |
continue
|
| 109 |
# Extract all word-like tokens that could be field names
|
| 110 |
+
# Require at least 10 chars to avoid matching common words like "backfill"
|
| 111 |
+
tokens = re.findall(r"\b([a-z][a-z0-9_]{10,})\b", expression.lower())
|
| 112 |
# Filter out operators and known keywords
|
| 113 |
+
skip = {
|
| 114 |
+
"subindustry", "industry", "sector", "market",
|
| 115 |
+
"close", "high", "low", "open", "volume", "vwap",
|
| 116 |
+
# Common English words that might match length filter
|
| 117 |
+
"backfill", "neutralize", "expression",
|
| 118 |
+
}
|
| 119 |
for t in tokens:
|
| 120 |
+
if t.startswith("ts_") or t.startswith("group_") or t.startswith("vec_"):
|
| 121 |
+
continue
|
| 122 |
+
if t.startswith("pv13_") or t.startswith("mdl") or t.startswith("snt") or t.startswith("scl"):
|
| 123 |
continue
|
| 124 |
if t in skip:
|
| 125 |
continue
|