feat: add cleanup utility to purge old bad alphas with fake fields + prevent theme repetition in pipeline"
Browse files- alpha_factory/cleanup.py +104 -0
alpha_factory/cleanup.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Cleanup utility — purge old alphas that used fake/placeholder fields.
|
| 3 |
+
Run: uv run python -m alpha_factory.cleanup
|
| 4 |
+
|
| 5 |
+
Also provides batch-level dedup to prevent theme repetition.
|
| 6 |
+
"""
|
| 7 |
+
import duckdb
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
DB_PATH = Path("factor_store/alphas.duckdb")
|
| 11 |
+
|
| 12 |
+
# Fields that don't exist on BRAIN — any alpha using these is invalid
|
| 13 |
+
FAKE_FIELDS = {
|
| 14 |
+
"book_to_price", "earnings_yield", "roe", "roa", "debt_to_equity",
|
| 15 |
+
"current_ratio", "returns", "close", "volume", "ts_returns",
|
| 16 |
+
"high", "low", "bid_ask_spread", "volatility", "ivol", "beta",
|
| 17 |
+
"hv", "atr", "analyst_rating", "estimate_revision", "target_price",
|
| 18 |
+
"recommendation", "sentiment", "social_volume", "social_score",
|
| 19 |
+
"news_sentiment", "iv30", "iv60", "iv90", "iv180", "pcr", "skew",
|
| 20 |
+
"term_structure", "earnings_surprise", "post_earnings_drift",
|
| 21 |
+
"guidance", "turnover", "amihud_illiquidity", "revenue_growth",
|
| 22 |
+
"earnings_growth", "asset_growth", "sales_growth", "open",
|
| 23 |
+
"vwap", "intraday_range", "dividend_yield", "buyback_yield",
|
| 24 |
+
"shareholder_yield", "fcf_yield",
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def cleanup_fake_alphas():
|
| 29 |
+
"""Remove alphas that used placeholder field names."""
|
| 30 |
+
if not DB_PATH.exists():
|
| 31 |
+
print("No database found.")
|
| 32 |
+
return
|
| 33 |
+
|
| 34 |
+
conn = duckdb.connect(str(DB_PATH))
|
| 35 |
+
|
| 36 |
+
# Get all alphas
|
| 37 |
+
rows = conn.execute("SELECT alpha_id, expression, fields_used FROM alphas").fetchall()
|
| 38 |
+
|
| 39 |
+
to_delete = []
|
| 40 |
+
for alpha_id, expression, fields_used in rows:
|
| 41 |
+
# Check if expression contains fake fields
|
| 42 |
+
has_fake = False
|
| 43 |
+
if fields_used:
|
| 44 |
+
for f in fields_used:
|
| 45 |
+
if f in FAKE_FIELDS:
|
| 46 |
+
has_fake = True
|
| 47 |
+
break
|
| 48 |
+
if not has_fake and expression:
|
| 49 |
+
# Also check expression text for bare fake field names
|
| 50 |
+
expr_lower = expression.lower()
|
| 51 |
+
for fake in FAKE_FIELDS:
|
| 52 |
+
if fake in expr_lower and fake not in ("close", "high", "low", "open", "volume"):
|
| 53 |
+
# close/high/low/open/volume ARE valid BRAIN fields in pv1
|
| 54 |
+
has_fake = True
|
| 55 |
+
break
|
| 56 |
+
|
| 57 |
+
if has_fake:
|
| 58 |
+
to_delete.append(alpha_id)
|
| 59 |
+
|
| 60 |
+
if to_delete:
|
| 61 |
+
placeholders = ",".join(["?" for _ in to_delete])
|
| 62 |
+
conn.execute(f"DELETE FROM alphas WHERE alpha_id IN ({placeholders})", to_delete)
|
| 63 |
+
print(f"Deleted {len(to_delete)} alphas with fake/placeholder fields:")
|
| 64 |
+
for aid in to_delete:
|
| 65 |
+
print(f" - {aid}")
|
| 66 |
+
else:
|
| 67 |
+
print("No fake alphas found. Database is clean.")
|
| 68 |
+
|
| 69 |
+
# Show remaining
|
| 70 |
+
count = conn.execute("SELECT COUNT(*) FROM alphas").fetchone()[0]
|
| 71 |
+
print(f"\nRemaining alphas in store: {count}")
|
| 72 |
+
conn.close()
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def cleanup_quoted_expressions():
|
| 76 |
+
"""Fix any expressions that have quoted field names."""
|
| 77 |
+
if not DB_PATH.exists():
|
| 78 |
+
return
|
| 79 |
+
|
| 80 |
+
import re
|
| 81 |
+
conn = duckdb.connect(str(DB_PATH))
|
| 82 |
+
rows = conn.execute("SELECT alpha_id, expression FROM alphas WHERE expression LIKE '%''%'").fetchall()
|
| 83 |
+
|
| 84 |
+
fixed = 0
|
| 85 |
+
for alpha_id, expression in rows:
|
| 86 |
+
clean = re.sub(r"['\"]([a-z][a-z0-9_]+)['\"]", r"\1", expression)
|
| 87 |
+
if clean != expression:
|
| 88 |
+
conn.execute("UPDATE alphas SET expression = ? WHERE alpha_id = ?", [clean, alpha_id])
|
| 89 |
+
fixed += 1
|
| 90 |
+
|
| 91 |
+
if fixed:
|
| 92 |
+
print(f"Fixed {fixed} expressions with quoted field names.")
|
| 93 |
+
else:
|
| 94 |
+
print("No quoted field names found.")
|
| 95 |
+
conn.close()
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
if __name__ == "__main__":
|
| 99 |
+
print("=== Alpha Factory Cleanup ===\n")
|
| 100 |
+
print("1. Removing alphas with fake/placeholder fields...")
|
| 101 |
+
cleanup_fake_alphas()
|
| 102 |
+
print("\n2. Fixing quoted field names in expressions...")
|
| 103 |
+
cleanup_quoted_expressions()
|
| 104 |
+
print("\nDone!")
|