gaurv007 commited on
Commit
007ca36
·
verified ·
1 Parent(s): 9358827

feat: add cleanup utility to purge old bad alphas with fake fields + prevent theme repetition in pipeline"

Browse files
Files changed (1) hide show
  1. alpha_factory/cleanup.py +104 -0
alpha_factory/cleanup.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Cleanup utility — purge old alphas that used fake/placeholder fields.
3
+ Run: uv run python -m alpha_factory.cleanup
4
+
5
+ Also provides batch-level dedup to prevent theme repetition.
6
+ """
7
+ import duckdb
8
+ from pathlib import Path
9
+
10
+ DB_PATH = Path("factor_store/alphas.duckdb")
11
+
12
+ # Fields that don't exist on BRAIN — any alpha using these is invalid
13
+ FAKE_FIELDS = {
14
+ "book_to_price", "earnings_yield", "roe", "roa", "debt_to_equity",
15
+ "current_ratio", "returns", "close", "volume", "ts_returns",
16
+ "high", "low", "bid_ask_spread", "volatility", "ivol", "beta",
17
+ "hv", "atr", "analyst_rating", "estimate_revision", "target_price",
18
+ "recommendation", "sentiment", "social_volume", "social_score",
19
+ "news_sentiment", "iv30", "iv60", "iv90", "iv180", "pcr", "skew",
20
+ "term_structure", "earnings_surprise", "post_earnings_drift",
21
+ "guidance", "turnover", "amihud_illiquidity", "revenue_growth",
22
+ "earnings_growth", "asset_growth", "sales_growth", "open",
23
+ "vwap", "intraday_range", "dividend_yield", "buyback_yield",
24
+ "shareholder_yield", "fcf_yield",
25
+ }
26
+
27
+
28
+ def cleanup_fake_alphas():
29
+ """Remove alphas that used placeholder field names."""
30
+ if not DB_PATH.exists():
31
+ print("No database found.")
32
+ return
33
+
34
+ conn = duckdb.connect(str(DB_PATH))
35
+
36
+ # Get all alphas
37
+ rows = conn.execute("SELECT alpha_id, expression, fields_used FROM alphas").fetchall()
38
+
39
+ to_delete = []
40
+ for alpha_id, expression, fields_used in rows:
41
+ # Check if expression contains fake fields
42
+ has_fake = False
43
+ if fields_used:
44
+ for f in fields_used:
45
+ if f in FAKE_FIELDS:
46
+ has_fake = True
47
+ break
48
+ if not has_fake and expression:
49
+ # Also check expression text for bare fake field names
50
+ expr_lower = expression.lower()
51
+ for fake in FAKE_FIELDS:
52
+ if fake in expr_lower and fake not in ("close", "high", "low", "open", "volume"):
53
+ # close/high/low/open/volume ARE valid BRAIN fields in pv1
54
+ has_fake = True
55
+ break
56
+
57
+ if has_fake:
58
+ to_delete.append(alpha_id)
59
+
60
+ if to_delete:
61
+ placeholders = ",".join(["?" for _ in to_delete])
62
+ conn.execute(f"DELETE FROM alphas WHERE alpha_id IN ({placeholders})", to_delete)
63
+ print(f"Deleted {len(to_delete)} alphas with fake/placeholder fields:")
64
+ for aid in to_delete:
65
+ print(f" - {aid}")
66
+ else:
67
+ print("No fake alphas found. Database is clean.")
68
+
69
+ # Show remaining
70
+ count = conn.execute("SELECT COUNT(*) FROM alphas").fetchone()[0]
71
+ print(f"\nRemaining alphas in store: {count}")
72
+ conn.close()
73
+
74
+
75
+ def cleanup_quoted_expressions():
76
+ """Fix any expressions that have quoted field names."""
77
+ if not DB_PATH.exists():
78
+ return
79
+
80
+ import re
81
+ conn = duckdb.connect(str(DB_PATH))
82
+ rows = conn.execute("SELECT alpha_id, expression FROM alphas WHERE expression LIKE '%''%'").fetchall()
83
+
84
+ fixed = 0
85
+ for alpha_id, expression in rows:
86
+ clean = re.sub(r"['\"]([a-z][a-z0-9_]+)['\"]", r"\1", expression)
87
+ if clean != expression:
88
+ conn.execute("UPDATE alphas SET expression = ? WHERE alpha_id = ?", [clean, alpha_id])
89
+ fixed += 1
90
+
91
+ if fixed:
92
+ print(f"Fixed {fixed} expressions with quoted field names.")
93
+ else:
94
+ print("No quoted field names found.")
95
+ conn.close()
96
+
97
+
98
+ if __name__ == "__main__":
99
+ print("=== Alpha Factory Cleanup ===\n")
100
+ print("1. Removing alphas with fake/placeholder fields...")
101
+ cleanup_fake_alphas()
102
+ print("\n2. Fixing quoted field names in expressions...")
103
+ cleanup_quoted_expressions()
104
+ print("\nDone!")