""" Cleanup utility — purge old alphas that used fake/placeholder fields. Run: uv run python -m alpha_factory.cleanup Also provides batch-level dedup to prevent theme repetition. """ import duckdb from pathlib import Path DB_PATH = Path("factor_store/alphas.duckdb") # Fields that genuinely do NOT exist on BRAIN — any alpha using these is invalid. # NOTE: close, high, low, open, volume, vwap ARE valid BRAIN pv1 fields. # Removed from blacklist: book_to_price, earnings_yield, returns, vwap, volatility, # bid_ask_spread, analyst_rating, etc. These are real field names on various datasets. FAKE_FIELDS = { # Truly fake / placeholder names that don't map to any BRAIN dataset "fake_field", "placeholder", "dummy_field", "test_field", "my_field", "sample_field", } def cleanup_fake_alphas(): """Remove alphas that used placeholder field names.""" if not DB_PATH.exists(): print("No database found.") return conn = duckdb.connect(str(DB_PATH)) # Get all alphas rows = conn.execute("SELECT alpha_id, expression, fields_used FROM alphas").fetchall() to_delete = [] for alpha_id, expression, fields_used in rows: has_fake = False if fields_used: for f in fields_used: if f in FAKE_FIELDS: has_fake = True break if not has_fake and expression: # Also check expression text for bare fake field names expr_lower = expression.lower() for fake in FAKE_FIELDS: if fake in expr_lower: has_fake = True break if has_fake: to_delete.append(alpha_id) if to_delete: placeholders = ",".join(["?" for _ in to_delete]) conn.execute(f"DELETE FROM alphas WHERE alpha_id IN ({placeholders})", to_delete) print(f"Deleted {len(to_delete)} alphas with fake/placeholder fields:") for aid in to_delete: print(f" - {aid}") else: print("No fake alphas found. Database is clean.") # Show remaining count = conn.execute("SELECT COUNT(*) FROM alphas").fetchone()[0] print(f"\nRemaining alphas in store: {count}") conn.close() def cleanup_quoted_expressions(): """Fix any expressions that have quoted field names.""" if not DB_PATH.exists(): return import re conn = duckdb.connect(str(DB_PATH)) rows = conn.execute("SELECT alpha_id, expression FROM alphas WHERE expression LIKE '%''%'").fetchall() fixed = 0 for alpha_id, expression in rows: clean = re.sub(r"['\"]([a-z][a-z0-9_]+)['\"]", r"\1", expression) if clean != expression: conn.execute("UPDATE alphas SET expression = ? WHERE alpha_id = ?", [clean, alpha_id]) fixed += 1 if fixed: print(f"Fixed {fixed} expressions with quoted field names.") else: print("No quoted field names found.") conn.close() def cleanup_orphans(): """Delete alphas that reference fields not in FIELD_INDEX.""" if not DB_PATH.exists(): return from alpha_factory.data.brain_fields import FIELD_INDEX import re conn = duckdb.connect(str(DB_PATH)) rows = conn.execute("SELECT alpha_id, expression FROM alphas").fetchall() to_delete = [] for alpha_id, expression in rows: if not expression: continue # Extract all word-like tokens that could be field names # Require at least 10 chars to avoid matching common words like "backfill" tokens = re.findall(r"\b([a-z][a-z0-9_]{10,})\b", expression.lower()) # Filter out operators and known keywords skip = { "subindustry", "industry", "sector", "market", "close", "high", "low", "open", "volume", "vwap", # Common English words that might match length filter "backfill", "neutralize", "expression", } for t in tokens: if t.startswith("ts_") or t.startswith("group_") or t.startswith("vec_"): continue if t.startswith("pv13_") or t.startswith("mdl") or t.startswith("snt") or t.startswith("scl"): continue if t in skip: continue if t not in FIELD_INDEX: to_delete.append(alpha_id) break if to_delete: placeholders = ",".join(["?" for _ in to_delete]) conn.execute(f"DELETE FROM alphas WHERE alpha_id IN ({placeholders})", to_delete) print(f"Deleted {len(to_delete)} alphas with unknown field references.") conn.close() if __name__ == "__main__": print("=== Alpha Factory Cleanup v0.2.0 ===\n") print("1. Removing alphas with fake/placeholder fields...") cleanup_fake_alphas() print("\n2. Fixing quoted field names in expressions...") cleanup_quoted_expressions() print("\n3. Removing alphas with unknown field references...") cleanup_orphans() print("\nDone!")