File size: 5,295 Bytes
007ca36 d5121aa 007ca36 d5121aa 007ca36 74d141c 007ca36 74d141c d5121aa 007ca36 d5121aa 007ca36 74d141c 007ca36 d5121aa 007ca36 d5121aa 007ca36 d5121aa 007ca36 d5121aa 007ca36 d5121aa 007ca36 d5121aa 007ca36 74d141c ee996e3 74d141c ee996e3 74d141c ee996e3 74d141c d5121aa 74d141c 007ca36 d5121aa 007ca36 d5121aa 007ca36 d5121aa 74d141c d5121aa | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 | """
Cleanup utility — purge old alphas that used fake/placeholder fields.
Run: uv run python -m alpha_factory.cleanup
Also provides batch-level dedup to prevent theme repetition.
"""
import duckdb
import logging
from pathlib import Path
logger = logging.getLogger(__name__)
DB_PATH = Path("factor_store/alphas.duckdb")
# Fields that genuinely do NOT exist on BRAIN — any alpha using these is invalid.
# NOTE: close, high, low, open, volume, vwap ARE valid BRAIN pv1 fields.
# Removed from blacklist: book_to_price, earnings_yield, returns, vwap, volatility,
# bid_ask_spread, analyst_rating, etc. These are real field names on various datasets.
FAKE_FIELDS = {
# Truly fake / placeholder names that don't map to any BRAIN dataset
"fake_field",
"placeholder",
"dummy_field",
"test_field",
"sample_field",
"my_field",
}
def cleanup_fake_alphas():
"""Remove alphas that used placeholder field names."""
if not DB_PATH.exists():
logger.info("No database found.")
return
conn = duckdb.connect(str(DB_PATH))
# Get all alphas
rows = conn.execute("SELECT alpha_id, expression, fields_used FROM alphas").fetchall()
to_delete = []
for alpha_id, expression, fields_used in rows:
has_fake = False
if fields_used:
for f in fields_used:
if f in FAKE_FIELDS:
has_fake = True
break
if not has_fake and expression:
# Also check expression text for bare fake field names
expr_lower = expression.lower()
for fake in FAKE_FIELDS:
if fake in expr_lower:
has_fake = True
break
if has_fake:
to_delete.append(alpha_id)
if to_delete:
placeholders = ",".join(["?" for _ in to_delete])
conn.execute(f"DELETE FROM alphas WHERE alpha_id IN ({placeholders})", to_delete)
logger.info(f"Deleted {len(to_delete)} alphas with fake/placeholder fields")
for aid in to_delete:
logger.info(f" - {aid}")
else:
logger.info("No fake alphas found. Database is clean.")
# Show remaining
count = conn.execute("SELECT COUNT(*) FROM alphas").fetchone()[0]
logger.info(f"Remaining alphas in store: {count}")
conn.close()
def cleanup_quoted_expressions():
"""Fix any expressions that have quoted field names."""
if not DB_PATH.exists():
return
import re
conn = duckdb.connect(str(DB_PATH))
rows = conn.execute("SELECT alpha_id, expression FROM alphas WHERE expression LIKE '%''%'").fetchall()
fixed = 0
for alpha_id, expression in rows:
clean = re.sub(r"['\"]([a-z][a-z0-9_]+)['\"]", r"\1", expression)
if clean != expression:
conn.execute("UPDATE alphas SET expression = ? WHERE alpha_id = ?", [clean, alpha_id])
fixed += 1
if fixed:
logger.info(f"Fixed {fixed} expressions with quoted field names.")
else:
logger.info("No quoted field names found.")
conn.close()
def cleanup_orphans():
"""Delete alphas that reference fields not in FIELD_INDEX."""
if not DB_PATH.exists():
return
from alpha_factory.data.brain_fields import FIELD_INDEX
import re
conn = duckdb.connect(str(DB_PATH))
rows = conn.execute("SELECT alpha_id, expression FROM alphas").fetchall()
to_delete = []
for alpha_id, expression in rows:
if not expression:
continue
# Extract all word-like tokens that could be field names
# Require at least 10 chars to avoid matching common words like "backfill"
tokens = re.findall(r"\b([a-z][a-z0-9_]{10,})\b", expression.lower())
# Filter out operators and known keywords
skip = {
"subindustry", "industry", "sector", "market",
"close", "high", "low", "open", "volume", "vwap",
# Common English words that might match length filter
"backfill", "neutralize", "expression",
}
for t in tokens:
if t.startswith("ts_") or t.startswith("group_") or t.startswith("vec_"):
continue
if t.startswith("pv13_") or t.startswith("mdl") or t.startswith("snt") or t.startswith("scl"):
continue
if t in skip:
continue
if t not in FIELD_INDEX:
to_delete.append(alpha_id)
break
if to_delete:
placeholders = ",".join(["?" for _ in to_delete])
conn.execute(f"DELETE FROM alphas WHERE alpha_id IN ({placeholders})", to_delete)
logger.info(f"Deleted {len(to_delete)} alphas with unknown field references.")
conn.close()
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logger.info("=== Alpha Factory Cleanup v0.2.0 ===")
logger.info("1. Removing alphas with fake/placeholder fields...")
cleanup_fake_alphas()
logger.info("2. Fixing quoted field names in expressions...")
cleanup_quoted_expressions()
logger.info("3. Removing alphas with unknown field references...")
cleanup_orphans()
logger.info("Done!")
|