gaurv007's picture
Upload alpha_factory/cleanup.py with huggingface_hub
ee996e3 verified
"""
Cleanup utility — purge old alphas that used fake/placeholder fields.
Run: uv run python -m alpha_factory.cleanup
Also provides batch-level dedup to prevent theme repetition.
"""
import duckdb
from pathlib import Path
DB_PATH = Path("factor_store/alphas.duckdb")
# Fields that genuinely do NOT exist on BRAIN — any alpha using these is invalid.
# NOTE: close, high, low, open, volume, vwap ARE valid BRAIN pv1 fields.
# Removed from blacklist: book_to_price, earnings_yield, returns, vwap, volatility,
# bid_ask_spread, analyst_rating, etc. These are real field names on various datasets.
FAKE_FIELDS = {
# Truly fake / placeholder names that don't map to any BRAIN dataset
"fake_field",
"placeholder",
"dummy_field",
"test_field",
"my_field",
"sample_field",
}
def cleanup_fake_alphas():
"""Remove alphas that used placeholder field names."""
if not DB_PATH.exists():
print("No database found.")
return
conn = duckdb.connect(str(DB_PATH))
# Get all alphas
rows = conn.execute("SELECT alpha_id, expression, fields_used FROM alphas").fetchall()
to_delete = []
for alpha_id, expression, fields_used in rows:
has_fake = False
if fields_used:
for f in fields_used:
if f in FAKE_FIELDS:
has_fake = True
break
if not has_fake and expression:
# Also check expression text for bare fake field names
expr_lower = expression.lower()
for fake in FAKE_FIELDS:
if fake in expr_lower:
has_fake = True
break
if has_fake:
to_delete.append(alpha_id)
if to_delete:
placeholders = ",".join(["?" for _ in to_delete])
conn.execute(f"DELETE FROM alphas WHERE alpha_id IN ({placeholders})", to_delete)
print(f"Deleted {len(to_delete)} alphas with fake/placeholder fields:")
for aid in to_delete:
print(f" - {aid}")
else:
print("No fake alphas found. Database is clean.")
# Show remaining
count = conn.execute("SELECT COUNT(*) FROM alphas").fetchone()[0]
print(f"\nRemaining alphas in store: {count}")
conn.close()
def cleanup_quoted_expressions():
"""Fix any expressions that have quoted field names."""
if not DB_PATH.exists():
return
import re
conn = duckdb.connect(str(DB_PATH))
rows = conn.execute("SELECT alpha_id, expression FROM alphas WHERE expression LIKE '%''%'").fetchall()
fixed = 0
for alpha_id, expression in rows:
clean = re.sub(r"['\"]([a-z][a-z0-9_]+)['\"]", r"\1", expression)
if clean != expression:
conn.execute("UPDATE alphas SET expression = ? WHERE alpha_id = ?", [clean, alpha_id])
fixed += 1
if fixed:
print(f"Fixed {fixed} expressions with quoted field names.")
else:
print("No quoted field names found.")
conn.close()
def cleanup_orphans():
"""Delete alphas that reference fields not in FIELD_INDEX."""
if not DB_PATH.exists():
return
from alpha_factory.data.brain_fields import FIELD_INDEX
import re
conn = duckdb.connect(str(DB_PATH))
rows = conn.execute("SELECT alpha_id, expression FROM alphas").fetchall()
to_delete = []
for alpha_id, expression in rows:
if not expression:
continue
# Extract all word-like tokens that could be field names
# Require at least 10 chars to avoid matching common words like "backfill"
tokens = re.findall(r"\b([a-z][a-z0-9_]{10,})\b", expression.lower())
# Filter out operators and known keywords
skip = {
"subindustry", "industry", "sector", "market",
"close", "high", "low", "open", "volume", "vwap",
# Common English words that might match length filter
"backfill", "neutralize", "expression",
}
for t in tokens:
if t.startswith("ts_") or t.startswith("group_") or t.startswith("vec_"):
continue
if t.startswith("pv13_") or t.startswith("mdl") or t.startswith("snt") or t.startswith("scl"):
continue
if t in skip:
continue
if t not in FIELD_INDEX:
to_delete.append(alpha_id)
break
if to_delete:
placeholders = ",".join(["?" for _ in to_delete])
conn.execute(f"DELETE FROM alphas WHERE alpha_id IN ({placeholders})", to_delete)
print(f"Deleted {len(to_delete)} alphas with unknown field references.")
conn.close()
if __name__ == "__main__":
print("=== Alpha Factory Cleanup v0.2.0 ===\n")
print("1. Removing alphas with fake/placeholder fields...")
cleanup_fake_alphas()
print("\n2. Fixing quoted field names in expressions...")
cleanup_quoted_expressions()
print("\n3. Removing alphas with unknown field references...")
cleanup_orphans()
print("\nDone!")