File size: 5,295 Bytes

"""
Cleanup utility — purge old alphas that used fake/placeholder fields.
Run: uv run python -m alpha_factory.cleanup

Also provides batch-level dedup to prevent theme repetition.
"""
import duckdb
import logging
from pathlib import Path

logger = logging.getLogger(__name__)

DB_PATH = Path("factor_store/alphas.duckdb")

# Fields that genuinely do NOT exist on BRAIN — any alpha using these is invalid.
# NOTE: close, high, low, open, volume, vwap ARE valid BRAIN pv1 fields.
# Removed from blacklist: book_to_price, earnings_yield, returns, vwap, volatility,
# bid_ask_spread, analyst_rating, etc. These are real field names on various datasets.
FAKE_FIELDS = {
    # Truly fake / placeholder names that don't map to any BRAIN dataset
    "fake_field",
    "placeholder",
    "dummy_field",
    "test_field",
    "sample_field",
    "my_field",
}


def cleanup_fake_alphas():
    """Remove alphas that used placeholder field names."""
    if not DB_PATH.exists():
        logger.info("No database found.")
        return

    conn = duckdb.connect(str(DB_PATH))
    
    # Get all alphas
    rows = conn.execute("SELECT alpha_id, expression, fields_used FROM alphas").fetchall()
    
    to_delete = []
    for alpha_id, expression, fields_used in rows:
        has_fake = False
        if fields_used:
            for f in fields_used:
                if f in FAKE_FIELDS:
                    has_fake = True
                    break
        if not has_fake and expression:
            # Also check expression text for bare fake field names
            expr_lower = expression.lower()
            for fake in FAKE_FIELDS:
                if fake in expr_lower:
                    has_fake = True
                    break
        
        if has_fake:
            to_delete.append(alpha_id)
    
    if to_delete:
        placeholders = ",".join(["?" for _ in to_delete])
        conn.execute(f"DELETE FROM alphas WHERE alpha_id IN ({placeholders})", to_delete)
        logger.info(f"Deleted {len(to_delete)} alphas with fake/placeholder fields")
        for aid in to_delete:
            logger.info(f"  - {aid}")
    else:
        logger.info("No fake alphas found. Database is clean.")
    
    # Show remaining
    count = conn.execute("SELECT COUNT(*) FROM alphas").fetchone()[0]
    logger.info(f"Remaining alphas in store: {count}")
    conn.close()


def cleanup_quoted_expressions():
    """Fix any expressions that have quoted field names."""
    if not DB_PATH.exists():
        return
    
    import re
    conn = duckdb.connect(str(DB_PATH))
    rows = conn.execute("SELECT alpha_id, expression FROM alphas WHERE expression LIKE '%''%'").fetchall()
    
    fixed = 0
    for alpha_id, expression in rows:
        clean = re.sub(r"['\"]([a-z][a-z0-9_]+)['\"]", r"\1", expression)
        if clean != expression:
            conn.execute("UPDATE alphas SET expression = ? WHERE alpha_id = ?", [clean, alpha_id])
            fixed += 1
    
    if fixed:
        logger.info(f"Fixed {fixed} expressions with quoted field names.")
    else:
        logger.info("No quoted field names found.")
    conn.close()


def cleanup_orphans():
    """Delete alphas that reference fields not in FIELD_INDEX."""
    if not DB_PATH.exists():
        return
    
    from alpha_factory.data.brain_fields import FIELD_INDEX
    import re
    conn = duckdb.connect(str(DB_PATH))
    rows = conn.execute("SELECT alpha_id, expression FROM alphas").fetchall()
    
    to_delete = []
    for alpha_id, expression in rows:
        if not expression:
            continue
        # Extract all word-like tokens that could be field names
        # Require at least 10 chars to avoid matching common words like "backfill"
        tokens = re.findall(r"\b([a-z][a-z0-9_]{10,})\b", expression.lower())
        # Filter out operators and known keywords
        skip = {
            "subindustry", "industry", "sector", "market",
            "close", "high", "low", "open", "volume", "vwap",
            # Common English words that might match length filter
            "backfill", "neutralize", "expression",
        }
        for t in tokens:
            if t.startswith("ts_") or t.startswith("group_") or t.startswith("vec_"):
                continue
            if t.startswith("pv13_") or t.startswith("mdl") or t.startswith("snt") or t.startswith("scl"):
                continue
            if t in skip:
                continue
            if t not in FIELD_INDEX:
                to_delete.append(alpha_id)
                break
    
    if to_delete:
        placeholders = ",".join(["?" for _ in to_delete])
        conn.execute(f"DELETE FROM alphas WHERE alpha_id IN ({placeholders})", to_delete)
        logger.info(f"Deleted {len(to_delete)} alphas with unknown field references.")
    
    conn.close()


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
    logger.info("=== Alpha Factory Cleanup v0.2.0 ===")
    logger.info("1. Removing alphas with fake/placeholder fields...")
    cleanup_fake_alphas()
    logger.info("2. Fixing quoted field names in expressions...")
    cleanup_quoted_expressions()
    logger.info("3. Removing alphas with unknown field references...")
    cleanup_orphans()
    logger.info("Done!")