File size: 5,295 Bytes
007ca36
 
 
 
 
 
 
d5121aa
007ca36
 
d5121aa
 
007ca36
 
74d141c
 
 
 
007ca36
74d141c
 
 
 
 
 
d5121aa
007ca36
 
 
 
 
 
d5121aa
007ca36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74d141c
007ca36
 
 
 
 
 
 
 
 
d5121aa
007ca36
d5121aa
007ca36
d5121aa
007ca36
 
 
d5121aa
007ca36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d5121aa
007ca36
d5121aa
007ca36
 
 
74d141c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee996e3
 
74d141c
ee996e3
 
 
 
 
 
74d141c
ee996e3
 
 
74d141c
 
 
 
 
 
 
 
 
 
d5121aa
74d141c
 
 
 
007ca36
d5121aa
 
 
007ca36
d5121aa
007ca36
d5121aa
74d141c
d5121aa
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
"""
Cleanup utility — purge old alphas that used fake/placeholder fields.
Run: uv run python -m alpha_factory.cleanup

Also provides batch-level dedup to prevent theme repetition.
"""
import duckdb
import logging
from pathlib import Path

logger = logging.getLogger(__name__)

DB_PATH = Path("factor_store/alphas.duckdb")

# Fields that genuinely do NOT exist on BRAIN — any alpha using these is invalid.
# NOTE: close, high, low, open, volume, vwap ARE valid BRAIN pv1 fields.
# Removed from blacklist: book_to_price, earnings_yield, returns, vwap, volatility,
# bid_ask_spread, analyst_rating, etc. These are real field names on various datasets.
FAKE_FIELDS = {
    # Truly fake / placeholder names that don't map to any BRAIN dataset
    "fake_field",
    "placeholder",
    "dummy_field",
    "test_field",
    "sample_field",
    "my_field",
}


def cleanup_fake_alphas():
    """Remove alphas that used placeholder field names."""
    if not DB_PATH.exists():
        logger.info("No database found.")
        return

    conn = duckdb.connect(str(DB_PATH))
    
    # Get all alphas
    rows = conn.execute("SELECT alpha_id, expression, fields_used FROM alphas").fetchall()
    
    to_delete = []
    for alpha_id, expression, fields_used in rows:
        has_fake = False
        if fields_used:
            for f in fields_used:
                if f in FAKE_FIELDS:
                    has_fake = True
                    break
        if not has_fake and expression:
            # Also check expression text for bare fake field names
            expr_lower = expression.lower()
            for fake in FAKE_FIELDS:
                if fake in expr_lower:
                    has_fake = True
                    break
        
        if has_fake:
            to_delete.append(alpha_id)
    
    if to_delete:
        placeholders = ",".join(["?" for _ in to_delete])
        conn.execute(f"DELETE FROM alphas WHERE alpha_id IN ({placeholders})", to_delete)
        logger.info(f"Deleted {len(to_delete)} alphas with fake/placeholder fields")
        for aid in to_delete:
            logger.info(f"  - {aid}")
    else:
        logger.info("No fake alphas found. Database is clean.")
    
    # Show remaining
    count = conn.execute("SELECT COUNT(*) FROM alphas").fetchone()[0]
    logger.info(f"Remaining alphas in store: {count}")
    conn.close()


def cleanup_quoted_expressions():
    """Fix any expressions that have quoted field names."""
    if not DB_PATH.exists():
        return
    
    import re
    conn = duckdb.connect(str(DB_PATH))
    rows = conn.execute("SELECT alpha_id, expression FROM alphas WHERE expression LIKE '%''%'").fetchall()
    
    fixed = 0
    for alpha_id, expression in rows:
        clean = re.sub(r"['\"]([a-z][a-z0-9_]+)['\"]", r"\1", expression)
        if clean != expression:
            conn.execute("UPDATE alphas SET expression = ? WHERE alpha_id = ?", [clean, alpha_id])
            fixed += 1
    
    if fixed:
        logger.info(f"Fixed {fixed} expressions with quoted field names.")
    else:
        logger.info("No quoted field names found.")
    conn.close()


def cleanup_orphans():
    """Delete alphas that reference fields not in FIELD_INDEX."""
    if not DB_PATH.exists():
        return
    
    from alpha_factory.data.brain_fields import FIELD_INDEX
    import re
    conn = duckdb.connect(str(DB_PATH))
    rows = conn.execute("SELECT alpha_id, expression FROM alphas").fetchall()
    
    to_delete = []
    for alpha_id, expression in rows:
        if not expression:
            continue
        # Extract all word-like tokens that could be field names
        # Require at least 10 chars to avoid matching common words like "backfill"
        tokens = re.findall(r"\b([a-z][a-z0-9_]{10,})\b", expression.lower())
        # Filter out operators and known keywords
        skip = {
            "subindustry", "industry", "sector", "market",
            "close", "high", "low", "open", "volume", "vwap",
            # Common English words that might match length filter
            "backfill", "neutralize", "expression",
        }
        for t in tokens:
            if t.startswith("ts_") or t.startswith("group_") or t.startswith("vec_"):
                continue
            if t.startswith("pv13_") or t.startswith("mdl") or t.startswith("snt") or t.startswith("scl"):
                continue
            if t in skip:
                continue
            if t not in FIELD_INDEX:
                to_delete.append(alpha_id)
                break
    
    if to_delete:
        placeholders = ",".join(["?" for _ in to_delete])
        conn.execute(f"DELETE FROM alphas WHERE alpha_id IN ({placeholders})", to_delete)
        logger.info(f"Deleted {len(to_delete)} alphas with unknown field references.")
    
    conn.close()


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
    logger.info("=== Alpha Factory Cleanup v0.2.0 ===")
    logger.info("1. Removing alphas with fake/placeholder fields...")
    cleanup_fake_alphas()
    logger.info("2. Fixing quoted field names in expressions...")
    cleanup_quoted_expressions()
    logger.info("3. Removing alphas with unknown field references...")
    cleanup_orphans()
    logger.info("Done!")