| """ |
| Cleanup utility — purge old alphas that used fake/placeholder fields. |
| Run: uv run python -m alpha_factory.cleanup |
| |
| Also provides batch-level dedup to prevent theme repetition. |
| """ |
| import duckdb |
| from pathlib import Path |
|
|
| DB_PATH = Path("factor_store/alphas.duckdb") |
|
|
| |
| |
| |
| |
| FAKE_FIELDS = { |
| |
| "fake_field", |
| "placeholder", |
| "dummy_field", |
| "test_field", |
| "my_field", |
| "sample_field", |
| } |
|
|
|
|
| def cleanup_fake_alphas(): |
| """Remove alphas that used placeholder field names.""" |
| if not DB_PATH.exists(): |
| print("No database found.") |
| return |
|
|
| conn = duckdb.connect(str(DB_PATH)) |
| |
| |
| rows = conn.execute("SELECT alpha_id, expression, fields_used FROM alphas").fetchall() |
| |
| to_delete = [] |
| for alpha_id, expression, fields_used in rows: |
| has_fake = False |
| if fields_used: |
| for f in fields_used: |
| if f in FAKE_FIELDS: |
| has_fake = True |
| break |
| if not has_fake and expression: |
| |
| expr_lower = expression.lower() |
| for fake in FAKE_FIELDS: |
| if fake in expr_lower: |
| has_fake = True |
| break |
| |
| if has_fake: |
| to_delete.append(alpha_id) |
| |
| if to_delete: |
| placeholders = ",".join(["?" for _ in to_delete]) |
| conn.execute(f"DELETE FROM alphas WHERE alpha_id IN ({placeholders})", to_delete) |
| print(f"Deleted {len(to_delete)} alphas with fake/placeholder fields:") |
| for aid in to_delete: |
| print(f" - {aid}") |
| else: |
| print("No fake alphas found. Database is clean.") |
| |
| |
| count = conn.execute("SELECT COUNT(*) FROM alphas").fetchone()[0] |
| print(f"\nRemaining alphas in store: {count}") |
| conn.close() |
|
|
|
|
| def cleanup_quoted_expressions(): |
| """Fix any expressions that have quoted field names.""" |
| if not DB_PATH.exists(): |
| return |
| |
| import re |
| conn = duckdb.connect(str(DB_PATH)) |
| rows = conn.execute("SELECT alpha_id, expression FROM alphas WHERE expression LIKE '%''%'").fetchall() |
| |
| fixed = 0 |
| for alpha_id, expression in rows: |
| clean = re.sub(r"['\"]([a-z][a-z0-9_]+)['\"]", r"\1", expression) |
| if clean != expression: |
| conn.execute("UPDATE alphas SET expression = ? WHERE alpha_id = ?", [clean, alpha_id]) |
| fixed += 1 |
| |
| if fixed: |
| print(f"Fixed {fixed} expressions with quoted field names.") |
| else: |
| print("No quoted field names found.") |
| conn.close() |
|
|
|
|
| def cleanup_orphans(): |
| """Delete alphas that reference fields not in FIELD_INDEX.""" |
| if not DB_PATH.exists(): |
| return |
| |
| from alpha_factory.data.brain_fields import FIELD_INDEX |
| import re |
| conn = duckdb.connect(str(DB_PATH)) |
| rows = conn.execute("SELECT alpha_id, expression FROM alphas").fetchall() |
| |
| to_delete = [] |
| for alpha_id, expression in rows: |
| if not expression: |
| continue |
| |
| |
| tokens = re.findall(r"\b([a-z][a-z0-9_]{10,})\b", expression.lower()) |
| |
| skip = { |
| "subindustry", "industry", "sector", "market", |
| "close", "high", "low", "open", "volume", "vwap", |
| |
| "backfill", "neutralize", "expression", |
| } |
| for t in tokens: |
| if t.startswith("ts_") or t.startswith("group_") or t.startswith("vec_"): |
| continue |
| if t.startswith("pv13_") or t.startswith("mdl") or t.startswith("snt") or t.startswith("scl"): |
| continue |
| if t in skip: |
| continue |
| if t not in FIELD_INDEX: |
| to_delete.append(alpha_id) |
| break |
| |
| if to_delete: |
| placeholders = ",".join(["?" for _ in to_delete]) |
| conn.execute(f"DELETE FROM alphas WHERE alpha_id IN ({placeholders})", to_delete) |
| print(f"Deleted {len(to_delete)} alphas with unknown field references.") |
| |
| conn.close() |
|
|
|
|
| if __name__ == "__main__": |
| print("=== Alpha Factory Cleanup v0.2.0 ===\n") |
| print("1. Removing alphas with fake/placeholder fields...") |
| cleanup_fake_alphas() |
| print("\n2. Fixing quoted field names in expressions...") |
| cleanup_quoted_expressions() |
| print("\n3. Removing alphas with unknown field references...") |
| cleanup_orphans() |
| print("\nDone!") |
|
|