Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| WorldQuant Alpha Swarm β Gradio UI | |
| Supports: Hugging Face Inference API + Ollama (local) | |
| Features: | |
| β’ LLM-driven alpha generation with structured JSON prompting | |
| β’ Dropdown selectors for all WQ data fields & operators | |
| β’ Real-time backtest evaluation on synthetic data | |
| β’ Orthogonality check vs existing library | |
| β’ Multi-domain swarm mode | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| """ | |
| import json | |
| import math | |
| import os | |
| import random | |
| import re | |
| import sys | |
| import traceback | |
| from dataclasses import dataclass | |
| from typing import Dict, List, Optional, Set, Tuple | |
| import gradio as gr | |
| import numpy as np | |
| import pandas as pd | |
| from scipy.stats import spearmanr | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # CONFIG: Model Lists | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| HF_MODELS = [ | |
| "meta-llama/Meta-Llama-3-8B-Instruct", | |
| "mistralai/Mistral-7B-Instruct-v0.3", | |
| "Qwen/Qwen2.5-7B-Instruct", | |
| "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", | |
| "microsoft/Phi-3-mini-4k-instruct", | |
| "HuggingFaceH4/zephyr-7b-beta", | |
| ] | |
| OLLAMA_MODELS = [ | |
| "llama3.2", | |
| "deepseek-r1:8b", | |
| "qwen2.5:7b", | |
| "mistral", | |
| "codellama", | |
| "phi3", | |
| ] | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # CONFIG: WorldQuant Data Fields & Operators | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| WQ_DATA_FIELDS = { | |
| # Price / Volume | |
| "open", "high", "low", "close", "volume", "vwap", | |
| "returns", "returns_open", "intraday_return", "overnight_return", | |
| "open_close_return", "high_low_range", "close_open_gap", | |
| "num_trades", "turnover", "turnover_ratio", | |
| "bid", "ask", "bid_size", "ask_size", "adv20", "adv60", | |
| # Fundamentals | |
| "market_cap", "pe_ratio", "pb_ratio", "ps_ratio", | |
| "ev_ebitda", "ev_sales", "debt_equity", "current_ratio", | |
| "roe", "roa", "roic", "gross_profit_margin", | |
| "ebitda", "operating_income", "net_income", "sales", "revenue", | |
| "total_assets", "total_debt", "cash", "book_value", "equity", | |
| "liabilities", "assets", | |
| "eps", "dps", "dividend_yield", | |
| "revenue_growth", "earnings_growth", "enterprise_value", "cap", | |
| "gross_income", "gross_income_reported_value", | |
| # Analyst / Estimates | |
| "est_eps", "est_revenue", "recommendation_mean", | |
| "num_analysts", "eps_surprise", "eps_surprise_pct", | |
| # Options | |
| "implied_volatility_call_180", "implied_volatility_put_180", | |
| "iv30", "iv60", "iv90", "put_call_ratio", "option_volume", "open_interest", | |
| # Alternative | |
| "realized_vol", "volatility", "skewness", "kurtosis", | |
| } | |
| WQ_OPERATORS = { | |
| # Cross-section | |
| "rank", "zscore", "scale", "normalize", "sign", "abs", | |
| "max", "min", "greater", "less", "if_else", "cond", | |
| "and", "or", "not", | |
| "group_neutralize", "group_rank", "group_zscore", "group_normalize", | |
| # Time-series | |
| "ts_mean", "ts_std_dev", "ts_variance", "ts_zscore", "ts_rank", | |
| "ts_min", "ts_max", "ts_delta", "ts_delay", "ts_return", | |
| "ts_corr", "ts_cov", "ts_sum", "ts_prod", "ts_skew", "ts_kurt", | |
| "ts_decay_linear", "ts_decay_exp", "ts_argmax", "ts_argmin", | |
| "ts_ir", "ts_backfill", "ts_sumif", "ts_count", | |
| # Special | |
| "trade_when", | |
| } | |
| NEUTRALIZATION_LEVELS = ["subindustry", "industry", "sector", "market", "none"] | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # SYNTHETIC DATA GENERATOR (Embedded Anomalies) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _DATA_CACHE = {} | |
| def get_synthetic_data(n_stocks: int = 300, n_days: int = 252, seed: int = 2026): | |
| key = (n_stocks, n_days, seed) | |
| if key in _DATA_CACHE: | |
| return _DATA_CACHE[key] | |
| np.random.seed(seed) | |
| dates = pd.date_range("2020-01-02", periods=n_days, freq="B") | |
| stocks = [f"STK_{i:04d}" for i in range(n_stocks)] | |
| # Persistent characteristics | |
| liquidity_sens = np.random.beta(2, 5, n_stocks) | |
| value_score = -np.log(np.random.lognormal(0, 0.4, n_stocks)) | |
| earn_vol = np.random.gamma(2, 0.03, n_stocks) | |
| # Market factor | |
| market_ret = np.random.normal(0.0003, 0.012, n_days) | |
| idio_vol = np.random.uniform(0.015, 0.035, n_stocks) | |
| beta = np.random.uniform(0.5, 1.5, n_stocks) | |
| returns = np.random.normal(0, idio_vol, (n_days, n_stocks)) | |
| for t in range(n_days): | |
| returns[t] += beta * market_ret[t] | |
| # Embed anomalies | |
| market_cap = np.random.lognormal(22, 1.2, (n_days, n_stocks)) | |
| market_cap = np.maximum(market_cap, 1e6) | |
| volume = np.exp(np.random.normal(15, 0.5, (n_days, n_stocks))) | |
| # ANOMALY 1: Amihud reversal | |
| for t in range(5, n_days - 1): | |
| amihud = np.abs(returns[t]) / (market_cap[t] * 1e-6 + 1000) | |
| amihud_rank = np.argsort(np.argsort(amihud)) / (n_stocks - 1) | |
| returns[t+1, amihud_rank > 0.80] -= 0.008 * liquidity_sens[amihud_rank > 0.80] | |
| returns[t+1, amihud_rank < 0.20] += 0.003 * (1 - liquidity_sens[amihud_rank < 0.20]) | |
| # ANOMALY 2: PEAD | |
| eps_surprise = np.zeros((n_days, n_stocks)) | |
| for s in range(n_stocks): | |
| earn_dates = np.random.choice(range(20, n_days - 10), size=3, replace=False) | |
| for ed in earn_dates: | |
| surprise = np.random.normal(0, earn_vol[s]) | |
| eps_surprise[ed, s] = surprise | |
| drift = 0.5 * surprise / (earn_vol[s] + 0.001) * 0.004 | |
| for d in range(1, 6): | |
| if ed + d < n_days: | |
| returns[ed + d, s] += drift * (1 - 0.15 * d) | |
| # ANOMALY 3: Value premium | |
| for t in range(n_days): | |
| returns[t] += 0.00008 * value_score | |
| # ANOMALY 4: VWAP pressure reversal | |
| close = np.zeros((n_days, n_stocks)) | |
| close[0] = 100.0 | |
| for t in range(1, n_days): | |
| close[t] = close[t-1] * (1 + returns[t]) | |
| vol_ma20 = pd.DataFrame(volume).rolling(20, min_periods=1).mean().values | |
| rel_vol = volume / (vol_ma20 + 1) | |
| vwap = close * (1 + 0.001 * (rel_vol - 1) * np.random.normal(0, 1, (n_days, n_stocks))) | |
| for t in range(1, n_days - 1): | |
| vwap_gap = np.abs(vwap[t] - close[t]) / close[t] | |
| pressure = vwap_gap * rel_vol[t] | |
| p_rank = np.argsort(np.argsort(pressure)) / (n_stocks - 1) | |
| returns[t+1, p_rank > 0.90] -= 0.006 * liquidity_sens[p_rank > 0.90] | |
| # Recalculate close with anomalies | |
| close = np.zeros((n_days, n_stocks)) | |
| close[0] = 100.0 | |
| for t in range(1, n_days): | |
| close[t] = close[t-1] * (1 + returns[t]) | |
| high = close * (1 + np.abs(np.random.normal(0, 0.008, close.shape))) | |
| low = close * (1 - np.abs(np.random.normal(0, 0.008, close.shape))) | |
| open_p = close * (1 + np.random.normal(0, 0.003, close.shape)) | |
| # Fundamentals | |
| operating_income = market_cap * np.random.lognormal(-3.0, 0.6, (n_days, n_stocks)) | |
| ebitda = operating_income * np.random.lognormal(0.3, 0.15, (n_days, n_stocks)) | |
| total_debt = market_cap * np.random.lognormal(-1.8, 0.9, (n_days, n_stocks)) | |
| total_assets = market_cap * np.random.lognormal(0.1, 0.4, (n_days, n_stocks)) | |
| cash = total_assets * np.random.uniform(0.03, 0.18, (n_days, n_stocks)) | |
| equity = total_assets * np.random.uniform(0.35, 0.75, (n_days, n_stocks)) | |
| liabilities = total_assets - equity | |
| enterprise_value = market_cap * np.random.uniform(1.0, 1.6, (n_days, n_stocks)) | |
| sales = market_cap * np.random.lognormal(-1.4, 0.35, (n_days, n_stocks)) | |
| eps = operating_income / (market_cap / 100) * np.random.uniform(0.3, 0.8, (n_days, n_stocks)) | |
| est_eps = eps * (1 + np.random.normal(0, 0.1, (n_days, n_stocks))) | |
| eps_surprise_pct = eps_surprise / (np.abs(est_eps) + 0.01) | |
| num_analysts = np.random.poisson(8, (n_days, n_stocks)).astype(float) | |
| # Options | |
| iv_call = np.random.uniform(0.18, 0.48, (n_days, n_stocks)) | |
| iv_put = iv_call + np.random.normal(0, 0.025, (n_days, n_stocks)) | |
| put_call_ratio = np.random.lognormal(0, 0.35, (n_days, n_stocks)) | |
| option_volume = volume * np.random.uniform(0.002, 0.04, (n_days, n_stocks)) | |
| realized_vol = pd.DataFrame(returns).rolling(20, min_periods=1).std().values | |
| realized_vol = np.nan_to_num(realized_vol, nan=0.02) | |
| def mkdf(arr): | |
| return pd.DataFrame(arr, index=dates, columns=stocks) | |
| data = { | |
| "returns": mkdf(returns), | |
| "close": mkdf(close), | |
| "high": mkdf(high), | |
| "low": mkdf(low), | |
| "open": mkdf(open_p), | |
| "volume": mkdf(volume), | |
| "vwap": mkdf(vwap), | |
| "market_cap": mkdf(market_cap), | |
| "cap": mkdf(market_cap), | |
| "operating_income": mkdf(operating_income), | |
| "ebitda": mkdf(ebitda), | |
| "total_debt": mkdf(total_debt), | |
| "total_assets": mkdf(total_assets), | |
| "cash": mkdf(cash), | |
| "equity": mkdf(equity), | |
| "book_value": mkdf(equity), | |
| "liabilities": mkdf(liabilities), | |
| "assets": mkdf(total_assets), | |
| "enterprise_value": mkdf(enterprise_value), | |
| "sales": mkdf(sales), | |
| "revenue": mkdf(sales), | |
| "eps": mkdf(eps), | |
| "est_eps": mkdf(est_eps), | |
| "eps_surprise": mkdf(eps_surprise), | |
| "eps_surprise_pct": mkdf(eps_surprise_pct), | |
| "num_analysts": mkdf(num_analysts), | |
| "implied_volatility_call_180": mkdf(iv_call), | |
| "implied_volatility_put_180": mkdf(iv_put), | |
| "put_call_ratio": mkdf(put_call_ratio), | |
| "option_volume": mkdf(option_volume), | |
| "realized_vol": mkdf(realized_vol), | |
| "adv20": mkdf(pd.DataFrame(volume).rolling(20, min_periods=1).mean().values), | |
| "turnover": mkdf(volume / (market_cap + 1)), | |
| "turnover_ratio": mkdf(volume / (market_cap + 1)), | |
| "volatility": mkdf(realized_vol), | |
| "debt_equity": mkdf(total_debt / (equity + 1)), | |
| "current_ratio": mkdf(np.random.uniform(0.8, 2.5, (n_days, n_stocks))), | |
| "roe": mkdf(operating_income / (equity + 1)), | |
| "roa": mkdf(operating_income / (total_assets + 1)), | |
| "gross_profit_margin": mkdf(np.random.uniform(0.2, 0.6, (n_days, n_stocks))), | |
| "pe_ratio": mkdf(np.random.lognormal(2.5, 0.5, (n_days, n_stocks))), | |
| "pb_ratio": mkdf(close / (equity / (market_cap / 100) + 0.01)), | |
| "ev_ebitda": mkdf(enterprise_value / (ebitda + 1)), | |
| "net_income": mkdf(operating_income * np.random.uniform(0.5, 0.9, (n_days, n_stocks))), | |
| "dividend_yield": mkdf(np.random.uniform(0, 0.05, (n_days, n_stocks))), | |
| "earnings_growth": mkdf(np.random.normal(0.05, 0.15, (n_days, n_stocks))), | |
| "revenue_growth": mkdf(np.random.normal(0.05, 0.15, (n_days, n_stocks))), | |
| "gross_income": mkdf(operating_income * np.random.uniform(1.2, 1.5, (n_days, n_stocks))), | |
| "gross_income_reported_value": mkdf(operating_income * np.random.uniform(1.2, 1.5, (n_days, n_stocks))), | |
| "iv30": mkdf(np.random.uniform(0.18, 0.48, (n_days, n_stocks))), | |
| "iv60": mkdf(np.random.uniform(0.18, 0.48, (n_days, n_stocks))), | |
| "iv90": mkdf(np.random.uniform(0.18, 0.48, (n_days, n_stocks))), | |
| "open_interest": mkdf(option_volume * np.random.uniform(5, 20, (n_days, n_stocks))), | |
| "bid": mkdf(close * (1 - np.random.uniform(0, 0.001, (n_days, n_stocks)))), | |
| "ask": mkdf(close * (1 + np.random.uniform(0, 0.001, (n_days, n_stocks)))), | |
| "bid_size": mkdf(np.random.poisson(1000, (n_days, n_stocks))), | |
| "ask_size": mkdf(np.random.poisson(1000, (n_days, n_stocks))), | |
| "returns_open": mkdf(np.random.normal(0.0002, 0.02, (n_days, n_stocks))), | |
| "intraday_return": mkdf(returns - np.random.normal(0.0001, 0.01, (n_days, n_stocks))), | |
| "overnight_return": mkdf(np.random.normal(0.0001, 0.01, (n_days, n_stocks))), | |
| "high_low_range": mkdf((high - low) / close), | |
| "close_open_gap": mkdf((close - open_p) / open_p), | |
| "est_revenue": mkdf(sales * (1 + np.random.normal(0, 0.05, (n_days, n_stocks)))), | |
| "recommendation_mean": mkdf(np.random.uniform(1.5, 4.5, (n_days, n_stocks))), | |
| "roic": mkdf(operating_income / (total_assets + 1)), | |
| "ev_sales": mkdf(enterprise_value / (sales + 1)), | |
| "num_trades": mkdf(np.random.poisson(5000, (n_days, n_stocks))), | |
| "skewness": mkdf(pd.DataFrame(returns).rolling(20, min_periods=1).skew().values), | |
| "kurtosis": mkdf(pd.DataFrame(returns).rolling(20, min_periods=1).kurt().values), | |
| } | |
| fwd = data["returns"].shift(-1) | |
| result = (data, fwd) | |
| _DATA_CACHE[key] = result | |
| return result | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # ALPHA EVALUATOR | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def evaluate_alpha(expr: str, data: dict, fwd: pd.DataFrame, min_days: int = 50): | |
| """Evaluate a WQ expression and return metrics.""" | |
| ns = dict(data) | |
| ns["rank"] = lambda df: df.rank(axis=1, pct=True) | |
| ns["zscore"] = lambda df: (df - df.mean(axis=1).values[:, None]) / (df.std(axis=1).values[:, None] + 0.0001) | |
| ns["sign"] = np.sign | |
| ns["abs"] = np.abs | |
| ns["ts_mean"] = lambda df, w: df.rolling(window=int(w), min_periods=1).mean() | |
| ns["ts_std_dev"] = lambda df, w: df.rolling(window=int(w), min_periods=1).std() | |
| ns["ts_rank"] = lambda df, w: df.rolling(window=int(w), min_periods=1).apply( | |
| lambda x: np.argsort(np.argsort(x))[-1] / max(len(x) - 1, 1) if len(x) > 1 else 0.5, raw=True | |
| ) | |
| ns["ts_min"] = lambda df, w: df.rolling(window=int(w), min_periods=1).min() | |
| ns["ts_max"] = lambda df, w: df.rolling(window=int(w), min_periods=1).max() | |
| ns["ts_delta"] = lambda df, w: df - df.shift(int(w)) | |
| ns["ts_delay"] = lambda df, w: df.shift(int(w)) | |
| ns["ts_return"] = lambda df, w: df / df.shift(int(w)) - 1 | |
| ns["ts_sum"] = lambda df, w: df.rolling(window=int(w), min_periods=1).sum() | |
| ns["ts_backfill"] = lambda df, w: df.rolling(window=int(w), min_periods=1).apply( | |
| lambda x: pd.Series(x).ffill().iloc[-1], raw=True | |
| ) | |
| ns["ts_decay_linear"] = lambda df, w: _ts_decay_fast(df, int(w)) | |
| ns["group_neutralize"] = lambda df, _: df - df.mean(axis=1).values[:, None] | |
| ns["group_rank"] = lambda df, _: df.rank(axis=1, pct=True) | |
| ns["greater"] = lambda a, b: (a > b).astype(float) | |
| ns["less"] = lambda a, b: (a < b).astype(float) | |
| ns["if_else"] = lambda c, a, b: np.where(c, a, b) | |
| ns["and"] = lambda a, b: ((a > 0) & (b > 0)).astype(float) | |
| ns["or"] = lambda a, b: ((a > 0) | (b > 0)).astype(float) | |
| ns["not"] = lambda a: (a <= 0).astype(float) | |
| ns["max"] = np.maximum | |
| ns["min"] = np.minimum | |
| ns["trade_when"] = lambda c, a, b: np.where(c > 0, a, b) | |
| try: | |
| result = eval(expr, {"__builtins__": {}}, ns) | |
| if not isinstance(result, pd.DataFrame): | |
| return {"valid": False, "error": "Not a DataFrame"} | |
| except Exception as e: | |
| return {"valid": False, "error": str(e)[:200]} | |
| valid_idx = result.index[min_days::5] | |
| ic_vals = [] | |
| rank_ic_vals = [] | |
| for date in valid_idx: | |
| a = result.loc[date].dropna() | |
| f = fwd.loc[date].dropna() | |
| common = a.index.intersection(f.index) | |
| if len(common) < 30: | |
| continue | |
| a, f = a[common], f[common] | |
| if a.std() > 0 and f.std() > 0: | |
| ic_vals.append(np.corrcoef(a, f)[0, 1]) | |
| if len(set(a)) > 1 and len(set(f)) > 1: | |
| r, _ = spearmanr(a, f) | |
| if not np.isnan(r): | |
| rank_ic_vals.append(r) | |
| ic = np.nanmean(ic_vals) if ic_vals else 0 | |
| rank_ic = np.nanmean(rank_ic_vals) if rank_ic_vals else 0 | |
| ic_std = np.nanstd(ic_vals) if ic_vals else 0.001 | |
| icir = ic / (ic_std + 0.0001) | |
| sharpe = min(icir * math.sqrt(252) / 3, 5.0) | |
| rnk = result.rank(axis=1) | |
| corr_vals = [] | |
| for i in range(1, min(len(rnk), 100)): | |
| a1 = rnk.iloc[i-1].dropna() | |
| a2 = rnk.iloc[i].dropna() | |
| common = a1.index.intersection(a2.index) | |
| if len(common) > 20: | |
| c = np.corrcoef(a1[common], a2[common])[0, 1] | |
| if not np.isnan(c): | |
| corr_vals.append(c) | |
| avg_corr = np.mean(corr_vals) if corr_vals else 0.8 | |
| turnover = max(0, (1 - avg_corr) * 100) | |
| max_dd = max(2.0, turnover * 0.15) | |
| return { | |
| "valid": True, | |
| "ic": round(ic, 4), | |
| "rank_ic": round(rank_ic, 4), | |
| "sharpe": round(sharpe, 3), | |
| "turnover": round(turnover, 1), | |
| "max_dd": round(max_dd, 2), | |
| } | |
| def _ts_decay_fast(df, window): | |
| w = window | |
| weights = np.arange(1, w + 1) | |
| weights = weights / weights.sum() | |
| return df.rolling(window=w, min_periods=1).apply( | |
| lambda x: np.dot(x[-len(weights):], weights[-len(x):]), raw=True | |
| ) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # LLM PROMPT ENGINE | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def build_prompt(fields: List[str], operators: List[str], domain: str, existing_alphas: str, num_alphas: int) -> str: | |
| fields_str = ", ".join(fields) | |
| ops_str = ", ".join(operators) | |
| prompt = f"""You are a senior quantitative researcher at Renaissance Technologies. Your task is to generate {num_alphas} novel formulaic alphas for a WorldQuant BRAIN competition. | |
| AVAILABLE DATA FIELDS: | |
| {fields_str} | |
| AVAILABLE OPERATORS: | |
| {ops_str} | |
| DOMAIN TO FOCUS ON: {domain} | |
| EXISTING ALPHA LIBRARY (DO NOT REPLICATE): | |
| {existing_alphas[:2000] if existing_alphas else "None β this is the first generation."} | |
| REQUIREMENTS FOR EACH ALPHA: | |
| 1. Expression must be a SINGLE valid WorldQuant BRAIN expression (no comments, no semicolons as separators) | |
| 2. Use only the listed operators and data fields | |
| 3. All division must include + 0.000001 guard to prevent division by zero | |
| 4. Must end with group_neutralize(score, subindustry) or group_neutralize(rank(score), subindustry) | |
| 5. Must be dimensionless (no units) | |
| 6. At least 2 distinct operations (not just rank(close)) | |
| 7. Max 5 named parameters per expression | |
| 8. Should exploit cross-sectional predictability, not time-series momentum alone | |
| OUTPUT FORMAT β Return ONLY a JSON array with exactly {num_alphas} objects. Each object must have: | |
| {{ | |
| "name": "short descriptive name", | |
| "description": "one-sentence economic rationale", | |
| "expression": "the full WQ expression as a single string", | |
| "domain": "which domain this belongs to", | |
| "neutralization": "subindustry" | |
| }} | |
| Do not include markdown code fences. Return raw JSON only.""" | |
| return prompt | |
| def call_hf_model(model_name: str, prompt: str, temperature: float = 0.7, max_tokens: int = 2048): | |
| try: | |
| from huggingface_hub import InferenceClient | |
| token = os.environ.get("HF_TOKEN", "") | |
| client = InferenceClient(token=token if token else None) | |
| response = client.chat_completion( | |
| model=model_name, | |
| messages=[{"role": "user", "content": prompt}], | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| ) | |
| return response.choices[0].message.content | |
| except Exception as e: | |
| return f"ERROR: {str(e)}" | |
| def call_ollama_model(model_name: str, prompt: str, temperature: float = 0.7): | |
| try: | |
| import ollama | |
| response = ollama.generate( | |
| model=model_name, | |
| prompt=prompt, | |
| format="json", | |
| options={"temperature": temperature, "num_predict": 2048}, | |
| ) | |
| return response["response"] | |
| except Exception as e: | |
| return f"ERROR: {str(e)}" | |
| def parse_alpha_json(raw_text: str) -> List[Dict]: | |
| text = raw_text.strip() | |
| if text.startswith("```"): | |
| text = text.split("\n", 1)[1] | |
| if text.endswith("```"): | |
| text = text.rsplit("\n", 1)[0] | |
| text = text.strip() | |
| try: | |
| return json.loads(text) | |
| except json.JSONDecodeError: | |
| pass | |
| match = re.search(r'\[.*\]', text, re.DOTALL) | |
| if match: | |
| try: | |
| return json.loads(match.group()) | |
| except: | |
| pass | |
| if not text.endswith("]"): | |
| text = text.rsplit("}", 1)[0] + "}]" | |
| try: | |
| return json.loads(text) | |
| except: | |
| pass | |
| return [] | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # SWARM GENERATION LOGIC | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| DOMAINS = [ | |
| "Liquidity Shock Reversal (Amihud, volume acceleration, VWAP pressure)", | |
| "Post-Earnings Announcement Drift (eps_surprise, SUE, analyst revisions)", | |
| "Capital Structure / Distress Quality (debt coverage, interest coverage, cash ratios)", | |
| "Options Market Flow & Skew (put_call_ratio, IV term structure, option volume)", | |
| "Nonlinear Factor Interactions (multiplicative combinations of orthogonal signals)", | |
| "Cross-Sectional Dispersion / Beta Timing (idiosyncratic vol, comovement deviation)", | |
| "Seasonality & Calendar Effects (intra-month, day-of-week, turn-of-month)", | |
| "News Sentiment / Text Signals (earnings tone, headline sentiment)", | |
| "Short Interest / Borrow Cost (utilization, short interest changes)", | |
| "Institutional Flow (13F ownership changes)", | |
| ] | |
| EXAMPLE_ALPHAS = [ | |
| "group_neutralize(rank(ts_mean(abs(returns) / (close * volume + 0.000001), 5) / (ts_mean(abs(returns) / (close * volume + 0.000001), 63) + 0.000001)), subindustry)", | |
| "group_neutralize(rank(eps_surprise / (abs(est_eps) + 0.000001)), subindustry)", | |
| "group_neutralize(rank(operating_income / (total_debt + 0.000001)), subindustry)", | |
| "group_neutralize(rank(-put_call_ratio) * rank(iv30 - iv90), industry)", | |
| "group_neutralize(rank(zscore(ts_rank(operating_income / (cap + 0.000001), 252))) * rank(zscore(ts_rank(-returns, 20))), subindustry)", | |
| ] | |
| def generate_alphas( | |
| backend: str, | |
| model_name: str, | |
| fields: List[str], | |
| operators: List[str], | |
| domain: str, | |
| num_alphas: int, | |
| temperature: float, | |
| existing_alphas_text: str, | |
| progress=gr.Progress(), | |
| ): | |
| progress(0.1, desc="Building prompt...") | |
| prompt = build_prompt(fields, operators, domain, existing_alphas_text, num_alphas) | |
| progress(0.2, desc=f"Calling {backend} model: {model_name}...") | |
| if backend == "Hugging Face": | |
| raw_response = call_hf_model(model_name, prompt, temperature) | |
| else: | |
| raw_response = call_ollama_model(model_name, prompt, temperature) | |
| if raw_response.startswith("ERROR:"): | |
| return [], f"β {raw_response}", "" | |
| progress(0.5, desc="Parsing response...") | |
| alphas = parse_alpha_json(raw_response) | |
| if not alphas: | |
| return [], f"β Could not parse LLM response. Raw output:\n\n{raw_response[:1000]}", "" | |
| progress(0.6, desc="Preparing evaluation data...") | |
| data, fwd = get_synthetic_data() | |
| results = [] | |
| progress_steps = len(alphas) | |
| for i, alpha in enumerate(alphas): | |
| progress(0.6 + 0.35 * (i / progress_steps), desc=f"Evaluating alpha {i+1}/{len(alphas)}...") | |
| expr = alpha.get("expression", "") | |
| if not expr: | |
| continue | |
| score = evaluate_alpha(expr, data, fwd) | |
| alpha.update(score) | |
| alpha["composite"] = ( | |
| 0.35 * score.get("sharpe", 0) + | |
| 0.25 * score.get("ic", 0) * 10 + | |
| 0.20 * score.get("rank_ic", 0) * 10 - | |
| 0.10 * (score.get("turnover", 0) / 100) - | |
| 0.10 * (score.get("max_dd", 0) / 100) | |
| ) if score.get("valid") else -999 | |
| results.append(alpha) | |
| progress(1.0, desc="Done!") | |
| results.sort(key=lambda x: x.get("composite", -999), reverse=True) | |
| report_lines = ["# Generated Alpha Report\n"] | |
| for i, r in enumerate(results, 1): | |
| status = "β VALID" if r.get("valid") else "β INVALID" | |
| report_lines.append(f"\n## Alpha {i}: {r.get('name', 'Unnamed')} {status}") | |
| report_lines.append(f"**Domain:** {r.get('domain', 'Unknown')}") | |
| report_lines.append(f"**Description:** {r.get('description', 'N/A')}") | |
| report_lines.append(f"```\n{r.get('expression', 'N/A')}\n```") | |
| if r.get("valid"): | |
| report_lines.append(f"| Metric | Value |") | |
| report_lines.append(f"|--------|-------|") | |
| report_lines.append(f"| Sharpe | {r.get('sharpe', 'N/A')} |") | |
| report_lines.append(f"| IC | {r.get('ic', 'N/A')} |") | |
| report_lines.append(f"| Rank IC | {r.get('rank_ic', 'N/A')} |") | |
| report_lines.append(f"| Turnover | {r.get('turnover', 'N/A')}% |") | |
| report_lines.append(f"| Max DD | {r.get('max_dd', 'N/A')}% |") | |
| report_lines.append(f"| Composite | {round(r.get('composite', 0), 3)} |") | |
| else: | |
| report_lines.append(f"**Error:** {r.get('error', 'Unknown')}") | |
| return results, "\n".join(report_lines), raw_response | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # GRADIO UI | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Blocks(title="WorldQuant Alpha Swarmβ’", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(""" | |
| # π MicroFish Swarmβ’ β WorldQuant Alpha Discovery | |
| ### LLM-Powered Formulaic Alpha Generation with Real-Time Backtesting | |
| """) | |
| with gr.Tab("π― Generate Alphas"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| backend = gr.Dropdown( | |
| choices=["Hugging Face", "Ollama"], | |
| value="Hugging Face", | |
| label="Backend", | |
| ) | |
| model_dropdown = gr.Dropdown( | |
| choices=HF_MODELS, | |
| value=HF_MODELS[0], | |
| label="Model", | |
| ) | |
| temperature = gr.Slider( | |
| minimum=0.1, | |
| maximum=1.5, | |
| value=0.7, | |
| step=0.1, | |
| label="Temperature", | |
| ) | |
| num_alphas = gr.Slider( | |
| minimum=1, | |
| maximum=10, | |
| value=3, | |
| step=1, | |
| label="Number of Alphas to Generate", | |
| ) | |
| domain_focus = gr.Dropdown( | |
| choices=DOMAINS, | |
| value=DOMAINS[0], | |
| label="Domain Focus", | |
| ) | |
| with gr.Column(scale=2): | |
| fields_select = gr.Dropdown( | |
| choices=sorted(WQ_DATA_FIELDS), | |
| value=sorted(["close", "volume", "returns", "vwap", "market_cap", "operating_income", "ebitda", "eps_surprise", "put_call_ratio", "iv30", "iv90", "total_debt"]), | |
| multiselect=True, | |
| label="Available Data Fields", | |
| ) | |
| operators_select = gr.Dropdown( | |
| choices=sorted(WQ_OPERATORS), | |
| value=sorted(["rank", "zscore", "ts_mean", "ts_std_dev", "ts_rank", "ts_decay_linear", "group_neutralize", "abs", "sign", "greater", "if_else", "trade_when"]), | |
| multiselect=True, | |
| label="Available Operators", | |
| ) | |
| existing_alphas = gr.Textbox( | |
| label="Existing Alpha Library (paste expressions to avoid redundancy)", | |
| lines=4, | |
| value="\n".join(EXAMPLE_ALPHAS), | |
| ) | |
| def update_models(backend_choice): | |
| return gr.Dropdown(choices=HF_MODELS if backend_choice == "Hugging Face" else OLLAMA_MODELS) | |
| backend.change(update_models, inputs=backend, outputs=model_dropdown) | |
| generate_btn = gr.Button("π Generate & Evaluate Alphas", variant="primary", size="lg") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| results_json = gr.JSON(label="Structured Results", visible=True) | |
| with gr.Column(scale=2): | |
| report_md = gr.Markdown(label="Evaluation Report") | |
| with gr.Row(): | |
| raw_output = gr.Textbox(label="Raw LLM Response (for debugging)", lines=6) | |
| generate_btn.click( | |
| fn=generate_alphas, | |
| inputs=[backend, model_dropdown, fields_select, operators_select, domain_focus, num_alphas, temperature, existing_alphas], | |
| outputs=[results_json, report_md, raw_output], | |
| ) | |
| with gr.Tab("π Evaluate Custom Expression"): | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| custom_expr = gr.Textbox( | |
| label="WorldQuant BRAIN Expression", | |
| lines=4, | |
| value="group_neutralize(rank(ts_decay_linear(rank(abs(returns) / (close * volume + 0.000001)), 3)), subindustry)", | |
| ) | |
| eval_btn = gr.Button("π Evaluate", variant="primary") | |
| with gr.Column(scale=1): | |
| eval_result = gr.JSON(label="Metrics") | |
| def evaluate_custom(expr): | |
| data, fwd = get_synthetic_data() | |
| return evaluate_alpha(expr, data, fwd) | |
| eval_btn.click(fn=evaluate_custom, inputs=custom_expr, outputs=eval_result) | |
| with gr.Tab("π Reference"): | |
| gr.Markdown(""" | |
| ## WorldQuant BRAIN Operator Reference | |
| ### Cross-Section Operators | |
| | Operator | Description | | |
| |----------|-------------| | |
| | `rank(x)` | Percentile rank (0-1) across stocks | | |
| | `zscore(x)` | Demean and scale to std=1 | | |
| | `scale(x)` | Normalize to unit sum | | |
| | `sign(x)` | Sign function | | |
| | `abs(x)` | Absolute value | | |
| | `max(x,y)` / `min(x,y)` | Element-wise max/min | | |
| | `greater(x,y)` | 1 if x>y else 0 | | |
| | `less(x,y)` | 1 if x<y else 0 | | |
| | `if_else(c,x,y)` | x if c else y | | |
| | `and(x,y)` / `or(x,y)` / `not(x)` | Boolean logic | | |
| | `group_neutralize(x, level)` | Demean within group | | |
| | `group_rank(x, level)` | Rank within group | | |
| ### Time-Series Operators | |
| | Operator | Description | | |
| |----------|-------------| | |
| | `ts_mean(x, d)` | d-day rolling mean | | |
| | `ts_std_dev(x, d)` | d-day rolling std | | |
| | `ts_rank(x, d)` | Rolling rank within history | | |
| | `ts_min(x, d)` / `ts_max(x, d)` | Rolling min/max | | |
| | `ts_delta(x, d)` | x[t] - x[t-d] | | |
| | `ts_delay(x, d)` | x[t-d] | | |
| | `ts_return(x, d)` | x[t]/x[t-d] - 1 | | |
| | `ts_corr(x, y, d)` | Rolling correlation | | |
| | `ts_sum(x, d)` | Rolling sum | | |
| | `ts_decay_linear(x, d)` | Linear decay-weighted average | | |
| | `ts_decay_exp(x, d)` | Exponential decay-weighted | | |
| | `ts_backfill(x, d)` | Forward fill within window | | |
| | `trade_when(cond, x, y)` | x if cond else y | | |
| ### Key Data Fields | |
| | Category | Fields | | |
| |----------|--------| | |
| | Price/Volume | `open`, `high`, `low`, `close`, `volume`, `vwap`, `returns`, `adv20`, `adv60` | | |
| | Fundamentals | `market_cap`, `operating_income`, `ebitda`, `total_debt`, `total_assets`, `cash`, `equity`, `enterprise_value`, `sales`, `revenue`, `eps` | | |
| | Analyst | `est_eps`, `eps_surprise`, `eps_surprise_pct`, `num_analysts`, `recommendation_mean` | | |
| | Options | `implied_volatility_call_180`, `implied_volatility_put_180`, `iv30`, `iv60`, `iv90`, `put_call_ratio`, `option_volume` | | |
| | Alternative | `realized_vol`, `volatility`, `skewness`, `kurtosis` | | |
| ## Tips for Strong Alphas | |
| 1. **Dimensionless** β rank or zscore before combining different metrics | |
| 2. **Guard divisions** β always add `+ 0.000001` to denominators | |
| 3. **Neutralize** β end with `group_neutralize(..., subindustry)` | |
| 4. **Decay smooth** β use `ts_decay_linear(expr, 3-10)` for noisy signals | |
| 5. **Multiplicative intersections** β `rank(a) * rank(b)` > `a + b` for orthogonal signals | |
| 6. **Cross-sectional** β the signal must differentiate stocks, not predict time | |
| """) | |
| with gr.Tab("π§ Settings"): | |
| gr.Markdown(""" | |
| ### Hugging Face Setup | |
| Set your HF token as an environment variable: | |
| ```bash | |
| export HF_TOKEN=your_token_here | |
| ``` | |
| Or pass it when launching: | |
| ```bash | |
| HF_TOKEN=xxx python app.py | |
| ``` | |
| ### Ollama Setup | |
| 1. Install Ollama: https://ollama.com | |
| 2. Pull a model: `ollama pull deepseek-r1:8b` | |
| 3. Ensure Ollama is running locally (default: http://localhost:11434) | |
| ### Deployment to Hugging Face Spaces | |
| ```bash | |
| # Create a Space with Gradio SDK | |
| # Push app.py + requirements.txt | |
| # requirements.txt contents: | |
| gradio>=4.0 | |
| numpy | |
| pandas | |
| scipy | |
| huggingface_hub | |
| ollama | |
| ``` | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=7860, share=True) | |