anky2002's picture
Upload app.py
b9ffefd verified
#!/usr/bin/env python3
"""
═══════════════════════════════════════════════════════════════════════════════
WorldQuant Alpha Swarm β€” Gradio UI
Supports: Hugging Face Inference API + Ollama (local)
Features:
β€’ LLM-driven alpha generation with structured JSON prompting
β€’ Dropdown selectors for all WQ data fields & operators
β€’ Real-time backtest evaluation on synthetic data
β€’ Orthogonality check vs existing library
β€’ Multi-domain swarm mode
═══════════════════════════════════════════════════════════════════════════════
"""
import json
import math
import os
import random
import re
import sys
import traceback
from dataclasses import dataclass
from typing import Dict, List, Optional, Set, Tuple
import gradio as gr
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
# ─────────────────────────────────────────────────────────────────────────────
# CONFIG: Model Lists
# ─────────────────────────────────────────────────────────────────────────────
HF_MODELS = [
"meta-llama/Meta-Llama-3-8B-Instruct",
"mistralai/Mistral-7B-Instruct-v0.3",
"Qwen/Qwen2.5-7B-Instruct",
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
"microsoft/Phi-3-mini-4k-instruct",
"HuggingFaceH4/zephyr-7b-beta",
]
OLLAMA_MODELS = [
"llama3.2",
"deepseek-r1:8b",
"qwen2.5:7b",
"mistral",
"codellama",
"phi3",
]
# ─────────────────────────────────────────────────────────────────────────────
# CONFIG: WorldQuant Data Fields & Operators
# ─────────────────────────────────────────────────────────────────────────────
WQ_DATA_FIELDS = {
# Price / Volume
"open", "high", "low", "close", "volume", "vwap",
"returns", "returns_open", "intraday_return", "overnight_return",
"open_close_return", "high_low_range", "close_open_gap",
"num_trades", "turnover", "turnover_ratio",
"bid", "ask", "bid_size", "ask_size", "adv20", "adv60",
# Fundamentals
"market_cap", "pe_ratio", "pb_ratio", "ps_ratio",
"ev_ebitda", "ev_sales", "debt_equity", "current_ratio",
"roe", "roa", "roic", "gross_profit_margin",
"ebitda", "operating_income", "net_income", "sales", "revenue",
"total_assets", "total_debt", "cash", "book_value", "equity",
"liabilities", "assets",
"eps", "dps", "dividend_yield",
"revenue_growth", "earnings_growth", "enterprise_value", "cap",
"gross_income", "gross_income_reported_value",
# Analyst / Estimates
"est_eps", "est_revenue", "recommendation_mean",
"num_analysts", "eps_surprise", "eps_surprise_pct",
# Options
"implied_volatility_call_180", "implied_volatility_put_180",
"iv30", "iv60", "iv90", "put_call_ratio", "option_volume", "open_interest",
# Alternative
"realized_vol", "volatility", "skewness", "kurtosis",
}
WQ_OPERATORS = {
# Cross-section
"rank", "zscore", "scale", "normalize", "sign", "abs",
"max", "min", "greater", "less", "if_else", "cond",
"and", "or", "not",
"group_neutralize", "group_rank", "group_zscore", "group_normalize",
# Time-series
"ts_mean", "ts_std_dev", "ts_variance", "ts_zscore", "ts_rank",
"ts_min", "ts_max", "ts_delta", "ts_delay", "ts_return",
"ts_corr", "ts_cov", "ts_sum", "ts_prod", "ts_skew", "ts_kurt",
"ts_decay_linear", "ts_decay_exp", "ts_argmax", "ts_argmin",
"ts_ir", "ts_backfill", "ts_sumif", "ts_count",
# Special
"trade_when",
}
NEUTRALIZATION_LEVELS = ["subindustry", "industry", "sector", "market", "none"]
# ─────────────────────────────────────────────────────────────────────────────
# SYNTHETIC DATA GENERATOR (Embedded Anomalies)
# ─────────────────────────────────────────────────────────────────────────────
_DATA_CACHE = {}
def get_synthetic_data(n_stocks: int = 300, n_days: int = 252, seed: int = 2026):
key = (n_stocks, n_days, seed)
if key in _DATA_CACHE:
return _DATA_CACHE[key]
np.random.seed(seed)
dates = pd.date_range("2020-01-02", periods=n_days, freq="B")
stocks = [f"STK_{i:04d}" for i in range(n_stocks)]
# Persistent characteristics
liquidity_sens = np.random.beta(2, 5, n_stocks)
value_score = -np.log(np.random.lognormal(0, 0.4, n_stocks))
earn_vol = np.random.gamma(2, 0.03, n_stocks)
# Market factor
market_ret = np.random.normal(0.0003, 0.012, n_days)
idio_vol = np.random.uniform(0.015, 0.035, n_stocks)
beta = np.random.uniform(0.5, 1.5, n_stocks)
returns = np.random.normal(0, idio_vol, (n_days, n_stocks))
for t in range(n_days):
returns[t] += beta * market_ret[t]
# Embed anomalies
market_cap = np.random.lognormal(22, 1.2, (n_days, n_stocks))
market_cap = np.maximum(market_cap, 1e6)
volume = np.exp(np.random.normal(15, 0.5, (n_days, n_stocks)))
# ANOMALY 1: Amihud reversal
for t in range(5, n_days - 1):
amihud = np.abs(returns[t]) / (market_cap[t] * 1e-6 + 1000)
amihud_rank = np.argsort(np.argsort(amihud)) / (n_stocks - 1)
returns[t+1, amihud_rank > 0.80] -= 0.008 * liquidity_sens[amihud_rank > 0.80]
returns[t+1, amihud_rank < 0.20] += 0.003 * (1 - liquidity_sens[amihud_rank < 0.20])
# ANOMALY 2: PEAD
eps_surprise = np.zeros((n_days, n_stocks))
for s in range(n_stocks):
earn_dates = np.random.choice(range(20, n_days - 10), size=3, replace=False)
for ed in earn_dates:
surprise = np.random.normal(0, earn_vol[s])
eps_surprise[ed, s] = surprise
drift = 0.5 * surprise / (earn_vol[s] + 0.001) * 0.004
for d in range(1, 6):
if ed + d < n_days:
returns[ed + d, s] += drift * (1 - 0.15 * d)
# ANOMALY 3: Value premium
for t in range(n_days):
returns[t] += 0.00008 * value_score
# ANOMALY 4: VWAP pressure reversal
close = np.zeros((n_days, n_stocks))
close[0] = 100.0
for t in range(1, n_days):
close[t] = close[t-1] * (1 + returns[t])
vol_ma20 = pd.DataFrame(volume).rolling(20, min_periods=1).mean().values
rel_vol = volume / (vol_ma20 + 1)
vwap = close * (1 + 0.001 * (rel_vol - 1) * np.random.normal(0, 1, (n_days, n_stocks)))
for t in range(1, n_days - 1):
vwap_gap = np.abs(vwap[t] - close[t]) / close[t]
pressure = vwap_gap * rel_vol[t]
p_rank = np.argsort(np.argsort(pressure)) / (n_stocks - 1)
returns[t+1, p_rank > 0.90] -= 0.006 * liquidity_sens[p_rank > 0.90]
# Recalculate close with anomalies
close = np.zeros((n_days, n_stocks))
close[0] = 100.0
for t in range(1, n_days):
close[t] = close[t-1] * (1 + returns[t])
high = close * (1 + np.abs(np.random.normal(0, 0.008, close.shape)))
low = close * (1 - np.abs(np.random.normal(0, 0.008, close.shape)))
open_p = close * (1 + np.random.normal(0, 0.003, close.shape))
# Fundamentals
operating_income = market_cap * np.random.lognormal(-3.0, 0.6, (n_days, n_stocks))
ebitda = operating_income * np.random.lognormal(0.3, 0.15, (n_days, n_stocks))
total_debt = market_cap * np.random.lognormal(-1.8, 0.9, (n_days, n_stocks))
total_assets = market_cap * np.random.lognormal(0.1, 0.4, (n_days, n_stocks))
cash = total_assets * np.random.uniform(0.03, 0.18, (n_days, n_stocks))
equity = total_assets * np.random.uniform(0.35, 0.75, (n_days, n_stocks))
liabilities = total_assets - equity
enterprise_value = market_cap * np.random.uniform(1.0, 1.6, (n_days, n_stocks))
sales = market_cap * np.random.lognormal(-1.4, 0.35, (n_days, n_stocks))
eps = operating_income / (market_cap / 100) * np.random.uniform(0.3, 0.8, (n_days, n_stocks))
est_eps = eps * (1 + np.random.normal(0, 0.1, (n_days, n_stocks)))
eps_surprise_pct = eps_surprise / (np.abs(est_eps) + 0.01)
num_analysts = np.random.poisson(8, (n_days, n_stocks)).astype(float)
# Options
iv_call = np.random.uniform(0.18, 0.48, (n_days, n_stocks))
iv_put = iv_call + np.random.normal(0, 0.025, (n_days, n_stocks))
put_call_ratio = np.random.lognormal(0, 0.35, (n_days, n_stocks))
option_volume = volume * np.random.uniform(0.002, 0.04, (n_days, n_stocks))
realized_vol = pd.DataFrame(returns).rolling(20, min_periods=1).std().values
realized_vol = np.nan_to_num(realized_vol, nan=0.02)
def mkdf(arr):
return pd.DataFrame(arr, index=dates, columns=stocks)
data = {
"returns": mkdf(returns),
"close": mkdf(close),
"high": mkdf(high),
"low": mkdf(low),
"open": mkdf(open_p),
"volume": mkdf(volume),
"vwap": mkdf(vwap),
"market_cap": mkdf(market_cap),
"cap": mkdf(market_cap),
"operating_income": mkdf(operating_income),
"ebitda": mkdf(ebitda),
"total_debt": mkdf(total_debt),
"total_assets": mkdf(total_assets),
"cash": mkdf(cash),
"equity": mkdf(equity),
"book_value": mkdf(equity),
"liabilities": mkdf(liabilities),
"assets": mkdf(total_assets),
"enterprise_value": mkdf(enterprise_value),
"sales": mkdf(sales),
"revenue": mkdf(sales),
"eps": mkdf(eps),
"est_eps": mkdf(est_eps),
"eps_surprise": mkdf(eps_surprise),
"eps_surprise_pct": mkdf(eps_surprise_pct),
"num_analysts": mkdf(num_analysts),
"implied_volatility_call_180": mkdf(iv_call),
"implied_volatility_put_180": mkdf(iv_put),
"put_call_ratio": mkdf(put_call_ratio),
"option_volume": mkdf(option_volume),
"realized_vol": mkdf(realized_vol),
"adv20": mkdf(pd.DataFrame(volume).rolling(20, min_periods=1).mean().values),
"turnover": mkdf(volume / (market_cap + 1)),
"turnover_ratio": mkdf(volume / (market_cap + 1)),
"volatility": mkdf(realized_vol),
"debt_equity": mkdf(total_debt / (equity + 1)),
"current_ratio": mkdf(np.random.uniform(0.8, 2.5, (n_days, n_stocks))),
"roe": mkdf(operating_income / (equity + 1)),
"roa": mkdf(operating_income / (total_assets + 1)),
"gross_profit_margin": mkdf(np.random.uniform(0.2, 0.6, (n_days, n_stocks))),
"pe_ratio": mkdf(np.random.lognormal(2.5, 0.5, (n_days, n_stocks))),
"pb_ratio": mkdf(close / (equity / (market_cap / 100) + 0.01)),
"ev_ebitda": mkdf(enterprise_value / (ebitda + 1)),
"net_income": mkdf(operating_income * np.random.uniform(0.5, 0.9, (n_days, n_stocks))),
"dividend_yield": mkdf(np.random.uniform(0, 0.05, (n_days, n_stocks))),
"earnings_growth": mkdf(np.random.normal(0.05, 0.15, (n_days, n_stocks))),
"revenue_growth": mkdf(np.random.normal(0.05, 0.15, (n_days, n_stocks))),
"gross_income": mkdf(operating_income * np.random.uniform(1.2, 1.5, (n_days, n_stocks))),
"gross_income_reported_value": mkdf(operating_income * np.random.uniform(1.2, 1.5, (n_days, n_stocks))),
"iv30": mkdf(np.random.uniform(0.18, 0.48, (n_days, n_stocks))),
"iv60": mkdf(np.random.uniform(0.18, 0.48, (n_days, n_stocks))),
"iv90": mkdf(np.random.uniform(0.18, 0.48, (n_days, n_stocks))),
"open_interest": mkdf(option_volume * np.random.uniform(5, 20, (n_days, n_stocks))),
"bid": mkdf(close * (1 - np.random.uniform(0, 0.001, (n_days, n_stocks)))),
"ask": mkdf(close * (1 + np.random.uniform(0, 0.001, (n_days, n_stocks)))),
"bid_size": mkdf(np.random.poisson(1000, (n_days, n_stocks))),
"ask_size": mkdf(np.random.poisson(1000, (n_days, n_stocks))),
"returns_open": mkdf(np.random.normal(0.0002, 0.02, (n_days, n_stocks))),
"intraday_return": mkdf(returns - np.random.normal(0.0001, 0.01, (n_days, n_stocks))),
"overnight_return": mkdf(np.random.normal(0.0001, 0.01, (n_days, n_stocks))),
"high_low_range": mkdf((high - low) / close),
"close_open_gap": mkdf((close - open_p) / open_p),
"est_revenue": mkdf(sales * (1 + np.random.normal(0, 0.05, (n_days, n_stocks)))),
"recommendation_mean": mkdf(np.random.uniform(1.5, 4.5, (n_days, n_stocks))),
"roic": mkdf(operating_income / (total_assets + 1)),
"ev_sales": mkdf(enterprise_value / (sales + 1)),
"num_trades": mkdf(np.random.poisson(5000, (n_days, n_stocks))),
"skewness": mkdf(pd.DataFrame(returns).rolling(20, min_periods=1).skew().values),
"kurtosis": mkdf(pd.DataFrame(returns).rolling(20, min_periods=1).kurt().values),
}
fwd = data["returns"].shift(-1)
result = (data, fwd)
_DATA_CACHE[key] = result
return result
# ─────────────────────────────────────────────────────────────────────────────
# ALPHA EVALUATOR
# ─────────────────────────────────────────────────────────────────────────────
def evaluate_alpha(expr: str, data: dict, fwd: pd.DataFrame, min_days: int = 50):
"""Evaluate a WQ expression and return metrics."""
ns = dict(data)
ns["rank"] = lambda df: df.rank(axis=1, pct=True)
ns["zscore"] = lambda df: (df - df.mean(axis=1).values[:, None]) / (df.std(axis=1).values[:, None] + 0.0001)
ns["sign"] = np.sign
ns["abs"] = np.abs
ns["ts_mean"] = lambda df, w: df.rolling(window=int(w), min_periods=1).mean()
ns["ts_std_dev"] = lambda df, w: df.rolling(window=int(w), min_periods=1).std()
ns["ts_rank"] = lambda df, w: df.rolling(window=int(w), min_periods=1).apply(
lambda x: np.argsort(np.argsort(x))[-1] / max(len(x) - 1, 1) if len(x) > 1 else 0.5, raw=True
)
ns["ts_min"] = lambda df, w: df.rolling(window=int(w), min_periods=1).min()
ns["ts_max"] = lambda df, w: df.rolling(window=int(w), min_periods=1).max()
ns["ts_delta"] = lambda df, w: df - df.shift(int(w))
ns["ts_delay"] = lambda df, w: df.shift(int(w))
ns["ts_return"] = lambda df, w: df / df.shift(int(w)) - 1
ns["ts_sum"] = lambda df, w: df.rolling(window=int(w), min_periods=1).sum()
ns["ts_backfill"] = lambda df, w: df.rolling(window=int(w), min_periods=1).apply(
lambda x: pd.Series(x).ffill().iloc[-1], raw=True
)
ns["ts_decay_linear"] = lambda df, w: _ts_decay_fast(df, int(w))
ns["group_neutralize"] = lambda df, _: df - df.mean(axis=1).values[:, None]
ns["group_rank"] = lambda df, _: df.rank(axis=1, pct=True)
ns["greater"] = lambda a, b: (a > b).astype(float)
ns["less"] = lambda a, b: (a < b).astype(float)
ns["if_else"] = lambda c, a, b: np.where(c, a, b)
ns["and"] = lambda a, b: ((a > 0) & (b > 0)).astype(float)
ns["or"] = lambda a, b: ((a > 0) | (b > 0)).astype(float)
ns["not"] = lambda a: (a <= 0).astype(float)
ns["max"] = np.maximum
ns["min"] = np.minimum
ns["trade_when"] = lambda c, a, b: np.where(c > 0, a, b)
try:
result = eval(expr, {"__builtins__": {}}, ns)
if not isinstance(result, pd.DataFrame):
return {"valid": False, "error": "Not a DataFrame"}
except Exception as e:
return {"valid": False, "error": str(e)[:200]}
valid_idx = result.index[min_days::5]
ic_vals = []
rank_ic_vals = []
for date in valid_idx:
a = result.loc[date].dropna()
f = fwd.loc[date].dropna()
common = a.index.intersection(f.index)
if len(common) < 30:
continue
a, f = a[common], f[common]
if a.std() > 0 and f.std() > 0:
ic_vals.append(np.corrcoef(a, f)[0, 1])
if len(set(a)) > 1 and len(set(f)) > 1:
r, _ = spearmanr(a, f)
if not np.isnan(r):
rank_ic_vals.append(r)
ic = np.nanmean(ic_vals) if ic_vals else 0
rank_ic = np.nanmean(rank_ic_vals) if rank_ic_vals else 0
ic_std = np.nanstd(ic_vals) if ic_vals else 0.001
icir = ic / (ic_std + 0.0001)
sharpe = min(icir * math.sqrt(252) / 3, 5.0)
rnk = result.rank(axis=1)
corr_vals = []
for i in range(1, min(len(rnk), 100)):
a1 = rnk.iloc[i-1].dropna()
a2 = rnk.iloc[i].dropna()
common = a1.index.intersection(a2.index)
if len(common) > 20:
c = np.corrcoef(a1[common], a2[common])[0, 1]
if not np.isnan(c):
corr_vals.append(c)
avg_corr = np.mean(corr_vals) if corr_vals else 0.8
turnover = max(0, (1 - avg_corr) * 100)
max_dd = max(2.0, turnover * 0.15)
return {
"valid": True,
"ic": round(ic, 4),
"rank_ic": round(rank_ic, 4),
"sharpe": round(sharpe, 3),
"turnover": round(turnover, 1),
"max_dd": round(max_dd, 2),
}
def _ts_decay_fast(df, window):
w = window
weights = np.arange(1, w + 1)
weights = weights / weights.sum()
return df.rolling(window=w, min_periods=1).apply(
lambda x: np.dot(x[-len(weights):], weights[-len(x):]), raw=True
)
# ─────────────────────────────────────────────────────────────────────────────
# LLM PROMPT ENGINE
# ─────────────────────────────────────────────────────────────────────────────
def build_prompt(fields: List[str], operators: List[str], domain: str, existing_alphas: str, num_alphas: int) -> str:
fields_str = ", ".join(fields)
ops_str = ", ".join(operators)
prompt = f"""You are a senior quantitative researcher at Renaissance Technologies. Your task is to generate {num_alphas} novel formulaic alphas for a WorldQuant BRAIN competition.
AVAILABLE DATA FIELDS:
{fields_str}
AVAILABLE OPERATORS:
{ops_str}
DOMAIN TO FOCUS ON: {domain}
EXISTING ALPHA LIBRARY (DO NOT REPLICATE):
{existing_alphas[:2000] if existing_alphas else "None β€” this is the first generation."}
REQUIREMENTS FOR EACH ALPHA:
1. Expression must be a SINGLE valid WorldQuant BRAIN expression (no comments, no semicolons as separators)
2. Use only the listed operators and data fields
3. All division must include + 0.000001 guard to prevent division by zero
4. Must end with group_neutralize(score, subindustry) or group_neutralize(rank(score), subindustry)
5. Must be dimensionless (no units)
6. At least 2 distinct operations (not just rank(close))
7. Max 5 named parameters per expression
8. Should exploit cross-sectional predictability, not time-series momentum alone
OUTPUT FORMAT β€” Return ONLY a JSON array with exactly {num_alphas} objects. Each object must have:
{{
"name": "short descriptive name",
"description": "one-sentence economic rationale",
"expression": "the full WQ expression as a single string",
"domain": "which domain this belongs to",
"neutralization": "subindustry"
}}
Do not include markdown code fences. Return raw JSON only."""
return prompt
def call_hf_model(model_name: str, prompt: str, temperature: float = 0.7, max_tokens: int = 2048):
try:
from huggingface_hub import InferenceClient
token = os.environ.get("HF_TOKEN", "")
client = InferenceClient(token=token if token else None)
response = client.chat_completion(
model=model_name,
messages=[{"role": "user", "content": prompt}],
max_tokens=max_tokens,
temperature=temperature,
)
return response.choices[0].message.content
except Exception as e:
return f"ERROR: {str(e)}"
def call_ollama_model(model_name: str, prompt: str, temperature: float = 0.7):
try:
import ollama
response = ollama.generate(
model=model_name,
prompt=prompt,
format="json",
options={"temperature": temperature, "num_predict": 2048},
)
return response["response"]
except Exception as e:
return f"ERROR: {str(e)}"
def parse_alpha_json(raw_text: str) -> List[Dict]:
text = raw_text.strip()
if text.startswith("```"):
text = text.split("\n", 1)[1]
if text.endswith("```"):
text = text.rsplit("\n", 1)[0]
text = text.strip()
try:
return json.loads(text)
except json.JSONDecodeError:
pass
match = re.search(r'\[.*\]', text, re.DOTALL)
if match:
try:
return json.loads(match.group())
except:
pass
if not text.endswith("]"):
text = text.rsplit("}", 1)[0] + "}]"
try:
return json.loads(text)
except:
pass
return []
# ─────────────────────────────────────────────────────────────────────────────
# SWARM GENERATION LOGIC
# ─────────────────────────────────────────────────────────────────────────────
DOMAINS = [
"Liquidity Shock Reversal (Amihud, volume acceleration, VWAP pressure)",
"Post-Earnings Announcement Drift (eps_surprise, SUE, analyst revisions)",
"Capital Structure / Distress Quality (debt coverage, interest coverage, cash ratios)",
"Options Market Flow & Skew (put_call_ratio, IV term structure, option volume)",
"Nonlinear Factor Interactions (multiplicative combinations of orthogonal signals)",
"Cross-Sectional Dispersion / Beta Timing (idiosyncratic vol, comovement deviation)",
"Seasonality & Calendar Effects (intra-month, day-of-week, turn-of-month)",
"News Sentiment / Text Signals (earnings tone, headline sentiment)",
"Short Interest / Borrow Cost (utilization, short interest changes)",
"Institutional Flow (13F ownership changes)",
]
EXAMPLE_ALPHAS = [
"group_neutralize(rank(ts_mean(abs(returns) / (close * volume + 0.000001), 5) / (ts_mean(abs(returns) / (close * volume + 0.000001), 63) + 0.000001)), subindustry)",
"group_neutralize(rank(eps_surprise / (abs(est_eps) + 0.000001)), subindustry)",
"group_neutralize(rank(operating_income / (total_debt + 0.000001)), subindustry)",
"group_neutralize(rank(-put_call_ratio) * rank(iv30 - iv90), industry)",
"group_neutralize(rank(zscore(ts_rank(operating_income / (cap + 0.000001), 252))) * rank(zscore(ts_rank(-returns, 20))), subindustry)",
]
def generate_alphas(
backend: str,
model_name: str,
fields: List[str],
operators: List[str],
domain: str,
num_alphas: int,
temperature: float,
existing_alphas_text: str,
progress=gr.Progress(),
):
progress(0.1, desc="Building prompt...")
prompt = build_prompt(fields, operators, domain, existing_alphas_text, num_alphas)
progress(0.2, desc=f"Calling {backend} model: {model_name}...")
if backend == "Hugging Face":
raw_response = call_hf_model(model_name, prompt, temperature)
else:
raw_response = call_ollama_model(model_name, prompt, temperature)
if raw_response.startswith("ERROR:"):
return [], f"❌ {raw_response}", ""
progress(0.5, desc="Parsing response...")
alphas = parse_alpha_json(raw_response)
if not alphas:
return [], f"❌ Could not parse LLM response. Raw output:\n\n{raw_response[:1000]}", ""
progress(0.6, desc="Preparing evaluation data...")
data, fwd = get_synthetic_data()
results = []
progress_steps = len(alphas)
for i, alpha in enumerate(alphas):
progress(0.6 + 0.35 * (i / progress_steps), desc=f"Evaluating alpha {i+1}/{len(alphas)}...")
expr = alpha.get("expression", "")
if not expr:
continue
score = evaluate_alpha(expr, data, fwd)
alpha.update(score)
alpha["composite"] = (
0.35 * score.get("sharpe", 0) +
0.25 * score.get("ic", 0) * 10 +
0.20 * score.get("rank_ic", 0) * 10 -
0.10 * (score.get("turnover", 0) / 100) -
0.10 * (score.get("max_dd", 0) / 100)
) if score.get("valid") else -999
results.append(alpha)
progress(1.0, desc="Done!")
results.sort(key=lambda x: x.get("composite", -999), reverse=True)
report_lines = ["# Generated Alpha Report\n"]
for i, r in enumerate(results, 1):
status = "βœ… VALID" if r.get("valid") else "❌ INVALID"
report_lines.append(f"\n## Alpha {i}: {r.get('name', 'Unnamed')} {status}")
report_lines.append(f"**Domain:** {r.get('domain', 'Unknown')}")
report_lines.append(f"**Description:** {r.get('description', 'N/A')}")
report_lines.append(f"```\n{r.get('expression', 'N/A')}\n```")
if r.get("valid"):
report_lines.append(f"| Metric | Value |")
report_lines.append(f"|--------|-------|")
report_lines.append(f"| Sharpe | {r.get('sharpe', 'N/A')} |")
report_lines.append(f"| IC | {r.get('ic', 'N/A')} |")
report_lines.append(f"| Rank IC | {r.get('rank_ic', 'N/A')} |")
report_lines.append(f"| Turnover | {r.get('turnover', 'N/A')}% |")
report_lines.append(f"| Max DD | {r.get('max_dd', 'N/A')}% |")
report_lines.append(f"| Composite | {round(r.get('composite', 0), 3)} |")
else:
report_lines.append(f"**Error:** {r.get('error', 'Unknown')}")
return results, "\n".join(report_lines), raw_response
# ─────────────────────────────────────────────────────────────────────────────
# GRADIO UI
# ─────────────────────────────────────────────────────────────────────────────
with gr.Blocks(title="WorldQuant Alpha Swarmβ„’", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# 🐟 MicroFish Swarmβ„’ β€” WorldQuant Alpha Discovery
### LLM-Powered Formulaic Alpha Generation with Real-Time Backtesting
""")
with gr.Tab("🎯 Generate Alphas"):
with gr.Row():
with gr.Column(scale=1):
backend = gr.Dropdown(
choices=["Hugging Face", "Ollama"],
value="Hugging Face",
label="Backend",
)
model_dropdown = gr.Dropdown(
choices=HF_MODELS,
value=HF_MODELS[0],
label="Model",
)
temperature = gr.Slider(
minimum=0.1,
maximum=1.5,
value=0.7,
step=0.1,
label="Temperature",
)
num_alphas = gr.Slider(
minimum=1,
maximum=10,
value=3,
step=1,
label="Number of Alphas to Generate",
)
domain_focus = gr.Dropdown(
choices=DOMAINS,
value=DOMAINS[0],
label="Domain Focus",
)
with gr.Column(scale=2):
fields_select = gr.Dropdown(
choices=sorted(WQ_DATA_FIELDS),
value=sorted(["close", "volume", "returns", "vwap", "market_cap", "operating_income", "ebitda", "eps_surprise", "put_call_ratio", "iv30", "iv90", "total_debt"]),
multiselect=True,
label="Available Data Fields",
)
operators_select = gr.Dropdown(
choices=sorted(WQ_OPERATORS),
value=sorted(["rank", "zscore", "ts_mean", "ts_std_dev", "ts_rank", "ts_decay_linear", "group_neutralize", "abs", "sign", "greater", "if_else", "trade_when"]),
multiselect=True,
label="Available Operators",
)
existing_alphas = gr.Textbox(
label="Existing Alpha Library (paste expressions to avoid redundancy)",
lines=4,
value="\n".join(EXAMPLE_ALPHAS),
)
def update_models(backend_choice):
return gr.Dropdown(choices=HF_MODELS if backend_choice == "Hugging Face" else OLLAMA_MODELS)
backend.change(update_models, inputs=backend, outputs=model_dropdown)
generate_btn = gr.Button("πŸš€ Generate & Evaluate Alphas", variant="primary", size="lg")
with gr.Row():
with gr.Column(scale=1):
results_json = gr.JSON(label="Structured Results", visible=True)
with gr.Column(scale=2):
report_md = gr.Markdown(label="Evaluation Report")
with gr.Row():
raw_output = gr.Textbox(label="Raw LLM Response (for debugging)", lines=6)
generate_btn.click(
fn=generate_alphas,
inputs=[backend, model_dropdown, fields_select, operators_select, domain_focus, num_alphas, temperature, existing_alphas],
outputs=[results_json, report_md, raw_output],
)
with gr.Tab("πŸ“Š Evaluate Custom Expression"):
with gr.Row():
with gr.Column(scale=2):
custom_expr = gr.Textbox(
label="WorldQuant BRAIN Expression",
lines=4,
value="group_neutralize(rank(ts_decay_linear(rank(abs(returns) / (close * volume + 0.000001)), 3)), subindustry)",
)
eval_btn = gr.Button("πŸ“ˆ Evaluate", variant="primary")
with gr.Column(scale=1):
eval_result = gr.JSON(label="Metrics")
def evaluate_custom(expr):
data, fwd = get_synthetic_data()
return evaluate_alpha(expr, data, fwd)
eval_btn.click(fn=evaluate_custom, inputs=custom_expr, outputs=eval_result)
with gr.Tab("πŸ“– Reference"):
gr.Markdown("""
## WorldQuant BRAIN Operator Reference
### Cross-Section Operators
| Operator | Description |
|----------|-------------|
| `rank(x)` | Percentile rank (0-1) across stocks |
| `zscore(x)` | Demean and scale to std=1 |
| `scale(x)` | Normalize to unit sum |
| `sign(x)` | Sign function |
| `abs(x)` | Absolute value |
| `max(x,y)` / `min(x,y)` | Element-wise max/min |
| `greater(x,y)` | 1 if x>y else 0 |
| `less(x,y)` | 1 if x<y else 0 |
| `if_else(c,x,y)` | x if c else y |
| `and(x,y)` / `or(x,y)` / `not(x)` | Boolean logic |
| `group_neutralize(x, level)` | Demean within group |
| `group_rank(x, level)` | Rank within group |
### Time-Series Operators
| Operator | Description |
|----------|-------------|
| `ts_mean(x, d)` | d-day rolling mean |
| `ts_std_dev(x, d)` | d-day rolling std |
| `ts_rank(x, d)` | Rolling rank within history |
| `ts_min(x, d)` / `ts_max(x, d)` | Rolling min/max |
| `ts_delta(x, d)` | x[t] - x[t-d] |
| `ts_delay(x, d)` | x[t-d] |
| `ts_return(x, d)` | x[t]/x[t-d] - 1 |
| `ts_corr(x, y, d)` | Rolling correlation |
| `ts_sum(x, d)` | Rolling sum |
| `ts_decay_linear(x, d)` | Linear decay-weighted average |
| `ts_decay_exp(x, d)` | Exponential decay-weighted |
| `ts_backfill(x, d)` | Forward fill within window |
| `trade_when(cond, x, y)` | x if cond else y |
### Key Data Fields
| Category | Fields |
|----------|--------|
| Price/Volume | `open`, `high`, `low`, `close`, `volume`, `vwap`, `returns`, `adv20`, `adv60` |
| Fundamentals | `market_cap`, `operating_income`, `ebitda`, `total_debt`, `total_assets`, `cash`, `equity`, `enterprise_value`, `sales`, `revenue`, `eps` |
| Analyst | `est_eps`, `eps_surprise`, `eps_surprise_pct`, `num_analysts`, `recommendation_mean` |
| Options | `implied_volatility_call_180`, `implied_volatility_put_180`, `iv30`, `iv60`, `iv90`, `put_call_ratio`, `option_volume` |
| Alternative | `realized_vol`, `volatility`, `skewness`, `kurtosis` |
## Tips for Strong Alphas
1. **Dimensionless** β€” rank or zscore before combining different metrics
2. **Guard divisions** β€” always add `+ 0.000001` to denominators
3. **Neutralize** β€” end with `group_neutralize(..., subindustry)`
4. **Decay smooth** β€” use `ts_decay_linear(expr, 3-10)` for noisy signals
5. **Multiplicative intersections** β€” `rank(a) * rank(b)` > `a + b` for orthogonal signals
6. **Cross-sectional** β€” the signal must differentiate stocks, not predict time
""")
with gr.Tab("πŸ”§ Settings"):
gr.Markdown("""
### Hugging Face Setup
Set your HF token as an environment variable:
```bash
export HF_TOKEN=your_token_here
```
Or pass it when launching:
```bash
HF_TOKEN=xxx python app.py
```
### Ollama Setup
1. Install Ollama: https://ollama.com
2. Pull a model: `ollama pull deepseek-r1:8b`
3. Ensure Ollama is running locally (default: http://localhost:11434)
### Deployment to Hugging Face Spaces
```bash
# Create a Space with Gradio SDK
# Push app.py + requirements.txt
# requirements.txt contents:
gradio>=4.0
numpy
pandas
scipy
huggingface_hub
ollama
```
""")
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860, share=True)