Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| """ | |
| Generate training dataset for AI Fusion strategy. | |
| Fetches historical OHLCV, computes technical features, and labels data. | |
| Includes future returns for Profit/Loss backtesting. | |
| """ | |
| import sys | |
| import os | |
| import pandas as pd | |
| import numpy as np | |
| import logging | |
| import torch | |
| from tqdm.auto import tqdm | |
| # Add project root to path | |
| sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) | |
| from trading_cli.data.market import fetch_ohlcv_yfinance | |
| from trading_cli.strategy.signals import ( | |
| calculate_rsi, | |
| calculate_sma, | |
| calculate_atr, | |
| calculate_bollinger_bands | |
| ) | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| SYMBOLS = [ | |
| "AAPL", "MSFT", "GOOGL", "AMZN", "TSLA", "NVDA", "AMD", "META", "NFLX", "ADBE", | |
| "CRM", "INTC", "CSCO", "ORCL", "QCOM", "AVGO", "TXN", "AMAT", "MU", "LRCX", | |
| "JPM", "BAC", "WFC", "GS", "MS", "V", "MA", "AXP", "BLK", "BX", | |
| "XOM", "CVX", "COP", "SLB", "HAL", "MPC", "PSX", "VLO", "OXY", "HES", | |
| "JNJ", "PFE", "UNH", "ABBV", "MRK", "LLY", "TMO", "DHR", "ISRG", "GILD", | |
| "WMT", "COST", "HD", "LOW", "TGT", "PG", "KO", "PEP", "PM", "MO", | |
| "CAT", "DE", "HON", "GE", "MMM", "UPS", "FDX", "RTX", "LMT", "GD", | |
| "BTC-USD", "ETH-USD", "GC=F", "CL=F" | |
| ] | |
| DAYS = 3652 # 10 years | |
| LOOKAHEAD = 5 # Prediction window (days) | |
| TARGET_PCT = 0.02 # Profit target (2%) | |
| STOP_PCT = 0.015 # Stop loss (1.5%) | |
| SEQ_LEN = 30 # One month of trading days | |
| def generate_features(df): | |
| """Compute technical indicators for the feature vector.""" | |
| close = df["close" if "close" in df.columns else "Close"] | |
| # 1. RSI(2) - Very short period | |
| rsi2 = calculate_rsi(close, 2) / 100.0 | |
| # 2. RSI(14) - Standard period | |
| rsi14 = calculate_rsi(close, 14) / 100.0 | |
| # 3. SMA distance (20, 50, 200) | |
| sma20 = calculate_sma(close, 20) | |
| sma50 = calculate_sma(close, 50) | |
| sma200 = calculate_sma(close, 200) | |
| dist_sma20 = (close / sma20) - 1.0 | |
| dist_sma50 = (close / sma50) - 1.0 | |
| dist_sma200 = (close / sma200) - 1.0 | |
| # 4. Bollinger Band position | |
| upper, mid, lower = calculate_bollinger_bands(close, 20, 2.0) | |
| bb_pos = (close - lower) / (upper - lower + 1e-6) | |
| # 5. ATR (Volatility) | |
| atr = calculate_atr(df, 14) | |
| atr_pct = atr / close | |
| # 6. Volume spike | |
| vol = df["volume" if "volume" in df.columns else "Volume"] | |
| vol_sma = vol.rolling(20).mean() | |
| vol_ratio = (vol / vol_sma).clip(0, 5) / 5.0 | |
| features = pd.DataFrame({ | |
| "rsi2": rsi2, | |
| "rsi14": rsi14, | |
| "dist_sma20": dist_sma20, | |
| "dist_sma50": dist_sma50, | |
| "dist_sma200": dist_sma200, | |
| "bb_pos": bb_pos, | |
| "atr_pct": atr_pct, | |
| "vol_ratio": vol_ratio, | |
| }, index=df.index) | |
| return features.dropna() | |
| def generate_labels(df): | |
| """Label data using Triple Barrier and calculate future returns.""" | |
| close = df["close" if "close" in df.columns else "Close"].values | |
| labels = np.zeros(len(close)) | |
| future_rets = np.zeros(len(close)) | |
| for i in range(len(close) - LOOKAHEAD): | |
| current_price = close[i] | |
| future_prices = close[i+1 : i+LOOKAHEAD+1] | |
| max_ret = (np.max(future_prices) - current_price) / current_price | |
| min_ret = (np.min(future_prices) - current_price) / current_price | |
| if max_ret >= TARGET_PCT: | |
| labels[i] = 1 # BUY | |
| elif min_ret <= -STOP_PCT: | |
| labels[i] = 2 # SELL | |
| else: | |
| labels[i] = 0 # HOLD | |
| future_rets[i] = (close[i + LOOKAHEAD] - current_price) / current_price | |
| return labels, future_rets | |
| def build_dataset(symbols=SYMBOLS, days=DAYS, output_path="data/trading_dataset.pt"): | |
| """Fetch, label, and sequence data for all symbols.""" | |
| all_X, all_y, all_rets = [], [], [] | |
| for symbol in tqdm(symbols, desc="Building Global Dataset"): | |
| try: | |
| df = fetch_ohlcv_yfinance(symbol, days=days) | |
| if len(df) < (SEQ_LEN + LOOKAHEAD + 50): | |
| continue | |
| features = generate_features(df) | |
| labels, rets = generate_labels(df) | |
| # Align features with labels/rets and add sentiment | |
| df_aligned = pd.DataFrame(index=df.index) | |
| df_aligned["label"] = labels | |
| df_aligned["future_ret"] = rets | |
| df_aligned["sentiment"] = np.random.normal(0, 0.2, len(df)) | |
| # Merge features | |
| df_combined = features.join(df_aligned, how="inner").dropna() | |
| if len(df_combined) < SEQ_LEN: | |
| continue | |
| feat_vals = df_combined.drop(columns=["label", "future_ret"]).values | |
| label_vals = df_combined["label"].values.astype(int) | |
| ret_vals = df_combined["future_ret"].values | |
| symbol_X, symbol_y, symbol_rets = [], [], [] | |
| for i in range(len(feat_vals) - SEQ_LEN): | |
| symbol_X.append(feat_vals[i : i+SEQ_LEN]) | |
| # Label/Ret is for the prediction point at the END of the sequence | |
| symbol_y.append(label_vals[i+SEQ_LEN-1]) | |
| symbol_rets.append(ret_vals[i+SEQ_LEN-1]) | |
| if symbol_X: | |
| all_X.append(np.array(symbol_X)) | |
| all_y.append(np.array(symbol_y)) | |
| all_rets.append(np.array(symbol_rets)) | |
| except Exception as e: | |
| logger.error(f"Error processing {symbol}: {e}") | |
| if not all_X: | |
| logger.error("No valid data collected!") | |
| return None | |
| X = np.concatenate(all_X, axis=0) | |
| y = np.concatenate(all_y, axis=0) | |
| rets = np.concatenate(all_rets, axis=0) | |
| data = { | |
| "X": torch.tensor(X, dtype=torch.float32), | |
| "y": torch.tensor(y, dtype=torch.long), | |
| "rets": torch.tensor(rets, dtype=torch.float32), | |
| "symbols": symbols | |
| } | |
| os.makedirs(os.path.dirname(output_path), exist_ok=True) | |
| torch.save(data, output_path) | |
| logger.info(f"✅ Dataset saved to {output_path} | Shape: {X.shape}") | |
| return data | |
| if __name__ == "__main__": | |
| build_dataset() | |