Reality8081 commited on
Commit
2ca41b8
·
1 Parent(s): fe81db6

Update src

Browse files
Files changed (2) hide show
  1. src/data_processing.py +103 -35
  2. src/train.py +26 -5
src/data_processing.py CHANGED
@@ -14,73 +14,119 @@ VOL_WINDOWS = [20, 50]
14
  LAGS = 3
15
 
16
  def load_data(symbols, market_symbol, start_date, end_date):
17
- print(f"Downloading data from {start_date} to {end_date}...")
 
18
  df_market = yf.download(market_symbol, start=start_date, end=end_date, auto_adjust=True, progress=False)
 
19
  if isinstance(df_market.columns, pd.MultiIndex):
20
  df_market.columns = df_market.columns.droplevel(1)
21
 
22
  df_market = df_market.reset_index()[['Date', 'Close']].rename(columns={'Close': 'Market_Close'})
23
  dfs = []
24
-
25
  for symbol in symbols:
26
- try:
27
- df = yf.download(symbol, start=start_date, end=end_date, auto_adjust=True, progress=False)
28
- if isinstance(df.columns, pd.MultiIndex):
29
- df.columns = df.columns.droplevel(1)
30
- df = df.reset_index()[['Date', 'Open', 'High', 'Low', 'Close', 'Volume']]
31
- df['Ticker'] = symbol
32
- df = pd.merge(df, df_market, on='Date', how='left')
33
- dfs.append(df)
34
- except Exception as e:
35
- print(f"Lỗi khi tải dữ liệu {symbol}: {e}")
36
 
37
- df_concat = pd.concat(dfs, ignore_index=True)
38
- df_concat = df_concat.sort_values(['Ticker', 'Date']).reset_index(drop=True)
39
- return df_concat
 
 
40
 
41
  def clean_data(df):
42
  cleaned_dfs = []
43
  for ticker, group in df.groupby('Ticker'):
44
  group = group.set_index('Date').sort_index()
45
- start_dt, end_dt = group.index.min(), group.index.max()
 
46
  all_business_days = pd.date_range(start=start_dt, end=end_dt, freq="B")
47
- group = group.reindex(all_business_days).ffill().reset_index().rename(columns={'index': 'Date'})
 
 
 
48
  group['Ticker'] = ticker
49
  cleaned_dfs.append(group)
50
 
51
- df_cleaned = pd.concat(cleaned_dfs, ignore_index=True).sort_values(['Ticker', 'Date']).reset_index(drop=True)
52
- return df_cleaned
 
 
 
 
53
 
54
  def validate_data(df, stage="pre_feature"):
 
 
55
  num_cols = df.select_dtypes(include=[np.number]).columns
56
- if df[num_cols].isna().sum().sum() > 0:
57
- print(f"WARNING: NaN values tại {stage}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  return df
59
 
60
  def generate_technical_features(df, is_inference=False):
 
 
 
 
 
 
 
 
61
  data = df.copy()
62
 
63
  def add_features(group):
64
  g = group.copy()
 
 
65
  g['Daily_Return'] = g['Close'].pct_change()
66
  g['Log_Return'] = np.log(1 + g['Daily_Return'])
67
  g['Market_Return'] = g['Market_Close'].pct_change()
68
  g['Market_Log_Return'] = np.log(1 + g['Market_Return'])
69
 
 
 
70
  for i in range(1, LAGS + 1):
71
  g[f'Return_Lag_{i}'] = g['Daily_Return'].shift(i)
72
  g[f'Market_Return_Lag_{i}'] = g['Market_Return'].shift(i)
73
 
 
 
74
  for w in SMA_WINDOWS:
75
  sma = g['Close'].rolling(window=w).mean()
76
  g[f'SMA_{w}_Ratio'] = g['Close'] / sma
77
- g[f'SMA_{w}_Distance_pct'] = (g['Close'] - sma) / sma * 100
78
 
79
  for w in EMA_WINDOWS:
80
  ema = g['Close'].ewm(span=w, adjust=False).mean()
81
  g[f'EMA_{w}_Ratio'] = g['Close'] / ema
82
  g[f'EMA_{w}_Distance_pct'] = (g['Close'] - ema) / ema * 100
83
 
 
84
  for w in RSI_WINDOWS:
85
  delta = g['Close'].diff()
86
  gain = delta.where(delta > 0, 0).rolling(w).mean()
@@ -88,46 +134,68 @@ def generate_technical_features(df, is_inference=False):
88
  rs = gain / loss
89
  g[f'RSI_{w}'] = 100 - (100 / (1 + rs))
90
 
 
91
  ema_fast = g['Close'].ewm(span=12, adjust=False).mean()
92
  ema_slow = g['Close'].ewm(span=26, adjust=False).mean()
93
  g['MACD_Line'] = ema_fast - ema_slow
94
  g['MACD_Signal'] = g['MACD_Line'].ewm(span=9, adjust=False).mean()
95
- g['MACD_Hist'] = (g['MACD_Line'] - g['MACD_Signal'])
96
- g['MACD_Hist_Normalized'] = g['MACD_Hist'] / g['Close'] * 100
97
 
 
98
  for w in BB_WINDOWS:
99
  middle = g['Close'].rolling(w).mean()
100
  std_dev = g['Close'].rolling(w).std()
101
- bb_range = (middle + 2 * std_dev) - (middle - 2 * std_dev)
102
- g[f'BB_Width_{w}_pct'] = (bb_range / middle * 100)
103
- g[f'BB_Position_{w}'] = (g['Close'] - (middle - 2 * std_dev)) / bb_range.where(bb_range > 0, 1)
 
 
 
104
 
105
- tr = pd.concat([g['High'] - g['Low'],
106
- abs(g['High'] - g['Close'].shift(1)),
107
- abs(g['Low'] - g['Close'].shift(1))], axis=1).max(axis=1)
 
 
 
 
108
 
 
109
  for w in ATR_WINDOWS:
110
  atr = tr.rolling(w).mean()
111
- g[f'ATR_Normalized_{w}'] = atr / g['Close']
112
  g[f'ATR_{w}'] = atr
 
113
 
 
114
  for w in VOL_WINDOWS:
115
  g[f'Market_Rolling_Vol_{w}'] = g['Market_Return'].rolling(w).std()
116
  g[f'AAPL_Rolling_Vol_{w}'] = g['Daily_Return'].rolling(w).std()
117
 
 
118
  g['Rel_Volume_20'] = g['Volume'] / g['Volume'].rolling(20).mean()
119
  return g
120
 
 
121
  data_list = [add_features(group) for _, group in data.groupby('Ticker')]
122
  data = pd.concat(data_list, ignore_index=True)
123
 
124
  if not is_inference:
125
  data['Target_Return'] = data.groupby('Ticker')['Daily_Return'].shift(-1)
126
  data = data.dropna().reset_index(drop=True)
127
- data = validate_data(data, stage="post_feature")
128
- X = data.drop(columns=['Date', 'Ticker', 'Market_Close', 'Target_Return'], errors='ignore')
129
- y = data['Target_Return']
130
- return data, X, y
 
 
 
 
 
 
 
 
 
131
  else:
132
  # Nếu là predict, dòng cuối cùng của mỗi ticker sẽ chứa feature đầy đủ và không bị loại bỏ do thiếu target
133
  data = data.dropna().reset_index(drop=True)
 
14
  LAGS = 3
15
 
16
  def load_data(symbols, market_symbol, start_date, end_date):
17
+ print("Downloading data for AAPL and market index (auto_adjust=True)...")
18
+
19
  df_market = yf.download(market_symbol, start=start_date, end=end_date, auto_adjust=True, progress=False)
20
+
21
  if isinstance(df_market.columns, pd.MultiIndex):
22
  df_market.columns = df_market.columns.droplevel(1)
23
 
24
  df_market = df_market.reset_index()[['Date', 'Close']].rename(columns={'Close': 'Market_Close'})
25
  dfs = []
 
26
  for symbol in symbols:
27
+ df = yf.download(symbol, start=start_date, end=end_date, auto_adjust=True, progress=False)
28
+ if isinstance(df.columns, pd.MultiIndex):
29
+ df.columns = df.columns.droplevel(1)
30
+ df = df.reset_index()[['Date', 'Open', 'High', 'Low', 'Close', 'Volume']]
31
+ df['Ticker'] = symbol
32
+
33
+ df = pd.merge(df, df_market, on='Date', how='left')
34
+ dfs.append(df)
 
 
35
 
36
+ df = pd.concat(dfs, ignore_index = True)
37
+ df = df.sort_values(['Ticker', 'Date']).reset_index(drop=True)
38
+ print(f"Loaded raw panel data: {len(df)} rows | {len(symbols)} tickers | "
39
+ f"from {df['Date'].min().date()} to {df['Date'].max().date()}")
40
+ return df
41
 
42
  def clean_data(df):
43
  cleaned_dfs = []
44
  for ticker, group in df.groupby('Ticker'):
45
  group = group.set_index('Date').sort_index()
46
+ start_dt = group.index.min()
47
+ end_dt = group.index.max()
48
  all_business_days = pd.date_range(start=start_dt, end=end_dt, freq="B")
49
+
50
+ group = group.reindex(all_business_days)
51
+ group = group.ffill()
52
+ group = group.reset_index().rename(columns={'index': 'Date'})
53
  group['Ticker'] = ticker
54
  cleaned_dfs.append(group)
55
 
56
+ df_cleaned = pd.concat(cleaned_dfs, ignore_index = True)
57
+ df_cleaned = df_cleaned.sort_values(['Ticker', 'Date']).reset_index(drop=True)
58
+
59
+ print(f"Data cleaned: {len(df_cleaned)} rows | "
60
+ f"from {df_cleaned['Date'].min().date()} to {df_cleaned['Date'].max().date()}")
61
+ return df
62
 
63
  def validate_data(df, stage="pre_feature"):
64
+
65
+ print(f"Validating data at stage: {stage}...")
66
  num_cols = df.select_dtypes(include=[np.number]).columns
67
+
68
+ nan_count = df[num_cols].isna().sum().sum()
69
+ inf_count = np.isinf(df[num_cols]).sum().sum()
70
+
71
+ if nan_count > 0:
72
+ print(f"WARNING: Tìm thấy {nan_count} NaN values tại stage {stage}")
73
+ if inf_count > 0:
74
+ print(f"WARNING: Tìm thấy {inf_count} Inf values tại stage {stage}")
75
+
76
+ if 'Date' in df.columns and 'Market_Return' in df.columns:
77
+ market_std_per_date = df.groupby('Date')['Market_Return'].std(ddof=0).max()
78
+ if pd.notna(market_std_per_date) and market_std_per_date > 1e-8:
79
+ print(f"WARNING: Cross-ticker contamination detected! "
80
+ f"Max std of Market_Return per date: {market_std_per_date:.2e}")
81
+
82
+ # Kiểm tra nhanh variance của returns (nên > 0)
83
+ if 'Daily_Return' in df.columns:
84
+ for ticker, grp in df.groupby('Ticker'):
85
+ if len(grp) > 1 and grp['Daily_Return'].std(ddof=0) == 0:
86
+ print(f"WARNING: Ticker {ticker} has zero variance in Daily_Return!")
87
+
88
+ print(f"Validation passed at {stage} (no critical issues).")
89
  return df
90
 
91
  def generate_technical_features(df, is_inference=False):
92
+ """
93
+ Feature Engineering hoàn toàn mới theo 5 yêu cầu:
94
+ 1. Corporate actions đã được xử lý ở load_data (auto_adjust=True)
95
+ 2. TẤT CẢ features được chuyển sang dạng stationary (ratio, pct distance, normalized, position 0-1)
96
+ 3. Multi-timeframe: nhiều windows để Linear_Regression tự chọn tín hiệu mạnh
97
+ 4. Market Regime & Volatility: ATR normalized + rolling volatility
98
+ 5. Gọi validate_data ngay trước khi return
99
+ """
100
  data = df.copy()
101
 
102
  def add_features(group):
103
  g = group.copy()
104
+
105
+ # === 1. BASIC RETURNS (luôn stationary) ===
106
  g['Daily_Return'] = g['Close'].pct_change()
107
  g['Log_Return'] = np.log(1 + g['Daily_Return'])
108
  g['Market_Return'] = g['Market_Close'].pct_change()
109
  g['Market_Log_Return'] = np.log(1 + g['Market_Return'])
110
 
111
+ # === 2. LAGGED FEATURES – CHỈ lag returns (KHÔNG lag Close raw) ===
112
+ # Lý do: Close raw và SMA raw là non-stationary → Linear_Regression sẽ học nhầm trend dài hạn thay vì pattern thực sự.
113
  for i in range(1, LAGS + 1):
114
  g[f'Return_Lag_{i}'] = g['Daily_Return'].shift(i)
115
  g[f'Market_Return_Lag_{i}'] = g['Market_Return'].shift(i)
116
 
117
+ # === 3. MULTI-TIMEFRAME TECHNICAL INDICATORS (Stationary version) ===
118
+ # SMA & EMA → Ratio + % Distance (thay vì giá trị tuyệt đối)
119
  for w in SMA_WINDOWS:
120
  sma = g['Close'].rolling(window=w).mean()
121
  g[f'SMA_{w}_Ratio'] = g['Close'] / sma
122
+ g[f'SMA_{w}_Distance_pct'] = (g['Close'] - sma) / sma * 100 # % distance từ giá đến SMA
123
 
124
  for w in EMA_WINDOWS:
125
  ema = g['Close'].ewm(span=w, adjust=False).mean()
126
  g[f'EMA_{w}_Ratio'] = g['Close'] / ema
127
  g[f'EMA_{w}_Distance_pct'] = (g['Close'] - ema) / ema * 100
128
 
129
+ # RSI multi-window (đã stationary tự nhiên 0-100)
130
  for w in RSI_WINDOWS:
131
  delta = g['Close'].diff()
132
  gain = delta.where(delta > 0, 0).rolling(w).mean()
 
134
  rs = gain / loss
135
  g[f'RSI_{w}'] = 100 - (100 / (1 + rs))
136
 
137
+ # MACD: giữ cấu trúc gốc nhưng normalize Hist theo % giá (stationary)
138
  ema_fast = g['Close'].ewm(span=12, adjust=False).mean()
139
  ema_slow = g['Close'].ewm(span=26, adjust=False).mean()
140
  g['MACD_Line'] = ema_fast - ema_slow
141
  g['MACD_Signal'] = g['MACD_Line'].ewm(span=9, adjust=False).mean()
142
+ g['MACD_Hist'] = g['MACD_Line'] - g['MACD_Signal']
143
+ g['MACD_Hist_Normalized'] = g['MACD_Hist'] / g['Close'] * 100 # % của giá → stationary
144
 
145
+ # Bollinger Bands: Width % + Position (0-1) thay vì Upper/Lower tuyệt đối
146
  for w in BB_WINDOWS:
147
  middle = g['Close'].rolling(w).mean()
148
  std_dev = g['Close'].rolling(w).std()
149
+ upper = middle + 2 * std_dev
150
+ lower = middle - 2 * std_dev
151
+ bb_range = upper - lower
152
+
153
+ g[f'BB_Width_{w}_pct'] = (bb_range / middle * 100) # % width (stationary)
154
+ g[f'BB_Position_{w}'] = (g['Close'] - lower) / bb_range.where(bb_range > 0, 1) # 0-1 position
155
 
156
+ # === 4. VOLATILITY & MARKET REGIME FEATURES ===
157
+ # True Range & ATR normalized
158
+ def calculate_true_range(high, low, close):
159
+ tr1 = high - low
160
+ tr2 = abs(high - close.shift(1))
161
+ tr3 = abs(low - close.shift(1))
162
+ return pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
163
 
164
+ tr = calculate_true_range(g['High'], g['Low'], g['Close'])
165
  for w in ATR_WINDOWS:
166
  atr = tr.rolling(w).mean()
 
167
  g[f'ATR_{w}'] = atr
168
+ g[f'ATR_Normalized_{w}'] = atr / g['Close'] # Relative volatility → stationary
169
 
170
+ # Rolling volatility (market regime detection)
171
  for w in VOL_WINDOWS:
172
  g[f'Market_Rolling_Vol_{w}'] = g['Market_Return'].rolling(w).std()
173
  g[f'AAPL_Rolling_Vol_{w}'] = g['Daily_Return'].rolling(w).std()
174
 
175
+ # Relative volume
176
  g['Rel_Volume_20'] = g['Volume'] / g['Volume'].rolling(20).mean()
177
  return g
178
 
179
+ # Xóa NaN (do rolling + lag)
180
  data_list = [add_features(group) for _, group in data.groupby('Ticker')]
181
  data = pd.concat(data_list, ignore_index=True)
182
 
183
  if not is_inference:
184
  data['Target_Return'] = data.groupby('Ticker')['Daily_Return'].shift(-1)
185
  data = data.dropna().reset_index(drop=True)
186
+ # === 5. DATA VALIDATION TRƯỚC KHI TRẢ VỀ ===
187
+ data = validate_data(data, stage="post_feature_engineering")
188
+
189
+ df_backtest = data.copy()
190
+ drop_cols = ['Date', 'Ticker', 'Market_Close', 'Target_Return']
191
+ X = data.drop(columns=drop_cols, errors='ignore')
192
+ y = data['Target_Return'].copy()
193
+
194
+ print(f"Generated stationary features & prepared ML data:\n"
195
+ f" • Total rows: {len(data)} | Tickers: {data['Ticker'].nunique()}\n"
196
+ f" • Features: {X.shape[1]} | X shape: {X.shape} | y shape: {y.shape}")
197
+
198
+ return df_backtest, X, y
199
  else:
200
  # Nếu là predict, dòng cuối cùng của mỗi ticker sẽ chứa feature đầy đủ và không bị loại bỏ do thiếu target
201
  data = data.dropna().reset_index(drop=True)
src/train.py CHANGED
@@ -32,7 +32,10 @@ def main():
32
  print("3. Tối ưu siêu tham số Ridge Regression...")
33
  def objective_lr(trial):
34
  alpha = trial.suggest_float('alpha', 1e-4, 1e4, log=True)
 
 
35
  fold_scores = []
 
36
  for train_idx, val_idx in tscv.split(X):
37
  X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
38
  y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
@@ -44,7 +47,10 @@ def main():
44
  model = Ridge(alpha=alpha, random_state=42)
45
  model.fit(X_train_scaled, y_train)
46
  preds = model.predict(X_val_scaled)
47
- fold_scores.append(np.sqrt(mean_squared_error(y_val, preds)))
 
 
 
48
  return np.mean(fold_scores)
49
 
50
  study_lr = optuna.create_study(direction='minimize')
@@ -54,9 +60,17 @@ def main():
54
  # === TỐI ƯU SVR ===
55
  print("4. Tối ưu siêu tham số SVR...")
56
  def objective_svr(trial):
57
- C = trial.suggest_float('C', 1e-3, 10.0, log=True)
 
 
58
  epsilon = trial.suggest_float('epsilon', 1e-3, 1.0, log=True)
 
 
 
 
 
59
  fold_scores = []
 
60
  for train_idx, val_idx in tscv.split(X):
61
  X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
62
  y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
@@ -65,11 +79,18 @@ def main():
65
  X_train_scaled = scaler.fit_transform(X_train)
66
  X_val_scaled = scaler.transform(X_val)
67
 
68
- # Khống chế max_iter để SVR chạy nhanh hơn trong quá trình tìm kiếm
69
- model = SVR(kernel='rbf', C=C, epsilon=epsilon, gamma='scale', max_iter=2000)
 
 
 
 
70
  model.fit(X_train_scaled, y_train)
71
  preds = model.predict(X_val_scaled)
72
- fold_scores.append(np.sqrt(mean_squared_error(y_val, preds)))
 
 
 
73
  return np.mean(fold_scores)
74
 
75
  study_svr = optuna.create_study(direction='minimize')
 
32
  print("3. Tối ưu siêu tham số Ridge Regression...")
33
  def objective_lr(trial):
34
  alpha = trial.suggest_float('alpha', 1e-4, 1e4, log=True)
35
+
36
+ tscv = TimeSeriesSplit(n_splits=5)
37
  fold_scores = []
38
+
39
  for train_idx, val_idx in tscv.split(X):
40
  X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
41
  y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
 
47
  model = Ridge(alpha=alpha, random_state=42)
48
  model.fit(X_train_scaled, y_train)
49
  preds = model.predict(X_val_scaled)
50
+
51
+ rmse = np.sqrt(mean_squared_error(y_val, preds))
52
+ fold_scores.append(rmse)
53
+
54
  return np.mean(fold_scores)
55
 
56
  study_lr = optuna.create_study(direction='minimize')
 
60
  # === TỐI ƯU SVR ===
61
  print("4. Tối ưu siêu tham số SVR...")
62
  def objective_svr(trial):
63
+ # Chỉ tối ưu siêu tham số SVR
64
+ kernel = trial.suggest_categorical('kernel', ['linear', 'rbf'])
65
+ C = trial.suggest_float('C', 1e-3, 100.0, log=True)
66
  epsilon = trial.suggest_float('epsilon', 1e-3, 1.0, log=True)
67
+ gamma = trial.suggest_categorical('gamma', ['scale', 'auto']) if kernel == 'rbf' else 'scale'
68
+
69
+ # Chuẩn bị data với feature cố định
70
+
71
+ tscv = TimeSeriesSplit(n_splits=5)
72
  fold_scores = []
73
+
74
  for train_idx, val_idx in tscv.split(X):
75
  X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
76
  y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
 
79
  X_train_scaled = scaler.fit_transform(X_train)
80
  X_val_scaled = scaler.transform(X_val)
81
 
82
+ X_train_scaled = X_train_scaled.astype('float32')
83
+ X_val_scaled = X_val_scaled.astype('float32')
84
+ y_train_f32 = y_train.values.astype('float32')
85
+ y_val_f32 = y_val.values.astype('float32')
86
+
87
+ model = SVR(kernel=kernel, C=C, epsilon=epsilon, gamma=gamma, max_iter=5000)
88
  model.fit(X_train_scaled, y_train)
89
  preds = model.predict(X_val_scaled)
90
+
91
+ rmse = np.sqrt(mean_squared_error(y_val, preds))
92
+ fold_scores.append(rmse)
93
+
94
  return np.mean(fold_scores)
95
 
96
  study_svr = optuna.create_study(direction='minimize')