SVR_Predict_Stocks / src /train.py
Reality8081's picture
Update src
35beba6
import os
import joblib
import optuna
import numpy as np
from datetime import datetime
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from huggingface_hub import HfApi
from src.data_processing import load_data, clean_data, generate_technical_features
SYMBOLS = ["AAPL", 'MSFT', 'GOOGL', 'AMZN']
MARKET_SYMBOL = "^GSPC"
START_DATE = "2010-01-01"
END_DATE = datetime.now().strftime('%Y-%m-%d')
REPO_ID = "Reality8081/Predict_Stock_SVR_Linear" # << THAY ĐỔI DÒNG NÀY
HORIZONS = [1, 7, 21]
def main():
print("1. Đang tải và làm sạch dữ liệu...")
df_raw = load_data(SYMBOLS, MARKET_SYMBOL, START_DATE, END_DATE)
df_clean = clean_data(df_raw)
os.makedirs("models", exist_ok=True)
for h in HORIZONS:
print("2. Tạo đặc trưng (Features)...")
_, X, y = generate_technical_features(df_clean, is_inference=False, target_horizon=h)
tscv = TimeSeriesSplit(n_splits=5)
# === TỐI ƯU LINEAR REGRESSION (RIDGE) ===
print("3. Tối ưu siêu tham số Ridge Regression...")
def objective_lr(trial):
alpha = trial.suggest_float('alpha', 1e-4, 1e4, log=True)
tscv = TimeSeriesSplit(n_splits=5)
fold_scores = []
for train_idx, val_idx in tscv.split(X):
X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
model = Ridge(alpha=alpha, random_state=42)
model.fit(X_train_scaled, y_train)
preds = model.predict(X_val_scaled)
rmse = np.sqrt(mean_squared_error(y_val, preds))
fold_scores.append(rmse)
return np.mean(fold_scores)
study_lr = optuna.create_study(direction='minimize')
study_lr.optimize(objective_lr, n_trials=20)
best_alpha = study_lr.best_params['alpha']
# === TỐI ƯU SVR ===
print("4. Tối ưu siêu tham số SVR...")
def objective_svr(trial):
# Chỉ tối ưu siêu tham số SVR
kernel = trial.suggest_categorical('kernel', ['linear', 'rbf'])
C = trial.suggest_float('C', 1e-3, 100.0, log=True)
epsilon = trial.suggest_float('epsilon', 1e-3, 1.0, log=True)
gamma = trial.suggest_categorical('gamma', ['scale', 'auto']) if kernel == 'rbf' else 'scale'
# Chuẩn bị data với feature cố định
tscv = TimeSeriesSplit(n_splits=5)
fold_scores = []
for train_idx, val_idx in tscv.split(X):
X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_train_scaled = X_train_scaled.astype('float32')
X_val_scaled = X_val_scaled.astype('float32')
y_train_f32 = y_train.values.astype('float32')
y_val_f32 = y_val.values.astype('float32')
model = SVR(kernel=kernel, C=C, epsilon=epsilon, gamma=gamma, max_iter=5000)
model.fit(X_train_scaled, y_train)
preds = model.predict(X_val_scaled)
rmse = np.sqrt(mean_squared_error(y_val, preds))
fold_scores.append(rmse)
return np.mean(fold_scores)
study_svr = optuna.create_study(direction='minimize')
study_svr.optimize(objective_svr, n_trials=10) # Set số trial vừa phải
# === HUẤN LUYỆN MODEL CUỐI CÙNG & LƯU LẠI ===
print("5. Huấn luyện mô hình cuối và lưu trữ...")
os.makedirs("models", exist_ok=True)
# Ridge
scaler_lr = StandardScaler()
X_scaled_lr = scaler_lr.fit_transform(X)
model_lr = Ridge(alpha=best_alpha, random_state=42)
model_lr.fit(X_scaled_lr, y)
joblib.dump(scaler_lr, f'models/scaler_lr_{h}d.pkl')
joblib.dump(model_lr, f'models/model_lr_{h}d.pkl')
# SVR
scaler_svr = StandardScaler()
X_scaled_svr = scaler_svr.fit_transform(X)
model_svr = SVR(kernel='rbf', C=study_svr.best_params['C'], epsilon=study_svr.best_params['epsilon'], gamma='scale')
model_svr.fit(X_scaled_svr, y)
joblib.dump(scaler_svr, f'models/scaler_svr_{h}d.pkl')
joblib.dump(model_svr, f'models/model_svr_{h}d.pkl')
print("6. Tải mô hình lên Hugging Face Hub...")
hf_token = os.environ.get("HF_TOKEN")
if hf_token:
api = HfApi()
api.upload_folder(
folder_path="models",
repo_id=REPO_ID,
repo_type="model",
token=hf_token
)
print("Upload thành công!")
else:
print("Thiếu HF_TOKEN, bỏ qua bước upload.")
if __name__ == "__main__":
main()