import os import joblib import optuna import numpy as np from datetime import datetime from sklearn.svm import SVR from sklearn.linear_model import Ridge from sklearn.preprocessing import StandardScaler from sklearn.metrics import mean_squared_error from sklearn.model_selection import TimeSeriesSplit from huggingface_hub import HfApi from src.data_processing import load_data, clean_data, generate_technical_features SYMBOLS = ["AAPL", 'MSFT', 'GOOGL', 'AMZN'] MARKET_SYMBOL = "^GSPC" START_DATE = "2010-01-01" END_DATE = datetime.now().strftime('%Y-%m-%d') REPO_ID = "Reality8081/Predict_Stock_SVR_Linear" # << THAY ĐỔI DÒNG NÀY HORIZONS = [1, 7, 21] def main(): print("1. Đang tải và làm sạch dữ liệu...") df_raw = load_data(SYMBOLS, MARKET_SYMBOL, START_DATE, END_DATE) df_clean = clean_data(df_raw) os.makedirs("models", exist_ok=True) for h in HORIZONS: print("2. Tạo đặc trưng (Features)...") _, X, y = generate_technical_features(df_clean, is_inference=False, target_horizon=h) tscv = TimeSeriesSplit(n_splits=5) # === TỐI ƯU LINEAR REGRESSION (RIDGE) === print("3. Tối ưu siêu tham số Ridge Regression...") def objective_lr(trial): alpha = trial.suggest_float('alpha', 1e-4, 1e4, log=True) tscv = TimeSeriesSplit(n_splits=5) fold_scores = [] for train_idx, val_idx in tscv.split(X): X_train, X_val = X.iloc[train_idx], X.iloc[val_idx] y_train, y_val = y.iloc[train_idx], y.iloc[val_idx] scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_val_scaled = scaler.transform(X_val) model = Ridge(alpha=alpha, random_state=42) model.fit(X_train_scaled, y_train) preds = model.predict(X_val_scaled) rmse = np.sqrt(mean_squared_error(y_val, preds)) fold_scores.append(rmse) return np.mean(fold_scores) study_lr = optuna.create_study(direction='minimize') study_lr.optimize(objective_lr, n_trials=20) best_alpha = study_lr.best_params['alpha'] # === TỐI ƯU SVR === print("4. Tối ưu siêu tham số SVR...") def objective_svr(trial): # Chỉ tối ưu siêu tham số SVR kernel = trial.suggest_categorical('kernel', ['linear', 'rbf']) C = trial.suggest_float('C', 1e-3, 100.0, log=True) epsilon = trial.suggest_float('epsilon', 1e-3, 1.0, log=True) gamma = trial.suggest_categorical('gamma', ['scale', 'auto']) if kernel == 'rbf' else 'scale' # Chuẩn bị data với feature cố định tscv = TimeSeriesSplit(n_splits=5) fold_scores = [] for train_idx, val_idx in tscv.split(X): X_train, X_val = X.iloc[train_idx], X.iloc[val_idx] y_train, y_val = y.iloc[train_idx], y.iloc[val_idx] scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_val_scaled = scaler.transform(X_val) X_train_scaled = X_train_scaled.astype('float32') X_val_scaled = X_val_scaled.astype('float32') y_train_f32 = y_train.values.astype('float32') y_val_f32 = y_val.values.astype('float32') model = SVR(kernel=kernel, C=C, epsilon=epsilon, gamma=gamma, max_iter=5000) model.fit(X_train_scaled, y_train) preds = model.predict(X_val_scaled) rmse = np.sqrt(mean_squared_error(y_val, preds)) fold_scores.append(rmse) return np.mean(fold_scores) study_svr = optuna.create_study(direction='minimize') study_svr.optimize(objective_svr, n_trials=10) # Set số trial vừa phải # === HUẤN LUYỆN MODEL CUỐI CÙNG & LƯU LẠI === print("5. Huấn luyện mô hình cuối và lưu trữ...") os.makedirs("models", exist_ok=True) # Ridge scaler_lr = StandardScaler() X_scaled_lr = scaler_lr.fit_transform(X) model_lr = Ridge(alpha=best_alpha, random_state=42) model_lr.fit(X_scaled_lr, y) joblib.dump(scaler_lr, f'models/scaler_lr_{h}d.pkl') joblib.dump(model_lr, f'models/model_lr_{h}d.pkl') # SVR scaler_svr = StandardScaler() X_scaled_svr = scaler_svr.fit_transform(X) model_svr = SVR(kernel='rbf', C=study_svr.best_params['C'], epsilon=study_svr.best_params['epsilon'], gamma='scale') model_svr.fit(X_scaled_svr, y) joblib.dump(scaler_svr, f'models/scaler_svr_{h}d.pkl') joblib.dump(model_svr, f'models/model_svr_{h}d.pkl') print("6. Tải mô hình lên Hugging Face Hub...") hf_token = os.environ.get("HF_TOKEN") if hf_token: api = HfApi() api.upload_folder( folder_path="models", repo_id=REPO_ID, repo_type="model", token=hf_token ) print("Upload thành công!") else: print("Thiếu HF_TOKEN, bỏ qua bước upload.") if __name__ == "__main__": main()