Spaces:
Sleeping
Sleeping
| import os | |
| import joblib | |
| import optuna | |
| import numpy as np | |
| from datetime import datetime | |
| from sklearn.svm import SVR | |
| from sklearn.linear_model import Ridge | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.metrics import mean_squared_error | |
| from sklearn.model_selection import TimeSeriesSplit | |
| from huggingface_hub import HfApi | |
| from src.data_processing import load_data, clean_data, generate_technical_features | |
| SYMBOLS = ["AAPL", 'MSFT', 'GOOGL', 'AMZN'] | |
| MARKET_SYMBOL = "^GSPC" | |
| START_DATE = "2010-01-01" | |
| END_DATE = datetime.now().strftime('%Y-%m-%d') | |
| REPO_ID = "Reality8081/Predict_Stock_SVR_Linear" # << THAY ĐỔI DÒNG NÀY | |
| HORIZONS = [1, 7, 21] | |
| def main(): | |
| print("1. Đang tải và làm sạch dữ liệu...") | |
| df_raw = load_data(SYMBOLS, MARKET_SYMBOL, START_DATE, END_DATE) | |
| df_clean = clean_data(df_raw) | |
| os.makedirs("models", exist_ok=True) | |
| for h in HORIZONS: | |
| print("2. Tạo đặc trưng (Features)...") | |
| _, X, y = generate_technical_features(df_clean, is_inference=False, target_horizon=h) | |
| tscv = TimeSeriesSplit(n_splits=5) | |
| # === TỐI ƯU LINEAR REGRESSION (RIDGE) === | |
| print("3. Tối ưu siêu tham số Ridge Regression...") | |
| def objective_lr(trial): | |
| alpha = trial.suggest_float('alpha', 1e-4, 1e4, log=True) | |
| tscv = TimeSeriesSplit(n_splits=5) | |
| fold_scores = [] | |
| for train_idx, val_idx in tscv.split(X): | |
| X_train, X_val = X.iloc[train_idx], X.iloc[val_idx] | |
| y_train, y_val = y.iloc[train_idx], y.iloc[val_idx] | |
| scaler = StandardScaler() | |
| X_train_scaled = scaler.fit_transform(X_train) | |
| X_val_scaled = scaler.transform(X_val) | |
| model = Ridge(alpha=alpha, random_state=42) | |
| model.fit(X_train_scaled, y_train) | |
| preds = model.predict(X_val_scaled) | |
| rmse = np.sqrt(mean_squared_error(y_val, preds)) | |
| fold_scores.append(rmse) | |
| return np.mean(fold_scores) | |
| study_lr = optuna.create_study(direction='minimize') | |
| study_lr.optimize(objective_lr, n_trials=20) | |
| best_alpha = study_lr.best_params['alpha'] | |
| # === TỐI ƯU SVR === | |
| print("4. Tối ưu siêu tham số SVR...") | |
| def objective_svr(trial): | |
| # Chỉ tối ưu siêu tham số SVR | |
| kernel = trial.suggest_categorical('kernel', ['linear', 'rbf']) | |
| C = trial.suggest_float('C', 1e-3, 100.0, log=True) | |
| epsilon = trial.suggest_float('epsilon', 1e-3, 1.0, log=True) | |
| gamma = trial.suggest_categorical('gamma', ['scale', 'auto']) if kernel == 'rbf' else 'scale' | |
| # Chuẩn bị data với feature cố định | |
| tscv = TimeSeriesSplit(n_splits=5) | |
| fold_scores = [] | |
| for train_idx, val_idx in tscv.split(X): | |
| X_train, X_val = X.iloc[train_idx], X.iloc[val_idx] | |
| y_train, y_val = y.iloc[train_idx], y.iloc[val_idx] | |
| scaler = StandardScaler() | |
| X_train_scaled = scaler.fit_transform(X_train) | |
| X_val_scaled = scaler.transform(X_val) | |
| X_train_scaled = X_train_scaled.astype('float32') | |
| X_val_scaled = X_val_scaled.astype('float32') | |
| y_train_f32 = y_train.values.astype('float32') | |
| y_val_f32 = y_val.values.astype('float32') | |
| model = SVR(kernel=kernel, C=C, epsilon=epsilon, gamma=gamma, max_iter=5000) | |
| model.fit(X_train_scaled, y_train) | |
| preds = model.predict(X_val_scaled) | |
| rmse = np.sqrt(mean_squared_error(y_val, preds)) | |
| fold_scores.append(rmse) | |
| return np.mean(fold_scores) | |
| study_svr = optuna.create_study(direction='minimize') | |
| study_svr.optimize(objective_svr, n_trials=10) # Set số trial vừa phải | |
| # === HUẤN LUYỆN MODEL CUỐI CÙNG & LƯU LẠI === | |
| print("5. Huấn luyện mô hình cuối và lưu trữ...") | |
| os.makedirs("models", exist_ok=True) | |
| # Ridge | |
| scaler_lr = StandardScaler() | |
| X_scaled_lr = scaler_lr.fit_transform(X) | |
| model_lr = Ridge(alpha=best_alpha, random_state=42) | |
| model_lr.fit(X_scaled_lr, y) | |
| joblib.dump(scaler_lr, f'models/scaler_lr_{h}d.pkl') | |
| joblib.dump(model_lr, f'models/model_lr_{h}d.pkl') | |
| # SVR | |
| scaler_svr = StandardScaler() | |
| X_scaled_svr = scaler_svr.fit_transform(X) | |
| model_svr = SVR(kernel='rbf', C=study_svr.best_params['C'], epsilon=study_svr.best_params['epsilon'], gamma='scale') | |
| model_svr.fit(X_scaled_svr, y) | |
| joblib.dump(scaler_svr, f'models/scaler_svr_{h}d.pkl') | |
| joblib.dump(model_svr, f'models/model_svr_{h}d.pkl') | |
| print("6. Tải mô hình lên Hugging Face Hub...") | |
| hf_token = os.environ.get("HF_TOKEN") | |
| if hf_token: | |
| api = HfApi() | |
| api.upload_folder( | |
| folder_path="models", | |
| repo_id=REPO_ID, | |
| repo_type="model", | |
| token=hf_token | |
| ) | |
| print("Upload thành công!") | |
| else: | |
| print("Thiếu HF_TOKEN, bỏ qua bước upload.") | |
| if __name__ == "__main__": | |
| main() |