File size: 9,206 Bytes
3eb5b88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2ca41b8
 
3eb5b88
2ca41b8
3eb5b88
 
 
 
 
 
2ca41b8
 
 
 
 
 
 
 
3eb5b88
2ca41b8
 
 
 
 
3eb5b88
 
 
 
 
2ca41b8
 
3eb5b88
2ca41b8
 
 
 
3eb5b88
 
 
2ca41b8
 
 
 
 
 
3eb5b88
 
2ca41b8
 
3eb5b88
2ca41b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3eb5b88
 
35beba6
2ca41b8
 
 
 
 
 
 
 
3eb5b88
 
 
 
2ca41b8
 
3eb5b88
 
 
 
 
2ca41b8
 
3eb5b88
 
 
 
2ca41b8
 
3eb5b88
 
 
2ca41b8
3eb5b88
 
 
 
 
 
2ca41b8
3eb5b88
 
 
 
 
 
 
2ca41b8
3eb5b88
 
 
 
2ca41b8
 
3eb5b88
2ca41b8
3eb5b88
 
 
2ca41b8
 
 
 
 
 
3eb5b88
2ca41b8
 
 
 
 
 
 
3eb5b88
2ca41b8
3eb5b88
 
 
2ca41b8
3eb5b88
2ca41b8
3eb5b88
 
 
 
2ca41b8
3eb5b88
 
 
2ca41b8
3eb5b88
 
 
 
35beba6
3eb5b88
2ca41b8
35beba6
2ca41b8
 
 
 
 
 
35beba6
2ca41b8
 
 
 
3eb5b88
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
import pandas as pd
import numpy as np
import yfinance as yf
import warnings

warnings.filterwarnings('ignore')

SMA_WINDOWS = [5, 10, 20, 50, 100]
EMA_WINDOWS = [5, 10, 20, 50]
RSI_WINDOWS = [7, 14, 21]
BB_WINDOWS = [10, 20, 50]
ATR_WINDOWS = [14, 20]
VOL_WINDOWS = [20, 50]
LAGS = 3

def load_data(symbols, market_symbol, start_date, end_date):
    print("Downloading data for AAPL and market index (auto_adjust=True)...")
    
    df_market = yf.download(market_symbol, start=start_date, end=end_date, auto_adjust=True, progress=False)

    if isinstance(df_market.columns, pd.MultiIndex):
        df_market.columns = df_market.columns.droplevel(1)
    
    df_market = df_market.reset_index()[['Date', 'Close']].rename(columns={'Close': 'Market_Close'})
    dfs = []
    for symbol in symbols:
        df = yf.download(symbol, start=start_date, end=end_date, auto_adjust=True, progress=False)
        if isinstance(df.columns, pd.MultiIndex):
            df.columns = df.columns.droplevel(1)
        df = df.reset_index()[['Date', 'Open', 'High', 'Low', 'Close', 'Volume']]
        df['Ticker'] = symbol
    
        df = pd.merge(df, df_market, on='Date', how='left')
        dfs.append(df)

    df = pd.concat(dfs, ignore_index = True)
    df = df.sort_values(['Ticker', 'Date']).reset_index(drop=True)
    print(f"Loaded raw panel data: {len(df)} rows | {len(symbols)} tickers | "
          f"from {df['Date'].min().date()} to {df['Date'].max().date()}")
    return df

def clean_data(df):
    cleaned_dfs = []
    for ticker, group in df.groupby('Ticker'):
        group = group.set_index('Date').sort_index()
        start_dt = group.index.min()
        end_dt = group.index.max()
        all_business_days = pd.date_range(start=start_dt, end=end_dt, freq="B")

        group = group.reindex(all_business_days)
        group = group.ffill()
        group = group.reset_index().rename(columns={'index': 'Date'})
        group['Ticker'] = ticker
        cleaned_dfs.append(group)
    
    df_cleaned = pd.concat(cleaned_dfs, ignore_index = True)
    df_cleaned = df_cleaned.sort_values(['Ticker', 'Date']).reset_index(drop=True)

    print(f"Data cleaned: {len(df_cleaned)} rows | "
          f"from {df_cleaned['Date'].min().date()} to {df_cleaned['Date'].max().date()}")
    return df

def validate_data(df, stage="pre_feature"):

    print(f"Validating data at stage: {stage}...")
    num_cols = df.select_dtypes(include=[np.number]).columns
    
    nan_count = df[num_cols].isna().sum().sum()
    inf_count = np.isinf(df[num_cols]).sum().sum()
    
    if nan_count > 0:
        print(f"WARNING: Tìm thấy {nan_count} NaN values tại stage {stage}")
    if inf_count > 0:
        print(f"WARNING: Tìm thấy {inf_count} Inf values tại stage {stage}")

    if 'Date' in df.columns and 'Market_Return' in df.columns:
        market_std_per_date = df.groupby('Date')['Market_Return'].std(ddof=0).max()
        if pd.notna(market_std_per_date) and market_std_per_date > 1e-8:
            print(f"WARNING: Cross-ticker contamination detected! "
                  f"Max std of Market_Return per date: {market_std_per_date:.2e}")
    
    # Kiểm tra nhanh variance của returns (nên > 0)
    if 'Daily_Return' in df.columns:
        for ticker, grp in df.groupby('Ticker'):
            if len(grp) > 1 and grp['Daily_Return'].std(ddof=0) == 0:
                print(f"WARNING: Ticker {ticker} has zero variance in Daily_Return!")
    
    print(f"Validation passed at {stage} (no critical issues).")
    return df

def generate_technical_features(df, is_inference=False, target_horizon=1):
    """
    Feature Engineering hoàn toàn mới theo 5 yêu cầu:
    1. Corporate actions đã được xử lý ở load_data (auto_adjust=True)
    2. TẤT CẢ features được chuyển sang dạng stationary (ratio, pct distance, normalized, position 0-1)
    3. Multi-timeframe: nhiều windows để Linear_Regression tự chọn tín hiệu mạnh
    4. Market Regime & Volatility: ATR normalized + rolling volatility
    5. Gọi validate_data ngay trước khi return
    """
    data = df.copy()

    def add_features(group):
        g = group.copy()
    
        # === 1. BASIC RETURNS (luôn stationary) ===
        g['Daily_Return'] = g['Close'].pct_change()
        g['Log_Return'] = np.log(1 + g['Daily_Return'])
        g['Market_Return'] = g['Market_Close'].pct_change()
        g['Market_Log_Return'] = np.log(1 + g['Market_Return'])
    
        # === 2. LAGGED FEATURES – CHỈ lag returns (KHÔNG lag Close raw) ===
        # Lý do: Close raw và SMA raw là non-stationary → Linear_Regression sẽ học nhầm trend dài hạn thay vì pattern thực sự.
        for i in range(1, LAGS + 1):
            g[f'Return_Lag_{i}'] = g['Daily_Return'].shift(i)
            g[f'Market_Return_Lag_{i}'] = g['Market_Return'].shift(i)
    
        # === 3. MULTI-TIMEFRAME TECHNICAL INDICATORS (Stationary version) ===
        # SMA & EMA → Ratio + % Distance (thay vì giá trị tuyệt đối)
        for w in SMA_WINDOWS:
            sma = g['Close'].rolling(window=w).mean()
            g[f'SMA_{w}_Ratio'] = g['Close'] / sma
            g[f'SMA_{w}_Distance_pct'] = (g['Close'] - sma) / sma * 100   # % distance từ giá đến SMA
        
        for w in EMA_WINDOWS:
            ema = g['Close'].ewm(span=w, adjust=False).mean()
            g[f'EMA_{w}_Ratio'] = g['Close'] / ema
            g[f'EMA_{w}_Distance_pct'] = (g['Close'] - ema) / ema * 100
    
        # RSI multi-window (đã stationary tự nhiên 0-100)
        for w in RSI_WINDOWS:
            delta = g['Close'].diff()
            gain = delta.where(delta > 0, 0).rolling(w).mean()
            loss = -delta.where(delta < 0, 0).rolling(w).mean()
            rs = gain / loss
            g[f'RSI_{w}'] = 100 - (100 / (1 + rs))
    
        # MACD: giữ cấu trúc gốc nhưng normalize Hist theo % giá (stationary)
        ema_fast = g['Close'].ewm(span=12, adjust=False).mean()
        ema_slow = g['Close'].ewm(span=26, adjust=False).mean()
        g['MACD_Line'] = ema_fast - ema_slow
        g['MACD_Signal'] = g['MACD_Line'].ewm(span=9, adjust=False).mean()
        g['MACD_Hist'] = g['MACD_Line'] - g['MACD_Signal']
        g['MACD_Hist_Normalized'] = g['MACD_Hist'] / g['Close'] * 100   # % của giá → stationary
    
        # Bollinger Bands: Width % + Position (0-1) thay vì Upper/Lower tuyệt đối
        for w in BB_WINDOWS:
            middle = g['Close'].rolling(w).mean()
            std_dev = g['Close'].rolling(w).std()
            upper = middle + 2 * std_dev
            lower = middle - 2 * std_dev
            bb_range = upper - lower
            
            g[f'BB_Width_{w}_pct'] = (bb_range / middle * 100)                  # % width (stationary)
            g[f'BB_Position_{w}'] = (g['Close'] - lower) / bb_range.where(bb_range > 0, 1)  # 0-1 position
        
        # === 4. VOLATILITY & MARKET REGIME FEATURES ===
        # True Range & ATR normalized
        def calculate_true_range(high, low, close):
            tr1 = high - low
            tr2 = abs(high - close.shift(1))
            tr3 = abs(low - close.shift(1))
            return pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
        
        tr = calculate_true_range(g['High'], g['Low'], g['Close'])
        for w in ATR_WINDOWS:
            atr = tr.rolling(w).mean()
            g[f'ATR_{w}'] = atr
            g[f'ATR_Normalized_{w}'] = atr / g['Close']   # Relative volatility → stationary
        
        # Rolling volatility (market regime detection)
        for w in VOL_WINDOWS:
            g[f'Market_Rolling_Vol_{w}'] = g['Market_Return'].rolling(w).std()
            g[f'AAPL_Rolling_Vol_{w}'] = g['Daily_Return'].rolling(w).std()
        
        # Relative volume
        g['Rel_Volume_20'] = g['Volume'] / g['Volume'].rolling(20).mean()
        return g
        
    # Xóa NaN (do rolling + lag)
    data_list = [add_features(group) for _, group in data.groupby('Ticker')]
    data = pd.concat(data_list, ignore_index=True)
    
    if not is_inference:
        data['Target_Return'] = data.groupby('Ticker')['Close'].shift(-target_horizon) / data['Close'] - 1
        data = data.dropna().reset_index(drop=True)
        # === 5. DATA VALIDATION TRƯỚC KHI TRẢ VỀ ===
        data = validate_data(data, f"post_feature_engineering_h{target_horizon}")

        df_backtest = data.copy()
        drop_cols = ['Date', 'Ticker', 'Market_Close', 'Target_Return']
        X = data.drop(columns=drop_cols, errors='ignore')
        y = data['Target_Return'].copy()

        print(f"Generated data for Horizon {target_horizon} days:\n"
            f"   • Total rows: {len(data)} | Tickers: {data['Ticker'].nunique()}\n"
            f"   • Features: {X.shape[1]} | X shape: {X.shape} | y shape: {y.shape}")
        
        return df_backtest, X, y
    else:
        # Nếu là predict, dòng cuối cùng của mỗi ticker sẽ chứa feature đầy đủ và không bị loại bỏ do thiếu target
        data = data.dropna().reset_index(drop=True)
        X = data.drop(columns=['Date', 'Ticker', 'Market_Close'], errors='ignore')
        return data, X, None