File size: 11,442 Bytes
d730c93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
"""
Data processing pipeline for LOBPatternNet v2.
Fixed: proper normalization, balanced labeling, oversampling.
"""

import numpy as np
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import os


def load_lob_data():
    """Load TRADES-LOB dataset from HF Hub."""
    ds = load_dataset("LeonardoBerti/TRADES-LOB", split="train")
    df = ds.to_pandas()
    print(f"Loaded dataset: {len(df)} rows")
    return df


def extract_and_normalize_features(df):
    """
    Extract and normalize LOB features properly.
    
    Approach: 
    1. Separate price and size features
    2. Prices: normalize relative to mid-price (basis points)
    3. Sizes: log-transform then z-score
    4. Replace invalid values with 0
    5. Final z-score normalization per feature
    
    Returns: (N, 40) normalized features
    """
    N = len(df)
    
    # Collect raw features
    ask_prices = np.zeros((N, 10), dtype=np.float64)
    ask_sizes = np.zeros((N, 10), dtype=np.float64)
    bid_prices = np.zeros((N, 10), dtype=np.float64)
    bid_sizes = np.zeros((N, 10), dtype=np.float64)
    
    for i in range(10):
        ask_prices[:, i] = df[f'ask_price_{i+1}'].values.astype(np.float64)
        ask_sizes[:, i] = df[f'ask_size_{i+1}'].values.astype(np.float64)
        bid_prices[:, i] = df[f'bid_price_{i+1}'].values.astype(np.float64)
        bid_sizes[:, i] = df[f'bid_size_{i+1}'].values.astype(np.float64)
    
    # Mark sentinel/invalid values
    SENTINEL = 1e9
    ask_p_valid = np.abs(ask_prices) < SENTINEL
    ask_s_valid = np.abs(ask_sizes) < SENTINEL
    bid_p_valid = np.abs(bid_prices) < SENTINEL
    bid_s_valid = np.abs(bid_sizes) < SENTINEL
    
    n_invalid = (~ask_p_valid).sum() + (~bid_p_valid).sum() + (~ask_s_valid).sum() + (~bid_s_valid).sum()
    print(f"Found {n_invalid} invalid/sentinel values")
    
    # Compute mid-price from valid best bid/ask
    best_ask = ask_prices[:, 0].copy()
    best_bid = bid_prices[:, 0].copy()
    both_valid = ask_p_valid[:, 0] & bid_p_valid[:, 0]
    mid_price = np.where(both_valid, (best_ask + best_bid) / 2.0, 0.0)
    
    # Forward-fill mid_price where it's 0
    for i in range(1, N):
        if mid_price[i] == 0 and mid_price[i-1] != 0:
            mid_price[i] = mid_price[i-1]
    
    # Normalize prices: (price - mid) / mid * 10000 = basis points
    norm_ask_p = np.zeros_like(ask_prices)
    norm_bid_p = np.zeros_like(bid_prices)
    
    for i in range(10):
        valid_a = ask_p_valid[:, i] & (mid_price > 0)
        valid_b = bid_p_valid[:, i] & (mid_price > 0)
        norm_ask_p[valid_a, i] = (ask_prices[valid_a, i] - mid_price[valid_a]) / mid_price[valid_a] * 10000
        norm_bid_p[valid_b, i] = (bid_prices[valid_b, i] - mid_price[valid_b]) / mid_price[valid_b] * 10000
    
    # Normalize sizes: log1p then z-score
    norm_ask_s = np.zeros_like(ask_sizes)
    norm_bid_s = np.zeros_like(bid_sizes)
    
    for i in range(10):
        valid_a = ask_s_valid[:, i] & (ask_sizes[:, i] > 0)
        valid_b = bid_s_valid[:, i] & (bid_sizes[:, i] > 0)
        norm_ask_s[valid_a, i] = np.log1p(ask_sizes[valid_a, i])
        norm_bid_s[valid_b, i] = np.log1p(bid_sizes[valid_b, i])
    
    # Assemble into (N, 40) array: [ask_p_1, ask_s_1, bid_p_1, bid_s_1, ...]
    features = np.zeros((N, 40), dtype=np.float32)
    for i in range(10):
        features[:, i*4] = norm_ask_p[:, i]
        features[:, i*4+1] = norm_ask_s[:, i]
        features[:, i*4+2] = norm_bid_p[:, i]
        features[:, i*4+3] = norm_bid_s[:, i]
    
    # Final z-score normalization per feature (critical for model convergence)
    means = features.mean(axis=0)
    stds = features.std(axis=0)
    stds[stds < 1e-8] = 1.0  # avoid division by 0
    features = (features - means) / stds
    
    # Replace any remaining NaN/inf
    features = np.nan_to_num(features, nan=0.0, posinf=0.0, neginf=0.0)
    
    print(f"Feature shape: {features.shape}")
    print(f"Feature range: [{features.min():.4f}, {features.max():.4f}]")
    print(f"Feature mean: {features.mean():.6f}, std: {features.std():.4f}")
    
    return features, means, stds


def rolling_sum(arr, window):
    """Fully vectorized rolling sum using cumsum trick."""
    cum = np.cumsum(arr)
    result = np.zeros_like(cum)
    result[window:] = cum[window:] - cum[:-window]
    return result


def construct_labels_vectorized(df, window=50, ofi_threshold=0.15, percentile=85):
    """
    Fully vectorized label construction for institutional trading detection.
    Uses rolling windows and relaxed thresholds for better class balance.
    """
    N = len(df)
    buy_sell = df['BUY_SELL_FLAG'].values.astype(np.float32)  # 1=buy, 0=sell
    sizes = df['SIZE'].values.astype(np.float32)
    types = df['TYPE'].values
    
    print(f"Constructing labels from {N} events, window={window}...")
    
    # Signed volume
    signed_vol = sizes * (2 * buy_sell - 1)
    
    # Rolling sums (vectorized)
    roll_signed = rolling_sum(signed_vol, window)
    roll_total = rolling_sum(sizes, window)
    norm_ofi = roll_signed / (roll_total + 1e-8)
    
    # Large orders
    is_large = (sizes > np.percentile(sizes, percentile)).astype(np.float32)
    roll_large_buy = rolling_sum(is_large * buy_sell, window)
    roll_large_sell = rolling_sum(is_large * (1 - buy_sell), window)
    
    # Cancellation rate
    is_cancel = (types == 'ORDER_CANCELLED').astype(np.float32)
    roll_cancel = rolling_sum(is_cancel, window) / window
    
    # Combined scores
    large_diff = (roll_large_buy - roll_large_sell) / (window * 0.1 + 1e-8)
    buy_score = norm_ofi + 0.3 * large_diff + 0.2 * roll_cancel
    sell_score = -norm_ofi - 0.3 * large_diff + 0.2 * roll_cancel
    
    # Use percentile thresholds for ~15-20% per class
    valid = np.arange(window, N)
    buy_threshold = np.percentile(buy_score[valid], 80)
    sell_threshold = np.percentile(sell_score[valid], 80)
    
    print(f"Buy threshold (p80): {buy_threshold:.4f}, Sell threshold (p80): {sell_threshold:.4f}")
    
    labels = np.ones(N, dtype=np.int64)
    labels[(buy_score > buy_threshold) & (norm_ofi > ofi_threshold)] = 0
    labels[(sell_score > sell_threshold) & (norm_ofi < -ofi_threshold)] = 2
    
    unique, counts = np.unique(labels, return_counts=True)
    label_names = {0: '主力买入', 1: '中性', 2: '主力卖出'}
    print("Label distribution:")
    for u, c in zip(unique, counts):
        print(f"  {u} ({label_names.get(u, '?')}): {c} ({c/N*100:.1f}%)")
    
    return labels


def create_sequences(features, labels, seq_len=100, stride=20):
    """Create sliding window sequences using stride_tricks for efficiency."""
    N = len(features)
    F = features.shape[1]
    n_sequences = (N - seq_len) // stride
    
    # Use list comprehension (more memory efficient than pre-allocating huge array)
    starts = np.arange(0, N - seq_len, stride)
    n_sequences = len(starts)
    
    print(f"Creating {n_sequences} sequences of length {seq_len}, stride {stride}...")
    
    X = np.zeros((n_sequences, seq_len, F), dtype=np.float32)
    y = np.zeros(n_sequences, dtype=np.int64)
    
    for idx, start in enumerate(starts):
        X[idx] = features[start:start + seq_len]
        y[idx] = labels[start + seq_len - 1]
    
    print(f"Created {n_sequences} sequences, memory: {X.nbytes / 1e6:.1f} MB")
    return X, y


class LOBDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X)
        self.y = torch.from_numpy(y)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


def get_weighted_sampler(y_train):
    """Create WeightedRandomSampler to oversample minority classes."""
    class_counts = np.bincount(y_train)
    class_weights = 1.0 / class_counts
    sample_weights = class_weights[y_train]
    sampler = WeightedRandomSampler(
        weights=torch.from_numpy(sample_weights).double(),
        num_samples=len(y_train),
        replacement=True
    )
    return sampler


def prepare_data(seq_len=100, stride=5, window=50, ofi_threshold=0.2,
                 percentile=90, test_size=0.15, val_size=0.15, 
                 random_state=42, batch_size=64):
    """
    Full data preparation pipeline.
    Returns train, val, test DataLoaders with balanced sampling.
    """
    cache_path = f"/app/data_v2_{seq_len}_{stride}_{window}.npz"
    
    if os.path.exists(cache_path):
        print(f"Loading cached data from {cache_path}")
        data = np.load(cache_path, allow_pickle=True)
        X_train, y_train = data['X_train'], data['y_train']
        X_val, y_val = data['X_val'], data['y_val']
        X_test, y_test = data['X_test'], data['y_test']
    else:
        # Load raw data
        df = load_lob_data()
        
        # Extract and normalize features
        features, means, stds = extract_and_normalize_features(df)
        
        # Construct labels
        labels = construct_labels_vectorized(df, window=window, 
                                             ofi_threshold=ofi_threshold,
                                             percentile=percentile)
        
        # Create sequences
        X, y = create_sequences(features, labels, seq_len=seq_len, stride=stride)
        
        # Split (stratified)
        X_train, X_temp, y_train, y_temp = train_test_split(
            X, y, test_size=test_size + val_size, random_state=random_state, stratify=y)
        X_val, X_test, y_val, y_test = train_test_split(
            X_temp, y_temp, test_size=test_size / (test_size + val_size),
            random_state=random_state, stratify=y_temp)
        
        # Save cache
        np.savez_compressed(cache_path, 
                           X_train=X_train, y_train=y_train,
                           X_val=X_val, y_val=y_val,
                           X_test=X_test, y_test=y_test,
                           means=means, stds=stds)
        print(f"Cached to {cache_path}")
    
    print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")
    
    # Print label distributions
    for name, ys in [("Train", y_train), ("Val", y_val), ("Test", y_test)]:
        unique, counts = np.unique(ys, return_counts=True)
        dist = {u: c for u, c in zip(unique, counts)}
        print(f"  {name}: {dist}")
    
    # Create datasets
    train_dataset = LOBDataset(X_train, y_train)
    val_dataset = LOBDataset(X_val, y_val)
    test_dataset = LOBDataset(X_test, y_test)
    
    # Weighted sampler for training (oversamples minority classes)
    train_sampler = get_weighted_sampler(y_train)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler, num_workers=0)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
    
    return train_loader, val_loader, test_loader, y_train


if __name__ == "__main__":
    train_loader, val_loader, test_loader, y_train = prepare_data()
    
    # Check a batch
    for X_batch, y_batch in train_loader:
        print(f"Batch X: {X_batch.shape}, y: {y_batch.shape}")
        print(f"Batch labels: {y_batch[:20]}")
        print(f"Batch X range: [{X_batch.min():.4f}, {X_batch.max():.4f}]")
        break