ashesh8500 commited on
Commit
d5789cd
·
verified ·
1 Parent(s): ab570e3

Upload data.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. data.py +155 -0
data.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import torch
4
+ from torch.utils.data import Dataset, DataLoader
5
+ from typing import Optional, Tuple, List, Dict
6
+
7
+ def compute_technical_indicators(df):
8
+ df = df.copy()
9
+ df['ret'] = df['close'].pct_change()
10
+ df['log_ret'] = np.log(df['close'] / df['close'].shift(1))
11
+ df['volatility_5'] = df['ret'].rolling(5).std()
12
+ df['volatility_20'] = df['ret'].rolling(20).std()
13
+ df['ma_5'] = df['close'].rolling(5).mean()
14
+ df['ma_20'] = df['close'].rolling(20).mean()
15
+ delta = df['close'].diff()
16
+ gain = delta.where(delta > 0, 0)
17
+ loss = -delta.where(delta < 0, 0)
18
+ avg_gain = gain.rolling(14).mean()
19
+ avg_loss = loss.rolling(14).mean()
20
+ rs = avg_gain / avg_loss
21
+ df['rsi'] = 100 - (100 / (1 + rs))
22
+ ema_12 = df['close'].ewm(span=12).mean()
23
+ ema_26 = df['close'].ewm(span=26).mean()
24
+ df['macd'] = ema_12 - ema_26
25
+ df['macd_signal'] = df['macd'].ewm(span=9).mean()
26
+ df['vol_ma_5'] = df['volume'].rolling(5).mean()
27
+ df['volume_ratio'] = df['volume'] / df['vol_ma_5']
28
+ high_low = df['high'] - df['low']
29
+ high_close = np.abs(df['high'] - df['close'].shift())
30
+ low_close = np.abs(df['low'] - df['close'].shift())
31
+ tr = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
32
+ df['atr'] = tr.rolling(14).mean()
33
+ df = df.fillna(0)
34
+ return df
35
+
36
+ def normalize_features(arr):
37
+ mean = arr.mean(axis=0, keepdims=True)
38
+ std = arr.std(axis=0, keepdims=True) + 1e-6
39
+ return (arr - mean) / std
40
+
41
+ class FinancialTrajectoryDataset(Dataset):
42
+ def __init__(self, data, n_assets=1, context_window=60, target_window=5,
43
+ feature_cols=None, stride=1, normalize=True):
44
+ self.data = data.reset_index(drop=True)
45
+ self.n_assets = n_assets
46
+ self.context_window = context_window
47
+ self.target_window = target_window
48
+ self.stride = stride
49
+ self.normalize = normalize
50
+ if feature_cols is None:
51
+ feature_cols = ['open', 'high', 'low', 'close', 'volume', 'ret', 'log_ret',
52
+ 'volatility_5', 'volatility_20', 'rsi', 'macd', 'macd_signal',
53
+ 'volume_ratio', 'atr']
54
+ self.feature_cols = [c for c in feature_cols if c in self.data.columns]
55
+ self.n_features = len(self.feature_cols)
56
+ self.features = self.data[self.feature_cols].values.astype(np.float32)
57
+ if normalize:
58
+ self.features = normalize_features(self.features)
59
+ self.returns = self.data['ret'].values.astype(np.float32)
60
+ self.total_len = len(self.data)
61
+ self.indices = list(range(0, self.total_len - context_window - target_window, stride))
62
+
63
+ def __len__(self):
64
+ return len(self.indices)
65
+
66
+ def __getitem__(self, idx):
67
+ start = self.indices[idx]
68
+ ctx_end = start + self.context_window
69
+ tgt_end = ctx_end + self.target_window
70
+ context = self.features[start:ctx_end]
71
+ target = self.features[ctx_end:tgt_end]
72
+ future_ret = self.returns[ctx_end:tgt_end]
73
+ avg_ret = future_ret.mean() if len(future_ret) > 0 else 0.0
74
+ if self.n_assets == 1:
75
+ weights = np.array([1.0], dtype=np.float32)
76
+ else:
77
+ weights = np.random.dirichlet(np.ones(self.n_assets)).astype(np.float32)
78
+ if avg_ret > 0.01:
79
+ signal = 0
80
+ elif avg_ret < -0.01:
81
+ signal = 1
82
+ else:
83
+ signal = 2
84
+ signals = np.array([signal] * self.n_assets, dtype=np.int64)
85
+ hedge = 0
86
+ return {
87
+ "context": torch.from_numpy(context),
88
+ "target": torch.from_numpy(target),
89
+ "weights": torch.from_numpy(weights),
90
+ "signals": torch.from_numpy(signals),
91
+ "hedge": torch.tensor(hedge, dtype=torch.long),
92
+ }
93
+
94
+ def build_dataloaders(data, n_assets=1, context_window=60, target_window=5,
95
+ batch_size=64, train_ratio=0.8, val_ratio=0.1, num_workers=0):
96
+ n = len(data)
97
+ train_end = int(n * train_ratio)
98
+ val_end = int(n * (train_ratio + val_ratio))
99
+ train_data = data.iloc[:train_end]
100
+ val_data = data.iloc[train_end:val_end]
101
+ test_data = data.iloc[val_end:]
102
+ train_ds = FinancialTrajectoryDataset(train_data, n_assets, context_window, target_window)
103
+ val_ds = FinancialTrajectoryDataset(val_data, n_assets, context_window, target_window)
104
+ test_ds = FinancialTrajectoryDataset(test_data, n_assets, context_window, target_window)
105
+ return {
106
+ "train": DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=num_workers, drop_last=True),
107
+ "val": DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=num_workers, drop_last=True),
108
+ "test": DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=num_workers, drop_last=True),
109
+ }
110
+
111
+ def load_hf_stock_data(dataset_name="paperswithbacktest/Stocks-Daily-Price", symbols=None, max_rows=100_000):
112
+ try:
113
+ from datasets import load_dataset
114
+ ds = load_dataset(dataset_name, split="train", streaming=True)
115
+ rows = []
116
+ for i, row in enumerate(ds):
117
+ if i >= max_rows:
118
+ break
119
+ if symbols is not None and row["symbol"] not in symbols:
120
+ continue
121
+ rows.append({
122
+ "symbol": row["symbol"],
123
+ "date": row["date"],
124
+ "open": row["open"],
125
+ "high": row["high"],
126
+ "low": row["low"],
127
+ "close": row["close"],
128
+ "volume": row["volume"],
129
+ "adj_close": row.get("adj_close", row["close"]),
130
+ })
131
+ df = pd.DataFrame(rows)
132
+ df = compute_technical_indicators(df)
133
+ return df
134
+ except Exception as e:
135
+ print(f"Error loading HF dataset: {e}")
136
+ return generate_synthetic_data(n_timesteps=max_rows, n_assets=1 if symbols is None else len(symbols))
137
+
138
+ def generate_synthetic_data(n_timesteps=5000, n_assets=1, seed=42):
139
+ np.random.seed(seed)
140
+ price = 100.0
141
+ data = []
142
+ for t in range(n_timesteps):
143
+ ret = np.random.normal(0.0002, 0.02)
144
+ price *= (1 + ret)
145
+ high = price * (1 + abs(np.random.normal(0, 0.005)))
146
+ low = price * (1 - abs(np.random.normal(0, 0.005)))
147
+ open_p = price * (1 + np.random.normal(0, 0.003))
148
+ vol = int(np.random.lognormal(15, 0.5))
149
+ data.append({
150
+ "open": open_p, "high": high, "low": low, "close": price,
151
+ "volume": vol, "adj_close": price,
152
+ })
153
+ df = pd.DataFrame(data)
154
+ df = compute_technical_indicators(df)
155
+ return df