huntergemmer commited on
Commit
9387baa
·
verified ·
1 Parent(s): af2e733

Upload train.py

Browse files
Files changed (1) hide show
  1. train.py +182 -0
train.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ End-to-end pipeline: load crypto data -> engineer features -> train 15-min direction classifier.
3
+ Single script to avoid intermediate file issues.
4
+ """
5
+
6
+ import os
7
+ import numpy as np
8
+ import pandas as pd
9
+ from datasets import load_dataset
10
+ from sklearn.ensemble import RandomForestClassifier
11
+ from sklearn.linear_model import LogisticRegression
12
+ from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
13
+ import json
14
+ import pickle
15
+
16
+ SEED = 42
17
+ LOOKBACK = 60
18
+ AHEAD = 15
19
+ MAX_ROWS = 300_000
20
+ MAX_TRAIN_SAMPLES = 30_000
21
+ MAX_VAL_SAMPLES = 6_000
22
+ MAX_TEST_SAMPLES = 6_000
23
+ OUT_DIR = "/app/outputs"
24
+ os.makedirs(OUT_DIR, exist_ok=True)
25
+
26
+ def load_data(max_rows=MAX_ROWS):
27
+ print("Loading BTC...")
28
+ ds = load_dataset("WinkingFace/CryptoLM-Bitcoin-BTC-USDT", split=f"train[:{max_rows}]")
29
+ df_btc = ds.to_pandas()
30
+ for c in ["open", "volume"]:
31
+ df_btc[c] = pd.to_numeric(df_btc[c], errors="coerce")
32
+ df_btc = df_btc.rename(columns={c: f"btc_{c}" for c in df_btc.columns if c != "timestamp"})
33
+
34
+ print("Loading ETH...")
35
+ ds = load_dataset("WinkingFace/CryptoLM-Ethereum-ETH-USDT", split=f"train[:{max_rows}]")
36
+ df_eth = ds.to_pandas()
37
+ for c in ["open", "volume"]:
38
+ df_eth[c] = pd.to_numeric(df_eth[c], errors="coerce")
39
+ df_eth = df_eth.rename(columns={c: f"eth_{c}" for c in df_eth.columns if c != "timestamp"})
40
+
41
+ df = pd.merge(df_btc, df_eth, on="timestamp", how="inner").sort_values("timestamp").reset_index(drop=True)
42
+ df = df.dropna(subset=["btc_close", "eth_close"]).reset_index(drop=True)
43
+ print(f"Merged rows: {len(df)}")
44
+ return df
45
+
46
+ def engineer_features(df):
47
+ print("Engineering features...")
48
+ df["eth_btc_ratio"] = df["eth_close"] / df["btc_close"]
49
+ df["btc_ret_1m"] = df["btc_close"].pct_change()
50
+ df["eth_ret_1m"] = df["eth_close"].pct_change()
51
+ df["btc_vol_ma20"] = df["btc_volume"].rolling(20).mean()
52
+ df["eth_vol_ma20"] = df["eth_volume"].rolling(20).mean()
53
+ df["btc_range"] = (df["btc_high"] - df["btc_low"]) / df["btc_close"]
54
+ df["eth_range"] = (df["eth_high"] - df["eth_low"]) / df["eth_close"]
55
+ df["target"] = (df["btc_close"].shift(-AHEAD) > df["btc_close"]).astype(int)
56
+ df = df.iloc[:-AHEAD].copy()
57
+ return df
58
+
59
+ def build_windows(df, lookback=LOOKBACK):
60
+ print("Building windows...")
61
+ exclude = {"timestamp", "btc_month", "eth_month", "target"}
62
+ feat_cols = [c for c in df.columns if c not in exclude]
63
+ df = df.dropna(subset=feat_cols + ["target"]).reset_index(drop=True)
64
+
65
+ data = df[feat_cols].values.astype(np.float32)
66
+ targets = df["target"].values.astype(np.int64)
67
+ n = len(df)
68
+ valid = ~np.isnan(data).any(axis=1) & ~np.isnan(targets)
69
+
70
+ max_i = n - lookback - AHEAD + 1
71
+ X_list, y_list = [], []
72
+ for i in range(max_i):
73
+ end = i + lookback
74
+ tidx = end + AHEAD - 1
75
+ if valid[i:end].all() and valid[tidx]:
76
+ X_list.append(data[i:end])
77
+ y_list.append(targets[tidx])
78
+ X = np.array(X_list, dtype=np.float32)
79
+ y = np.array(y_list, dtype=np.int64)
80
+ print(f"Samples: {X.shape}, pos_rate={y.mean():.3f}")
81
+ return X, y
82
+
83
+ def subsample(X, y, max_n, rng):
84
+ if len(X) > max_n:
85
+ idx = rng.choice(len(X), max_n, replace=False)
86
+ return X[idx], y[idx]
87
+ return X, y
88
+
89
+ def evaluate_model(name, model, X_test, y_test, results):
90
+ preds = model.predict(X_test)
91
+ probs = model.predict_proba(X_test)[:, 1]
92
+ acc = accuracy_score(y_test, preds)
93
+ f1 = f1_score(y_test, preds)
94
+ auc = roc_auc_score(y_test, probs)
95
+ results[name] = {"accuracy": float(acc), "f1": float(f1), "auc": float(auc)}
96
+ print(f" {name} test: acc={acc:.4f} f1={f1:.4f} auc={auc:.4f}")
97
+ return results
98
+
99
+ def main():
100
+ df = load_data()
101
+ df = engineer_features(df)
102
+ X, y = build_windows(df)
103
+
104
+ n = len(X)
105
+ te = int(n * 0.70)
106
+ ve = int(n * 0.85)
107
+
108
+ X_train, y_train = X[:te], y[:te]
109
+ X_val, y_val = X[te:ve], y[te:ve]
110
+ X_test, y_test = X[ve:], y[ve:]
111
+ print(f"Split: train={len(X_train)}, val={len(X_val)}, test={len(X_test)}")
112
+
113
+ rng = np.random.RandomState(SEED)
114
+ X_train, y_train = subsample(X_train, y_train, MAX_TRAIN_SAMPLES, rng)
115
+ X_val, y_val = subsample(X_val, y_val, MAX_VAL_SAMPLES, rng)
116
+ X_test, y_test = subsample(X_test, y_test, MAX_TEST_SAMPLES, rng)
117
+ print(f"Subsampled: train={len(X_train)}, val={len(X_val)}, test={len(X_test)}")
118
+
119
+ def flat(X):
120
+ return X.reshape(X.shape[0], -1)
121
+
122
+ X_train_f = flat(X_train)
123
+ X_val_f = flat(X_val)
124
+ X_test_f = flat(X_test)
125
+
126
+ valid = (np.isfinite(X_train_f).all(axis=0) &
127
+ np.isfinite(X_val_f).all(axis=0) &
128
+ np.isfinite(X_test_f).all(axis=0))
129
+ X_train_f = X_train_f[:, valid]
130
+ X_val_f = X_val_f[:, valid]
131
+ X_test_f = X_test_f[:, valid]
132
+ print(f"Valid features: {X_train_f.shape[1]}")
133
+
134
+ mean = X_train_f.mean(axis=0)
135
+ std = X_train_f.std(axis=0) + 1e-8
136
+ X_train_f = (X_train_f - mean) / std
137
+ X_val_f = (X_val_f - mean) / std
138
+ X_test_f = (X_test_f - mean) / std
139
+
140
+ results = {}
141
+
142
+ print("\nTraining Random Forest...")
143
+ rf = RandomForestClassifier(n_estimators=200, max_depth=12, min_samples_leaf=5, n_jobs=-1, random_state=SEED)
144
+ rf.fit(X_train_f, y_train)
145
+ results = evaluate_model("RandomForest", rf, X_test_f, y_test, results)
146
+
147
+ print("\nTraining Logistic Regression...")
148
+ lr = LogisticRegression(max_iter=500, random_state=SEED)
149
+ lr.fit(X_train_f, y_train)
150
+ results = evaluate_model("LogisticRegression", lr, X_test_f, y_test, results)
151
+
152
+ best_name = max(results, key=lambda k: results[k]["auc"])
153
+ print(f"\nBest model: {best_name} (AUC={results[best_name]['auc']:.4f})")
154
+ best_model = rf if best_name == "RandomForest" else lr
155
+
156
+ with open(os.path.join(OUT_DIR, "model.pkl"), "wb") as f:
157
+ pickle.dump(best_model, f)
158
+ np.save(os.path.join(OUT_DIR, "feature_mean.npy"), mean)
159
+ np.save(os.path.join(OUT_DIR, "feature_std.npy"), std)
160
+ np.save(os.path.join(OUT_DIR, "valid_cols.npy"), valid)
161
+
162
+ preds = best_model.predict(X_test_f)
163
+ print("\nBest Model Classification Report (Test):")
164
+ print(classification_report(y_test, preds, target_names=["down", "up"], digits=4))
165
+
166
+ metrics = {
167
+ "best_model": best_name,
168
+ "train_samples": int(len(X_train_f)),
169
+ "val_samples": int(len(X_val_f)),
170
+ "test_samples": int(len(X_test_f)),
171
+ "n_features": int(X_train_f.shape[1]),
172
+ "results": results,
173
+ "best_test_accuracy": results[best_name]["accuracy"],
174
+ "best_test_f1": results[best_name]["f1"],
175
+ "best_test_auc": results[best_name]["auc"],
176
+ }
177
+ with open(os.path.join(OUT_DIR, "metrics.json"), "w") as f:
178
+ json.dump(metrics, f, indent=2)
179
+ print(f"\nArtifacts saved to {OUT_DIR}")
180
+
181
+ if __name__ == "__main__":
182
+ main()