Upload train.py
Browse files
train.py
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
End-to-end pipeline: load crypto data -> engineer features -> train 15-min direction classifier.
|
| 3 |
+
Single script to avoid intermediate file issues.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import numpy as np
|
| 8 |
+
import pandas as pd
|
| 9 |
+
from datasets import load_dataset
|
| 10 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 11 |
+
from sklearn.linear_model import LogisticRegression
|
| 12 |
+
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
|
| 13 |
+
import json
|
| 14 |
+
import pickle
|
| 15 |
+
|
| 16 |
+
SEED = 42
|
| 17 |
+
LOOKBACK = 60
|
| 18 |
+
AHEAD = 15
|
| 19 |
+
MAX_ROWS = 300_000
|
| 20 |
+
MAX_TRAIN_SAMPLES = 30_000
|
| 21 |
+
MAX_VAL_SAMPLES = 6_000
|
| 22 |
+
MAX_TEST_SAMPLES = 6_000
|
| 23 |
+
OUT_DIR = "/app/outputs"
|
| 24 |
+
os.makedirs(OUT_DIR, exist_ok=True)
|
| 25 |
+
|
| 26 |
+
def load_data(max_rows=MAX_ROWS):
|
| 27 |
+
print("Loading BTC...")
|
| 28 |
+
ds = load_dataset("WinkingFace/CryptoLM-Bitcoin-BTC-USDT", split=f"train[:{max_rows}]")
|
| 29 |
+
df_btc = ds.to_pandas()
|
| 30 |
+
for c in ["open", "volume"]:
|
| 31 |
+
df_btc[c] = pd.to_numeric(df_btc[c], errors="coerce")
|
| 32 |
+
df_btc = df_btc.rename(columns={c: f"btc_{c}" for c in df_btc.columns if c != "timestamp"})
|
| 33 |
+
|
| 34 |
+
print("Loading ETH...")
|
| 35 |
+
ds = load_dataset("WinkingFace/CryptoLM-Ethereum-ETH-USDT", split=f"train[:{max_rows}]")
|
| 36 |
+
df_eth = ds.to_pandas()
|
| 37 |
+
for c in ["open", "volume"]:
|
| 38 |
+
df_eth[c] = pd.to_numeric(df_eth[c], errors="coerce")
|
| 39 |
+
df_eth = df_eth.rename(columns={c: f"eth_{c}" for c in df_eth.columns if c != "timestamp"})
|
| 40 |
+
|
| 41 |
+
df = pd.merge(df_btc, df_eth, on="timestamp", how="inner").sort_values("timestamp").reset_index(drop=True)
|
| 42 |
+
df = df.dropna(subset=["btc_close", "eth_close"]).reset_index(drop=True)
|
| 43 |
+
print(f"Merged rows: {len(df)}")
|
| 44 |
+
return df
|
| 45 |
+
|
| 46 |
+
def engineer_features(df):
|
| 47 |
+
print("Engineering features...")
|
| 48 |
+
df["eth_btc_ratio"] = df["eth_close"] / df["btc_close"]
|
| 49 |
+
df["btc_ret_1m"] = df["btc_close"].pct_change()
|
| 50 |
+
df["eth_ret_1m"] = df["eth_close"].pct_change()
|
| 51 |
+
df["btc_vol_ma20"] = df["btc_volume"].rolling(20).mean()
|
| 52 |
+
df["eth_vol_ma20"] = df["eth_volume"].rolling(20).mean()
|
| 53 |
+
df["btc_range"] = (df["btc_high"] - df["btc_low"]) / df["btc_close"]
|
| 54 |
+
df["eth_range"] = (df["eth_high"] - df["eth_low"]) / df["eth_close"]
|
| 55 |
+
df["target"] = (df["btc_close"].shift(-AHEAD) > df["btc_close"]).astype(int)
|
| 56 |
+
df = df.iloc[:-AHEAD].copy()
|
| 57 |
+
return df
|
| 58 |
+
|
| 59 |
+
def build_windows(df, lookback=LOOKBACK):
|
| 60 |
+
print("Building windows...")
|
| 61 |
+
exclude = {"timestamp", "btc_month", "eth_month", "target"}
|
| 62 |
+
feat_cols = [c for c in df.columns if c not in exclude]
|
| 63 |
+
df = df.dropna(subset=feat_cols + ["target"]).reset_index(drop=True)
|
| 64 |
+
|
| 65 |
+
data = df[feat_cols].values.astype(np.float32)
|
| 66 |
+
targets = df["target"].values.astype(np.int64)
|
| 67 |
+
n = len(df)
|
| 68 |
+
valid = ~np.isnan(data).any(axis=1) & ~np.isnan(targets)
|
| 69 |
+
|
| 70 |
+
max_i = n - lookback - AHEAD + 1
|
| 71 |
+
X_list, y_list = [], []
|
| 72 |
+
for i in range(max_i):
|
| 73 |
+
end = i + lookback
|
| 74 |
+
tidx = end + AHEAD - 1
|
| 75 |
+
if valid[i:end].all() and valid[tidx]:
|
| 76 |
+
X_list.append(data[i:end])
|
| 77 |
+
y_list.append(targets[tidx])
|
| 78 |
+
X = np.array(X_list, dtype=np.float32)
|
| 79 |
+
y = np.array(y_list, dtype=np.int64)
|
| 80 |
+
print(f"Samples: {X.shape}, pos_rate={y.mean():.3f}")
|
| 81 |
+
return X, y
|
| 82 |
+
|
| 83 |
+
def subsample(X, y, max_n, rng):
|
| 84 |
+
if len(X) > max_n:
|
| 85 |
+
idx = rng.choice(len(X), max_n, replace=False)
|
| 86 |
+
return X[idx], y[idx]
|
| 87 |
+
return X, y
|
| 88 |
+
|
| 89 |
+
def evaluate_model(name, model, X_test, y_test, results):
|
| 90 |
+
preds = model.predict(X_test)
|
| 91 |
+
probs = model.predict_proba(X_test)[:, 1]
|
| 92 |
+
acc = accuracy_score(y_test, preds)
|
| 93 |
+
f1 = f1_score(y_test, preds)
|
| 94 |
+
auc = roc_auc_score(y_test, probs)
|
| 95 |
+
results[name] = {"accuracy": float(acc), "f1": float(f1), "auc": float(auc)}
|
| 96 |
+
print(f" {name} test: acc={acc:.4f} f1={f1:.4f} auc={auc:.4f}")
|
| 97 |
+
return results
|
| 98 |
+
|
| 99 |
+
def main():
|
| 100 |
+
df = load_data()
|
| 101 |
+
df = engineer_features(df)
|
| 102 |
+
X, y = build_windows(df)
|
| 103 |
+
|
| 104 |
+
n = len(X)
|
| 105 |
+
te = int(n * 0.70)
|
| 106 |
+
ve = int(n * 0.85)
|
| 107 |
+
|
| 108 |
+
X_train, y_train = X[:te], y[:te]
|
| 109 |
+
X_val, y_val = X[te:ve], y[te:ve]
|
| 110 |
+
X_test, y_test = X[ve:], y[ve:]
|
| 111 |
+
print(f"Split: train={len(X_train)}, val={len(X_val)}, test={len(X_test)}")
|
| 112 |
+
|
| 113 |
+
rng = np.random.RandomState(SEED)
|
| 114 |
+
X_train, y_train = subsample(X_train, y_train, MAX_TRAIN_SAMPLES, rng)
|
| 115 |
+
X_val, y_val = subsample(X_val, y_val, MAX_VAL_SAMPLES, rng)
|
| 116 |
+
X_test, y_test = subsample(X_test, y_test, MAX_TEST_SAMPLES, rng)
|
| 117 |
+
print(f"Subsampled: train={len(X_train)}, val={len(X_val)}, test={len(X_test)}")
|
| 118 |
+
|
| 119 |
+
def flat(X):
|
| 120 |
+
return X.reshape(X.shape[0], -1)
|
| 121 |
+
|
| 122 |
+
X_train_f = flat(X_train)
|
| 123 |
+
X_val_f = flat(X_val)
|
| 124 |
+
X_test_f = flat(X_test)
|
| 125 |
+
|
| 126 |
+
valid = (np.isfinite(X_train_f).all(axis=0) &
|
| 127 |
+
np.isfinite(X_val_f).all(axis=0) &
|
| 128 |
+
np.isfinite(X_test_f).all(axis=0))
|
| 129 |
+
X_train_f = X_train_f[:, valid]
|
| 130 |
+
X_val_f = X_val_f[:, valid]
|
| 131 |
+
X_test_f = X_test_f[:, valid]
|
| 132 |
+
print(f"Valid features: {X_train_f.shape[1]}")
|
| 133 |
+
|
| 134 |
+
mean = X_train_f.mean(axis=0)
|
| 135 |
+
std = X_train_f.std(axis=0) + 1e-8
|
| 136 |
+
X_train_f = (X_train_f - mean) / std
|
| 137 |
+
X_val_f = (X_val_f - mean) / std
|
| 138 |
+
X_test_f = (X_test_f - mean) / std
|
| 139 |
+
|
| 140 |
+
results = {}
|
| 141 |
+
|
| 142 |
+
print("\nTraining Random Forest...")
|
| 143 |
+
rf = RandomForestClassifier(n_estimators=200, max_depth=12, min_samples_leaf=5, n_jobs=-1, random_state=SEED)
|
| 144 |
+
rf.fit(X_train_f, y_train)
|
| 145 |
+
results = evaluate_model("RandomForest", rf, X_test_f, y_test, results)
|
| 146 |
+
|
| 147 |
+
print("\nTraining Logistic Regression...")
|
| 148 |
+
lr = LogisticRegression(max_iter=500, random_state=SEED)
|
| 149 |
+
lr.fit(X_train_f, y_train)
|
| 150 |
+
results = evaluate_model("LogisticRegression", lr, X_test_f, y_test, results)
|
| 151 |
+
|
| 152 |
+
best_name = max(results, key=lambda k: results[k]["auc"])
|
| 153 |
+
print(f"\nBest model: {best_name} (AUC={results[best_name]['auc']:.4f})")
|
| 154 |
+
best_model = rf if best_name == "RandomForest" else lr
|
| 155 |
+
|
| 156 |
+
with open(os.path.join(OUT_DIR, "model.pkl"), "wb") as f:
|
| 157 |
+
pickle.dump(best_model, f)
|
| 158 |
+
np.save(os.path.join(OUT_DIR, "feature_mean.npy"), mean)
|
| 159 |
+
np.save(os.path.join(OUT_DIR, "feature_std.npy"), std)
|
| 160 |
+
np.save(os.path.join(OUT_DIR, "valid_cols.npy"), valid)
|
| 161 |
+
|
| 162 |
+
preds = best_model.predict(X_test_f)
|
| 163 |
+
print("\nBest Model Classification Report (Test):")
|
| 164 |
+
print(classification_report(y_test, preds, target_names=["down", "up"], digits=4))
|
| 165 |
+
|
| 166 |
+
metrics = {
|
| 167 |
+
"best_model": best_name,
|
| 168 |
+
"train_samples": int(len(X_train_f)),
|
| 169 |
+
"val_samples": int(len(X_val_f)),
|
| 170 |
+
"test_samples": int(len(X_test_f)),
|
| 171 |
+
"n_features": int(X_train_f.shape[1]),
|
| 172 |
+
"results": results,
|
| 173 |
+
"best_test_accuracy": results[best_name]["accuracy"],
|
| 174 |
+
"best_test_f1": results[best_name]["f1"],
|
| 175 |
+
"best_test_auc": results[best_name]["auc"],
|
| 176 |
+
}
|
| 177 |
+
with open(os.path.join(OUT_DIR, "metrics.json"), "w") as f:
|
| 178 |
+
json.dump(metrics, f, indent=2)
|
| 179 |
+
print(f"\nArtifacts saved to {OUT_DIR}")
|
| 180 |
+
|
| 181 |
+
if __name__ == "__main__":
|
| 182 |
+
main()
|