GoshawkVortexAI commited on
Commit
47584e0
·
verified ·
1 Parent(s): 0fd33e0

Create labeler.py

Browse files
Files changed (1) hide show
  1. labeler.py +159 -0
labeler.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ labeler.py — Supervised learning target construction for crypto trading.
3
+
4
+ Target definition (binary):
5
+ y = 1 if the trade hits RR=1:2 target BEFORE stop within LABEL_FORWARD_BARS
6
+ y = 0 if stop is hit first OR neither hits within the window
7
+
8
+ Design decisions:
9
+ - Stop and target computed from ATR at signal bar (no lookahead)
10
+ - Realistic costs (fees + slippage) deducted from target threshold
11
+ - Both long and short labeling supported (direction from rule engine)
12
+ - Time-series integrity: labeling uses only forward prices from bar+1
13
+ - NaN label produced when insufficient forward bars exist (dropped later)
14
+
15
+ Target horizon N = 24 bars (1H timeframe = 1 full trading day):
16
+ - Short enough to avoid regime change within the trade
17
+ - Long enough for 1:2 RR to fully play out
18
+ - Empirically: >24 bars introduces too many confounding events
19
+ - <12 bars under-samples legitimate continuation moves
20
+ """
21
+
22
+ import numpy as np
23
+ import pandas as pd
24
+ from typing import Optional
25
+
26
+ from ml_config import (
27
+ LABEL_FORWARD_BARS,
28
+ STOP_MULT,
29
+ TARGET_RR,
30
+ ROUND_TRIP_COST,
31
+ )
32
+
33
+
34
+ def label_single_trade(
35
+ df: pd.DataFrame,
36
+ signal_idx: int,
37
+ atr: float,
38
+ direction: int, # +1 = long, -1 = short
39
+ forward_bars: int = LABEL_FORWARD_BARS,
40
+ ) -> Optional[int]:
41
+ """
42
+ Label a single trade signal.
43
+
44
+ Args:
45
+ df: Full OHLCV DataFrame (index = timestamp, sorted ascending)
46
+ signal_idx: Integer position of signal bar in df
47
+ atr: ATR value AT signal bar (must be pre-computed, no lookahead)
48
+ direction: +1 long, -1 short
49
+ forward_bars: Max bars to check
50
+
51
+ Returns:
52
+ 1 = win (target hit first)
53
+ 0 = loss (stop hit first or timeout)
54
+ None = insufficient data
55
+ """
56
+ if signal_idx + 1 >= len(df):
57
+ return None
58
+
59
+ entry_price = float(df["close"].iloc[signal_idx])
60
+ stop_distance = atr * STOP_MULT
61
+
62
+ # Cost-adjusted thresholds: we need price to move further than naive RR
63
+ cost_ticks = entry_price * ROUND_TRIP_COST
64
+ target_distance = stop_distance * TARGET_RR + cost_ticks
65
+
66
+ if direction == 1: # long
67
+ stop_price = entry_price - stop_distance
68
+ target_price = entry_price + target_distance
69
+ else: # short
70
+ stop_price = entry_price + stop_distance
71
+ target_price = entry_price - target_distance
72
+
73
+ end_idx = min(signal_idx + 1 + forward_bars, len(df))
74
+ forward = df.iloc[signal_idx + 1 : end_idx]
75
+
76
+ if len(forward) == 0:
77
+ return None
78
+
79
+ for _, bar in forward.iterrows():
80
+ high = float(bar["high"])
81
+ low = float(bar["low"])
82
+
83
+ if direction == 1:
84
+ # Long: check stop (low) then target (high) — pessimistic ordering
85
+ if low <= stop_price:
86
+ return 0
87
+ if high >= target_price:
88
+ return 1
89
+ else:
90
+ # Short: check stop (high) then target (low)
91
+ if high >= stop_price:
92
+ return 0
93
+ if low <= target_price:
94
+ return 1
95
+
96
+ # Neither hit within window = loss (opportunity cost + fees)
97
+ return 0
98
+
99
+
100
+ def label_dataframe(
101
+ df: pd.DataFrame,
102
+ signal_mask: pd.Series,
103
+ atr_series: pd.Series,
104
+ direction_series: pd.Series,
105
+ forward_bars: int = LABEL_FORWARD_BARS,
106
+ min_bars_remaining: int = LABEL_FORWARD_BARS,
107
+ ) -> pd.Series:
108
+ """
109
+ Label all signal bars in a DataFrame.
110
+
111
+ Args:
112
+ df: Full OHLCV DataFrame
113
+ signal_mask: Boolean series, True where a setup was flagged
114
+ atr_series: ATR at each bar (aligned to df index)
115
+ direction_series: +1/-1 for each signal bar
116
+ forward_bars: Max forward window
117
+ min_bars_remaining: Drop labels too close to end of data
118
+
119
+ Returns:
120
+ Series of {1, 0, NaN} aligned to df.index
121
+ """
122
+ labels = pd.Series(np.nan, index=df.index, dtype="float64")
123
+ n = len(df)
124
+
125
+ signal_positions = np.where(signal_mask.values)[0]
126
+
127
+ for pos in signal_positions:
128
+ # Drop signals too close to end of data (insufficient forward bars)
129
+ if pos + min_bars_remaining >= n:
130
+ continue
131
+
132
+ atr_val = float(atr_series.iloc[pos])
133
+ direction = int(direction_series.iloc[pos])
134
+
135
+ if np.isnan(atr_val) or direction == 0:
136
+ continue
137
+
138
+ label = label_single_trade(df, pos, atr_val, direction, forward_bars)
139
+ if label is not None:
140
+ labels.iloc[pos] = float(label)
141
+
142
+ return labels
143
+
144
+
145
+ def compute_label_stats(labels: pd.Series) -> dict:
146
+ """Return win rate, class balance, and label counts for diagnostics."""
147
+ valid = labels.dropna()
148
+ total = len(valid)
149
+ wins = int((valid == 1).sum())
150
+ losses = int((valid == 0).sum())
151
+ win_rate = wins / total if total > 0 else 0.0
152
+ class_imbalance = wins / losses if losses > 0 else float("inf")
153
+ return {
154
+ "total_labels": total,
155
+ "wins": wins,
156
+ "losses": losses,
157
+ "win_rate": round(win_rate, 4),
158
+ "class_imbalance_ratio": round(class_imbalance, 3),
159
+ }