Spaces:

GoshawkVortexAI
/

Goshawk_Hedge_Pro

Sleeping

App Files Files Community

GoshawkVortexAI commited on Feb 27

Commit

47584e0

verified ·

1 Parent(s): 0fd33e0

Create labeler.py

Browse files

Files changed (1) hide show

labeler.py +159 -0

labeler.py ADDED Viewed

	@@ -0,0 +1,159 @@

+"""
+labeler.py — Supervised learning target construction for crypto trading.
+Target definition (binary):
+    y = 1 if the trade hits RR=1:2 target BEFORE stop within LABEL_FORWARD_BARS
+    y = 0 if stop is hit first OR neither hits within the window
+Design decisions:
+  - Stop and target computed from ATR at signal bar (no lookahead)
+  - Realistic costs (fees + slippage) deducted from target threshold
+  - Both long and short labeling supported (direction from rule engine)
+  - Time-series integrity: labeling uses only forward prices from bar+1
+  - NaN label produced when insufficient forward bars exist (dropped later)
+Target horizon N = 24 bars (1H timeframe = 1 full trading day):
+  - Short enough to avoid regime change within the trade
+  - Long enough for 1:2 RR to fully play out
+  - Empirically: >24 bars introduces too many confounding events
+  - <12 bars under-samples legitimate continuation moves
+"""
+import numpy as np
+import pandas as pd
+from typing import Optional
+from ml_config import (
+    LABEL_FORWARD_BARS,
+    STOP_MULT,
+    TARGET_RR,
+    ROUND_TRIP_COST,
+)
+def label_single_trade(
+    df: pd.DataFrame,
+    signal_idx: int,
+    atr: float,
+    direction: int,  # +1 = long, -1 = short
+    forward_bars: int = LABEL_FORWARD_BARS,
+) -> Optional[int]:
+    """
+    Label a single trade signal.
+    Args:
+        df: Full OHLCV DataFrame (index = timestamp, sorted ascending)
+        signal_idx: Integer position of signal bar in df
+        atr: ATR value AT signal bar (must be pre-computed, no lookahead)
+        direction: +1 long, -1 short
+        forward_bars: Max bars to check
+    Returns:
+        1 = win (target hit first)
+        0 = loss (stop hit first or timeout)
+        None = insufficient data
+    """
+    if signal_idx + 1 >= len(df):
+        return None
+    entry_price = float(df["close"].iloc[signal_idx])
+    stop_distance = atr * STOP_MULT
+    # Cost-adjusted thresholds: we need price to move further than naive RR
+    cost_ticks = entry_price * ROUND_TRIP_COST
+    target_distance = stop_distance * TARGET_RR + cost_ticks
+    if direction == 1:  # long
+        stop_price   = entry_price - stop_distance
+        target_price = entry_price + target_distance
+    else:  # short
+        stop_price   = entry_price + stop_distance
+        target_price = entry_price - target_distance
+    end_idx = min(signal_idx + 1 + forward_bars, len(df))
+    forward = df.iloc[signal_idx + 1 : end_idx]
+    if len(forward) == 0:
+        return None
+    for _, bar in forward.iterrows():
+        high = float(bar["high"])
+        low  = float(bar["low"])
+        if direction == 1:
+            # Long: check stop (low) then target (high) — pessimistic ordering
+            if low <= stop_price:
+                return 0
+            if high >= target_price:
+                return 1
+        else:
+            # Short: check stop (high) then target (low)
+            if high >= stop_price:
+                return 0
+            if low <= target_price:
+                return 1
+    # Neither hit within window = loss (opportunity cost + fees)
+    return 0
+def label_dataframe(
+    df: pd.DataFrame,
+    signal_mask: pd.Series,
+    atr_series: pd.Series,
+    direction_series: pd.Series,
+    forward_bars: int = LABEL_FORWARD_BARS,
+    min_bars_remaining: int = LABEL_FORWARD_BARS,
+) -> pd.Series:
+    """
+    Label all signal bars in a DataFrame.
+    Args:
+        df: Full OHLCV DataFrame
+        signal_mask: Boolean series, True where a setup was flagged
+        atr_series: ATR at each bar (aligned to df index)
+        direction_series: +1/-1 for each signal bar
+        forward_bars: Max forward window
+        min_bars_remaining: Drop labels too close to end of data
+    Returns:
+        Series of {1, 0, NaN} aligned to df.index
+    """
+    labels = pd.Series(np.nan, index=df.index, dtype="float64")
+    n = len(df)
+    signal_positions = np.where(signal_mask.values)[0]
+    for pos in signal_positions:
+        # Drop signals too close to end of data (insufficient forward bars)
+        if pos + min_bars_remaining >= n:
+            continue
+        atr_val  = float(atr_series.iloc[pos])
+        direction = int(direction_series.iloc[pos])
+        if np.isnan(atr_val) or direction == 0:
+            continue
+        label = label_single_trade(df, pos, atr_val, direction, forward_bars)
+        if label is not None:
+            labels.iloc[pos] = float(label)
+    return labels
+def compute_label_stats(labels: pd.Series) -> dict:
+    """Return win rate, class balance, and label counts for diagnostics."""
+    valid = labels.dropna()
+    total = len(valid)
+    wins  = int((valid == 1).sum())
+    losses = int((valid == 0).sum())
+    win_rate = wins / total if total > 0 else 0.0
+    class_imbalance = wins / losses if losses > 0 else float("inf")
+    return {
+        "total_labels": total,
+        "wins": wins,
+        "losses": losses,
+        "win_rate": round(win_rate, 4),
+        "class_imbalance_ratio": round(class_imbalance, 3),
+    }