Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

.claude/settings.local.json +6 -1
data/data_loader.py +440 -20
pre_cache.sh +42 -1
scripts/cache_dataset.py +186 -73

.claude/settings.local.json CHANGED Viewed

@@ -9,7 +9,12 @@
       "Bash(python3:*)",
       "Bash(dir:*)",
       "Bash(cmd /c \"dir /s /b\")",
-      "Bash(python -c:*)"
     ]
   }
 }

       "Bash(python3:*)",
       "Bash(dir:*)",
       "Bash(cmd /c \"dir /s /b\")",
+      "Bash(python -c:*)",
+      "Bash(pip install:*)",
+      "Bash(huggingface-cli login:*)",
+      "Bash(hf whoami:*)",
+      "Bash(huggingface-cli whoami:*)",
+      "Bash(python -m huggingface_hub.commands.huggingface_cli:*)"
     ]
   }
 }

data/data_loader.py CHANGED Viewed

@@ -1080,36 +1080,51 @@ class OracleDataset(Dataset):
     def __getitem__(self, idx: int) -> Optional[Dict[str, Any]]:
         """
-        Loads raw data from cache, samples a random T_cutoff, and generates a training sample.
-        OPTIMIZED: Uses fully cached wallet/graph/image data for zero DB calls during training.
         """
         import time as _time
         _timings = {}
         _total_start = _time.perf_counter()
-        # --- REMOVED: No more fetcher initialization during training ---
-        # We use fully offline mode with pre-cached data
-        _timings['fetcher_init'] = 0.0
         # --- TIMING: Cache load ---
         _t0 = _time.perf_counter()
-        raw_data = None
-        if self.cache_dir:
-            if idx >= len(self.cached_files):
-                raise IndexError(f"Index {idx} out of range for {len(self.cached_files)} cached files.")
-            filepath = self.cached_files[idx]
-            try:
-                raw_data = torch.load(filepath, map_location='cpu', weights_only=False)
-            except Exception as e:
-                raise RuntimeError(f"ERROR: Could not load cached item {filepath}: {e}")
-        else:
-             # Strict Offline Mode: No dynamic generation fallback
-             raise RuntimeError(f"Offline mode required. No cache directory provided or configured.")
         _timings['cache_load'] = _time.perf_counter() - _t0
-        if not raw_data:
-            raise RuntimeError(f"No raw data loaded for index {idx}")
         required_keys = [
             "mint_timestamp",
@@ -2347,3 +2362,408 @@ class OracleDataset(Dataset):
             'labels_mask': torch.tensor(mask_values, dtype=torch.float32),
             'quality_score': torch.tensor(quality_score if quality_score is not None else 0.0, dtype=torch.float32)
         }

     def __getitem__(self, idx: int) -> Optional[Dict[str, Any]]:
         """
+        Loads data from cache. Behavior depends on cache mode:
+        - RAW MODE: Loads raw token data, samples T_cutoff at runtime, applies H/B/H
+        - CONTEXT MODE: Loads pre-computed training context directly (fully offline)
+        The cache mode is auto-detected from the cached file's 'cache_mode' field.
         """
         import time as _time
         _timings = {}
         _total_start = _time.perf_counter()
         # --- TIMING: Cache load ---
         _t0 = _time.perf_counter()
+        if not self.cache_dir:
+            raise RuntimeError("Offline mode required. No cache directory provided.")
+        if idx >= len(self.cached_files):
+            raise IndexError(f"Index {idx} out of range for {len(self.cached_files)} cached files.")
+        filepath = self.cached_files[idx]
+        try:
+            cached_data = torch.load(filepath, map_location='cpu', weights_only=False)
+        except Exception as e:
+            raise RuntimeError(f"ERROR: Could not load cached item {filepath}: {e}")
         _timings['cache_load'] = _time.perf_counter() - _t0
+        if not cached_data:
+            raise RuntimeError(f"No data loaded for index {idx}")
+        # Auto-detect cache mode
+        cache_mode = cached_data.get('cache_mode', 'raw')
+        if cache_mode == 'context':
+            # CONTEXT MODE: Return pre-computed training context directly
+            # This is fully deterministic - no runtime sampling or processing
+            _timings['total'] = _time.perf_counter() - _total_start
+            if idx % 100 == 0:
+                print(f"[Sample {idx}] CONTEXT mode | cache_load: {_timings['cache_load']*1000:.1f}ms | "
+                      f"total: {_timings['total']*1000:.1f}ms | events: {len(cached_data.get('event_sequence', []))}")
+            return cached_data
+        # RAW MODE: Fall through to original __getitem__ logic with runtime T_cutoff sampling
+        raw_data = cached_data
         required_keys = [
             "mint_timestamp",
             'labels_mask': torch.tensor(mask_values, dtype=torch.float32),
             'quality_score': torch.tensor(quality_score if quality_score is not None else 0.0, dtype=torch.float32)
         }
+    def __cacheitem_context__(self, idx: int, num_samples_per_token: int = 1) -> List[Optional[Dict[str, Any]]]:
+        """
+        Generates fully processed training contexts for caching.
+        This method:
+        1. Fetches raw token data (like __cacheitem__)
+        2. Samples T_cutoff(s) using the weight sampling logic
+        3. Applies H/B/H dynamic sampling based on max_seq_len
+        4. Returns complete training-ready samples that can be loaded directly
+        This moves ALL non-determinism into cache time, making training fully offline
+        and avoiding caching tokens that would never be seen (98% garbage filtered out
+        by weight sampling and T_cutoff eligibility).
+        Args:
+            idx: Index into sampled_mints
+            num_samples_per_token: Number of different T_cutoff samples to generate per token
+        Returns:
+            List of training-ready samples (may be fewer than num_samples_per_token if
+            some T_cutoffs are invalid)
+        """
+        import time as _time
+        if not self.sampled_mints:
+            raise RuntimeError("Dataset has no mint records loaded.")
+        if idx >= len(self.sampled_mints):
+            raise IndexError(f"Index {idx} exceeds mint count {len(self.sampled_mints)}.")
+        initial_mint_record = self.sampled_mints[idx]
+        t0 = initial_mint_record["timestamp"]
+        if isinstance(t0, datetime.datetime) and t0.tzinfo is None:
+            t0 = t0.replace(tzinfo=datetime.timezone.utc)
+        creator_address = initial_mint_record['creator_address']
+        token_address = initial_mint_record['mint_address']
+        print(f"\n--- Caching CONTEXT for token: {token_address} (generating {num_samples_per_token} samples) ---")
+        if not self.fetcher:
+            raise RuntimeError("Dataset has no data fetcher.")
+        # --- STEP 1: Fetch raw data (same as __cacheitem__) ---
+        raw_data = self.fetcher.fetch_raw_token_data(
+            token_address=token_address,
+            creator_address=creator_address,
+            mint_timestamp=t0,
+            max_horizon_seconds=self.max_cache_horizon_seconds,
+            include_wallet_data=False,
+            include_graph=False,
+            min_trades=10,
+            full_history=True,
+            prune_failed=False,
+            prune_transfers=False
+        )
+        if raw_data is None:
+            print(f"  SKIP: No raw data for {token_address}")
+            return []
+        def _timestamp_to_order_value(ts_value) -> float:
+            if isinstance(ts_value, datetime.datetime):
+                if ts_value.tzinfo is None:
+                    ts_value = ts_value.replace(tzinfo=datetime.timezone.utc)
+                return ts_value.timestamp()
+            try:
+                return float(ts_value)
+            except:
+                return 0.0
+        # --- STEP 2: Validate trades and find eligible T_cutoff indices ---
+        all_trades_raw = raw_data.get('trades', [])
+        if not all_trades_raw:
+            print(f"  SKIP: No trades for {token_address}")
+            return []
+        all_trades_sorted = sorted(
+            [t for t in all_trades_raw if t.get('timestamp') is not None],
+            key=lambda t: _timestamp_to_order_value(t.get('timestamp'))
+        )
+        min_context_trades = 10
+        if len(all_trades_sorted) < (min_context_trades + 1):
+            print(f"  SKIP: Not enough trades ({len(all_trades_sorted)}) for {token_address}")
+            return []
+        # Find successful trade indices
+        successful_indices = [
+            i for i, t in enumerate(all_trades_sorted)
+            if t.get('success', False) and float(t.get('price_usd', 0) or 0) > 0
+        ]
+        if len(successful_indices) < 2:
+            print(f"  SKIP: Not enough successful trades for {token_address}")
+            return []
+        max_horizon_seconds = max(self.horizons_seconds) if self.horizons_seconds else 0
+        min_idx = min_context_trades - 1
+        max_idx = len(all_trades_sorted) - 2
+        if max_idx < min_idx:
+            print(f"  SKIP: Invalid index range for {token_address}")
+            return []
+        # Build lookup arrays
+        last_successful_before = [-1] * len(all_trades_sorted)
+        last_seen = -1
+        succ_set = set(successful_indices)
+        for i in range(len(all_trades_sorted)):
+            if i in succ_set:
+                last_seen = i
+            last_successful_before[i] = last_seen
+        next_successful_after = [-1] * len(all_trades_sorted)
+        next_seen = -1
+        for i in range(len(all_trades_sorted) - 1, -1, -1):
+            if i in succ_set:
+                next_seen = i
+            next_successful_after[i] = next_seen
+        # Find all eligible T_cutoff indices
+        eligible_indices = []
+        for i in range(min_idx, max_idx + 1):
+            anchor_idx = last_successful_before[i]
+            next_idx = next_successful_after[i + 1] if i + 1 < len(all_trades_sorted) else -1
+            if anchor_idx < 0 or next_idx < 0:
+                continue
+            cutoff_ts = _timestamp_to_order_value(all_trades_sorted[i].get('timestamp'))
+            next_ts = _timestamp_to_order_value(all_trades_sorted[next_idx].get('timestamp'))
+            if next_ts <= cutoff_ts + max_horizon_seconds:
+                eligible_indices.append(i)
+        if not eligible_indices:
+            print(f"  SKIP: No eligible T_cutoff indices for {token_address}")
+            return []
+        print(f"  INFO: Found {len(eligible_indices)} eligible T_cutoff positions")
+        # --- STEP 3: Generate OHLC and holder snapshots (same as __cacheitem__) ---
+        trades = raw_data.get('trades', [])
+        trade_ts_values = [_timestamp_to_order_value(t.get('timestamp')) for t in trades]
+        t0_val = _timestamp_to_order_value(t0)
+        last_trade_ts_val = max(trade_ts_values)
+        duration_seconds = int(last_trade_ts_val - t0_val) + 120
+        ohlc_1s = torch.zeros((duration_seconds, 2), dtype=torch.float32)
+        trades.sort(key=lambda x: _timestamp_to_order_value(x['timestamp']))
+        trades_by_sec = defaultdict(list)
+        for t in trades:
+            ts = _timestamp_to_order_value(t['timestamp'])
+            sec_idx = int(ts - t0_val)
+            if 0 <= sec_idx < duration_seconds:
+                trades_by_sec[sec_idx].append(t['price_usd'])
+        last_close = float(trades[0]['price_usd'])
+        for i in range(duration_seconds):
+            if i in trades_by_sec:
+                prices = trades_by_sec[i]
+                op, cl = prices[0], prices[-1]
+                last_close = cl
+            else:
+                op = cl = last_close
+            ohlc_1s[i, 0] = float(op)
+            ohlc_1s[i, 1] = float(cl)
+        raw_data['ohlc_1s'] = ohlc_1s
+        # Generate holder snapshots
+        interval = 300
+        num_intervals = (duration_seconds // interval) + 1
+        snapshot_stats = torch.zeros((num_intervals, 6), dtype=torch.float32)
+        buckets = defaultdict(list)
+        for t in trades:
+            ts = _timestamp_to_order_value(t['timestamp'])
+            bucket_idx = int(ts - t0_val) // interval
+            if bucket_idx >= 0:
+                buckets[bucket_idx].append(t)
+        holder_snapshots_list = []
+        for i in range(num_intervals):
+            bucket_trades = buckets[i]
+            vol = sum(t.get('total_usd', 0.0) for t in bucket_trades)
+            tx = len(bucket_trades)
+            buys = sum(1 for t in bucket_trades if t.get('trade_direction') == 0 or t.get('trade_type') == 0)
+            sells = tx - buys
+            snapshot_ts = t0 + datetime.timedelta(seconds=(i+1)*interval)
+            count, top_holders = self.fetcher.fetch_holder_snapshot_stats_for_token(
+                token_address, snapshot_ts, limit=HOLDER_SNAPSHOT_TOP_K
+            )
+            total_supply = raw_data.get('total_supply', 0) or 1
+            if raw_data.get('decimals'):
+                total_supply /= (10 ** raw_data['decimals'])
+            top10_bal = sum(h.get('current_balance', 0) for h in top_holders[:10])
+            top10_pct = (top10_bal / total_supply) if total_supply > 0 else 0.0
+            snapshot_stats[i, 0] = float(vol)
+            snapshot_stats[i, 1] = float(tx)
+            snapshot_stats[i, 2] = float(buys)
+            snapshot_stats[i, 3] = float(sells)
+            snapshot_stats[i, 4] = float(count)
+            snapshot_stats[i, 5] = float(top10_pct)
+            holder_snapshots_list.append({
+                'timestamp': int(snapshot_ts.timestamp()),
+                'holders': top_holders
+            })
+        raw_data['snapshots_5m'] = snapshot_stats
+        raw_data['holder_snapshots_list'] = holder_snapshots_list
+        raw_data['protocol_id'] = initial_mint_record.get('protocol')
+        # --- STEP 4: Collect ALL wallets and pre-fetch their data ---
+        all_wallets = set()
+        all_wallets.add(creator_address)
+        for trade in raw_data.get('trades', []):
+            if trade.get('maker'):
+                all_wallets.add(trade['maker'])
+        for transfer in raw_data.get('transfers', []):
+            if transfer.get('source'):
+                all_wallets.add(transfer['source'])
+            if transfer.get('destination'):
+                all_wallets.add(transfer['destination'])
+        for pool in raw_data.get('pool_creations', []):
+            if pool.get('creator_address'):
+                all_wallets.add(pool['creator_address'])
+        for liq in raw_data.get('liquidity_changes', []):
+            if liq.get('lp_provider'):
+                all_wallets.add(liq['lp_provider'])
+        for snapshot in holder_snapshots_list:
+            for holder in snapshot.get('holders', []):
+                if holder.get('wallet_address'):
+                    all_wallets.add(holder['wallet_address'])
+        all_wallets.discard(None)
+        all_wallets.discard('')
+        wallet_list = list(all_wallets)
+        max_T_cutoff = datetime.datetime.fromtimestamp(last_trade_ts_val, tz=datetime.timezone.utc)
+        try:
+            cached_profiles, cached_socials = self.fetcher.fetch_wallet_profiles_and_socials(wallet_list, max_T_cutoff)
+        except Exception as e:
+            print(f"  WARN: Failed to fetch wallet profiles/socials: {e}")
+            cached_profiles, cached_socials = {}, {}
+        try:
+            cached_holdings = self.fetcher.fetch_wallet_holdings(wallet_list, max_T_cutoff)
+        except Exception as e:
+            print(f"  WARN: Failed to fetch wallet holdings: {e}")
+            cached_holdings = {}
+        try:
+            cached_graph_entities, cached_graph_links = self.fetcher.fetch_graph_links(
+                wallet_list, max_T_cutoff, max_degrees=1
+            )
+        except Exception as e:
+            print(f"  WARN: Failed to fetch graph links: {e}")
+            cached_graph_entities, cached_graph_links = {}, {}
+        # Fetch token image
+        cached_image_bytes = None
+        try:
+            bullx_image_url = f"https://image.bullx.io/1399811149/{token_address}?retry=0"
+            resp = self.http_session.get(bullx_image_url, timeout=5)
+            if resp.status_code == 200:
+                cached_image_bytes = resp.content
+        except:
+            pass
+        # --- STEP 5: Sample T_cutoffs and generate complete training contexts ---
+        results = []
+        # Sample indices (with replacement if needed)
+        if num_samples_per_token >= len(eligible_indices):
+            sampled_indices = eligible_indices.copy()
+        else:
+            sampled_indices = random.sample(eligible_indices, num_samples_per_token)
+        print(f"  INFO: Generating {len(sampled_indices)} training contexts...")
+        for sample_num, sample_idx in enumerate(sampled_indices):
+            sample_trade = all_trades_sorted[sample_idx]
+            sample_offset_ts = _timestamp_to_order_value(sample_trade.get('timestamp'))
+            T_cutoff = datetime.datetime.fromtimestamp(sample_offset_ts, tz=datetime.timezone.utc)
+            cutoff_ts = sample_offset_ts
+            # Collect wallets visible at T_cutoff
+            wallets_to_fetch = set()
+            wallets_to_fetch.add(creator_address)
+            for trade in raw_data.get('trades', []):
+                if _timestamp_to_order_value(trade.get('timestamp')) <= cutoff_ts:
+                    if trade.get('maker'):
+                        wallets_to_fetch.add(trade['maker'])
+            for transfer in raw_data.get('transfers', []):
+                if _timestamp_to_order_value(transfer.get('timestamp')) <= cutoff_ts:
+                    if transfer.get('source'):
+                        wallets_to_fetch.add(transfer['source'])
+                    if transfer.get('destination'):
+                        wallets_to_fetch.add(transfer['destination'])
+            for pool in raw_data.get('pool_creations', []):
+                if _timestamp_to_order_value(pool.get('timestamp')) <= cutoff_ts:
+                    if pool.get('creator_address'):
+                        wallets_to_fetch.add(pool['creator_address'])
+            for liq in raw_data.get('liquidity_changes', []):
+                if _timestamp_to_order_value(liq.get('timestamp')) <= cutoff_ts:
+                    if liq.get('lp_provider'):
+                        wallets_to_fetch.add(liq['lp_provider'])
+            # Get holder snapshot at T_cutoff
+            elapsed = (T_cutoff - t0).total_seconds()
+            snap_idx = int(elapsed // 300)
+            if 0 <= snap_idx < len(holder_snapshots_list):
+                snapshot_data = holder_snapshots_list[snap_idx]
+                for holder in snapshot_data.get('holders', []):
+                    if holder.get('wallet_address'):
+                        wallets_to_fetch.add(holder['wallet_address'])
+            wallets_to_fetch.discard(None)
+            wallets_to_fetch.discard('')
+            # Build offline data for this context
+            pooler = EmbeddingPooler()
+            # Process token data offline
+            offline_token_data = {token_address: raw_data.copy()}
+            if cached_image_bytes:
+                try:
+                    cached_image = Image.open(BytesIO(cached_image_bytes))
+                    offline_token_data[token_address]['_cached_image_pil'] = cached_image
+                except:
+                    pass
+            main_token_data = self._process_token_data_offline(
+                [token_address], pooler, T_cutoff, token_data=offline_token_data
+            )
+            if not main_token_data:
+                continue
+            # Process wallet data offline
+            wallet_data, all_token_data = self._process_wallet_data(
+                list(wallets_to_fetch),
+                main_token_data.copy(),
+                pooler,
+                T_cutoff,
+                profiles_override=cached_profiles,
+                socials_override=cached_socials,
+                holdings_override=cached_holdings
+            )
+            # Generate the complete training item (with H/B/H applied via _generate_dataset_item)
+            mint_event = {
+                'event_type': 'Mint',
+                'timestamp': int(t0.timestamp()),
+                'relative_ts': 0,
+                'wallet_address': creator_address,
+                'token_address': token_address,
+                'protocol_id': raw_data.get('protocol_id', 0)
+            }
+            result = self._generate_dataset_item(
+                token_address=token_address,
+                t0=t0,
+                T_cutoff=T_cutoff,
+                mint_event=mint_event,
+                trade_records=raw_data['trades'],
+                transfer_records=raw_data['transfers'],
+                pool_creation_records=raw_data['pool_creations'],
+                liquidity_change_records=raw_data['liquidity_changes'],
+                fee_collection_records=raw_data['fee_collections'],
+                burn_records=raw_data['burns'],
+                supply_lock_records=raw_data['supply_locks'],
+                migration_records=raw_data['migrations'],
+                wallet_data=wallet_data,
+                all_token_data=all_token_data,
+                graph_links=cached_graph_links,
+                graph_seed_entities=wallets_to_fetch,
+                all_graph_entities=cached_graph_entities,
+                future_trades_for_labels=raw_data['trades'],
+                pooler=pooler,
+                sample_idx=idx,
+                cached_holders_list=holder_snapshots_list,
+                cached_ohlc_1s=ohlc_1s,
+                quality_score=None  # Will be injected by cache_dataset.py
+            )
+            if result is not None:
+                # Store the T_cutoff used for this sample (for reproducibility tracking)
+                result['cached_t_cutoff_ts'] = sample_offset_ts
+                result['cached_sample_num'] = sample_num
+                results.append(result)
+                print(f"    + Context {sample_num}: T_cutoff={T_cutoff.isoformat()}, events={len(result.get('event_sequence', []))}")
+        print(f"  INFO: Generated {len(results)} valid training contexts for {token_address}")
+        return results

pre_cache.sh CHANGED Viewed

@@ -1,7 +1,48 @@
 #!/bin/bash
 # Pre-caches the dataset for training
 echo "Starting dataset caching..."
 python3 scripts/cache_dataset.py \
-    --ohlc_stats_path "/workspace/apollo/data/ohlc_stats.npz"
 echo "Done!"

 #!/bin/bash
 # Pre-caches the dataset for training
+#
+# Usage:
+#   ./pre_cache.sh                          # Raw mode (old behavior)
+#   ./pre_cache.sh --cache_mode context     # Context mode (new fully offline)
+#
+# Context mode arguments:
+#   --context_length N      Max sequence length, triggers H/B/H when exceeded (default: 8192)
+#   --min_trades N          Minimum trades for T_cutoff sampling (default: 10)
+#   --samples_per_token N   Number of T_cutoff samples per token (default: 1)
+set -e
+# Default values
+CACHE_MODE="${CACHE_MODE:-raw}"
+CONTEXT_LENGTH="${CONTEXT_LENGTH:-8192}"
+MIN_TRADES="${MIN_TRADES:-10}"
+SAMPLES_PER_TOKEN="${SAMPLES_PER_TOKEN:-1}"
+OHLC_STATS_PATH="${OHLC_STATS_PATH:-/workspace/apollo/data/ohlc_stats.npz}"
+OUTPUT_DIR="${OUTPUT_DIR:-data/cache}"
+echo "========================================"
+echo "Apollo Dataset Pre-Caching"
+echo "========================================"
+echo "Cache Mode: $CACHE_MODE"
+if [ "$CACHE_MODE" = "context" ]; then
+    echo "Context Length (H/B/H threshold): $CONTEXT_LENGTH"
+    echo "Min Trades (T_cutoff threshold): $MIN_TRADES"
+    echo "Samples per Token: $SAMPLES_PER_TOKEN"
+fi
+echo "Output Directory: $OUTPUT_DIR"
+echo "OHLC Stats Path: $OHLC_STATS_PATH"
+echo "========================================"
 echo "Starting dataset caching..."
 python3 scripts/cache_dataset.py \
+    --ohlc_stats_path "$OHLC_STATS_PATH" \
+    --output_dir "$OUTPUT_DIR" \
+    --cache_mode "$CACHE_MODE" \
+    --context_length "$CONTEXT_LENGTH" \
+    --min_trades "$MIN_TRADES" \
+    --samples_per_token "$SAMPLES_PER_TOKEN" \
+    "$@"
 echo "Done!"
+echo "Cache saved to: $OUTPUT_DIR"

scripts/cache_dataset.py CHANGED Viewed

@@ -194,14 +194,24 @@ def main():
     parser.add_argument("--start_date", type=str, default=None, help="Start date (YYYY-MM-DD) for fetching new mints")
     parser.add_argument("--ohlc_stats_path", type=str, default="data/ohlc_stats.npz")
     parser.add_argument("--min_trade_usd", type=float, default=0.0)
     # DB Args
     parser.add_argument("--clickhouse_host", type=str, default=os.getenv("CLICKHOUSE_HOST", "localhost"))
     parser.add_argument("--clickhouse_port", type=int, default=int(os.getenv("CLICKHOUSE_PORT", 9000)))
     parser.add_argument("--neo4j_uri", type=str, default=os.getenv("NEO4J_URI", "bolt://localhost:7687"))
     parser.add_argument("--neo4j_user", type=str, default=os.getenv("NEO4J_USER", "neo4j"))
     parser.add_argument("--neo4j_password", type=str, default=os.getenv("NEO4J_PASSWORD", "password"))
     args = parser.parse_args()
     output_dir = Path(args.output_dir)
@@ -240,7 +250,8 @@ def main():
             ohlc_stats_path=args.ohlc_stats_path,
             horizons_seconds=[60, 180, 300, 600, 1800, 3600, 7200],
             quantiles=[0.5],
-            min_trade_usd=args.min_trade_usd
         )
         if len(dataset) == 0:
@@ -262,84 +273,186 @@ def main():
         if len(dataset) == 0:
             print("WARNING: No tokens remain after filtering by return_class_map.")
             return
-        # --- 3. Iterate and cache each item ---
-        print(f"INFO: Starting to generate and cache {len(dataset)} samples...")
         skipped_count = 0
         cached_count = 0
-        for i in tqdm(range(len(dataset)), desc="Caching samples"):
-            mint_addr = dataset.sampled_mints[i]['mint_address']
-            # (No need to check if in return_class_map anymore, we filtered)
-            class_id = return_class_map[mint_addr]
-            try:
-                item = dataset.__cacheitem__(i)
-                if item is None:
                     skipped_count += 1
                     continue
-                # Require quality score only for samples that will be cached
-                if mint_addr not in quality_scores_map:
-                    reason = quality_missing_reason(mint_addr)
-                    raise RuntimeError(
-                        f"Missing quality score for mint {mint_addr}. Reason: {reason}. "
-                        "Refusing to cache without quality_score."
                     )
-                q_score = quality_scores_map[mint_addr]
-                # INJECT QUALITY SCORE INTO TENSOR DICT
-                item["quality_score"] = q_score
-                item["class_id"] = class_id
-                filename = f"sample_{i}.pt"
-                output_path = output_dir / filename
-                torch.save(item, output_path)
-                cached_count += 1
-                # Log progress details (reflect all cached event lists)
-                n_trades = len(item.get("trades", []))
-                n_transfers = len(item.get("transfers", []))
-                n_pool_creations = len(item.get("pool_creations", []))
-                n_liquidity_changes = len(item.get("liquidity_changes", []))
-                n_fee_collections = len(item.get("fee_collections", []))
-                n_burns = len(item.get("burns", []))
-                n_supply_locks = len(item.get("supply_locks", []))
-                n_migrations = len(item.get("migrations", []))
-                n_mints = 1 if item.get("mint_timestamp") else 0
-                n_ohlc = len(item.get("ohlc_1s", [])) if item.get("ohlc_1s") is not None else 0
-                n_snapshots_5m = len(item.get("snapshots_5m", []))
-                n_holders = len(item.get("holder_snapshots_list", []))
-                tqdm.write(
-                    f"  + Cached: {mint_addr} | Class: {class_id} | Q: {q_score:.4f} | "
-                    f"Events: Mint {n_mints}, Trades {n_trades}, Transfers {n_transfers}, Pool Creations {n_pool_creations}, "
-                    f"Liquidity Changes {n_liquidity_changes}, Fee Collections {n_fee_collections}, "
-                    f"Burns {n_burns}, Supply Locks {n_supply_locks}, Migrations {n_migrations} | "
-                    f"Derived: Ohlc 1s {n_ohlc}, Snapshots 5m {n_snapshots_5m}, Holder Snapshots {n_holders}"
-                )
             except Exception as e:
-                error_msg = str(e)
-                # If a FATAL error occurs (e.g. persistent DB auth failure), stop the script immediately.
-                if "FATAL" in error_msg or "AuthenticationRateLimit" in error_msg:
-                    print(f"\nCRITICAL: Fatal error encountered processing sample {i}. Stopping execution.\nError: {e}", file=sys.stderr)
-                    sys.exit(1)
-                print(f"\nERROR: Failed to generate or save sample {i} for mint '{mint_addr}'. Error: {e}", file=sys.stderr)
-                # print trackback
-                import traceback
-                traceback.print_exc()
-                skipped_count += 1
-                continue
         print(f"\n--- Caching Complete ---")
-        print(f"Successfully cached: {cached_count} items.")
-        print(f"Filtered (Invalid/High Return): {filtered_count} items.")
-        print(f"Skipped (Errors/Empty): {skipped_count} items.")
         print(f"Cache location: {output_dir.resolve()}")
     finally:

     parser.add_argument("--start_date", type=str, default=None, help="Start date (YYYY-MM-DD) for fetching new mints")
     parser.add_argument("--ohlc_stats_path", type=str, default="data/ohlc_stats.npz")
     parser.add_argument("--min_trade_usd", type=float, default=0.0)
+    # NEW: Context caching mode args
+    parser.add_argument("--cache_mode", type=str, default="raw", choices=["raw", "context"],
+                        help="Cache mode: 'raw' caches raw token data (old behavior), 'context' caches fully processed training contexts (new behavior)")
+    parser.add_argument("--context_length", type=int, default=8192,
+                        help="Max sequence length for context caching mode. Triggers H/B/H dynamic sampling when events exceed this limit.")
+    parser.add_argument("--min_trades", type=int, default=10,
+                        help="Minimum number of trades required for T_cutoff sampling. Tokens with fewer trades are skipped.")
+    parser.add_argument("--samples_per_token", type=int, default=1,
+                        help="Number of different T_cutoff samples to generate per token in context mode.")
     # DB Args
     parser.add_argument("--clickhouse_host", type=str, default=os.getenv("CLICKHOUSE_HOST", "localhost"))
     parser.add_argument("--clickhouse_port", type=int, default=int(os.getenv("CLICKHOUSE_PORT", 9000)))
     parser.add_argument("--neo4j_uri", type=str, default=os.getenv("NEO4J_URI", "bolt://localhost:7687"))
     parser.add_argument("--neo4j_user", type=str, default=os.getenv("NEO4J_USER", "neo4j"))
     parser.add_argument("--neo4j_password", type=str, default=os.getenv("NEO4J_PASSWORD", "password"))
     args = parser.parse_args()
     output_dir = Path(args.output_dir)
             ohlc_stats_path=args.ohlc_stats_path,
             horizons_seconds=[60, 180, 300, 600, 1800, 3600, 7200],
             quantiles=[0.5],
+            min_trade_usd=args.min_trade_usd,
+            max_seq_len=args.context_length  # Pass context_length for H/B/H threshold
         )
         if len(dataset) == 0:
         if len(dataset) == 0:
             print("WARNING: No tokens remain after filtering by return_class_map.")
             return
+        # --- 3. Iterate and cache based on mode ---
+        print(f"INFO: Cache mode: {args.cache_mode}")
+        print(f"INFO: Starting to generate and cache from {len(dataset)} tokens...")
         skipped_count = 0
         cached_count = 0
+        global_sample_idx = 0  # Global counter for unique sample filenames
+        # Track class distribution for balanced sampling metadata
+        class_distribution = {}
+        if args.cache_mode == "context":
+            # =========================================================================
+            # CONTEXT MODE: Cache fully processed training contexts
+            # - Samples T_cutoff during caching (non-deterministic moved to cache time)
+            # - Applies H/B/H dynamic sampling based on context_length
+            # - Avoids caching tokens that won't be seen (garbage filtered out)
+            # - Training becomes fully deterministic (just loads cached contexts)
+            # =========================================================================
+            print(f"INFO: Context mode settings:")
+            print(f"  - context_length (H/B/H threshold): {args.context_length}")
+            print(f"  - min_trades (T_cutoff threshold): {args.min_trades}")
+            print(f"  - samples_per_token: {args.samples_per_token}")
+            for i in tqdm(range(len(dataset)), desc="Caching contexts"):
+                mint_addr = dataset.sampled_mints[i]['mint_address']
+                class_id = return_class_map[mint_addr]
+                try:
+                    # Generate multiple training contexts per token
+                    contexts = dataset.__cacheitem_context__(i, num_samples_per_token=args.samples_per_token)
+                    if not contexts:
+                        skipped_count += 1
+                        continue
+                    # Require quality score
+                    if mint_addr not in quality_scores_map:
+                        reason = quality_missing_reason(mint_addr)
+                        raise RuntimeError(
+                            f"Missing quality score for mint {mint_addr}. Reason: {reason}."
+                        )
+                    q_score = quality_scores_map[mint_addr]
+                    # Save each context as a separate sample
+                    for ctx in contexts:
+                        ctx["quality_score"] = q_score
+                        ctx["class_id"] = class_id
+                        ctx["source_token"] = mint_addr  # Track origin for debugging
+                        ctx["cache_mode"] = "context"
+                        filename = f"sample_{global_sample_idx}.pt"
+                        output_path = output_dir / filename
+                        torch.save(ctx, output_path)
+                        # Track class distribution
+                        class_distribution[class_id] = class_distribution.get(class_id, 0) + 1
+                        global_sample_idx += 1
+                        cached_count += 1
+                    n_events = len(contexts[0].get("event_sequence", [])) if contexts else 0
+                    tqdm.write(
+                        f"  + Cached {len(contexts)} contexts: {mint_addr} | Class: {class_id} | Q: {q_score:.4f} | Events: {n_events}"
+                    )
+                except Exception as e:
+                    error_msg = str(e)
+                    if "FATAL" in error_msg or "AuthenticationRateLimit" in error_msg:
+                        print(f"\nCRITICAL: Fatal error processing sample {i}. Stopping.\nError: {e}", file=sys.stderr)
+                        sys.exit(1)
+                    print(f"\nERROR: Failed to cache contexts for {mint_addr}. Error: {e}", file=sys.stderr)
+                    import traceback
+                    traceback.print_exc()
                     skipped_count += 1
                     continue
+        else:
+            # =========================================================================
+            # RAW MODE: Cache raw token data (original behavior)
+            # - T_cutoff sampling happens at runtime
+            # - H/B/H applied at runtime
+            # - Non-deterministic training
+            # =========================================================================
+            for i in tqdm(range(len(dataset)), desc="Caching raw samples"):
+                mint_addr = dataset.sampled_mints[i]['mint_address']
+                class_id = return_class_map[mint_addr]
+                try:
+                    item = dataset.__cacheitem__(i)
+                    if item is None:
+                        skipped_count += 1
+                        continue
+                    if mint_addr not in quality_scores_map:
+                        reason = quality_missing_reason(mint_addr)
+                        raise RuntimeError(
+                            f"Missing quality score for mint {mint_addr}. Reason: {reason}."
+                        )
+                    q_score = quality_scores_map[mint_addr]
+                    item["quality_score"] = q_score
+                    item["class_id"] = class_id
+                    item["cache_mode"] = "raw"
+                    filename = f"sample_{i}.pt"
+                    output_path = output_dir / filename
+                    torch.save(item, output_path)
+                    # Track class distribution
+                    class_distribution[class_id] = class_distribution.get(class_id, 0) + 1
+                    cached_count += 1
+                    n_trades = len(item.get("trades", []))
+                    n_transfers = len(item.get("transfers", []))
+                    n_pool_creations = len(item.get("pool_creations", []))
+                    n_liquidity_changes = len(item.get("liquidity_changes", []))
+                    n_fee_collections = len(item.get("fee_collections", []))
+                    n_burns = len(item.get("burns", []))
+                    n_supply_locks = len(item.get("supply_locks", []))
+                    n_migrations = len(item.get("migrations", []))
+                    n_mints = 1 if item.get("mint_timestamp") else 0
+                    n_ohlc = len(item.get("ohlc_1s", [])) if item.get("ohlc_1s") is not None else 0
+                    n_snapshots_5m = len(item.get("snapshots_5m", []))
+                    n_holders = len(item.get("holder_snapshots_list", []))
+                    tqdm.write(
+                        f"  + Cached: {mint_addr} | Class: {class_id} | Q: {q_score:.4f} | "
+                        f"Events: Mint {n_mints}, Trades {n_trades}, Transfers {n_transfers}, Pool Creations {n_pool_creations}, "
+                        f"Liquidity Changes {n_liquidity_changes}, Fee Collections {n_fee_collections}, "
+                        f"Burns {n_burns}, Supply Locks {n_supply_locks}, Migrations {n_migrations} | "
+                        f"Derived: Ohlc 1s {n_ohlc}, Snapshots 5m {n_snapshots_5m}, Holder Snapshots {n_holders}"
                     )
+                except Exception as e:
+                    error_msg = str(e)
+                    if "FATAL" in error_msg or "AuthenticationRateLimit" in error_msg:
+                        print(f"\nCRITICAL: Fatal error processing sample {i}. Stopping.\nError: {e}", file=sys.stderr)
+                        sys.exit(1)
+                    print(f"\nERROR: Failed to cache sample {i} for {mint_addr}. Error: {e}", file=sys.stderr)
+                    import traceback
+                    traceback.print_exc()
+                    skipped_count += 1
+                    continue
+        # --- Save class metadata for balanced sampling ---
+        # Build file_class_map for the metadata cache
+        file_class_map = {}
+        for sample_file in sorted(output_dir.glob("sample_*.pt")):
+            try:
+                sample_data = torch.load(sample_file, map_location="cpu", weights_only=False)
+                file_class_map[sample_file.name] = sample_data.get("class_id", 0)
             except Exception as e:
+                print(f"WARN: Could not read class_id from {sample_file.name}: {e}")
+        metadata_path = output_dir / "class_metadata.json"
+        try:
+            with open(metadata_path, 'w') as f:
+                json.dump({
+                    'file_class_map': file_class_map,
+                    'class_distribution': class_distribution,
+                    'cache_mode': args.cache_mode,
+                    'context_length': args.context_length if args.cache_mode == "context" else None,
+                    'min_trades': args.min_trades if args.cache_mode == "context" else None,
+                    'samples_per_token': args.samples_per_token if args.cache_mode == "context" else None,
+                }, f, indent=2)
+            print(f"INFO: Saved class metadata to {metadata_path}")
+        except Exception as e:
+            print(f"WARN: Failed to save class metadata: {e}")
         print(f"\n--- Caching Complete ---")
+        print(f"Cache mode: {args.cache_mode}")
+        print(f"Successfully cached: {cached_count} samples.")
+        print(f"Filtered (Invalid/High Return): {filtered_count} tokens.")
+        print(f"Skipped (Errors/Empty): {skipped_count} tokens.")
+        print(f"Class distribution: {class_distribution}")
         print(f"Cache location: {output_dir.resolve()}")
     finally: