Upload folder using huggingface_hub

Browse files

Files changed (11) hide show

collator_dump.json +0 -0
data/data_collator.py +24 -6
data/data_fetcher.py +39 -0
data/data_loader.py +344 -61
data/ohlc_stats.npz +1 -1
database.sh +2 -0
log.log +2 -2
models/helper_encoders.py +6 -5
pre_cache.sh +1 -3
scripts/inspect_collator.py +227 -0
train.py +26 -6

collator_dump.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/data_collator.py CHANGED Viewed

@@ -276,11 +276,25 @@ class MemecoinCollator:
             unique_wallets_data.update(item.get('wallets', {}))
             unique_tokens_data.update(item.get('tokens', {}))
-        # Create mappings needed for indexing
-        wallet_list_data = list(unique_wallets_data.values())
-        token_list_data = list(unique_tokens_data.values())
-        wallet_addr_to_batch_idx = {feat.get('profile', {}).get('wallet_address', f'__error_{i}'): i+1 for i, feat in enumerate(wallet_list_data)}
-        token_addr_to_batch_idx = {feat.get('address', f'__error_{i}'): i+1 for i, feat in enumerate(token_list_data)}
         # Collate Static Raw Features (Tokens, Wallets, Graph)
         token_encoder_inputs = self._collate_features_for_encoder(token_list_data, ['name'], self.device, "token")
@@ -297,7 +311,8 @@ class MemecoinCollator:
         # Initialize sequence tensors
         event_type_ids = torch.full((B, L), PAD_IDX_SEQ, dtype=torch.long, device=self.device)
-        timestamps_float = torch.zeros((B, L), dtype=torch.float32, device=self.device)
         # Store relative_ts in float32 for stability; model will scale/log/normalize
         relative_ts = torch.zeros((B, L, 1), dtype=torch.float32, device=self.device)
         attention_mask = torch.zeros((B, L), dtype=torch.long, device=self.device)
@@ -601,6 +616,9 @@ class MemecoinCollator:
                     ]
                     boosted_token_numerical_features[i, j, :] = torch.tensor(num_feats, dtype=self.dtype)
                 elif event_type == 'HolderSnapshot':
                     # --- FIXED: Store raw holder data, not an index ---
                     raw_holders = event.get('holders', [])

             unique_wallets_data.update(item.get('wallets', {}))
             unique_tokens_data.update(item.get('tokens', {}))
+        # Create mappings needed for indexing (use dict keys as source of truth)
+        wallet_items = list(unique_wallets_data.items())
+        token_items = list(unique_tokens_data.items())
+        wallet_list_data = []
+        for addr, feat in wallet_items:
+            profile = feat.get('profile', {})
+            if not profile.get('wallet_address'):
+                profile['wallet_address'] = addr
+            wallet_list_data.append(feat)
+        token_list_data = []
+        for addr, feat in token_items:
+            if not feat.get('address'):
+                feat['address'] = addr
+            token_list_data.append(feat)
+        wallet_addr_to_batch_idx = {addr: i + 1 for i, (addr, _) in enumerate(wallet_items)}
+        token_addr_to_batch_idx = {addr: i + 1 for i, (addr, _) in enumerate(token_items)}
         # Collate Static Raw Features (Tokens, Wallets, Graph)
         token_encoder_inputs = self._collate_features_for_encoder(token_list_data, ['name'], self.device, "token")
         # Initialize sequence tensors
         event_type_ids = torch.full((B, L), PAD_IDX_SEQ, dtype=torch.long, device=self.device)
+        # Use float64 to preserve second-level precision for large Unix timestamps.
+        timestamps_float = torch.zeros((B, L), dtype=torch.float64, device=self.device)
         # Store relative_ts in float32 for stability; model will scale/log/normalize
         relative_ts = torch.zeros((B, L, 1), dtype=torch.float32, device=self.device)
         attention_mask = torch.zeros((B, L), dtype=torch.long, device=self.device)
                     ]
                     boosted_token_numerical_features[i, j, :] = torch.tensor(num_feats, dtype=self.dtype)
+                elif event_type == 'Migrated':
+                    migrated_protocol_ids[i, j] = event.get('protocol_id', 0)
                 elif event_type == 'HolderSnapshot':
                     # --- FIXED: Store raw holder data, not an index ---
                     raw_holders = event.get('holders', [])

data/data_fetcher.py CHANGED Viewed

@@ -1014,6 +1014,45 @@ class DataFetcher:
         except Exception as e:
             print(f"ERROR: Failed to count total holders for token {token_address}: {e}")
             return 0
     def fetch_raw_token_data(
         self,
         token_address: str,

         except Exception as e:
             print(f"ERROR: Failed to count total holders for token {token_address}: {e}")
             return 0
+    def fetch_holder_snapshot_stats_for_token(self, token_address: str, T_cutoff: datetime.datetime, limit: int = 200) -> Tuple[int, List[Dict[str, Any]]]:
+        """
+        Fetch total holder count and top holders in a single query.
+        Returns (count, top_holders_list).
+        """
+        if not token_address:
+            return 0, []
+        query = """
+        WITH point_in_time_holdings AS (
+            SELECT
+                wallet_address,
+                argMax(current_balance, updated_at) AS bal
+            FROM wallet_holdings
+            WHERE mint_address = %(token)s AND updated_at <= %(T_cutoff)s
+            GROUP BY wallet_address
+        )
+        SELECT
+            (SELECT count() FROM point_in_time_holdings WHERE bal > 0) AS holder_count,
+            (SELECT groupArray((wallet_address, bal))
+             FROM (
+                 SELECT wallet_address, bal
+                 FROM point_in_time_holdings
+                 WHERE bal > 0
+                 ORDER BY bal DESC
+                 LIMIT %(limit)s
+             )) AS top_holders
+        """
+        params = {'token': token_address, 'T_cutoff': T_cutoff, 'limit': int(limit)}
+        try:
+            rows = self.db_client.execute(query, params)
+            if not rows:
+                return 0, []
+            holder_count, top_holders = rows[0]
+            top_list = [{'wallet_address': wa, 'current_balance': bal} for wa, bal in (top_holders or [])]
+            return int(holder_count or 0), top_list
+        except Exception as e:
+            print(f"ERROR: Failed to fetch holder snapshot stats for token {token_address}: {e}")
+            return 0, []
     def fetch_raw_token_data(
         self,
         token_address: str,

data/data_loader.py CHANGED Viewed

@@ -61,6 +61,7 @@ MIN_AMOUNT_TRANSFER_SUPPLY = 0.0 # 1.0% of total supply
 # Interval for HolderSnapshot events (seconds)
 HOLDER_SNAPSHOT_INTERVAL_SEC = 300
 HOLDER_SNAPSHOT_TOP_K = 200
 class EmbeddingPooler:
@@ -114,6 +115,7 @@ class OracleDataset(Dataset):
     """
     def __init__(self,
                  data_fetcher: Optional[DataFetcher] = None, # OPTIONAL: Only needed for caching (Writer)
                  horizons_seconds: List[int] = [],
                  quantiles: List[float] = [],
                  max_samples: Optional[int] = None,
@@ -129,18 +131,11 @@ class OracleDataset(Dataset):
         # --- NEW: Create a persistent requests session for efficiency ---
         # Configure robust HTTP session
-        self.http_session = requests.Session()
-        retry_strategy = Retry(
-            total=3,
-            backoff_factor=1,
-            status_forcelist=[429, 500, 502, 503, 504],
-            allowed_methods=["HEAD", "GET", "OPTIONS"]
-        )
-        adapter = HTTPAdapter(max_retries=retry_strategy)
-        self.http_session.mount("http://", adapter)
-        self.http_session.mount("https://", adapter)
         self.fetcher = data_fetcher
         self.cache_dir = Path(cache_dir) if cache_dir else None
         # Always define these so DataLoader workers don't crash with AttributeError if
         # initialization falls through an unexpected branch.
@@ -271,6 +266,51 @@ class OracleDataset(Dataset):
              print("INFO: No OHLC stats path provided. Using default normalization.")
         self.min_trade_usd = min_trade_usd
     def __len__(self) -> int:
         return self.num_samples
@@ -287,6 +327,16 @@ class OracleDataset(Dataset):
         denom = self.ohlc_price_std if abs(self.ohlc_price_std) > 1e-9 else 1.0
         return [(float(v) - self.ohlc_price_mean) / denom for v in values]
     def _apply_dynamic_sampling(self, events: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         """
         Applies dynamic context sampling to fit events within max_seq_len.
@@ -482,8 +532,11 @@ class OracleDataset(Dataset):
                  holders_end = len(cached_holders_list[i])
             elif self.fetcher:
                 cutoff_dt_ts = datetime.datetime.fromtimestamp(ts_value, tz=datetime.timezone.utc)
-                holder_records_ts = self.fetcher.fetch_token_holders_for_snapshot(token_address, cutoff_dt_ts, limit=HOLDER_SNAPSHOT_TOP_K)
-                holders_end = self.fetcher.fetch_total_holders_count_for_token(token_address, cutoff_dt_ts)
             else:
                  holder_records_ts = []
                  holders_end = 0
@@ -784,6 +837,9 @@ class OracleDataset(Dataset):
             # --- FIXED: Only add to pooler if data is valid ---
             image = None
             token_uri = data.get('token_uri')
             # --- NEW: Use multiple IPFS gateways for reliability ---
             if token_uri and isinstance(token_uri, str) and token_uri.strip():
@@ -841,11 +897,15 @@ class OracleDataset(Dataset):
                             else: # If all gateways fail for the image
                                 raise RuntimeError(f"All IPFS gateways failed for image: {image_url}")
                         else: # Handle regular HTTP image URLs
                             image_resp = self.http_session.get(image_url, timeout=10)
                             image_resp.raise_for_status()
                             image = Image.open(BytesIO(image_resp.content))
                 except (requests.RequestException, ValueError, IOError) as e:
                     print(f"WARN: Could not fetch or process image for token {addr} from URI {token_uri}. Reason: {e}")
                     image = None
@@ -954,6 +1014,9 @@ class OracleDataset(Dataset):
         """
         Loads raw data from cache, samples a random T_cutoff, and generates a training sample.
         """
         raw_data = None
         if self.cache_dir:
             if idx >= len(self.cached_files):
@@ -1024,8 +1087,8 @@ class OracleDataset(Dataset):
         # ============================================================================
         # 1. Use ALL trades (sorted by timestamp) for context
         # 2. Find indices of SUCCESSFUL trades (needed for label computation)
-        # 3. Sample interval: [24, last_successful_idx - 1]
-        # 4. This guarantees: 24+ trades for context, 1+ successful trade for labels
         # ============================================================================
         all_trades_raw = raw_data.get('trades', [])
@@ -1038,7 +1101,8 @@ class OracleDataset(Dataset):
             key=lambda t: _timestamp_to_order_value(t.get('timestamp'))
         )
-        if len(all_trades_sorted) < 26:  # Need at least 24 for context + 2 for cutoff+label
             return None
         # Find indices of SUCCESSFUL trades (valid for label computation)
@@ -1052,7 +1116,7 @@ class OracleDataset(Dataset):
         max_horizon_seconds = max(self.horizons_seconds) if self.horizons_seconds else 0
         # Define sampling interval
-        min_idx = 24  # At least 24 trades for context
         max_idx = len(all_trades_sorted) - 2  # Need at least 1 trade after cutoff
         if max_idx < min_idx:
@@ -1148,38 +1212,57 @@ class OracleDataset(Dataset):
             _add_wallet(holder.get('wallet_address'), wallets_to_fetch)
         pooler = EmbeddingPooler()
-        # Prepare offline token data
-        offline_token_data = {token_address: raw_data} # Assuming raw_data contains token metadata at root
-        main_token_data = self._process_token_data([token_address], pooler, T_cutoff, token_data=offline_token_data)
         if not main_token_data:
             return None
-        # Prepare offline wallet data
-        # raw_data['socials'] structure: {'profiles': {...}, 'socials': {...}} usually.
-        # But wait, cached raw_data['socials'] might be just the dict we need?
-        # Let's handle graceful empty if not found.
-        cached_social_bundle = raw_data.get('socials', {})
-        offline_profiles = cached_social_bundle.get('profiles', {})
-        offline_socials = cached_social_bundle.get('socials', {})
-        offline_holdings = {} # Holdings not cached usually due to size
-        wallet_data, all_token_data = self._process_wallet_data(
-            list(wallets_to_fetch),
-            main_token_data.copy(),
-            pooler,
-            T_cutoff,
-            profiles_override=offline_profiles,
-            socials_override=offline_socials,
-            holdings_override=offline_holdings
-        )
         graph_entities = {}
         graph_links = {}
-        graph_entities = {}
-        graph_links = {}
-        # if wallets_to_fetch:
-        #     graph_entities, graph_links = self.fetcher.fetch_graph_links(...)
-        # Offline Graph: check if raw_data has graph? Assuming no for now.
         # Generate the item
         return self._generate_dataset_item(
@@ -1244,7 +1327,7 @@ class OracleDataset(Dataset):
             max_horizon_seconds=self.max_cache_horizon_seconds,
             include_wallet_data=False,
             include_graph=False,
-            min_trades=25, # Enforce min trades for context
             full_history=True,      # Bypass H/B/H limits
             prune_failed=False,     # Keep failed trades for realistic simulation
             prune_transfers=False   # Keep transfers for snapshot reconstruction
@@ -1350,10 +1433,12 @@ class OracleDataset(Dataset):
             # Time is end of bucket
             snapshot_ts = t0 + datetime.timedelta(seconds=(i+1)*interval)
-            # These queries can be slow.
-            count = self.fetcher.fetch_total_holders_count_for_token(token_address, snapshot_ts)
-            # Fetch Top 200 as per constant
-            top_holders = self.fetcher.fetch_token_holders_for_snapshot(token_address, snapshot_ts, limit=HOLDER_SNAPSHOT_TOP_K)
             total_supply = raw_data.get('total_supply', 0) or 1
             if raw_data.get('decimals'):
@@ -1470,6 +1555,7 @@ class OracleDataset(Dataset):
         # 3. Process Trades (Events + Chart)
         trade_events = []
         aggregation_trades = []
         high_def_chart_trades = []
         middle_chart_trades = []
@@ -1563,6 +1649,40 @@ class OracleDataset(Dataset):
             _register_event(trade_event, trade_sort_key)
             trade_events.append(trade_event)
         # 4. Generate Chart Events
         def _finalize_chart(t_list):
             t_list.sort(key=lambda x: x['sort_key'])
@@ -1625,31 +1745,194 @@ class OracleDataset(Dataset):
         chart_events.extend(_emit_chart_segments(high_def_chart_trades, HIGH_DEF_INTERVAL, "chart-hd", precomputed_ohlc=ohlc_1s_precomputed))
         chart_events.extend(_emit_chart_segments(middle_chart_trades, MIDDLE_INTERVAL, "chart-mid"))
-        # 5. Process Other Records (Pool, Liquidity, etc.) using filtering
-        # Note: We need to port the logic that converts raw records to events
-        # For simplicity, assuming these records are already processed or we add the logic here.
-        # Given the space constraint, I'll add a simplified pass for pool creation.
-        # Ideally we refactor this into helper methods too.
         for pool_record in pool_creation_records:
-            pool_ts = int(_timestamp_to_order_value(pool_record.get('timestamp')))
-            # ... process pool ...
-            # Simple placeholder for now:
             pool_event = {
                 'event_type': 'PoolCreated',
                 'timestamp': pool_ts,
-                'relative_ts': pool_ts - t0_timestamp,
                 'wallet_address': pool_record.get('creator_address'),
                 'token_address': token_address,
-                # ... other fields ...
             }
-            # _register_event(pool_event, val)
         # 6. Generate Snapshots
         self._generate_onchain_snapshots(
             token_address, int(t0_timestamp), T_cutoff,
             300, # Interval
-            trade_events, [], # Transfer events
             aggregation_trades,
             wallet_data,
             total_supply_dec,

 # Interval for HolderSnapshot events (seconds)
 HOLDER_SNAPSHOT_INTERVAL_SEC = 300
 HOLDER_SNAPSHOT_TOP_K = 200
+DEAD_URI_RETRY_LIMIT = 2
 class EmbeddingPooler:
     """
     def __init__(self,
                  data_fetcher: Optional[DataFetcher] = None, # OPTIONAL: Only needed for caching (Writer)
+                 fetcher_config: Optional[Dict[str, Any]] = None,
                  horizons_seconds: List[int] = [],
                  quantiles: List[float] = [],
                  max_samples: Optional[int] = None,
         # --- NEW: Create a persistent requests session for efficiency ---
         # Configure robust HTTP session
+        self.http_session = None
+        self._init_http_session()
         self.fetcher = data_fetcher
+        self.fetcher_config = fetcher_config
         self.cache_dir = Path(cache_dir) if cache_dir else None
         # Always define these so DataLoader workers don't crash with AttributeError if
         # initialization falls through an unexpected branch.
              print("INFO: No OHLC stats path provided. Using default normalization.")
         self.min_trade_usd = min_trade_usd
+        self._uri_fail_counts: Dict[str, int] = {}
+    def _init_http_session(self) -> None:
+        # Configure robust HTTP session
+        self.http_session = requests.Session()
+        retry_strategy = Retry(
+            total=3,
+            backoff_factor=1,
+            status_forcelist=[429, 500, 502, 503, 504],
+            allowed_methods=["HEAD", "GET", "OPTIONS"]
+        )
+        adapter = HTTPAdapter(max_retries=retry_strategy)
+        self.http_session.mount("http://", adapter)
+        self.http_session.mount("https://", adapter)
+    def init_fetcher(self) -> None:
+        """
+        Initialize DataFetcher from stored config (for DataLoader workers).
+        """
+        if self.fetcher is not None or not self.fetcher_config:
+            return
+        from clickhouse_driver import Client as ClickHouseClient
+        from neo4j import GraphDatabase
+        cfg = self.fetcher_config
+        clickhouse_client = ClickHouseClient(
+            host=cfg.get("clickhouse_host", "localhost"),
+            port=int(cfg.get("clickhouse_port", 9000)),
+        )
+        neo4j_driver = GraphDatabase.driver(
+            cfg.get("neo4j_uri", "bolt://localhost:7687"),
+            auth=(cfg.get("neo4j_user", "neo4j"), cfg.get("neo4j_password", "password"))
+        )
+        self.fetcher = DataFetcher(clickhouse_client=clickhouse_client, neo4j_driver=neo4j_driver)
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        # Drop non-pickleable objects
+        state["fetcher"] = None
+        state["http_session"] = None
+        return state
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        if self.http_session is None:
+            self._init_http_session()
     def __len__(self) -> int:
         return self.num_samples
         denom = self.ohlc_price_std if abs(self.ohlc_price_std) > 1e-9 else 1.0
         return [(float(v) - self.ohlc_price_mean) / denom for v in values]
+    def _is_dead_uri(self, uri: Optional[str]) -> bool:
+        if not uri:
+            return False
+        return self._uri_fail_counts.get(uri, 0) >= DEAD_URI_RETRY_LIMIT
+    def _mark_uri_failure(self, uri: Optional[str]) -> None:
+        if not uri:
+            return
+        self._uri_fail_counts[uri] = self._uri_fail_counts.get(uri, 0) + 1
     def _apply_dynamic_sampling(self, events: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         """
         Applies dynamic context sampling to fit events within max_seq_len.
                  holders_end = len(cached_holders_list[i])
             elif self.fetcher:
                 cutoff_dt_ts = datetime.datetime.fromtimestamp(ts_value, tz=datetime.timezone.utc)
+                holders_end, holder_records_ts = self.fetcher.fetch_holder_snapshot_stats_for_token(
+                    token_address,
+                    cutoff_dt_ts,
+                    limit=HOLDER_SNAPSHOT_TOP_K
+                )
             else:
                  holder_records_ts = []
                  holders_end = 0
             # --- FIXED: Only add to pooler if data is valid ---
             image = None
             token_uri = data.get('token_uri')
+            if self._is_dead_uri(token_uri):
+                image = None
+                token_uri = None
             # --- NEW: Use multiple IPFS gateways for reliability ---
             if token_uri and isinstance(token_uri, str) and token_uri.strip():
                             else: # If all gateways fail for the image
                                 raise RuntimeError(f"All IPFS gateways failed for image: {image_url}")
                         else: # Handle regular HTTP image URLs
+                            if self._is_dead_uri(image_url):
+                                raise requests.RequestException("Skipping dead image URI after repeated failures.")
                             image_resp = self.http_session.get(image_url, timeout=10)
                             image_resp.raise_for_status()
                             image = Image.open(BytesIO(image_resp.content))
                 except (requests.RequestException, ValueError, IOError) as e:
+                    self._mark_uri_failure(token_uri)
+                    if isinstance(metadata.get('image') if 'metadata' in locals() and isinstance(metadata, dict) else None, str):
+                        self._mark_uri_failure(metadata.get('image'))
                     print(f"WARN: Could not fetch or process image for token {addr} from URI {token_uri}. Reason: {e}")
                     image = None
         """
         Loads raw data from cache, samples a random T_cutoff, and generates a training sample.
         """
+        if self.fetcher is None and self.fetcher_config:
+            # Lazy init in main or worker if not initialized.
+            self.init_fetcher()
         raw_data = None
         if self.cache_dir:
             if idx >= len(self.cached_files):
         # ============================================================================
         # 1. Use ALL trades (sorted by timestamp) for context
         # 2. Find indices of SUCCESSFUL trades (needed for label computation)
+        # 3. Sample interval: [min_context_trades-1, last_successful_idx - 1]
+        # 4. This guarantees: N trades for context, 1+ successful trade for labels
         # ============================================================================
         all_trades_raw = raw_data.get('trades', [])
             key=lambda t: _timestamp_to_order_value(t.get('timestamp'))
         )
+        min_context_trades = 10
+        if len(all_trades_sorted) < (min_context_trades + 1):  # context + 1 trade after cutoff
             return None
         # Find indices of SUCCESSFUL trades (valid for label computation)
         max_horizon_seconds = max(self.horizons_seconds) if self.horizons_seconds else 0
         # Define sampling interval
+        min_idx = min_context_trades - 1  # At least N trades for context
         max_idx = len(all_trades_sorted) - 2  # Need at least 1 trade after cutoff
         if max_idx < min_idx:
             _add_wallet(holder.get('wallet_address'), wallets_to_fetch)
         pooler = EmbeddingPooler()
+        # Token data: fetch time-aware data when fetcher is available.
+        if self.fetcher:
+            main_token_data = self._process_token_data([token_address], pooler, T_cutoff, token_data=None)
+            # Fallback to cached raw data if DB returned nothing
+            if not main_token_data:
+                offline_token_data = {token_address: raw_data} # raw_data contains token metadata at root
+                main_token_data = self._process_token_data([token_address], pooler, T_cutoff, token_data=offline_token_data)
+        else:
+            offline_token_data = {token_address: raw_data} # raw_data contains token metadata at root
+            main_token_data = self._process_token_data([token_address], pooler, T_cutoff, token_data=offline_token_data)
         if not main_token_data:
             return None
+        # Wallet data: fetch time-aware data when fetcher is available.
+        if self.fetcher:
+            wallet_data, all_token_data = self._process_wallet_data(
+                list(wallets_to_fetch),
+                main_token_data.copy(),
+                pooler,
+                T_cutoff,
+                profiles_override=None,
+                socials_override=None,
+                holdings_override=None
+            )
+        else:
+            cached_social_bundle = raw_data.get('socials', {})
+            offline_profiles = cached_social_bundle.get('profiles', {})
+            offline_socials = cached_social_bundle.get('socials', {})
+            offline_holdings = {} # Holdings not cached usually due to size
+            wallet_data, all_token_data = self._process_wallet_data(
+                list(wallets_to_fetch),
+                main_token_data.copy(),
+                pooler,
+                T_cutoff,
+                profiles_override=offline_profiles,
+                socials_override=offline_socials,
+                holdings_override=offline_holdings
+            )
+        # Graph links: fetch time-aware graph when fetcher is available.
         graph_entities = {}
         graph_links = {}
+        if self.fetcher and wallets_to_fetch:
+            try:
+                graph_entities, graph_links = self.fetcher.fetch_graph_links(
+                    list(wallets_to_fetch),
+                    T_cutoff,
+                    max_degrees=1
+                )
+            except Exception as e:
+                print(f"ERROR: Failed to fetch graph links for {token_address}: {e}")
         # Generate the item
         return self._generate_dataset_item(
             max_horizon_seconds=self.max_cache_horizon_seconds,
             include_wallet_data=False,
             include_graph=False,
+            min_trades=10, # Enforce min trades for context
             full_history=True,      # Bypass H/B/H limits
             prune_failed=False,     # Keep failed trades for realistic simulation
             prune_transfers=False   # Keep transfers for snapshot reconstruction
             # Time is end of bucket
             snapshot_ts = t0 + datetime.timedelta(seconds=(i+1)*interval)
+            # These queries can be slow; use single-call combined stats.
+            count, top_holders = self.fetcher.fetch_holder_snapshot_stats_for_token(
+                token_address,
+                snapshot_ts,
+                limit=HOLDER_SNAPSHOT_TOP_K
+            )
             total_supply = raw_data.get('total_supply', 0) or 1
             if raw_data.get('decimals'):
         # 3. Process Trades (Events + Chart)
         trade_events = []
+        transfer_events = []
         aggregation_trades = []
         high_def_chart_trades = []
         middle_chart_trades = []
             _register_event(trade_event, trade_sort_key)
             trade_events.append(trade_event)
+        # 3b. Process Transfers
+        for transfer in transfer_records:
+            transfer_ts_val = _timestamp_to_order_value(transfer.get('timestamp'))
+            transfer_ts_int = int(transfer_ts_val)
+            amount_dec = float(transfer.get('amount_decimal', 0.0) or 0.0)
+            source_balance = float(transfer.get('source_balance', 0.0) or 0.0)
+            denom = source_balance + amount_dec if source_balance > 0 else 0.0
+            transfer_pct_of_holding = (amount_dec / denom) if denom > 1e-9 else 0.0
+            transfer_pct_of_supply = (amount_dec / total_supply_dec) if total_supply_dec > 0 else 0.0
+            is_large_transfer = transfer_pct_of_supply >= LARGE_TRANSFER_SUPPLY_PCT_THRESHOLD
+            transfer_event = {
+                'event_type': 'LargeTransfer' if is_large_transfer else 'Transfer',
+                'timestamp': transfer_ts_int,
+                'relative_ts': transfer_ts_val - t0_timestamp,
+                'wallet_address': transfer.get('source'),
+                'destination_wallet_address': transfer.get('destination'),
+                'token_address': token_address,
+                'token_amount': amount_dec,
+                'transfer_pct_of_total_supply': transfer_pct_of_supply,
+                'transfer_pct_of_holding': transfer_pct_of_holding,
+                'priority_fee': transfer.get('priority_fee', 0.0),
+                'success': transfer.get('success', False)
+            }
+            _register_event(
+                transfer_event,
+                _event_execution_sort_key(
+                    transfer.get('timestamp'),
+                    slot=transfer.get('slot', 0),
+                    signature=transfer.get('signature', '')
+                )
+            )
+            transfer_events.append(transfer_event)
         # 4. Generate Chart Events
         def _finalize_chart(t_list):
             t_list.sort(key=lambda x: x['sort_key'])
         chart_events.extend(_emit_chart_segments(high_def_chart_trades, HIGH_DEF_INTERVAL, "chart-hd", precomputed_ohlc=ohlc_1s_precomputed))
         chart_events.extend(_emit_chart_segments(middle_chart_trades, MIDDLE_INTERVAL, "chart-mid"))
+        # 5. Process Other Records (Pool, Liquidity, Fees, Burns, Locks, Migrations)
+        pool_meta_by_address = {}
         for pool_record in pool_creation_records:
+            pool_addr = pool_record.get('pool_address')
+            if pool_addr:
+                pool_meta_by_address[pool_addr] = pool_record
+            pool_ts_val = _timestamp_to_order_value(pool_record.get('timestamp'))
+            pool_ts = int(pool_ts_val)
+            base_decimals = pool_record.get('base_decimals')
+            quote_decimals = pool_record.get('quote_decimals')
+            base_decimals = int(base_decimals) if base_decimals is not None else 0
+            quote_decimals = int(quote_decimals) if quote_decimals is not None else 0
+            base_amount_raw = pool_record.get('initial_base_liquidity', 0) or 0
+            quote_amount_raw = pool_record.get('initial_quote_liquidity', 0) or 0
+            base_amount = float(base_amount_raw) / (10 ** base_decimals) if base_decimals > 0 else float(base_amount_raw)
+            quote_amount = float(quote_amount_raw) / (10 ** quote_decimals) if quote_decimals > 0 else float(quote_amount_raw)
             pool_event = {
                 'event_type': 'PoolCreated',
                 'timestamp': pool_ts,
+                'relative_ts': pool_ts_val - t0_timestamp,
                 'wallet_address': pool_record.get('creator_address'),
                 'token_address': token_address,
+                'quote_token_address': pool_record.get('quote_address'),
+                'protocol_id': pool_record.get('protocol', 0),
+                'pool_address': pool_addr,
+                'base_amount': base_amount,
+                'quote_amount': quote_amount,
+                'priority_fee': pool_record.get('priority_fee', 0.0),
+                'success': pool_record.get('success', False)
+            }
+            _register_event(
+                pool_event,
+                _event_execution_sort_key(
+                    pool_record.get('timestamp'),
+                    slot=pool_record.get('slot', 0),
+                    signature=pool_record.get('signature', '')
+                )
+            )
+        for liq_record in liquidity_change_records:
+            liq_ts_val = _timestamp_to_order_value(liq_record.get('timestamp'))
+            liq_ts = int(liq_ts_val)
+            pool_addr = liq_record.get('pool_address')
+            pool_meta = pool_meta_by_address.get(pool_addr, {})
+            quote_decimals = pool_meta.get('quote_decimals')
+            quote_decimals = int(quote_decimals) if quote_decimals is not None else 0
+            quote_amount_raw = liq_record.get('quote_amount', 0) or 0
+            quote_amount = float(quote_amount_raw) / (10 ** quote_decimals) if quote_decimals > 0 else float(quote_amount_raw)
+            liq_event = {
+                'event_type': 'LiquidityChange',
+                'timestamp': liq_ts,
+                'relative_ts': liq_ts_val - t0_timestamp,
+                'wallet_address': liq_record.get('lp_provider'),
+                'token_address': token_address,
+                'quote_token_address': pool_meta.get('quote_address'),
+                'protocol_id': liq_record.get('protocol', 0),
+                'change_type_id': liq_record.get('change_type', 0),
+                'quote_amount': quote_amount,
+                'priority_fee': liq_record.get('priority_fee', 0.0),
+                'success': liq_record.get('success', False)
             }
+            _register_event(
+                liq_event,
+                _event_execution_sort_key(
+                    liq_record.get('timestamp'),
+                    slot=liq_record.get('slot', 0),
+                    signature=liq_record.get('signature', '')
+                )
+            )
+        for fee_record in fee_collection_records:
+            fee_ts_val = _timestamp_to_order_value(fee_record.get('timestamp'))
+            fee_ts = int(fee_ts_val)
+            amount = 0.0
+            if fee_record.get('token_0_mint_address') == token_address:
+                amount = float(fee_record.get('token_0_amount', 0.0) or 0.0)
+            elif fee_record.get('token_1_mint_address') == token_address:
+                amount = float(fee_record.get('token_1_amount', 0.0) or 0.0)
+            fee_event = {
+                'event_type': 'FeeCollected',
+                'timestamp': fee_ts,
+                'relative_ts': fee_ts_val - t0_timestamp,
+                'wallet_address': fee_record.get('recipient_address'),
+                'token_address': token_address,
+                'sol_amount': amount,
+                'protocol_id': fee_record.get('protocol', 0),
+                'priority_fee': fee_record.get('priority_fee', 0.0),
+                'success': fee_record.get('success', False)
+            }
+            _register_event(
+                fee_event,
+                _event_execution_sort_key(
+                    fee_record.get('timestamp'),
+                    slot=fee_record.get('slot', 0),
+                    signature=fee_record.get('signature', '')
+                )
+            )
+        for burn_record in burn_records:
+            burn_ts_val = _timestamp_to_order_value(burn_record.get('timestamp'))
+            burn_ts = int(burn_ts_val)
+            amount_dec = float(burn_record.get('amount_decimal', 0.0) or 0.0)
+            amount_pct = (amount_dec / total_supply_dec) if total_supply_dec > 0 else 0.0
+            burn_event = {
+                'event_type': 'TokenBurn',
+                'timestamp': burn_ts,
+                'relative_ts': burn_ts_val - t0_timestamp,
+                'wallet_address': burn_record.get('source'),
+                'token_address': token_address,
+                'amount_pct_of_total_supply': amount_pct,
+                'amount_tokens_burned': amount_dec,
+                'priority_fee': burn_record.get('priority_fee', 0.0),
+                'success': burn_record.get('success', False)
+            }
+            _register_event(
+                burn_event,
+                _event_execution_sort_key(
+                    burn_record.get('timestamp'),
+                    slot=burn_record.get('slot', 0),
+                    signature=burn_record.get('signature', '')
+                )
+            )
+        for lock_record in supply_lock_records:
+            lock_ts_val = _timestamp_to_order_value(lock_record.get('timestamp'))
+            lock_ts = int(lock_ts_val)
+            total_locked_amount = float(lock_record.get('total_locked_amount', 0.0) or 0.0)
+            amount_pct = (total_locked_amount / total_supply_dec) if total_supply_dec > 0 else 0.0
+            final_unlock_ts = lock_record.get('final_unlock_timestamp', 0) or 0
+            lock_duration = float(final_unlock_ts) - float(lock_ts_val)
+            if lock_duration < 0:
+                lock_duration = 0.0
+            lock_event = {
+                'event_type': 'SupplyLock',
+                'timestamp': lock_ts,
+                'relative_ts': lock_ts_val - t0_timestamp,
+                'wallet_address': lock_record.get('sender'),
+                'token_address': token_address,
+                'amount_pct_of_total_supply': amount_pct,
+                'lock_duration': lock_duration,
+                'protocol_id': lock_record.get('protocol', 0),
+                'priority_fee': lock_record.get('priority_fee', 0.0),
+                'success': lock_record.get('success', False)
+            }
+            _register_event(
+                lock_event,
+                _event_execution_sort_key(
+                    lock_record.get('timestamp'),
+                    slot=lock_record.get('slot', 0),
+                    signature=lock_record.get('signature', '')
+                )
+            )
+        for migration_record in migration_records:
+            mig_ts_val = _timestamp_to_order_value(migration_record.get('timestamp'))
+            mig_ts = int(mig_ts_val)
+            mig_event = {
+                'event_type': 'Migrated',
+                'timestamp': mig_ts,
+                'relative_ts': mig_ts_val - t0_timestamp,
+                'wallet_address': None,
+                'token_address': token_address,
+                'protocol_id': migration_record.get('protocol', 0),
+                'priority_fee': migration_record.get('priority_fee', 0.0),
+                'success': migration_record.get('success', False)
+            }
+            _register_event(
+                mig_event,
+                _event_execution_sort_key(
+                    migration_record.get('timestamp'),
+                    slot=migration_record.get('slot', 0),
+                    signature=migration_record.get('signature', '')
+                )
+            )
         # 6. Generate Snapshots
         self._generate_onchain_snapshots(
             token_address, int(t0_timestamp), T_cutoff,
             300, # Interval
+            trade_events, transfer_events,
             aggregation_trades,
             wallet_data,
             total_supply_dec,

data/ohlc_stats.npz CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:46809f070aa1dfcb4f53d7390b1b6ff370e6828e198df4c0df5632ac6fa9f607
 size 1660

 version https://git-lfs.github.com/spec/v1
+oid sha256:038695f9d26e59395a2e69b52f9da029cf8796b06e4d503f0c18191288ad2a02
 size 1660

database.sh ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ python scripts/download_epoch_artifacts.py
2	+ python scripts/ingest_epoch.py --epoch 844

log.log CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:41885991264f1522ec8b539dd4f3f738d537102a65103a800578229feef13880
-size 18007

 version https://git-lfs.github.com/spec/v1
+oid sha256:82f53ccd1f7ea7868893c879fabdea0b5939a3780cc5086790edb79d878e8121
+size 32490

models/helper_encoders.py CHANGED Viewed

@@ -43,8 +43,8 @@ class ContextualTimeEncoder(nn.Module):
         half_dim = d_model // 2
         # Calculations for sinusoidal encoding are more stable in float32
-        div_term = torch.exp(torch.arange(0, half_dim, device=device).float() * -(math.log(10000.0) / half_dim))
-        args = values.float().unsqueeze(-1) * div_term
         return torch.cat([torch.sin(args), torch.cos(args)], dim=-1)
@@ -63,15 +63,16 @@ class ContextualTimeEncoder(nn.Module):
             # 1. Store original shape (e.g., [B, L]) and flatten
             original_shape = timestamps.shape
-            timestamps_flat = timestamps.flatten().float() # Shape [N_total]
             # 2. Sinusoidal encode (already vectorized)
             ts_encoding = self._sinusoidal_encode(timestamps_flat, self.ts_dim)
             # 3. List comprehension (this is the only non-vectorized part)
             # This loop is now correct, as it iterates over the 1D flat tensor
-            hours = torch.tensor([datetime.datetime.fromtimestamp(ts.item(), tz=datetime.timezone.utc).hour for ts in timestamps_flat], device=device, dtype=torch.float32)
-            days = torch.tensor([datetime.datetime.fromtimestamp(ts.item(), tz=datetime.timezone.utc).weekday() for ts in timestamps_flat], device=device, dtype=torch.float32)
             # 4. Cyclical encode (already vectorized)
             hour_encoding = self._cyclical_encode(hours, self.hour_dim, max_val=24.0)

         half_dim = d_model // 2
         # Calculations for sinusoidal encoding are more stable in float32
+        div_term = torch.exp(torch.arange(0, half_dim, device=device, dtype=torch.float64) * -(math.log(10000.0) / half_dim))
+        args = values.double().unsqueeze(-1) * div_term
         return torch.cat([torch.sin(args), torch.cos(args)], dim=-1)
             # 1. Store original shape (e.g., [B, L]) and flatten
             original_shape = timestamps.shape
+            # Preserve precision for large Unix timestamps.
+            timestamps_flat = timestamps.flatten().double() # Shape [N_total]
             # 2. Sinusoidal encode (already vectorized)
             ts_encoding = self._sinusoidal_encode(timestamps_flat, self.ts_dim)
             # 3. List comprehension (this is the only non-vectorized part)
             # This loop is now correct, as it iterates over the 1D flat tensor
+            hours = torch.tensor([datetime.datetime.fromtimestamp(float(ts), tz=datetime.timezone.utc).hour for ts in timestamps_flat], device=device, dtype=torch.float32)
+            days = torch.tensor([datetime.datetime.fromtimestamp(float(ts), tz=datetime.timezone.utc).weekday() for ts in timestamps_flat], device=device, dtype=torch.float32)
             # 4. Cyclical encode (already vectorized)
             hour_encoding = self._cyclical_encode(hours, self.hour_dim, max_val=24.0)

pre_cache.sh CHANGED Viewed

@@ -3,7 +3,5 @@
 echo "Starting dataset caching..."
 python3 scripts/cache_dataset.py \
-    --ohlc_stats_path "/workspace/apollo/data/ohlc_stats.npz" \
-    --max_samples 50
 echo "Done!"

 echo "Starting dataset caching..."
 python3 scripts/cache_dataset.py \
+    --ohlc_stats_path "/workspace/apollo/data/ohlc_stats.npz"
 echo "Done!"

scripts/inspect_collator.py ADDED Viewed

	@@ -0,0 +1,227 @@

+import argparse
+import json
+import os
+import sys
+from pathlib import Path
+from typing import Any, Dict, List
+import torch
+# Ensure repo root is on sys.path
+REPO_ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(REPO_ROOT))
+from data.data_loader import OracleDataset
+from data.data_fetcher import DataFetcher
+from data.data_collator import MemecoinCollator
+import models.vocabulary as vocab
+def _decode_events(event_type_ids: torch.Tensor) -> List[str]:
+    names = []
+    for eid in event_type_ids.tolist():
+        if eid == 0:
+            names.append("__PAD__")
+        else:
+            names.append(vocab.ID_TO_EVENT.get(eid, f"UNK_{eid}"))
+    return names
+def _tensor_to_list(t: torch.Tensor) -> List:
+    return t.detach().cpu().tolist()
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Inspect MemecoinCollator outputs on cached samples.")
+    parser.add_argument("--cache_dir", type=str, default="data/cache")
+    parser.add_argument("--idx", type=int, nargs="+", default=[0], help="Sample indices to inspect")
+    parser.add_argument("--max_seq_len", type=int, default=16000)
+    parser.add_argument("--out", type=str, default="collator_dump.json")
+    args = parser.parse_args()
+    cache_dir = Path(args.cache_dir)
+    # Optional: enable time-aware fetches if DB env is set.
+    import os
+    from dotenv import load_dotenv
+    from clickhouse_driver import Client as ClickHouseClient
+    from neo4j import GraphDatabase
+    load_dotenv()
+    clickhouse_host = os.getenv("CLICKHOUSE_HOST", "localhost")
+    clickhouse_port = int(os.getenv("CLICKHOUSE_NATIVE_PORT", os.getenv("CLICKHOUSE_PORT", 9000)))
+    neo4j_uri = os.getenv("NEO4J_URI", "bolt://localhost:7687")
+    neo4j_user = os.getenv("NEO4J_USER", "neo4j")
+    neo4j_password = os.getenv("NEO4J_PASSWORD", "password")
+    clickhouse_client = ClickHouseClient(host=clickhouse_host, port=clickhouse_port)
+    neo4j_driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_password))
+    data_fetcher = DataFetcher(clickhouse_client=clickhouse_client, neo4j_driver=neo4j_driver)
+    dataset = OracleDataset(
+        data_fetcher=data_fetcher,
+        cache_dir=str(cache_dir),
+        horizons_seconds=[30, 60, 120, 240, 420],
+        quantiles=[0.1, 0.5, 0.9],
+        max_samples=None,
+        max_seq_len=args.max_seq_len,
+    )
+    if hasattr(dataset, "init_fetcher"):
+        dataset.init_fetcher()
+    collator = MemecoinCollator(
+        event_type_to_id=vocab.EVENT_TO_ID,
+        device=torch.device("cpu"),
+        dtype=torch.float32,
+        max_seq_len=args.max_seq_len,
+    )
+    batch_items = [dataset[i] for i in args.idx]
+    batch = collator(batch_items)
+    # Build JSON-friendly dump (no truncation of events; embeddings are omitted)
+    dump: Dict[str, Any] = {
+        "batch_size": len(args.idx),
+        "token_addresses": batch.get("token_addresses"),
+        "t_cutoffs": batch.get("t_cutoffs"),
+        "sample_indices": batch.get("sample_indices"),
+        "raw_events": [item.get("event_sequence", []) for item in batch_items],
+    }
+    # Raw event type counts
+    event_counts = []
+    for item in batch_items:
+        counts: Dict[str, int] = {}
+        for ev in item.get("event_sequence", []):
+            et = ev.get("event_type", "UNKNOWN")
+            counts[et] = counts.get(et, 0) + 1
+        event_counts.append(counts)
+    dump["raw_event_counts"] = event_counts
+    # Core sequence + features (full length)
+    dump["event_type_ids"] = _tensor_to_list(batch["event_type_ids"])
+    dump["event_type_names"] = [
+        _decode_events(batch["event_type_ids"][i].cpu())
+        for i in range(batch["event_type_ids"].shape[0])
+    ]
+    dump["timestamps_float"] = _tensor_to_list(batch["timestamps_float"])
+    dump["relative_ts"] = _tensor_to_list(batch["relative_ts"])
+    dump["attention_mask"] = _tensor_to_list(batch["attention_mask"])
+    dump["wallet_addr_to_batch_idx"] = batch.get("wallet_addr_to_batch_idx", {})
+    # Pointer tensors
+    for key in [
+        "wallet_indices",
+        "token_indices",
+        "quote_token_indices",
+        "trending_token_indices",
+        "boosted_token_indices",
+        "dest_wallet_indices",
+        "original_author_indices",
+        "ohlc_indices",
+        "holder_snapshot_indices",
+        "textual_event_indices",
+    ]:
+        if key in batch:
+            dump[key] = _tensor_to_list(batch[key])
+    # Numerical feature tensors
+    nonzero_summary = {}
+    for key in [
+        "transfer_numerical_features",
+        "trade_numerical_features",
+        "deployer_trade_numerical_features",
+        "smart_wallet_trade_numerical_features",
+        "pool_created_numerical_features",
+        "liquidity_change_numerical_features",
+        "fee_collected_numerical_features",
+        "token_burn_numerical_features",
+        "supply_lock_numerical_features",
+        "onchain_snapshot_numerical_features",
+        "trending_token_numerical_features",
+        "boosted_token_numerical_features",
+        "dexboost_paid_numerical_features",
+        "dexprofile_updated_flags",
+        "global_trending_numerical_features",
+        "chainsnapshot_numerical_features",
+        "lighthousesnapshot_numerical_features",
+    ]:
+        if key in batch:
+            t = batch[key]
+            dump[key] = _tensor_to_list(t)
+            nonzero_summary[key] = int(torch.count_nonzero(t).item())
+    # Categorical feature tensors
+    for key in [
+        "trade_dex_ids",
+        "trade_direction_ids",
+        "trade_mev_protection_ids",
+        "trade_is_bundle_ids",
+        "pool_created_protocol_ids",
+        "liquidity_change_type_ids",
+        "trending_token_source_ids",
+        "trending_token_timeframe_ids",
+        "lighthousesnapshot_protocol_ids",
+        "lighthousesnapshot_timeframe_ids",
+        "migrated_protocol_ids",
+        "alpha_group_ids",
+        "channel_ids",
+        "exchange_ids",
+    ]:
+        if key in batch:
+            t = batch[key]
+            dump[key] = _tensor_to_list(t)
+            nonzero_summary[key] = int(torch.count_nonzero(t).item())
+    # Labels
+    if batch.get("labels") is not None:
+        dump["labels"] = _tensor_to_list(batch["labels"])
+    if batch.get("labels_mask") is not None:
+        dump["labels_mask"] = _tensor_to_list(batch["labels_mask"])
+    if batch.get("quality_score") is not None:
+        dump["quality_score"] = _tensor_to_list(batch["quality_score"])
+    dump["nonzero_summary"] = nonzero_summary
+    # Raw wallet/token feature payloads used by encoders
+    wallet_inputs = batch.get("wallet_encoder_inputs", {})
+    token_inputs = batch.get("token_encoder_inputs", {})
+    dump["wallet_encoder_inputs"] = {
+        "profile_rows": wallet_inputs.get("profile_rows", []),
+        "social_rows": wallet_inputs.get("social_rows", []),
+        "holdings_batch": wallet_inputs.get("holdings_batch", []),
+        "username_embed_indices": _tensor_to_list(wallet_inputs.get("username_embed_indices")) if "username_embed_indices" in wallet_inputs else [],
+    }
+    dump["token_encoder_inputs"] = {
+        "addresses_for_lookup": token_inputs.get("_addresses_for_lookup", []),
+        "protocol_ids": _tensor_to_list(token_inputs.get("protocol_ids")) if "protocol_ids" in token_inputs else [],
+        "is_vanity_flags": _tensor_to_list(token_inputs.get("is_vanity_flags")) if "is_vanity_flags" in token_inputs else [],
+        "name_embed_indices": _tensor_to_list(token_inputs.get("name_embed_indices")) if "name_embed_indices" in token_inputs else [],
+        "symbol_embed_indices": _tensor_to_list(token_inputs.get("symbol_embed_indices")) if "symbol_embed_indices" in token_inputs else [],
+        "image_embed_indices": _tensor_to_list(token_inputs.get("image_embed_indices")) if "image_embed_indices" in token_inputs else [],
+    }
+    dump["wallet_set_encoder_inputs"] = {
+        "holdings_batch": wallet_inputs.get("holdings_batch", []),
+        "token_vibe_lookup_keys": token_inputs.get("_addresses_for_lookup", []),
+    }
+    out_path = Path(args.out)
+    def _json_default(o):
+        if isinstance(o, (str, int, float, bool)) or o is None:
+            return o
+        try:
+            import datetime as _dt
+            if isinstance(o, (_dt.datetime, _dt.date)):
+                return o.isoformat()
+        except Exception:
+            pass
+        try:
+            return str(o)
+        except Exception:
+            return "<unserializable>"
+    with out_path.open("w") as f:
+        json.dump(dump, f, indent=2, default=_json_default)
+    print(f"Wrote collator dump to {out_path.resolve()}")
+if __name__ == "__main__":
+    main()

train.py CHANGED Viewed

@@ -118,6 +118,15 @@ def quantile_pinball_loss(preds: torch.Tensor,
     return sum(losses) / mask.sum().clamp_min(1.0)
 def filtered_collate(collator: MemecoinCollator,
                      batch: List[Optional[Dict[str, Any]]]) -> Optional[Dict[str, Any]]:
     """Filter out None items from the dataset before collating."""
@@ -304,13 +313,18 @@ def main() -> None:
         max_seq_len=max_seq_len
     )
-    # DB Connections - REMOVED for Training (Using Cache)
-    # clickhouse_client = ClickHouseClient(...)
-    # neo4j_driver = GraphDatabase.driver(...)
-    # data_fetcher = DataFetcher(...)
     dataset = OracleDataset(
-        data_fetcher=None, # Training Mode (Reader Only)
         horizons_seconds=horizons,
         quantiles=quantiles,
         max_samples=args.max_samples,
@@ -339,6 +353,10 @@ def main() -> None:
             else:
                  logger.info("INFO: Weights found but shuffle=False. Ignoring weights (sequential mode).")
     dl_kwargs = dict(
         dataset=dataset,
         batch_size=batch_size,
@@ -353,6 +371,9 @@ def main() -> None:
         # re-initializes heavy per-worker state (e.g. SigLIP MultiModalEncoder).
         dl_kwargs["persistent_workers"] = True
         dl_kwargs["prefetch_factor"] = 2
     dataloader = DataLoader(**dl_kwargs)
     # --- 3. Model Init ---
@@ -599,7 +620,6 @@ def main() -> None:
                 logger.warning(f"Epoch {epoch+1}: No valid batches processed.")
     accelerator.end_training()
-    # neo4j_driver.close() # REMOVED
 if __name__ == "__main__":
     main()

     return sum(losses) / mask.sum().clamp_min(1.0)
+def init_worker_fetcher(worker_id: int) -> None:
+    """Initialize per-worker DataFetcher for cached datasets."""
+    worker_info = torch.utils.data.get_worker_info()
+    if worker_info is not None:
+        ds = worker_info.dataset
+        if hasattr(ds, "init_fetcher"):
+            ds.init_fetcher()
 def filtered_collate(collator: MemecoinCollator,
                      batch: List[Optional[Dict[str, Any]]]) -> Optional[Dict[str, Any]]:
     """Filter out None items from the dataset before collating."""
         max_seq_len=max_seq_len
     )
+    # DB config (for time-aware wallet/token/graph features during training)
+    fetcher_config = {
+        "clickhouse_host": os.getenv("CLICKHOUSE_HOST", "localhost"),
+        "clickhouse_port": int(os.getenv("CLICKHOUSE_PORT", 9000)),
+        "neo4j_uri": os.getenv("NEO4J_URI", "bolt://localhost:7687"),
+        "neo4j_user": os.getenv("NEO4J_USER", "neo4j"),
+        "neo4j_password": os.getenv("NEO4J_PASSWORD", "password"),
+    }
     dataset = OracleDataset(
+        data_fetcher=None,
+        fetcher_config=fetcher_config, # Training Mode (Cache + time-aware fetch per worker)
         horizons_seconds=horizons,
         quantiles=quantiles,
         max_samples=args.max_samples,
             else:
                  logger.info("INFO: Weights found but shuffle=False. Ignoring weights (sequential mode).")
+    # Initialize DataFetcher in main process when not using workers.
+    if int(args.num_workers) == 0:
+        dataset.init_fetcher()
     dl_kwargs = dict(
         dataset=dataset,
         batch_size=batch_size,
         # re-initializes heavy per-worker state (e.g. SigLIP MultiModalEncoder).
         dl_kwargs["persistent_workers"] = True
         dl_kwargs["prefetch_factor"] = 2
+    if int(args.num_workers) > 0:
+        dl_kwargs["worker_init_fn"] = init_worker_fetcher
     dataloader = DataLoader(**dl_kwargs)
     # --- 3. Model Init ---
                 logger.warning(f"Epoch {epoch+1}: No valid batches processed.")
     accelerator.end_training()
 if __name__ == "__main__":
     main()