Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

data/data_collator.py +15 -45
data/data_loader.py +65 -1
data/ohlc_stats.npz +1 -1
log.log +2 -2
models/multi_modal_processor.py +18 -2
pre_cache.sh +1 -8
scripts/cache_dataset.py +26 -1
scripts/dump_cache_sample.py +18 -1
train.py +6 -2

data/data_collator.py CHANGED Viewed

@@ -6,31 +6,7 @@ from torch.nn.utils.rnn import pad_sequence
 from typing import List, Dict, Any, Tuple, Optional, Union
 from collections import defaultdict
 from PIL import Image
-# --- GLOBAL SINGLETON FOR WORKER PROCESSES ---
-_WORKER_ENCODER = None
-def _set_worker_encoder(encoder):
-    """
-    Pre-set the encoder for workers (called from main process before forking).
-    This avoids lazy-loading on first batch.
-    """
-    global _WORKER_ENCODER
-    _WORKER_ENCODER = encoder
-def _get_worker_encoder(model_id: str, dtype: torch.dtype, device: torch.device):
-    """
-    Lazy-loads the encoder on the worker process.
-    FORCED TO CPU to save VRAM when using multiple workers.
-    """
-    global _WORKER_ENCODER
-    if _WORKER_ENCODER is None:
-        print(f"[Worker] Initializing MultiModalEncoder (SigLIP) on CPU (VRAM optimization)...")
-        # Local import to avoid top-level dependency issues
-        from models.multi_modal_processor import MultiModalEncoder
-        # Explicitly pass device="cpu"
-        _WORKER_ENCODER = MultiModalEncoder(model_id=model_id, dtype=dtype, device="cpu")
-    return _WORKER_ENCODER
 import models.vocabulary as vocab
 from data.data_loader import EmbeddingPooler
@@ -235,27 +211,21 @@ class MemecoinCollator:
         # --- 2. Create the single, batch-wide embedding pool tensor ---
         all_items_sorted = batch_wide_pooler.get_all_items()
-        texts_to_encode = [d['item'] for d in all_items_sorted if isinstance(d['item'], str)]
-        images_to_encode = [d['item'] for d in all_items_sorted if isinstance(d['item'], Image.Image)]
-        # LAZY LOAD ENCODER
-        encoder = _get_worker_encoder(self.model_id, self.dtype, self.device)
-        text_embeds = encoder(texts_to_encode).to(self.device) if texts_to_encode else torch.empty(0)
-        image_embeds = encoder(images_to_encode).to(self.device) if images_to_encode else torch.empty(0)
-        # Create the final lookup tensor and fill it based on original item type
-        batch_embedding_pool = torch.zeros(len(all_items_sorted), encoder.embedding_dim, device=self.device, dtype=self.dtype)
-        text_cursor, image_cursor = 0, 0
-        for i, item_data in enumerate(all_items_sorted):
-            if isinstance(item_data['item'], str):
-                if text_embeds.numel() > 0:
-                    batch_embedding_pool[i] = text_embeds[text_cursor]
-                    text_cursor += 1
-            elif isinstance(item_data['item'], Image.Image):
-                if image_embeds.numel() > 0:
-                    batch_embedding_pool[i] = image_embeds[image_cursor]
-                    image_cursor += 1
         # --- 3. Remap all indices in the batch data ---
         for i, item in enumerate(batch):

 from typing import List, Dict, Any, Tuple, Optional, Union
 from collections import defaultdict
 from PIL import Image
+# --- GLOBAL SINGLETON FOR WORKER PROCESSES REMOVED ---
 import models.vocabulary as vocab
 from data.data_loader import EmbeddingPooler
         # --- 2. Create the single, batch-wide embedding pool tensor ---
         all_items_sorted = batch_wide_pooler.get_all_items()
+        if not all_items_sorted:
+            # Handle edge case of absolutely no embeddings in batch
+            # Create a dummy empty tensor
+            batch_embedding_pool = torch.empty(0, 768, device=self.device, dtype=self.dtype) # Default SigLIP dim is 1152 actually, but standard is 768. Better to infer or default.
+            # Actually, if empty, it doesn't matter much as long as it's not accessed.
+        else:
+            first_item = all_items_sorted[0]['item']
+            if not isinstance(first_item, torch.Tensor):
+                 raise RuntimeError(f"Collator expects pre-computed embeddings (torch.Tensor), found {type(first_item)}. Please rebuild cache.")
+            # Stack all embeddings
+            # They should already be CPU tensors from the loader
+            # Move to device and cast to dtype
+            batch_embedding_pool = torch.stack([d['item'] for d in all_items_sorted]).to(device=self.device, dtype=self.dtype)
         # --- 3. Remap all indices in the batch data ---
         for i, item in enumerate(batch):

data/data_loader.py CHANGED Viewed

@@ -90,6 +90,8 @@ class EmbeddingPooler:
             key = item.strip()  # use normalized text key
         elif isinstance(item, Image.Image):
             key = id(item)  # unique memory address for images
         else:
             key = item  # fallback: use object itself if hashable
@@ -142,6 +144,8 @@ class OracleDataset(Dataset):
         # initialization falls through an unexpected branch.
         self.cached_files = []
         self.weights_list = []
         # If a fetcher is provided, we can determine the number of samples.
         # Otherwise, we are likely in a test mode where __len__ might not be called
@@ -2475,7 +2479,58 @@ class OracleDataset(Dataset):
             'quality_score': torch.tensor(quality_score if quality_score is not None else 0.0, dtype=torch.float32)
         }
-    def __cacheitem_context__(self, idx: int, num_samples_per_token: int = 1) -> List[Optional[Dict[str, Any]]]:
         """
         Generates fully processed training contexts for caching.
@@ -2968,5 +3023,14 @@ class OracleDataset(Dataset):
                 results.append(result)
                 pass  # Per-context verbose logging removed for caching speed
         # Final count logged via tqdm in cache_dataset.py
         return results

             key = item.strip()  # use normalized text key
         elif isinstance(item, Image.Image):
             key = id(item)  # unique memory address for images
+        elif isinstance(item, torch.Tensor):
+            key = id(item)  # unique memory address for tensors
         else:
             key = item  # fallback: use object itself if hashable
         # initialization falls through an unexpected branch.
         self.cached_files = []
         self.weights_list = []
         # If a fetcher is provided, we can determine the number of samples.
         # Otherwise, we are likely in a test mode where __len__ might not be called
             'quality_score': torch.tensor(quality_score if quality_score is not None else 0.0, dtype=torch.float32)
         }
+    def _embed_context(self, context: Dict[str, Any], encoder: Any) -> None:
+        """
+        Helper to replace raw items in the embedding pooler with pre-computed embeddings
+        using the provided encoder (on GPU).
+        """
+        pooler = context.get('embedding_pooler')
+        if not pooler:
+            return
+        # Direct access to pool_map
+        keys_to_embed_img = []
+        images_to_embed = []
+        keys_to_embed_text = []
+        texts_to_embed = []
+        for key, entry in pooler.pool_map.items():
+            item = entry['item']
+            if isinstance(item, str):
+                # Strings (text)
+                keys_to_embed_text.append(key)
+                texts_to_embed.append(item)
+            elif hasattr(item, 'resize') and not isinstance(item, torch.Tensor): # Duck typing to catch all PIL images
+                keys_to_embed_img.append(key)
+                images_to_embed.append(item)
+        # Batch encode images
+        if images_to_embed:
+            # print(f"DEBUG: Found {len(images_to_embed)} images to embed", flush=True)
+            with torch.no_grad():
+                img_embeddings = encoder(images_to_embed)
+            # Update pool_map directly for images
+            for i, (key, emb) in enumerate(zip(keys_to_embed_img, img_embeddings)):
+                if key in pooler.pool_map:
+                    old_entry = pooler.pool_map[key]
+                    pooler.pool_map[key] = {'item': emb.cpu().clone(), 'idx': old_entry['idx']}
+        # Batch encode text
+        if texts_to_embed:
+            # print(f"DEBUG: Found {len(texts_to_embed)} text items to embed", flush=True)
+            with torch.no_grad():
+                text_embeddings = encoder(texts_to_embed)
+            # Update pool_map directly for text
+            for i, (key, emb) in enumerate(zip(keys_to_embed_text, text_embeddings)):
+                if key in pooler.pool_map:
+                    old_entry = pooler.pool_map[key]
+                    pooler.pool_map[key] = {'item': emb.cpu().clone(), 'idx': old_entry['idx']}
+    def __cacheitem_context__(self, idx: int, num_samples_per_token: int = 1, encoder: Optional[Any] = None) -> List[Optional[Dict[str, Any]]]:
         """
         Generates fully processed training contexts for caching.
                 results.append(result)
                 pass  # Per-context verbose logging removed for caching speed
+        # --- OPTIONAL: Pre-compute Embeddings (if encoder provided) ---
+        if encoder is not None:
+            # print(f"DEBUG: Encoder provided to loader for {len(results)} contexts", flush=True)
+            for ctx in results:
+                self._embed_context(ctx, encoder)
+        else:
+             if idx == 0:
+                 print("DEBUG: No encoder provided to __cacheitem_context__", flush=True)
         # Final count logged via tqdm in cache_dataset.py
         return results

data/ohlc_stats.npz CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:92f50d146182941b8b01be19b4699c1b0ebe37bac1ff155580b20a8755994070
 size 1660

 version https://git-lfs.github.com/spec/v1
+oid sha256:0ecfddd649b981eacb14b68ac183e53a273cab3571ec566eaa64700bbd871e42
 size 1660

log.log CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e4ceaef802908dd650ce1ade210a0e827ec433b904adc3bf17c3d8a877e59ae6
-size 2854

 version https://git-lfs.github.com/spec/v1
+oid sha256:e6751984e91418c4e318a9a01327bc60f6d0ee18282e7f12b42fe35562611c0d
+size 13919

models/multi_modal_processor.py CHANGED Viewed

@@ -88,9 +88,24 @@ class MultiModalEncoder:
                     inputs = self.processor(text=x, return_tensors="pt", padding=True, truncation=True).to(self.device)
                     embeddings = self.model.get_text_features(**inputs)
                 else:
-                    inputs = self.processor(images=x, return_tensors="pt").to(self.device)
                     embeddings = self.model.get_image_features(**inputs)
                 # Normalize in float32 for numerical stability
                 embeddings = F.normalize(embeddings.float(), p=2, dim=-1)
@@ -99,7 +114,8 @@ class MultiModalEncoder:
             except Exception as e:
                 # Silently fail or log debug only if needed
-                # traceback.print_exc()
                 return torch.empty(0, self.embedding_dim).to(self.device)
 # --- Test block (SigLIP) ---

                     inputs = self.processor(text=x, return_tensors="pt", padding=True, truncation=True).to(self.device)
                     embeddings = self.model.get_text_features(**inputs)
                 else:
+                    # Ensure all images are RGB to avoid "Unable to infer channel dimension format"
+                    valid_images = [img.convert("RGB") for img in x]
+                    inputs = self.processor(images=valid_images, return_tensors="pt").to(self.device)
                     embeddings = self.model.get_image_features(**inputs)
+                # EXTRACT TENSOR IF OUTPUT IS A MODEL OUTPUT OBJECT
+                if not isinstance(embeddings, torch.Tensor):
+                    if hasattr(embeddings, 'pooler_output'):
+                         embeddings = embeddings.pooler_output
+                    elif hasattr(embeddings, 'last_hidden_state'):
+                         # Fallback for models without pooler_output but with hidden state (e.g. usage of [CLS] or mean pooling needed?)
+                         # For SigLIP/CLIP get_image_features, it should return the features.
+                         # If it returns an object, it might be the raw output.
+                         # Let's try to assume it matches the expected embedding dim.
+                         embeddings = embeddings.last_hidden_state
+                    elif isinstance(embeddings, (tuple, list)):
+                         embeddings = embeddings[0]
                 # Normalize in float32 for numerical stability
                 embeddings = F.normalize(embeddings.float(), p=2, dim=-1)
             except Exception as e:
                 # Silently fail or log debug only if needed
+                print(f"ERROR in MultiModalEncoder: {e}", flush=True)
+                traceback.print_exc()
                 return torch.empty(0, self.embedding_dim).to(self.device)
 # --- Test block (SigLIP) ---

pre_cache.sh CHANGED Viewed

@@ -1,11 +1,3 @@
-#!/bin/bash
-# Pre-caches the dataset for training in context mode
-#
-# Usage:
-#   ./pre_cache.sh
-set -euo pipefail
 # =========================
 # Hardcoded cache settings
 # =========================
@@ -48,6 +40,7 @@ python3 scripts/cache_dataset.py \
     --num_workers "$NUM_WORKERS" \
     --horizons_seconds "${HORIZONS_SECONDS[@]}" \
     --quantiles "${QUANTILES[@]}" \
     "$@"
 echo "Done!"

 # =========================
 # Hardcoded cache settings
 # =========================
     --num_workers "$NUM_WORKERS" \
     --horizons_seconds "${HORIZONS_SECONDS[@]}" \
     --quantiles "${QUANTILES[@]}" \
+    --max_samples 150000 \
     "$@"
 echo "Done!"

scripts/cache_dataset.py CHANGED Viewed

@@ -14,6 +14,7 @@ import huggingface_hub
 import logging
 from concurrent.futures import ProcessPoolExecutor, as_completed
 import multiprocessing as mp
 logging.getLogger("httpx").setLevel(logging.WARNING)
 logging.getLogger("transformers").setLevel(logging.ERROR)
@@ -30,6 +31,7 @@ from neo4j import GraphDatabase
 _worker_dataset = None
 _worker_return_class_map = None
 _worker_quality_scores_map = None
 def _init_worker(db_config, dataset_config, return_class_map, quality_scores_map):
@@ -40,6 +42,20 @@ def _init_worker(db_config, dataset_config, return_class_map, quality_scores_map
     clickhouse_client = ClickHouseClient(host=db_config['clickhouse_host'], port=db_config['clickhouse_port'])
     neo4j_driver = GraphDatabase.driver(db_config['neo4j_uri'], auth=(db_config['neo4j_user'], db_config['neo4j_password']))
     data_fetcher = DataFetcher(clickhouse_client=clickhouse_client, neo4j_driver=neo4j_driver)
     _worker_dataset = OracleDataset(
         data_fetcher=data_fetcher,
@@ -63,7 +79,15 @@ def _process_single_token_context(args):
         class_id = _worker_return_class_map.get(mint_addr)
         if class_id is None:
             return {'status': 'skipped', 'reason': 'not in class map', 'mint': mint_addr}
-        contexts = _worker_dataset.__cacheitem_context__(idx, num_samples_per_token=samples_per_token)
         if not contexts:
             return {'status': 'skipped', 'reason': 'no valid contexts', 'mint': mint_addr}
         q_score = _worker_quality_scores_map.get(mint_addr)
@@ -75,6 +99,7 @@ def _process_single_token_context(args):
             ctx["class_id"] = class_id
             filename = f"sample_{mint_addr[:16]}_{ctx_idx}.pt"
             output_path = Path(output_dir) / filename
             torch.save(ctx, output_path)
             saved_files.append(filename)
         return {'status': 'success', 'mint': mint_addr, 'class_id': class_id, 'q_score': q_score, 'n_contexts': len(contexts), 'n_events': len(contexts[0].get('event_sequence', [])) if contexts else 0, 'files': saved_files}

 import logging
 from concurrent.futures import ProcessPoolExecutor, as_completed
 import multiprocessing as mp
+from PIL import Image
 logging.getLogger("httpx").setLevel(logging.WARNING)
 logging.getLogger("transformers").setLevel(logging.ERROR)
 _worker_dataset = None
 _worker_return_class_map = None
 _worker_quality_scores_map = None
+_worker_encoder = None
 def _init_worker(db_config, dataset_config, return_class_map, quality_scores_map):
     clickhouse_client = ClickHouseClient(host=db_config['clickhouse_host'], port=db_config['clickhouse_port'])
     neo4j_driver = GraphDatabase.driver(db_config['neo4j_uri'], auth=(db_config['neo4j_user'], db_config['neo4j_password']))
     data_fetcher = DataFetcher(clickhouse_client=clickhouse_client, neo4j_driver=neo4j_driver)
+    # --- NEW: Init Encoder on GPU ---
+    from models.multi_modal_processor import MultiModalEncoder
+    # Using float16 for efficiency on GPU
+    global _worker_encoder
+    try:
+        _worker_encoder = MultiModalEncoder(
+            model_id="google/siglip-so400m-patch16-256-i18n",
+            device="cuda",
+            dtype=torch.float16
+        )
+    except Exception as e:
+        print(f"WARN: Failed to initialize MultiModalEncoder on worker: {e}")
+        _worker_encoder = None
     _worker_dataset = OracleDataset(
         data_fetcher=data_fetcher,
         class_id = _worker_return_class_map.get(mint_addr)
         if class_id is None:
             return {'status': 'skipped', 'reason': 'not in class map', 'mint': mint_addr}
+        # Pass the global encoder (if initialized) to pre-compute embeddings
+        global _worker_encoder
+        encoder = _worker_encoder
+        # print(f"DEBUG: Worker encoder status: {type(encoder)}", flush=True) # Commented out to reduce noise if it works
+        if encoder is None:
+             print(f"ERROR: Worker encoder is None for mint {mint_addr}!", flush=True)
+        contexts = _worker_dataset.__cacheitem_context__(idx, num_samples_per_token=samples_per_token, encoder=encoder)
         if not contexts:
             return {'status': 'skipped', 'reason': 'no valid contexts', 'mint': mint_addr}
         q_score = _worker_quality_scores_map.get(mint_addr)
             ctx["class_id"] = class_id
             filename = f"sample_{mint_addr[:16]}_{ctx_idx}.pt"
             output_path = Path(output_dir) / filename
             torch.save(ctx, output_path)
             saved_files.append(filename)
         return {'status': 'success', 'mint': mint_addr, 'class_id': class_id, 'q_score': q_score, 'n_contexts': len(contexts), 'n_events': len(contexts[0].get('event_sequence', [])) if contexts else 0, 'files': saved_files}

scripts/dump_cache_sample.py CHANGED Viewed

@@ -36,7 +36,24 @@ def convert_to_serializable(obj):
     if isinstance(obj, np.ndarray):
         return {"__type__": "ndarray", "shape": list(obj.shape), "dtype": str(obj.dtype), "data": obj.tolist()}
     if isinstance(obj, torch.Tensor):
-        return {"__type__": "tensor", "shape": list(obj.shape), "dtype": str(obj.dtype), "data": obj.tolist()}
     if isinstance(obj, datetime):
         return {"__type__": "datetime", "value": obj.isoformat()}
     if isinstance(obj, bytes):

     if isinstance(obj, np.ndarray):
         return {"__type__": "ndarray", "shape": list(obj.shape), "dtype": str(obj.dtype), "data": obj.tolist()}
     if isinstance(obj, torch.Tensor):
+        data = obj.tolist()
+        # Truncate large tensors for readability
+        if obj.numel() > 50:
+            flat = obj.flatten().tolist()
+            data = flat[:20] + [f"... ({obj.numel()} elements total)"]
+        return {"__type__": "tensor", "shape": list(obj.shape), "dtype": str(obj.dtype), "data": data}
+    # Handle EmbeddingPooler specifically
+    if type(obj).__name__ == 'EmbeddingPooler':
+        try:
+            items = obj.get_all_items()
+            return {
+                "__type__": "EmbeddingPooler",
+                "count": len(items),
+                "items": [convert_to_serializable(item) for item in items]
+            }
+        except:
+            return {"__type__": "EmbeddingPooler", "repr": str(obj)}
     if isinstance(obj, datetime):
         return {"__type__": "datetime", "value": obj.isoformat()}
     if isinstance(obj, bytes):

train.py CHANGED Viewed

@@ -132,8 +132,12 @@ def create_balanced_split(dataset, n_val_per_class: int = 1, seed: int = 42):
     # Group indices by class_id - use dataset's existing map if available
     class_to_indices = defaultdict(list)
-    # Fast path: use dataset's file_class_map (already loaded during init)
-    if hasattr(dataset, 'file_class_map') and dataset.file_class_map:
         for idx, cached_file in enumerate(dataset.cached_files):
             # file_class_map uses filename strings as keys, cached_files are Path objects
             fname = cached_file.name if hasattr(cached_file, 'name') else str(cached_file)

     # Group indices by class_id - use dataset's existing map if available
     class_to_indices = defaultdict(list)
+    # Fast path: use dataset's sample_labels (aligned with __getitem__)
+    if hasattr(dataset, 'sample_labels') and dataset.sample_labels:
+         for idx, class_id in enumerate(dataset.sample_labels):
+             class_to_indices[class_id].append(idx)
+    # Legacy path: use dataset's file_class_map (for 1-file-1-sample datasets)
+    elif hasattr(dataset, 'file_class_map') and dataset.file_class_map:
         for idx, cached_file in enumerate(dataset.cached_files):
             # file_class_map uses filename strings as keys, cached_files are Path objects
             fname = cached_file.name if hasattr(cached_file, 'name') else str(cached_file)