zirobtc commited on Mar 9

Commit

d195287

verified ·

1 Parent(s): c471f42

Upload folder using huggingface_hub

Browse files

Files changed (21) hide show

audit_cache.py +231 -133
data/data_collator.py +41 -2
data/data_loader.py +222 -344
data/quant_ohlc_feature_schema.py +182 -0
inference.py +9 -2
log.log +2 -2
models/model.py +57 -6
models/ohlc_embedder.py +3 -4
models/quant_ohlc_embedder.py +74 -0
pre_cache.sh +3 -1
sample_2kGqvM18kGLby9bY_5.json +0 -0
scripts/cache_dataset.py +193 -211
scripts/dump_cache_sample.py +6 -2
scripts/evaluate_sample.py +32 -1
scripts/rebuild_metadata.py +2 -0
signals/__init__.py +1 -0
signals/patterns.py +127 -0
signals/rolling_quant.py +71 -0
signals/support_resistance.py +136 -0
signals/trendlines.py +118 -0
train.py +8 -0

audit_cache.py CHANGED Viewed

@@ -1,151 +1,249 @@
-import os
-import torch
-import math
 import argparse
 from pathlib import Path
-from collections import defaultdict
-import glob
 from tqdm import tqdm
-def audit_cache(cache_dir, num_samples=10000):
-    files = glob.glob(os.path.join(cache_dir, "sample_*.pt"))
     if not files:
-        print(f"No .pt files found in {cache_dir}")
         return
-    files = files[:num_samples]
-    issues = defaultdict(int)
-    total_files = len(files)
     stats = {
-        'max_label_return': -float('inf'),
-        'min_label_return': float('inf'),
-        'nan_labels': 0,
-        'nan_masks': 0,
-        'missing_quality_score': 0,
-        'negative_quality_score': 0,
-        'empty_event_sequence': 0,
-        'missing_wallets': 0,
-        'nan_in_wallet_profile': 0,
-        'nan_in_events': 0,
-        'inf_in_events': 0,
-        'invalid_pool_idx': 0,
-        'max_seq_len_exceeded': 0,
-        'negative_prices': 0,
     }
-    for fpath in tqdm(files, desc="Auditing"):
         try:
-            try:
-                data = torch.load(fpath, map_location="cpu", weights_only=False)
-            except Exception:
-                issues['load_error'] += 1
                 continue
-            # 1. Quality score
-            q_score = data.get("quality_score")
-            if q_score is None:
-                stats['missing_quality_score'] += 1
-            elif math.isnan(q_score):
-                issues['nan_quality_score'] += 1
-            elif q_score < 0:
-                stats['negative_quality_score'] += 1
-            # 2. Labels & Masks
-            labels = data.get("future_return_labels")
-            masks = data.get("future_return_masks")
-            if labels is not None:
-                for v in labels.tolist():
-                    if math.isnan(v):
-                        stats['nan_labels'] += 1
-                        break
-                    stats['max_label_return'] = max(stats['max_label_return'], float(v))
-                    stats['min_label_return'] = min(stats['min_label_return'], float(v))
-            if masks is not None:
-                for v in masks.tolist():
-                    if math.isnan(v):
-                        stats['nan_masks'] += 1
-                        break
-            # 3. Events
-            events = data.get("event_sequence", [])
-            if not events:
-                stats['empty_event_sequence'] += 1
-            elif len(events) > 8192:
-                stats['max_seq_len_exceeded'] += 1
-            has_nan_event = False
-            has_inf_event = False
-            has_neg_price = False
-            for event in events:
-                for k, v in event.items():
-                    if isinstance(v, float):
-                        if math.isnan(v):
-                            has_nan_event = True
-                        elif math.isinf(v):
-                            has_inf_event = True
-                # Check chart segments specifically
-                if event.get("event_type") == "Chart_Segment":
-                    opens = event.get("opens", [])
-                    closes = event.get("closes", [])
-                    # The user report said spread is zero due to z-score
-                    # Let's check if prices are negative
-                    if opens and min(opens) < 0:
-                        has_neg_price = True
-            if has_nan_event:
-                stats['nan_in_events'] += 1
-            if has_inf_event:
-                stats['inf_in_events'] += 1
-            if has_neg_price:
-                stats['negative_prices'] += 1
-            # 4. Wallets
-            wallets = data.get("wallets")
-            if not wallets:
-                stats['missing_wallets'] += 1
-            else:
-                has_nan_wallet = False
-                for w_addr, w_data in wallets.items():
-                    profile = w_data.get("profile", {})
-                    for k, v in profile.items():
-                        if isinstance(v, float) and math.isnan(v):
-                            has_nan_wallet = True
-                if has_nan_wallet:
-                    stats['nan_in_wallet_profile'] += 1
-            # 5. Pool index out of bounds
-            pool = data.get("embedding_pool", [])
-            pool_idxs = [item.get("idx") for item in pool]
-            max_idx = max(pool_idxs) if pool_idxs else 0
-            for event in events:
-                for k, v in event.items():
-                    if k.endswith("_idx") and isinstance(v, int):
-                        if v > max_idx:
-                            stats['invalid_pool_idx'] += 1
-                            break
-        except Exception as e:
-            issues[f'processing_error_{type(e).__name__}'] += 1
-    print("\n--- Cache Audit Report (New Issues) ---")
-    print(f"Audited {total_files} files.")
-    for k, v in stats.items():
-        print(f"{k}: {v}")
-    print("\nIssues encountered:")
-    for k, v in issues.items():
-        print(f"{k}: {v}")
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--cache_dir", type=str, default="/workspace/apollo/data/cache")
-    parser.add_argument("--num", type=int, default=1000)
     args = parser.parse_args()
     audit_cache(args.cache_dir, args.num)

 import argparse
+import math
+from collections import Counter, defaultdict
 from pathlib import Path
+import torch
 from tqdm import tqdm
+from data.data_loader import summarize_context_window
+from data.quant_ohlc_feature_schema import FEATURE_VERSION, NUM_QUANT_OHLC_FEATURES, TOKENS_PER_SEGMENT
+REQUIRED_CONTEXT_FIELDS = [
+    "event_sequence",
+    "wallets",
+    "tokens",
+    "labels",
+    "labels_mask",
+    "quality_score",
+    "class_id",
+    "source_token",
+    "context_bucket",
+    "context_score",
+    "quant_ohlc_features",
+    "quant_feature_version",
+]
+def _to_list(value):
+    if value is None:
+        return []
+    if isinstance(value, torch.Tensor):
+        return value.tolist()
+    return list(value)
+def _safe_float(value):
+    if isinstance(value, torch.Tensor):
+        if value.numel() != 1:
+            raise ValueError("Expected scalar tensor.")
+        return float(value.item())
+    return float(value)
+def audit_cache(cache_dir, num_samples=None):
+    cache_path = Path(cache_dir)
+    files = sorted(cache_path.glob("sample_*.pt"))
     if not files:
+        print(f"No sample_*.pt files found in {cache_path}")
         return
+    if num_samples is not None and num_samples > 0:
+        files = files[:num_samples]
+    issues = Counter()
+    class_counts = Counter()
+    bucket_counts = Counter()
+    class_bucket_counts = defaultdict(Counter)
+    token_counts_by_class = defaultdict(Counter)
+    samples_per_token = Counter()
+    missing_fields = Counter()
     stats = {
+        "files_audited": len(files),
+        "empty_event_sequence": 0,
+        "missing_wallets": 0,
+        "missing_tokens": 0,
+        "nan_labels": 0,
+        "nan_masks": 0,
+        "nan_quality_score": 0,
+        "negative_quality_score": 0,
+        "max_label_return": -float("inf"),
+        "min_label_return": float("inf"),
+        "max_events": 0,
+        "min_events": float("inf"),
+        "contexts_with_no_valid_horizons": 0,
+        "context_bucket_mismatch": 0,
+        "context_score_mismatch": 0,
+        "quant_feature_version_mismatch": 0,
+        "chart_events_missing_quant": 0,
+        "quant_segments_total": 0,
     }
+    for filepath in tqdm(files, desc="Auditing cache", unit="file"):
         try:
+            data = torch.load(filepath, map_location="cpu", weights_only=False)
+        except Exception:
+            issues["load_error"] += 1
+            continue
+        if not isinstance(data, dict):
+            issues["not_dict"] += 1
+            continue
+        missing_for_file = []
+        for field in REQUIRED_CONTEXT_FIELDS:
+            if field not in data:
+                missing_for_file.append(field)
+                missing_fields[field] += 1
+        if missing_for_file:
+            issues["missing_required_fields"] += 1
+            continue
+        class_id = int(data["class_id"])
+        source_token = str(data["source_token"])
+        context_bucket = str(data["context_bucket"])
+        class_counts[class_id] += 1
+        bucket_counts[context_bucket] += 1
+        class_bucket_counts[class_id][context_bucket] += 1
+        token_counts_by_class[class_id][source_token] += 1
+        samples_per_token[source_token] += 1
+        events = data.get("event_sequence") or []
+        wallets = data.get("wallets") or {}
+        tokens = data.get("tokens") or {}
+        labels = _to_list(data.get("labels"))
+        masks = _to_list(data.get("labels_mask"))
+        if not events:
+            stats["empty_event_sequence"] += 1
+        stats["max_events"] = max(stats["max_events"], len(events))
+        stats["min_events"] = min(stats["min_events"], len(events))
+        if not wallets:
+            stats["missing_wallets"] += 1
+        if not tokens:
+            stats["missing_tokens"] += 1
+        has_nan_label = False
+        for value in labels:
+            if math.isnan(float(value)):
+                has_nan_label = True
+                break
+            stats["max_label_return"] = max(stats["max_label_return"], float(value))
+            stats["min_label_return"] = min(stats["min_label_return"], float(value))
+        if has_nan_label:
+            stats["nan_labels"] += 1
+        has_nan_mask = False
+        for value in masks:
+            if math.isnan(float(value)):
+                has_nan_mask = True
+                break
+        if has_nan_mask:
+            stats["nan_masks"] += 1
+        try:
+            quality_score = _safe_float(data.get("quality_score"))
+            if math.isnan(quality_score):
+                stats["nan_quality_score"] += 1
+            elif quality_score < 0:
+                stats["negative_quality_score"] += 1
+        except Exception:
+            issues["invalid_quality_score"] += 1
+        try:
+            summary = summarize_context_window(data.get("labels"), data.get("labels_mask"))
+            if summary["valid_horizons"] == 0:
+                stats["contexts_with_no_valid_horizons"] += 1
+            if summary["context_bucket"] != context_bucket:
+                stats["context_bucket_mismatch"] += 1
+            stored_score = _safe_float(data.get("context_score"))
+            if not math.isclose(summary["context_score"], stored_score, rel_tol=1e-6, abs_tol=1e-6):
+                stats["context_score_mismatch"] += 1
+        except Exception:
+            issues["context_summary_error"] += 1
+        if data.get("quant_feature_version") != FEATURE_VERSION:
+            stats["quant_feature_version_mismatch"] += 1
+        chart_events = [event for event in events if event.get("event_type") == "Chart_Segment"]
+        stats["quant_segments_total"] += len(chart_events)
+        for event in chart_events:
+            quant_payload = event.get("quant_ohlc_features")
+            if not isinstance(quant_payload, list):
+                stats["chart_events_missing_quant"] += 1
                 continue
+            if len(quant_payload) > TOKENS_PER_SEGMENT:
+                issues["quant_too_many_tokens"] += 1
+            for token_payload in quant_payload:
+                vec = token_payload.get("feature_vector")
+                if not isinstance(vec, list) or len(vec) != NUM_QUANT_OHLC_FEATURES:
+                    issues["quant_bad_vector_shape"] += 1
+                    break
+    if stats["min_events"] == float("inf"):
+        stats["min_events"] = 0
+    if stats["min_label_return"] == float("inf"):
+        stats["min_label_return"] = 0.0
+    if stats["max_label_return"] == -float("inf"):
+        stats["max_label_return"] = 0.0
+    unique_tokens_total = len(samples_per_token)
+    duplicate_tokens_total = sum(1 for count in samples_per_token.values() if count > 1)
+    print("\n=== Cache Audit ===")
+    print(f"Cache dir: {cache_path}")
+    print(f"Files audited: {stats['files_audited']}")
+    print(f"Unique source tokens: {unique_tokens_total}")
+    print(f"Tokens with >1 cached context: {duplicate_tokens_total}")
+    print(f"Samples per token max: {max(samples_per_token.values()) if samples_per_token else 0}")
+    print("\n--- Class Counts ---")
+    for class_id in sorted(class_counts):
+        unique_tokens = len(token_counts_by_class[class_id])
+        print(f"Class {class_id}: samples={class_counts[class_id]} unique_tokens={unique_tokens}")
+    print("\n--- Context Buckets ---")
+    for bucket, count in sorted(bucket_counts.items()):
+        print(f"{bucket}: {count}")
+    print("\n--- Class x Context Bucket ---")
+    for class_id in sorted(class_bucket_counts):
+        bucket_summary = dict(sorted(class_bucket_counts[class_id].items()))
+        print(f"Class {class_id}: {bucket_summary}")
+    print("\n--- General Stats ---")
+    for key, value in stats.items():
+        print(f"{key}: {value}")
+    print("\n--- Missing Fields ---")
+    if missing_fields:
+        for field, count in sorted(missing_fields.items()):
+            print(f"{field}: {count}")
+    else:
+        print("none")
+    print("\n--- Issues ---")
+    if issues:
+        for key, value in sorted(issues.items()):
+            print(f"{key}: {value}")
+    else:
+        print("none")
+    print("\n--- Duplicate-Heavy Tokens ---")
+    heavy_tokens = sorted(samples_per_token.items(), key=lambda item: (-item[1], item[0]))[:20]
+    for token, count in heavy_tokens:
+        print(f"{token}: {count}")
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--cache_dir", type=str, default="/workspace/apollo/data/cache")
+    parser.add_argument("--num", type=int, default=None, help="Audit only the first N files.")
     args = parser.parse_args()
     audit_cache(args.cache_dir, args.num)

data/data_collator.py CHANGED Viewed

@@ -10,6 +10,7 @@ from PIL import Image
 import models.vocabulary as vocab
 from data.data_loader import EmbeddingPooler
 NATIVE_MINT = "So11111111111111111111111111111111111111112"
 QUOTE_MINTS = {
@@ -40,6 +41,8 @@ class MemecoinCollator:
         self.device = device
         self.dtype = dtype
         self.ohlc_seq_len = 300 # HARDCODED
         self.max_seq_len = max_seq_len
     def _collate_features_for_encoder(self, entities: List[Dict], feature_keys: List[str], device: torch.device, entity_type: str) -> Dict[str, Any]:
@@ -86,10 +89,16 @@ class MemecoinCollator:
         if not chart_events:
             return {
                 'price_tensor': torch.empty(0, 2, self.ohlc_seq_len, device=self.device, dtype=self.dtype),
-                'interval_ids': torch.empty(0, device=self.device, dtype=torch.long)
             }
         ohlc_tensors = []
         interval_ids_list = []
         seq_len = self.ohlc_seq_len
         unknown_id = vocab.INTERVAL_TO_ID.get("Unknown", 0)
         for segment_data in chart_events:
@@ -105,9 +114,36 @@ class MemecoinCollator:
              ohlc_tensors.append(torch.stack([o, c]))
              interval_id = vocab.INTERVAL_TO_ID.get(interval_str, unknown_id)
              interval_ids_list.append(interval_id)
         return {
             'price_tensor': torch.stack(ohlc_tensors).to(self.device),
-            'interval_ids': torch.tensor(interval_ids_list, device=self.device, dtype=torch.long)
         }
     def _collate_graph_links(self,
@@ -677,6 +713,9 @@ class MemecoinCollator:
             'wallet_encoder_inputs': wallet_encoder_inputs, # ADDED BACK
             'ohlc_price_tensors': ohlc_inputs_dict['price_tensor'],
             'ohlc_interval_ids': ohlc_inputs_dict['interval_ids'],
             'graph_updater_links': graph_updater_links,
             'wallet_addr_to_batch_idx': wallet_addr_to_batch_idx, # NEW: Pass the mapping

 import models.vocabulary as vocab
 from data.data_loader import EmbeddingPooler
+from data.quant_ohlc_feature_schema import FEATURE_VERSION, FEATURE_VERSION_ID, NUM_QUANT_OHLC_FEATURES, TOKENS_PER_SEGMENT
 NATIVE_MINT = "So11111111111111111111111111111111111111112"
 QUOTE_MINTS = {
         self.device = device
         self.dtype = dtype
         self.ohlc_seq_len = 300 # HARDCODED
+        self.quant_ohlc_tokens = TOKENS_PER_SEGMENT
+        self.quant_ohlc_num_features = NUM_QUANT_OHLC_FEATURES
         self.max_seq_len = max_seq_len
     def _collate_features_for_encoder(self, entities: List[Dict], feature_keys: List[str], device: torch.device, entity_type: str) -> Dict[str, Any]:
         if not chart_events:
             return {
                 'price_tensor': torch.empty(0, 2, self.ohlc_seq_len, device=self.device, dtype=self.dtype),
+                'interval_ids': torch.empty(0, device=self.device, dtype=torch.long),
+                'quant_feature_tensors': torch.empty(0, self.quant_ohlc_tokens, self.quant_ohlc_num_features, device=self.device, dtype=self.dtype),
+                'quant_feature_mask': torch.empty(0, self.quant_ohlc_tokens, device=self.device, dtype=self.dtype),
+                'quant_feature_version_ids': torch.empty(0, device=self.device, dtype=torch.long),
             }
         ohlc_tensors = []
         interval_ids_list = []
+        quant_feature_tensors = []
+        quant_feature_masks = []
+        quant_feature_version_ids = []
         seq_len = self.ohlc_seq_len
         unknown_id = vocab.INTERVAL_TO_ID.get("Unknown", 0)
         for segment_data in chart_events:
              ohlc_tensors.append(torch.stack([o, c]))
              interval_id = vocab.INTERVAL_TO_ID.get(interval_str, unknown_id)
              interval_ids_list.append(interval_id)
+             quant_payload = segment_data.get('quant_ohlc_features')
+             if quant_payload is None:
+                 raise RuntimeError("Chart_Segment missing quant_ohlc_features. Rebuild cache with quantitative chart features.")
+             if not isinstance(quant_payload, list):
+                 raise RuntimeError("Chart_Segment quant_ohlc_features must be a list.")
+             feature_rows = []
+             feature_mask = []
+             for token_idx in range(self.quant_ohlc_tokens):
+                 if token_idx < len(quant_payload):
+                     payload = quant_payload[token_idx]
+                     vec = payload.get('feature_vector')
+                     if not isinstance(vec, list) or len(vec) != self.quant_ohlc_num_features:
+                         raise RuntimeError(
+                             f"Chart_Segment quant feature vector must have length {self.quant_ohlc_num_features}."
+                         )
+                     feature_rows.append(vec)
+                     feature_mask.append(1.0)
+                 else:
+                     feature_rows.append([0.0] * self.quant_ohlc_num_features)
+                     feature_mask.append(0.0)
+             quant_feature_tensors.append(torch.tensor(feature_rows, device=self.device, dtype=self.dtype))
+             quant_feature_masks.append(torch.tensor(feature_mask, device=self.device, dtype=self.dtype))
+             version = segment_data.get('quant_feature_version', FEATURE_VERSION)
+             quant_feature_version_ids.append(FEATURE_VERSION_ID if version == FEATURE_VERSION else 0)
         return {
             'price_tensor': torch.stack(ohlc_tensors).to(self.device),
+            'interval_ids': torch.tensor(interval_ids_list, device=self.device, dtype=torch.long),
+            'quant_feature_tensors': torch.stack(quant_feature_tensors).to(self.device),
+            'quant_feature_mask': torch.stack(quant_feature_masks).to(self.device),
+            'quant_feature_version_ids': torch.tensor(quant_feature_version_ids, device=self.device, dtype=torch.long),
         }
     def _collate_graph_links(self,
             'wallet_encoder_inputs': wallet_encoder_inputs, # ADDED BACK
             'ohlc_price_tensors': ohlc_inputs_dict['price_tensor'],
             'ohlc_interval_ids': ohlc_inputs_dict['interval_ids'],
+            'quant_ohlc_feature_tensors': ohlc_inputs_dict['quant_feature_tensors'],
+            'quant_ohlc_feature_mask': ohlc_inputs_dict['quant_feature_mask'],
+            'quant_ohlc_feature_version_ids': ohlc_inputs_dict['quant_feature_version_ids'],
             'graph_updater_links': graph_updater_links,
             'wallet_addr_to_batch_idx': wallet_addr_to_batch_idx, # NEW: Pass the mapping

data/data_loader.py CHANGED Viewed

@@ -18,6 +18,19 @@ import models.vocabulary as vocab
 from models.multi_modal_processor import MultiModalEncoder
 from data.data_fetcher import DataFetcher # NEW: Import the DataFetcher
 from data.context_targets import derive_movement_targets
 from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry
@@ -1278,14 +1291,186 @@ class OracleDataset(Dataset):
         return full_ohlc
-    def __getitem__(self, idx: int) -> Optional[Dict[str, Any]]:
-        """
-        Loads data from cache. Behavior depends on cache mode:
-        - RAW MODE: Loads raw token data, samples T_cutoff at runtime, applies H/B/H
-        - CONTEXT MODE: Loads pre-computed training context directly (fully offline)
-        The cache mode is auto-detected from the cached file's 'cache_mode' field.
         """
         import time as _time
         _timings = {}
@@ -1309,23 +1494,17 @@ class OracleDataset(Dataset):
         if not cached_data:
             raise RuntimeError(f"No data loaded for index {idx}")
-        # Auto-detect cache mode. New compact context cache may omit 'cache_mode'.
-        if 'cache_mode' in cached_data:
-            cache_mode = cached_data.get('cache_mode', 'raw')
-        else:
-            has_context_shape = (
-                isinstance(cached_data, dict) and
-                'event_sequence' in cached_data and
-                'tokens' in cached_data and
-                'wallets' in cached_data and
-                'labels' in cached_data and
-                'labels_mask' in cached_data
-            )
-            cache_mode = 'context' if has_context_shape else 'raw'
-        if cache_mode == 'context':
-            # CONTEXT MODE: Return pre-computed training context directly
-            # This is fully deterministic - no runtime sampling or processing
             _timings['total'] = _time.perf_counter() - _total_start
             if 'movement_class_targets' not in cached_data and 'labels' in cached_data and 'labels_mask' in cached_data:
@@ -1346,318 +1525,16 @@ class OracleDataset(Dataset):
                 )
             if idx % 100 == 0:
-                print(f"[Sample {idx}] CONTEXT mode | cache_load: {_timings['cache_load']*1000:.1f}ms | "
                       f"total: {_timings['total']*1000:.1f}ms | events: {len(cached_data.get('event_sequence', []))}")
             return cached_data
-        # RAW MODE: Fall through to original __getitem__ logic with runtime T_cutoff sampling
-        raw_data = cached_data
-        required_keys = [
-            "mint_timestamp",
-            "max_limit_time",
-            "token_address",
-            "creator_address",
-            "trades",
-            "transfers",
-            "pool_creations",
-            "liquidity_changes",
-            "fee_collections",
-            "burns",
-            "supply_locks",
-            "migrations",
-            "quality_score"
-        ]
-        missing_keys = [key for key in required_keys if key not in raw_data]
-        if missing_keys:
-            raise RuntimeError(
-                f"Cached sample missing raw fields ({missing_keys}). Rebuild cache with raw caching enabled."
-            )
-        # --- CHECK: Determine if we have new-style complete cache ---
-        has_complete_cache = 'cached_wallet_data' in raw_data and 'cached_graph_data' in raw_data
-        # --- TIMING: T_cutoff sampling prep ---
-        _t0 = _time.perf_counter()
-        def _timestamp_to_order_value(ts_value: Any) -> float:
-            if isinstance(ts_value, datetime.datetime):
-                if ts_value.tzinfo is None:
-                    ts_value = ts_value.replace(tzinfo=datetime.timezone.utc)
-                return ts_value.timestamp()
-            try:
-                return float(ts_value)
-            except (TypeError, ValueError):
-                return 0.0
-        # --- DYNAMIC SAMPLING LOGIC ---
-        mint_timestamp = raw_data['mint_timestamp']
-        if isinstance(mint_timestamp, datetime.datetime) and mint_timestamp.tzinfo is None:
-            mint_timestamp = mint_timestamp.replace(tzinfo=datetime.timezone.utc)
-        min_window = 30  # seconds
-        horizons = sorted(self.horizons_seconds)
-        first_horizon = horizons[0] if horizons else 60
-        min_label = max(60, first_horizon)
-        preferred_horizon = horizons[1] if len(horizons) > 1 else min_label
-        mint_ts_value = _timestamp_to_order_value(mint_timestamp)
-        # ============================================================================
-        # T_CUTOFF SAMPLING: Index-based with Successful Trade Guarantee
-        # ============================================================================
-        # 1. Use ALL trades (sorted by timestamp) for context
-        # 2. Find indices of SUCCESSFUL trades (needed for label computation)
-        # 3. Sample interval: [min_context_trades-1, last_successful_idx - 1]
-        # 4. This guarantees: N trades for context, 1+ successful trade for labels
-        # ============================================================================
-        all_trades_raw = raw_data.get('trades', [])
-        if not all_trades_raw:
-            return None
-        # Sort ALL trades by timestamp
-        all_trades_sorted = sorted(
-            [t for t in all_trades_raw if t.get('timestamp') is not None],
-            key=lambda t: _timestamp_to_order_value(t.get('timestamp'))
         )
-        min_context_trades = self.min_trades
-        if len(all_trades_sorted) < (min_context_trades + 1):  # context + 1 trade after cutoff
-            return None
-        # Find indices of SUCCESSFUL trades (valid for label computation)
-        successful_indices = [
-            i for i, t in enumerate(all_trades_sorted)
-            if t.get('success', False) and float(t.get('price_usd', 0) or 0) > 0
-        ]
-        if len(successful_indices) < 2:  # Need at least 2 successful: anchor + future
-            return None
-        max_horizon_seconds = max(self.horizons_seconds) if self.horizons_seconds else 0
-        # Define sampling interval
-        min_idx = min_context_trades - 1  # At least N trades for context
-        max_idx = len(all_trades_sorted) - 2  # Need at least 1 trade after cutoff
-        if max_idx < min_idx:
-            return None
-        # Precompute last successful index <= i and next successful index > i
-        last_successful_before = [-1] * len(all_trades_sorted)
-        last_seen = -1
-        succ_set = set(successful_indices)
-        for i in range(len(all_trades_sorted)):
-            if i in succ_set:
-                last_seen = i
-            last_successful_before[i] = last_seen
-        next_successful_after = [-1] * len(all_trades_sorted)
-        next_seen = -1
-        for i in range(len(all_trades_sorted) - 1, -1, -1):
-            if i in succ_set:
-                next_seen = i
-            next_successful_after[i] = next_seen
-        # Build eligible indices that guarantee:
-        # 1) anchor successful trade at or before cutoff
-        # 2) successful trade within max_horizon_seconds after cutoff
-        eligible_indices = []
-        for i in range(min_idx, max_idx + 1):
-            anchor_idx = last_successful_before[i]
-            next_idx = next_successful_after[i + 1] if i + 1 < len(all_trades_sorted) else -1
-            if anchor_idx < 0 or next_idx < 0:
-                continue
-            cutoff_ts = _timestamp_to_order_value(all_trades_sorted[i].get('timestamp'))
-            next_ts = _timestamp_to_order_value(all_trades_sorted[next_idx].get('timestamp'))
-            if next_ts <= cutoff_ts + max_horizon_seconds:
-                eligible_indices.append(i)
-        if not eligible_indices:
-            return None
-        # Sample random index within eligible interval
-        sample_idx = random.choice(eligible_indices)
-        # T_cutoff = timestamp of the sampled trade
-        sample_trade = all_trades_sorted[sample_idx]
-        sample_offset_ts = _timestamp_to_order_value(sample_trade.get('timestamp'))
-        T_cutoff = datetime.datetime.fromtimestamp(sample_offset_ts, tz=datetime.timezone.utc)
-        _timings['t_cutoff_sampling'] = _time.perf_counter() - _t0
-        # --- TIMING: Wallet collection ---
-        _t0 = _time.perf_counter()
-        token_address = raw_data['token_address']
-        creator_address = raw_data['creator_address']
-        cutoff_ts = _timestamp_to_order_value(T_cutoff)
-        def _add_wallet(addr: Optional[str], wallet_set: set):
-            if addr:
-                wallet_set.add(addr)
-        wallets_to_fetch = set()
-        _add_wallet(creator_address, wallets_to_fetch)
-        for trade in raw_data.get('trades', []):
-            if _timestamp_to_order_value(trade.get('timestamp')) <= cutoff_ts:
-                _add_wallet(trade.get('maker'), wallets_to_fetch)
-        for transfer in raw_data.get('transfers', []):
-            if _timestamp_to_order_value(transfer.get('timestamp')) <= cutoff_ts:
-                _add_wallet(transfer.get('source'), wallets_to_fetch)
-                _add_wallet(transfer.get('destination'), wallets_to_fetch)
-        for pool in raw_data.get('pool_creations', []):
-            if _timestamp_to_order_value(pool.get('timestamp')) <= cutoff_ts:
-                _add_wallet(pool.get('creator_address'), wallets_to_fetch)
-        for liq in raw_data.get('liquidity_changes', []):
-            if _timestamp_to_order_value(liq.get('timestamp')) <= cutoff_ts:
-                _add_wallet(liq.get('lp_provider'), wallets_to_fetch)
-        # Offline Holder Lookup using raw_data['holder_snapshots_list']
-        # We need the snapshot corresponding to T_cutoff.
-        # Intervals are every 300s from mint_ts.
-        # idx = (T_cutoff - mint) // 300
-        elapsed = (T_cutoff - mint_timestamp).total_seconds()
-        snap_idx = int(elapsed // 300)
-        holder_records = []
-        cached_holders_list = raw_data.get('holder_snapshots_list')
-        if not isinstance(cached_holders_list, list):
-            raise RuntimeError("Invalid cache: holder_snapshots_list must be a list.")
-        if not (0 <= snap_idx < len(cached_holders_list)):
-            raise RuntimeError(
-                f"Invalid cache: holder_snapshots_list index out of range (snap_idx={snap_idx}, len={len(cached_holders_list)})."
-            )
-        snapshot_data = cached_holders_list[snap_idx]
-        if not isinstance(snapshot_data, dict) or not isinstance(snapshot_data.get('holders'), list):
-            raise RuntimeError("Invalid cache: holder snapshot entry must be a dict with list field 'holders'.")
-        holder_records = snapshot_data['holders']
-        for holder in holder_records:
-            if not isinstance(holder, dict) or 'wallet_address' not in holder or 'current_balance' not in holder:
-                raise RuntimeError("Invalid cache: each holder record must include wallet_address and current_balance.")
-            _add_wallet(holder['wallet_address'], wallets_to_fetch)
-        _timings['wallet_collection'] = _time.perf_counter() - _t0
-        _timings['num_wallets'] = len(wallets_to_fetch)
-        pooler = EmbeddingPooler()
-        # --- TIMING: Token data (OFFLINE - uses cached image bytes) ---
-        _t0 = _time.perf_counter()
-        # Build minimal main token metadata from cache (no HTTP calls)
-        offline_token_data = {token_address: self._build_main_token_seed(token_address, raw_data)}
-        # If we have cached image bytes, convert to PIL Image for the pooler
-        cached_image_bytes = raw_data.get('cached_image_bytes')
-        if cached_image_bytes:
-            try:
-                cached_image = Image.open(BytesIO(cached_image_bytes))
-                offline_token_data[token_address]['_cached_image_pil'] = cached_image
-            except Exception as e:
-                pass  # Image decoding failed, will use None
-        main_token_data = self._process_token_data_offline(
-            [token_address], pooler, T_cutoff, token_data=offline_token_data
-        )
-        _timings['token_data_and_images'] = _time.perf_counter() - _t0
-        if not main_token_data:
-            return None
-        # --- TIMING: Wallet data (OFFLINE - uses pre-cached profiles/socials/holdings) ---
-        _t0 = _time.perf_counter()
-        if has_complete_cache:
-            # Use new complete cache format
-            cached_wallet_bundle = raw_data.get('cached_wallet_data', {})
-            offline_profiles = cached_wallet_bundle.get('profiles', {})
-            offline_socials = cached_wallet_bundle.get('socials', {})
-            offline_holdings = cached_wallet_bundle.get('holdings', {})
-        else:
-            # Fallback to old partial cache format
-            cached_social_bundle = raw_data.get('socials', {})
-            offline_profiles = cached_social_bundle.get('profiles', {})
-            offline_socials = cached_social_bundle.get('socials', {})
-            offline_holdings = {}
-        wallet_data, all_token_data = self._process_wallet_data(
-            list(wallets_to_fetch),
-            main_token_data.copy(),
-            pooler,
-            T_cutoff,
-            profiles_override=offline_profiles,
-            socials_override=offline_socials,
-            holdings_override=offline_holdings
-        )
-        _timings['wallet_data'] = _time.perf_counter() - _t0
-        _timings['num_tokens_in_holdings'] = len(all_token_data) - 1 if all_token_data else 0
-        # --- TIMING: Graph links (OFFLINE - uses pre-cached graph data) ---
-        _t0 = _time.perf_counter()
-        if has_complete_cache:
-            # Use new complete cache format
-            cached_graph_bundle = raw_data.get('cached_graph_data', {})
-            graph_entities = cached_graph_bundle.get('entities', {})
-            graph_links = cached_graph_bundle.get('links', {})
-        else:
-            # No graph data in old cache format
-            graph_entities = {}
-            graph_links = {}
-        _timings['graph_links'] = _time.perf_counter() - _t0
-        _timings['num_graph_entities'] = len(graph_entities)
-        # --- TIMING: Generate dataset item ---
-        _t0 = _time.perf_counter()
-        # Generate the item
-        result = self._generate_dataset_item(
-             token_address=token_address,
-             t0=mint_timestamp,
-             T_cutoff=T_cutoff,
-             mint_event={ # Reconstruct simplified mint event
-                 'event_type': 'Mint',
-                 'timestamp': int(mint_timestamp.timestamp()),
-                 'relative_ts': 0,
-                 'wallet_address': creator_address,
-                 'token_address': token_address,
-                 'protocol_id': raw_data.get('protocol_id', 0)
-             },
-             trade_records=raw_data['trades'],
-             transfer_records=raw_data['transfers'],
-             pool_creation_records=raw_data['pool_creations'],
-             liquidity_change_records=raw_data['liquidity_changes'],
-             fee_collection_records=raw_data['fee_collections'],
-             burn_records=raw_data['burns'],
-             supply_lock_records=raw_data['supply_locks'],
-             migration_records=raw_data['migrations'],
-             wallet_data=wallet_data,
-             all_token_data=all_token_data,
-             graph_links=graph_links,
-             graph_seed_entities=wallets_to_fetch,
-             all_graph_entities=graph_entities,
-             future_trades_for_labels=raw_data['trades'], # We utilize full trade history for labels!
-             pooler=pooler,
-             sample_idx=idx,
-             cached_holders_list=raw_data.get('holder_snapshots_list'),
-             cached_ohlc_1s=raw_data.get('ohlc_1s'),
-             quality_score=raw_data.get('quality_score')
-        )
-        _timings['generate_item'] = _time.perf_counter() - _t0
-        # --- TIMING: Total and summary ---
-        _timings['total'] = _time.perf_counter() - _total_start
-        # Only print timing summary occasionally to reduce log spam
-        if idx % 100 == 0:
-            print(f"[Sample {idx}] OFFLINE mode | cache_load: {_timings['cache_load']*1000:.1f}ms | "
-                  f"total: {_timings['total']*1000:.1f}ms | wallets: {_timings['num_wallets']} | "
-                  f"graph_entities: {_timings['num_graph_entities']}")
-        return result
     def _process_token_data_offline(self, token_addresses: List[str], pooler: EmbeddingPooler,
                                      T_cutoff: datetime.datetime, token_data: Optional[Dict] = None) -> Dict[str, Dict[str, Any]]:
         """
@@ -2301,7 +2178,9 @@ class OracleDataset(Dataset):
                     'relative_ts': int(last_ts) - int(t0_timestamp),
                     'opens': self._normalize_price_series(opens_raw),
                     'closes': self._normalize_price_series(closes_raw),
-                    'i': interval_label
                 }
                 emitted_events.append(chart_event)
             return emitted_events
@@ -2601,22 +2480,22 @@ class OracleDataset(Dataset):
         if not all_trades:
             # No valid trades for label computation
-            movement_targets = derive_movement_targets(
-                [0.0] * len(self.horizons_seconds),
-                [0.0] * len(self.horizons_seconds),
-                movement_label_config=self.movement_label_config,
-            )
             return {
                 'event_sequence': event_sequence,
                 'wallets': wallet_data,
                 'tokens': all_token_data,
                 'graph_links': graph_links,
                 'embedding_pooler': pooler,
                 'labels': torch.zeros(len(self.horizons_seconds), dtype=torch.float32),
                 'labels_mask': torch.zeros(len(self.horizons_seconds), dtype=torch.float32),
                 'quality_score': torch.tensor(quality_score if quality_score is not None else 0.0, dtype=torch.float32),
-                'movement_class_targets': torch.tensor(movement_targets['movement_class_targets'], dtype=torch.long),
-                'movement_class_mask': torch.tensor(movement_targets['movement_class_mask'], dtype=torch.long),
             }
         # Ensure sorted
@@ -2696,12 +2575,11 @@ class OracleDataset(Dataset):
         # DEBUG: Mask summaries removed after validation
-        movement_targets = derive_movement_targets(
-            label_values,
-            mask_values,
-            movement_label_config=self.movement_label_config,
-        )
         return {
             'sample_idx': sample_idx if sample_idx is not None else -1,  # Debug trace
             'token_address': token_address,  # For debugging
@@ -2711,11 +2589,11 @@ class OracleDataset(Dataset):
             'tokens': all_token_data,
             'graph_links': graph_links,
             'embedding_pooler': pooler,
             'labels': torch.tensor(label_values, dtype=torch.float32),
             'labels_mask': torch.tensor(mask_values, dtype=torch.float32),
             'quality_score': torch.tensor(quality_score if quality_score is not None else 0.0, dtype=torch.float32),
-            'movement_class_targets': torch.tensor(movement_targets['movement_class_targets'], dtype=torch.long),
-            'movement_class_mask': torch.tensor(movement_targets['movement_class_mask'], dtype=torch.long),
         }
     def _embed_context(self, context: Dict[str, Any], encoder: Any) -> None:

 from models.multi_modal_processor import MultiModalEncoder
 from data.data_fetcher import DataFetcher # NEW: Import the DataFetcher
 from data.context_targets import derive_movement_targets
+from data.quant_ohlc_feature_schema import (
+    FEATURE_VERSION,
+    FEATURE_VERSION_ID,
+    LOOKBACK_SECONDS,
+    TOKENS_PER_SEGMENT,
+    WINDOW_SECONDS,
+    empty_feature_dict,
+    feature_dict_to_vector,
+)
+from signals.patterns import compute_pattern_features
+from signals.rolling_quant import compute_rolling_quant_features
+from signals.support_resistance import compute_support_resistance_features
+from signals.trendlines import compute_trendline_features
 from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry
         return full_ohlc
+    def _compute_quant_rolling_features(
+        self,
+        closes: List[float],
+        end_idx: int,
+    ) -> Dict[str, float]:
+        return compute_rolling_quant_features(closes, end_idx)
+    def _compute_support_resistance_features(
+        self,
+        closes: List[float],
+        highs: List[float],
+        lows: List[float],
+        end_idx: int,
+        window_start: int,
+        window_end: int,
+        timestamps: List[int],
+    ) -> Dict[str, float]:
+        return compute_support_resistance_features(
+            closes=closes,
+            highs=highs,
+            lows=lows,
+            end_idx=end_idx,
+            window_start=window_start,
+            window_end=window_end,
+            timestamps=timestamps,
+        )
+    def _compute_trendline_features(
+        self,
+        closes: List[float],
+        highs: List[float],
+        lows: List[float],
+        end_idx: int,
+    ) -> Dict[str, float]:
+        return compute_trendline_features(closes, highs, lows, end_idx)
+    def _compute_optional_pattern_flags(
+        self,
+        closes: List[float],
+        highs: List[float],
+        lows: List[float],
+        end_idx: int,
+    ) -> Dict[str, float]:
+        return compute_pattern_features(closes, highs, lows, end_idx)
+    def _extract_quant_ohlc_features_for_segment(
+        self,
+        segment: List[tuple],
+        interval_label: str,
+    ) -> List[Dict[str, Any]]:
+        del interval_label
+        if not segment:
+            return []
+        timestamps = [int(row[0]) for row in segment]
+        opens = [float(row[1]) for row in segment]
+        closes = [float(row[2]) for row in segment]
+        highs = [max(o, c) for o, c in zip(opens, closes)]
+        lows = [min(o, c) for o, c in zip(opens, closes)]
+        log_closes = np.log(np.clip(np.asarray(closes, dtype=np.float64), 1e-8, None))
+        one_sec_returns = np.diff(log_closes)
+        feature_windows: List[Dict[str, Any]] = []
+        for window_idx in range(TOKENS_PER_SEGMENT):
+            window_start = window_idx * WINDOW_SECONDS
+            if window_start >= len(segment):
+                break
+            window_end = min(len(segment), window_start + WINDOW_SECONDS)
+            current_end_idx = window_end - 1
+            window_returns = one_sec_returns[window_start:max(window_start, current_end_idx)]
+            window_closes = closes[window_start:window_end]
+            window_highs = highs[window_start:window_end]
+            window_lows = lows[window_start:window_end]
+            features = empty_feature_dict()
+            if window_closes:
+                window_close_arr = np.asarray(window_closes, dtype=np.float64)
+                window_return_sum = float(np.sum(window_returns)) if window_returns.size > 0 else 0.0
+                range_width = max(max(window_highs) - min(window_lows), 0.0)
+                first_close = float(window_close_arr[0])
+                last_close = float(window_close_arr[-1])
+                accel_proxy = 0.0
+                if window_returns.size >= 2:
+                    accel_proxy = float(window_returns[-1] - window_returns[0])
+                features.update({
+                    "cum_log_return": window_return_sum,
+                    "mean_log_return_1s": float(np.mean(window_returns)) if window_returns.size > 0 else 0.0,
+                    "std_log_return_1s": float(np.std(window_returns)) if window_returns.size > 0 else 0.0,
+                    "max_up_1s": float(np.max(window_returns)) if window_returns.size > 0 else 0.0,
+                    "max_down_1s": float(np.min(window_returns)) if window_returns.size > 0 else 0.0,
+                    "realized_vol": float(np.sqrt(np.sum(np.square(window_returns)))) if window_returns.size > 0 else 0.0,
+                    "window_range_frac": range_width / max(abs(last_close), 1e-8),
+                    "close_to_close_slope": (last_close - first_close) / max(abs(first_close), 1e-8),
+                    "accel_proxy": accel_proxy,
+                    "frac_pos_1s": float(np.mean(window_returns > 0)) if window_returns.size > 0 else 0.0,
+                    "frac_neg_1s": float(np.mean(window_returns < 0)) if window_returns.size > 0 else 0.0,
+                })
+                current_price = closes[current_end_idx]
+                current_high = highs[current_end_idx]
+                current_low = lows[current_end_idx]
+                for lookback in LOOKBACK_SECONDS:
+                    prefix = f"lb_{lookback}s"
+                    lookback_start = max(0, current_end_idx - lookback + 1)
+                    hist_closes = closes[lookback_start: current_end_idx + 1]
+                    hist_highs = highs[lookback_start: current_end_idx + 1]
+                    hist_lows = lows[lookback_start: current_end_idx + 1]
+                    hist_range = max(max(hist_highs) - min(hist_lows), 1e-8)
+                    rolling_high = max(hist_highs)
+                    rolling_low = min(hist_lows)
+                    hist_returns = np.diff(np.log(np.clip(np.asarray(hist_closes, dtype=np.float64), 1e-8, None)))
+                    current_width = max(max(window_highs) - min(window_lows), 0.0)
+                    prev_hist_width = max(max(hist_highs[:-len(window_highs)]) - min(hist_lows[:-len(window_lows)]), 0.0) if len(hist_highs) > len(window_highs) else current_width
+                    prev_close = closes[current_end_idx - 1] if current_end_idx > 0 else current_price
+                    features.update({
+                        f"{prefix}_dist_high": (rolling_high - current_price) / max(abs(current_price), 1e-8),
+                        f"{prefix}_dist_low": (current_price - rolling_low) / max(abs(current_price), 1e-8),
+                        f"{prefix}_drawdown_high": (current_price - rolling_high) / max(abs(rolling_high), 1e-8),
+                        f"{prefix}_rebound_low": (current_price - rolling_low) / max(abs(rolling_low), 1e-8),
+                        f"{prefix}_pos_in_range": (current_price - rolling_low) / hist_range,
+                        f"{prefix}_range_width": hist_range / max(abs(current_price), 1e-8),
+                        f"{prefix}_compression_ratio": current_width / max(prev_hist_width, 1e-8),
+                        f"{prefix}_breakout_high": 1.0 if current_high > rolling_high and prev_close <= rolling_high else 0.0,
+                        f"{prefix}_breakdown_low": 1.0 if current_low < rolling_low and prev_close >= rolling_low else 0.0,
+                        f"{prefix}_reclaim_breakdown": 1.0 if current_low < rolling_low and current_price >= rolling_low else 0.0,
+                        f"{prefix}_rejection_breakout": 1.0 if current_high > rolling_high and current_price <= rolling_high else 0.0,
+                    })
+                features.update(self._compute_support_resistance_features(
+                    closes=closes,
+                    highs=highs,
+                    lows=lows,
+                    end_idx=current_end_idx,
+                    window_start=window_start,
+                    window_end=window_end,
+                    timestamps=timestamps,
+                ))
+                features.update(self._compute_trendline_features(
+                    closes=closes,
+                    highs=highs,
+                    lows=lows,
+                    end_idx=current_end_idx,
+                ))
+                features.update(self._compute_quant_rolling_features(
+                    closes=closes,
+                    end_idx=current_end_idx,
+                ))
+                features.update(self._compute_optional_pattern_flags(
+                    closes=closes,
+                    highs=highs,
+                    lows=lows,
+                    end_idx=current_end_idx,
+                ))
+            feature_windows.append({
+                "start_ts": timestamps[window_start],
+                "end_ts": timestamps[current_end_idx],
+                "window_seconds": WINDOW_SECONDS,
+                "feature_vector": feature_dict_to_vector(features),
+                "feature_names_version": FEATURE_VERSION,
+                "feature_version_id": FEATURE_VERSION_ID,
+                "level_snapshot": {
+                    "support_distance": features.get("nearest_support_dist", 0.0),
+                    "resistance_distance": features.get("nearest_resistance_dist", 0.0),
+                    "support_strength": features.get("support_strength", 0.0),
+                    "resistance_strength": features.get("resistance_strength", 0.0),
+                },
+                "pattern_flags": {
+                    key.replace("pattern_", "").replace("_confidence", ""): features[key]
+                    for key in features.keys()
+                    if key.startswith("pattern_") and key.endswith("_confidence")
+                },
+            })
+        return feature_windows
+    def __getitem__(self, idx: int) -> Optional[Dict[str, Any]]:
+        """
+        Loads data from cache.
         """
         import time as _time
         _timings = {}
         if not cached_data:
             raise RuntimeError(f"No data loaded for index {idx}")
+        has_context_shape = (
+            isinstance(cached_data, dict) and
+            'event_sequence' in cached_data and
+            'tokens' in cached_data and
+            'wallets' in cached_data and
+            'labels' in cached_data and
+            'labels_mask' in cached_data
+        )
+        if has_context_shape:
+            # Return pre-computed training context directly.
             _timings['total'] = _time.perf_counter() - _total_start
             if 'movement_class_targets' not in cached_data and 'labels' in cached_data and 'labels_mask' in cached_data:
                 )
             if idx % 100 == 0:
+                print(f"[Sample {idx}] context cache | cache_load: {_timings['cache_load']*1000:.1f}ms | "
                       f"total: {_timings['total']*1000:.1f}ms | events: {len(cached_data.get('event_sequence', []))}")
             return cached_data
+        raise RuntimeError(
+            f"Cached item at {filepath} is not a valid context cache. "
+            "Rebuild the cache with scripts/cache_dataset.py."
         )
     def _process_token_data_offline(self, token_addresses: List[str], pooler: EmbeddingPooler,
                                      T_cutoff: datetime.datetime, token_data: Optional[Dict] = None) -> Dict[str, Dict[str, Any]]:
         """
                     'relative_ts': int(last_ts) - int(t0_timestamp),
                     'opens': self._normalize_price_series(opens_raw),
                     'closes': self._normalize_price_series(closes_raw),
+                    'i': interval_label,
+                    'quant_ohlc_features': self._extract_quant_ohlc_features_for_segment(segment, interval_label) if interval_label == "1s" else [],
+                    'quant_feature_version': FEATURE_VERSION,
                 }
                 emitted_events.append(chart_event)
             return emitted_events
         if not all_trades:
             # No valid trades for label computation
+            quant_payload = [
+                event.get('quant_ohlc_features', [])
+                for event in event_sequence
+                if event.get('event_type') == 'Chart_Segment'
+            ]
             return {
                 'event_sequence': event_sequence,
                 'wallets': wallet_data,
                 'tokens': all_token_data,
                 'graph_links': graph_links,
                 'embedding_pooler': pooler,
+                'quant_ohlc_features': quant_payload,
+                'quant_feature_version': FEATURE_VERSION,
                 'labels': torch.zeros(len(self.horizons_seconds), dtype=torch.float32),
                 'labels_mask': torch.zeros(len(self.horizons_seconds), dtype=torch.float32),
                 'quality_score': torch.tensor(quality_score if quality_score is not None else 0.0, dtype=torch.float32),
             }
         # Ensure sorted
         # DEBUG: Mask summaries removed after validation
+        quant_payload = [
+            event.get('quant_ohlc_features', [])
+            for event in event_sequence
+            if event.get('event_type') == 'Chart_Segment'
+        ]
         return {
             'sample_idx': sample_idx if sample_idx is not None else -1,  # Debug trace
             'token_address': token_address,  # For debugging
             'tokens': all_token_data,
             'graph_links': graph_links,
             'embedding_pooler': pooler,
+            'quant_ohlc_features': quant_payload,
+            'quant_feature_version': FEATURE_VERSION,
             'labels': torch.tensor(label_values, dtype=torch.float32),
             'labels_mask': torch.tensor(mask_values, dtype=torch.float32),
             'quality_score': torch.tensor(quality_score if quality_score is not None else 0.0, dtype=torch.float32),
         }
     def _embed_context(self, context: Dict[str, Any], encoder: Any) -> None:

data/quant_ohlc_feature_schema.py ADDED Viewed

	@@ -0,0 +1,182 @@

+from collections import OrderedDict
+from typing import Dict, Iterable, List
+FEATURE_VERSION = "qohlc_v1"
+FEATURE_VERSION_ID = 1
+WINDOW_SECONDS = 5
+SEGMENT_SECONDS = 300
+TOKENS_PER_SEGMENT = SEGMENT_SECONDS // WINDOW_SECONDS
+PATTERN_NAMES = [
+    "double_top",
+    "double_bottom",
+    "ascending_triangle",
+    "descending_triangle",
+    "head_shoulders",
+    "inverse_head_shoulders",
+]
+LOOKBACK_SECONDS = [15, 30, 60, 120]
+FEATURE_NAMES: List[str] = [
+    "cum_log_return",
+    "mean_log_return_1s",
+    "std_log_return_1s",
+    "max_up_1s",
+    "max_down_1s",
+    "realized_vol",
+    "window_range_frac",
+    "close_to_close_slope",
+    "accel_proxy",
+    "frac_pos_1s",
+    "frac_neg_1s",
+]
+for lookback in LOOKBACK_SECONDS:
+    prefix = f"lb_{lookback}s"
+    FEATURE_NAMES.extend([
+        f"{prefix}_dist_high",
+        f"{prefix}_dist_low",
+        f"{prefix}_drawdown_high",
+        f"{prefix}_rebound_low",
+        f"{prefix}_pos_in_range",
+        f"{prefix}_range_width",
+        f"{prefix}_compression_ratio",
+        f"{prefix}_breakout_high",
+        f"{prefix}_breakdown_low",
+        f"{prefix}_reclaim_breakdown",
+        f"{prefix}_rejection_breakout",
+    ])
+FEATURE_NAMES.extend([
+    "nearest_support_dist",
+    "nearest_resistance_dist",
+    "support_touch_count",
+    "resistance_touch_count",
+    "support_age_sec",
+    "resistance_age_sec",
+    "support_strength",
+    "resistance_strength",
+    "inside_support_zone",
+    "inside_resistance_zone",
+    "support_swept",
+    "resistance_swept",
+    "support_reclaim",
+    "resistance_reject",
+    "lower_trendline_slope",
+    "upper_trendline_slope",
+    "dist_to_lower_line",
+    "dist_to_upper_line",
+    "trend_channel_width",
+    "trend_convergence",
+    "trend_breakout_upper",
+    "trend_breakdown_lower",
+    "trend_reentry",
+    "ema_fast",
+    "ema_medium",
+    "sma_fast",
+    "sma_medium",
+    "price_minus_ema_fast",
+    "price_minus_ema_medium",
+    "ema_spread",
+    "price_zscore",
+    "mean_reversion_score",
+    "rolling_vol_zscore",
+])
+for pattern_name in PATTERN_NAMES:
+    FEATURE_NAMES.append(f"pattern_{pattern_name}_confidence")
+FEATURE_NAMES.extend([
+    "sr_available",
+    "trendline_available",
+    "pattern_available",
+])
+FEATURE_INDEX = {name: idx for idx, name in enumerate(FEATURE_NAMES)}
+NUM_QUANT_OHLC_FEATURES = len(FEATURE_NAMES)
+FEATURE_GROUPS = OrderedDict([
+    ("price_path", [
+        "cum_log_return",
+        "mean_log_return_1s",
+        "std_log_return_1s",
+        "max_up_1s",
+        "max_down_1s",
+        "realized_vol",
+        "window_range_frac",
+        "close_to_close_slope",
+        "accel_proxy",
+        "frac_pos_1s",
+        "frac_neg_1s",
+    ]),
+    ("relative_structure", [name for name in FEATURE_NAMES if name.startswith("lb_")]),
+    ("levels_breaks", [
+        "nearest_support_dist",
+        "nearest_resistance_dist",
+        "support_touch_count",
+        "resistance_touch_count",
+        "support_age_sec",
+        "resistance_age_sec",
+        "support_strength",
+        "resistance_strength",
+        "inside_support_zone",
+        "inside_resistance_zone",
+        "support_swept",
+        "resistance_swept",
+        "support_reclaim",
+        "resistance_reject",
+    ]),
+    ("trendlines", [
+        "lower_trendline_slope",
+        "upper_trendline_slope",
+        "dist_to_lower_line",
+        "dist_to_upper_line",
+        "trend_channel_width",
+        "trend_convergence",
+        "trend_breakout_upper",
+        "trend_breakdown_lower",
+        "trend_reentry",
+    ]),
+    ("rolling_quant", [
+        "ema_fast",
+        "ema_medium",
+        "sma_fast",
+        "sma_medium",
+        "price_minus_ema_fast",
+        "price_minus_ema_medium",
+        "ema_spread",
+        "price_zscore",
+        "mean_reversion_score",
+        "rolling_vol_zscore",
+    ]),
+    ("patterns", [name for name in FEATURE_NAMES if name.startswith("pattern_")]),
+    ("availability", [
+        "sr_available",
+        "trendline_available",
+        "pattern_available",
+    ]),
+])
+def empty_feature_dict() -> Dict[str, float]:
+    return {name: 0.0 for name in FEATURE_NAMES}
+def feature_dict_to_vector(features: Dict[str, float]) -> List[float]:
+    out: List[float] = []
+    for name in FEATURE_NAMES:
+        value = features.get(name, 0.0)
+        try:
+            out.append(float(value))
+        except Exception:
+            out.append(0.0)
+    return out
+def group_feature_indices(group_names: Iterable[str]) -> List[int]:
+    indices: List[int] = []
+    for group_name in group_names:
+        for feature_name in FEATURE_GROUPS[group_name]:
+            indices.append(FEATURE_INDEX[feature_name])
+    return sorted(set(indices))

inference.py CHANGED Viewed

@@ -15,7 +15,9 @@ from models.token_encoder import TokenEncoder
 from models.wallet_encoder import WalletEncoder
 from models.graph_updater import GraphUpdater
 from models.ohlc_embedder import OHLCEmbedder
 import models.vocabulary as vocab
 # --- NEW: Import database clients ---
 from clickhouse_driver import Client as ClickHouseClient
@@ -56,7 +58,11 @@ if __name__ == "__main__":
         real_ohlc_emb = OHLCEmbedder(
             num_intervals=vocab.NUM_OHLC_INTERVALS,
-            sequence_length=OHLC_SEQ_LEN,
             dtype=dtype
         )
@@ -96,7 +102,8 @@ if __name__ == "__main__":
         quantiles=_test_quantiles,
         horizons_seconds=_test_horizons,
         dtype=dtype,
-        ohlc_embedder=real_ohlc_emb
     ).to(device)
     model.eval()
     print(f"Oracle d_model: {model.d_model}")

 from models.wallet_encoder import WalletEncoder
 from models.graph_updater import GraphUpdater
 from models.ohlc_embedder import OHLCEmbedder
+from models.quant_ohlc_embedder import QuantOHLCEmbedder
 import models.vocabulary as vocab
+from data.quant_ohlc_feature_schema import NUM_QUANT_OHLC_FEATURES, TOKENS_PER_SEGMENT
 # --- NEW: Import database clients ---
 from clickhouse_driver import Client as ClickHouseClient
         real_ohlc_emb = OHLCEmbedder(
             num_intervals=vocab.NUM_OHLC_INTERVALS,
+            dtype=dtype
+        )
+        real_quant_ohlc_emb = QuantOHLCEmbedder(
+            num_features=NUM_QUANT_OHLC_FEATURES,
+            sequence_length=TOKENS_PER_SEGMENT,
             dtype=dtype
         )
         quantiles=_test_quantiles,
         horizons_seconds=_test_horizons,
         dtype=dtype,
+        ohlc_embedder=real_ohlc_emb,
+        quant_ohlc_embedder=real_quant_ohlc_emb
     ).to(device)
     model.eval()
     print(f"Oracle d_model: {model.d_model}")

log.log CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:df78e2b44dd97a148be762f91e3b00f397651f8e7e43ee21f938492291fdfa3a
-size 83447

 version https://git-lfs.github.com/spec/v1
+oid sha256:49c4d500d58301a7c158851716c6cf0c7e6bc60cdf59f7a166fcb92b4e77b04a
+size 390587

models/model.py CHANGED Viewed

@@ -14,6 +14,7 @@ from models.token_encoder import TokenEncoder
 from models.wallet_encoder import WalletEncoder
 from models.graph_updater import GraphUpdater
 from models.ohlc_embedder import OHLCEmbedder
 from models.HoldersEncoder import HolderDistributionEncoder # NEW
 from models.SocialEncoders import SocialEncoder # NEW
 import models.vocabulary as vocab # For vocab sizes
@@ -28,6 +29,7 @@ class Oracle(nn.Module):
                  wallet_encoder: WalletEncoder,
                  graph_updater: GraphUpdater,
                  ohlc_embedder: OHLCEmbedder, # NEW
                  time_encoder: ContextualTimeEncoder,
                  num_event_types: int,
                  multi_modal_dim: int,
@@ -124,7 +126,8 @@ class Oracle(nn.Module):
         self.token_encoder = token_encoder
         self.wallet_encoder = wallet_encoder
         self.graph_updater = graph_updater
-        self.ohlc_embedder = ohlc_embedder
         self.time_encoder = time_encoder # Store time_encoder
         self.social_encoder = SocialEncoder(d_model=self.d_model, dtype=self.dtype) # Now self.d_model is defined
@@ -140,7 +143,7 @@ class Oracle(nn.Module):
         # --- 5. Define Entity Padding (Learnable) ---
         self.pad_wallet_emb = nn.Parameter(torch.zeros(1, self.wallet_encoder.d_model))
         self.pad_token_emb = nn.Parameter(torch.zeros(1, self.token_encoder.output_dim))
-        self.pad_ohlc_emb = nn.Parameter(torch.zeros(1, self.ohlc_embedder.output_dim))
         self.pad_precomputed_emb = nn.Parameter(torch.zeros(1, self.multi_modal_dim)) # NEW: For text/images
         # --- NEW: Instantiate HolderDistributionEncoder internally ---
@@ -157,7 +160,16 @@ class Oracle(nn.Module):
         self.rel_ts_norm = nn.LayerNorm(1)
         self.wallet_proj = nn.Linear(self.wallet_encoder.d_model, self.d_model)
         self.token_proj = nn.Linear(self.token_encoder.output_dim, self.d_model)
-        self.ohlc_proj = nn.Linear(self.ohlc_embedder.output_dim, self.d_model)
         # self.holder_snapshot_proj is no longer needed as HolderDistributionEncoder outputs directly to d_model
@@ -309,7 +321,7 @@ class Oracle(nn.Module):
     @classmethod
     def from_pretrained(cls, load_directory: str,
-                        token_encoder, wallet_encoder, graph_updater, ohlc_embedder, time_encoder):
         """
         Loads the Oracle model from a saved directory.
         Note: You must still provide the initialized sub-encoders (or we can refactor to save them too).
@@ -329,6 +341,7 @@ class Oracle(nn.Module):
             wallet_encoder=wallet_encoder,
             graph_updater=graph_updater,
             ohlc_embedder=ohlc_embedder,
             time_encoder=time_encoder,
             num_event_types=config["num_event_types"],
             multi_modal_dim=config["multi_modal_dim"],
@@ -431,6 +444,14 @@ class Oracle(nn.Module):
             neginf=0.0
         )
         ohlc_interval_ids = batch['ohlc_interval_ids'].to(device)
         graph_updater_links = batch['graph_updater_links']
         # 1a. Encode Tokens
@@ -490,9 +511,39 @@ class Oracle(nn.Module):
         # 1c. Encode OHLC
         if ohlc_price_tensors.shape[0] > 0:
-            batch_ohlc_embeddings_raw = self.ohlc_embedder(ohlc_price_tensors, ohlc_interval_ids)
         else:
-            batch_ohlc_embeddings_raw = torch.empty(0, self.ohlc_embedder.output_dim, device=device, dtype=self.dtype)
         # 1d. Run Graph Updater
         pad_wallet_raw = self.pad_wallet_emb.to(self.dtype)

 from models.wallet_encoder import WalletEncoder
 from models.graph_updater import GraphUpdater
 from models.ohlc_embedder import OHLCEmbedder
+from models.quant_ohlc_embedder import QuantOHLCEmbedder
 from models.HoldersEncoder import HolderDistributionEncoder # NEW
 from models.SocialEncoders import SocialEncoder # NEW
 import models.vocabulary as vocab # For vocab sizes
                  wallet_encoder: WalletEncoder,
                  graph_updater: GraphUpdater,
                  ohlc_embedder: OHLCEmbedder, # NEW
+                 quant_ohlc_embedder: QuantOHLCEmbedder,
                  time_encoder: ContextualTimeEncoder,
                  num_event_types: int,
                  multi_modal_dim: int,
         self.token_encoder = token_encoder
         self.wallet_encoder = wallet_encoder
         self.graph_updater = graph_updater
+        self.ohlc_embedder = ohlc_embedder
+        self.quant_ohlc_embedder = quant_ohlc_embedder
         self.time_encoder = time_encoder # Store time_encoder
         self.social_encoder = SocialEncoder(d_model=self.d_model, dtype=self.dtype) # Now self.d_model is defined
         # --- 5. Define Entity Padding (Learnable) ---
         self.pad_wallet_emb = nn.Parameter(torch.zeros(1, self.wallet_encoder.d_model))
         self.pad_token_emb = nn.Parameter(torch.zeros(1, self.token_encoder.output_dim))
+        self.pad_ohlc_emb = nn.Parameter(torch.zeros(1, self.quant_ohlc_embedder.output_dim))
         self.pad_precomputed_emb = nn.Parameter(torch.zeros(1, self.multi_modal_dim)) # NEW: For text/images
         # --- NEW: Instantiate HolderDistributionEncoder internally ---
         self.rel_ts_norm = nn.LayerNorm(1)
         self.wallet_proj = nn.Linear(self.wallet_encoder.d_model, self.d_model)
         self.token_proj = nn.Linear(self.token_encoder.output_dim, self.d_model)
+        self.ohlc_proj = nn.Linear(self.quant_ohlc_embedder.output_dim, self.d_model)
+        self.chart_interval_fusion_embedding = nn.Embedding(vocab.NUM_OHLC_INTERVALS, 32, padding_idx=0)
+        fusion_input_dim = self.ohlc_embedder.output_dim + self.quant_ohlc_embedder.output_dim + 32
+        self.chart_fusion = nn.Sequential(
+            nn.Linear(fusion_input_dim, self.quant_ohlc_embedder.output_dim),
+            nn.GELU(),
+            nn.LayerNorm(self.quant_ohlc_embedder.output_dim),
+            nn.Linear(self.quant_ohlc_embedder.output_dim, self.quant_ohlc_embedder.output_dim),
+            nn.LayerNorm(self.quant_ohlc_embedder.output_dim),
+        )
         # self.holder_snapshot_proj is no longer needed as HolderDistributionEncoder outputs directly to d_model
     @classmethod
     def from_pretrained(cls, load_directory: str,
+                        token_encoder, wallet_encoder, graph_updater, ohlc_embedder, quant_ohlc_embedder, time_encoder):
         """
         Loads the Oracle model from a saved directory.
         Note: You must still provide the initialized sub-encoders (or we can refactor to save them too).
             wallet_encoder=wallet_encoder,
             graph_updater=graph_updater,
             ohlc_embedder=ohlc_embedder,
+            quant_ohlc_embedder=quant_ohlc_embedder,
             time_encoder=time_encoder,
             num_event_types=config["num_event_types"],
             multi_modal_dim=config["multi_modal_dim"],
             neginf=0.0
         )
         ohlc_interval_ids = batch['ohlc_interval_ids'].to(device)
+        quant_ohlc_feature_tensors = torch.nan_to_num(
+            batch['quant_ohlc_feature_tensors'].to(device, self.dtype),
+            nan=0.0,
+            posinf=0.0,
+            neginf=0.0
+        )
+        quant_ohlc_feature_mask = batch['quant_ohlc_feature_mask'].to(device)
+        quant_ohlc_feature_version_ids = batch['quant_ohlc_feature_version_ids'].to(device)
         graph_updater_links = batch['graph_updater_links']
         # 1a. Encode Tokens
         # 1c. Encode OHLC
         if ohlc_price_tensors.shape[0] > 0:
+            raw_chart_embeddings = self.ohlc_embedder(ohlc_price_tensors, ohlc_interval_ids)
+        else:
+            raw_chart_embeddings = torch.empty(0, self.ohlc_embedder.output_dim, device=device, dtype=self.dtype)
+        if quant_ohlc_feature_tensors.shape[0] > 0:
+            quant_chart_embeddings = self.quant_ohlc_embedder(
+                quant_ohlc_feature_tensors,
+                quant_ohlc_feature_mask,
+                quant_ohlc_feature_version_ids,
+            )
+        else:
+            quant_chart_embeddings = torch.empty(0, self.quant_ohlc_embedder.output_dim, device=device, dtype=self.dtype)
+        num_chart_segments = max(raw_chart_embeddings.shape[0], quant_chart_embeddings.shape[0])
+        if num_chart_segments > 0:
+            if raw_chart_embeddings.shape[0] == 0:
+                raw_chart_embeddings = torch.zeros(
+                    num_chart_segments,
+                    self.ohlc_embedder.output_dim,
+                    device=device,
+                    dtype=self.dtype,
+                )
+            if quant_chart_embeddings.shape[0] == 0:
+                quant_chart_embeddings = torch.zeros(
+                    num_chart_segments,
+                    self.quant_ohlc_embedder.output_dim,
+                    device=device,
+                    dtype=self.dtype,
+                )
+            interval_embeds = self.chart_interval_fusion_embedding(ohlc_interval_ids[:num_chart_segments]).to(self.dtype)
+            batch_ohlc_embeddings_raw = self.chart_fusion(
+                torch.cat([raw_chart_embeddings, quant_chart_embeddings, interval_embeds], dim=-1)
+            )
         else:
+            batch_ohlc_embeddings_raw = torch.empty(0, self.quant_ohlc_embedder.output_dim, device=device, dtype=self.dtype)
         # 1d. Run Graph Updater
         pad_wallet_raw = self.pad_wallet_emb.to(self.dtype)

models/ohlc_embedder.py CHANGED Viewed

@@ -19,11 +19,11 @@ class OHLCEmbedder(nn.Module):
         num_intervals: int,
         input_channels: int = 2, # Open, Close
         # sequence_length: int = 300, # REMOVED: HARDCODED
-        cnn_channels: List[int] = [16, 32, 64],
         kernel_sizes: List[int] = [3, 3, 3],
         # --- NEW: Interval embedding dim ---
-        interval_embed_dim: int = 32,
-        output_dim: int = 4096,
         dtype: torch.dtype = torch.float16
     ):
         super().__init__()
@@ -116,4 +116,3 @@ class OHLCEmbedder(nn.Module):
         # Shape: [batch_size, output_dim]
         return x

         num_intervals: int,
         input_channels: int = 2, # Open, Close
         # sequence_length: int = 300, # REMOVED: HARDCODED
+        cnn_channels: List[int] = [8, 16, 32],
         kernel_sizes: List[int] = [3, 3, 3],
         # --- NEW: Interval embedding dim ---
+        interval_embed_dim: int = 16,
+        output_dim: int = 512,
         dtype: torch.dtype = torch.float16
     ):
         super().__init__()
         # Shape: [batch_size, output_dim]
         return x

models/quant_ohlc_embedder.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import torch
+import torch.nn as nn
+class QuantOHLCEmbedder(nn.Module):
+    def __init__(
+        self,
+        num_features: int,
+        sequence_length: int = 60,
+        version_vocab_size: int = 4,
+        hidden_dim: int = 320,
+        num_layers: int = 3,
+        num_heads: int = 8,
+        output_dim: int = 1536,
+        dtype: torch.dtype = torch.float16,
+    ):
+        super().__init__()
+        self.num_features = num_features
+        self.sequence_length = sequence_length
+        self.output_dim = output_dim
+        self.dtype = dtype
+        self.feature_proj = nn.Sequential(
+            nn.LayerNorm(num_features),
+            nn.Linear(num_features, hidden_dim),
+            nn.GELU(),
+        )
+        self.position_embedding = nn.Parameter(torch.zeros(1, sequence_length, hidden_dim))
+        self.version_embedding = nn.Embedding(version_vocab_size, hidden_dim, padding_idx=0)
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=hidden_dim,
+            nhead=num_heads,
+            dim_feedforward=hidden_dim * 4,
+            dropout=0.0,
+            batch_first=True,
+            activation="gelu",
+            norm_first=True,
+        )
+        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
+        self.output_head = nn.Sequential(
+            nn.LayerNorm(hidden_dim),
+            nn.Linear(hidden_dim, hidden_dim * 2),
+            nn.GELU(),
+            nn.LayerNorm(hidden_dim * 2),
+            nn.Linear(hidden_dim * 2, output_dim),
+            nn.LayerNorm(output_dim),
+        )
+        self.to(dtype)
+    def forward(
+        self,
+        feature_tokens: torch.Tensor,
+        feature_mask: torch.Tensor,
+        version_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        if feature_tokens.ndim != 3:
+            raise ValueError(f"Expected [B, T, F], got {feature_tokens.shape}")
+        if feature_tokens.shape[1] != self.sequence_length:
+            raise ValueError(f"Expected T={self.sequence_length}, got {feature_tokens.shape[1]}")
+        if feature_tokens.shape[2] != self.num_features:
+            raise ValueError(f"Expected F={self.num_features}, got {feature_tokens.shape[2]}")
+        x = self.feature_proj(feature_tokens.to(self.dtype))
+        version_embed = self.version_embedding(version_ids).unsqueeze(1)
+        x = x + self.position_embedding[:, : x.shape[1], :].to(x.dtype) + version_embed
+        key_padding_mask = ~(feature_mask > 0)
+        x = self.encoder(x, src_key_padding_mask=key_padding_mask)
+        mask = feature_mask.to(x.dtype).unsqueeze(-1)
+        valid_any = (feature_mask.sum(dim=1, keepdim=True) > 0).to(x.dtype)
+        denom = mask.sum(dim=1).clamp_min(1.0)
+        pooled = (x * mask).sum(dim=1) / denom
+        out = self.output_head(pooled)
+        return out * valid_any

pre_cache.sh CHANGED Viewed

@@ -4,6 +4,7 @@
 CONTEXT_LENGTH=4096
 MIN_TRADES=10
 SAMPLES_PER_TOKEN=1
 NUM_WORKERS=1
 OUTPUT_DIR="data/cache"
@@ -20,6 +21,7 @@ echo "========================================"
 echo "Context Length (H/B/H threshold): $CONTEXT_LENGTH"
 echo "Min Trades (T_cutoff threshold): $MIN_TRADES"
 echo "Samples per Token: $SAMPLES_PER_TOKEN"
 echo "Num Workers: $NUM_WORKERS"
 echo "Horizons (sec): ${HORIZONS_SECONDS[*]}"
 echo "Quantiles: ${QUANTILES[*]}"
@@ -36,10 +38,10 @@ python3 scripts/cache_dataset.py \
     --context_length "$CONTEXT_LENGTH" \
     --min_trades "$MIN_TRADES" \
     --samples_per_token "$SAMPLES_PER_TOKEN" \
     --num_workers "$NUM_WORKERS" \
     --horizons_seconds "${HORIZONS_SECONDS[@]}" \
     --quantiles "${QUANTILES[@]}" \
-    --max_samples 10 \
     "$@"
 echo "Done!"

 CONTEXT_LENGTH=4096
 MIN_TRADES=10
 SAMPLES_PER_TOKEN=1
+TARGET_CONTEXTS_PER_CLASS=10
 NUM_WORKERS=1
 OUTPUT_DIR="data/cache"
 echo "Context Length (H/B/H threshold): $CONTEXT_LENGTH"
 echo "Min Trades (T_cutoff threshold): $MIN_TRADES"
 echo "Samples per Token: $SAMPLES_PER_TOKEN"
+echo "Target Contexts per Class: $TARGET_CONTEXTS_PER_CLASS"
 echo "Num Workers: $NUM_WORKERS"
 echo "Horizons (sec): ${HORIZONS_SECONDS[*]}"
 echo "Quantiles: ${QUANTILES[*]}"
     --context_length "$CONTEXT_LENGTH" \
     --min_trades "$MIN_TRADES" \
     --samples_per_token "$SAMPLES_PER_TOKEN" \
+    --target_contexts_per_class "$TARGET_CONTEXTS_PER_CLASS" \
     --num_workers "$NUM_WORKERS" \
     --horizons_seconds "${HORIZONS_SECONDS[@]}" \
     --quantiles "${QUANTILES[@]}" \
     "$@"
 echo "Done!"

sample_2kGqvM18kGLby9bY_5.json ADDED Viewed

The diff for this file is too large to render. See raw diff

scripts/cache_dataset.py CHANGED Viewed

@@ -6,15 +6,13 @@ import argparse
 import datetime
 import torch
 import json
-import math
 from pathlib import Path
 from tqdm import tqdm
 from dotenv import load_dotenv
 import huggingface_hub
 import logging
-from concurrent.futures import ProcessPoolExecutor, as_completed
 import multiprocessing as mp
-from PIL import Image
 logging.getLogger("httpx").setLevel(logging.WARNING)
 logging.getLogger("transformers").setLevel(logging.ERROR)
@@ -23,7 +21,9 @@ logging.getLogger("huggingface_hub").setLevel(logging.WARNING)
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from scripts.analyze_distribution import get_return_class_map, compute_p99_clamps
-from scripts.compute_quality_score import get_token_quality_scores, fetch_token_metrics, _bucket_id, _midrank_percentiles, EPS
 from clickhouse_driver import Client as ClickHouseClient
 from neo4j import GraphDatabase
@@ -58,6 +58,32 @@ def _representative_context_polarity(context):
     return "positive" if max(valid_returns) > 0.0 else "negative"
 def _select_contexts_by_polarity(contexts, max_keep, desired_positive=None, desired_negative=None):
     if len(contexts) <= max_keep:
         polarity_counts = {}
@@ -113,68 +139,6 @@ def _select_contexts_by_polarity(contexts, max_keep, desired_positive=None, desi
     return selected[:max_keep], polarity_counts
-def _allocate_class_targets(mints_by_class, target_total, positive_balance_min_class, positive_ratio):
-    from collections import defaultdict
-    import random
-    class_ids = sorted(mints_by_class.keys())
-    if not class_ids:
-        return {}, {}, {}
-    target_per_class = target_total // len(class_ids)
-    remainder = target_total % len(class_ids)
-    token_plans = {}
-    class_targets = {}
-    class_polarity_targets = {}
-    for pos, class_id in enumerate(class_ids):
-        class_target = target_per_class + (1 if pos < remainder else 0)
-        class_targets[class_id] = class_target
-        token_list = list(mints_by_class[class_id])
-        random.shuffle(token_list)
-        if not token_list or class_target <= 0:
-            class_polarity_targets[class_id] = {"positive": 0, "negative": 0}
-            continue
-        if class_id >= positive_balance_min_class:
-            positive_target = int(round(class_target * positive_ratio))
-            positive_target = min(max(positive_target, 0), class_target)
-        else:
-            positive_target = 0
-        negative_target = class_target - positive_target
-        class_polarity_targets[class_id] = {
-            "positive": positive_target,
-            "negative": negative_target,
-        }
-        assigned_positive = 0
-        assigned_negative = 0
-        token_count = len(token_list)
-        for sample_num in range(class_target):
-            token_idx, mint_record = token_list[sample_num % token_count]
-            mint_addr = mint_record["mint_address"]
-            plan_key = (token_idx, mint_addr)
-            if plan_key not in token_plans:
-                token_plans[plan_key] = {
-                    "samples_to_keep": 0,
-                    "desired_positive": 0,
-                    "desired_negative": 0,
-                    "class_id": class_id,
-                }
-            token_plans[plan_key]["samples_to_keep"] += 1
-            if assigned_positive < positive_target:
-                token_plans[plan_key]["desired_positive"] += 1
-                assigned_positive += 1
-            else:
-                token_plans[plan_key]["desired_negative"] += 1
-                assigned_negative += 1
-    return token_plans, class_targets, class_polarity_targets
 def _init_worker(db_config, dataset_config, return_class_map, quality_scores_map):
     global _worker_dataset, _worker_return_class_map, _worker_quality_scores_map
     from data.data_loader import OracleDataset
@@ -200,7 +164,6 @@ def _init_worker(db_config, dataset_config, return_class_map, quality_scores_map
     _worker_dataset = OracleDataset(
         data_fetcher=data_fetcher,
-        max_samples=dataset_config['max_samples'],
         start_date=dataset_config['start_date'],
         horizons_seconds=dataset_config['horizons_seconds'],
         quantiles=dataset_config['quantiles'],
@@ -214,7 +177,7 @@ def _init_worker(db_config, dataset_config, return_class_map, quality_scores_map
 def _process_single_token_context(args):
-    idx, mint_addr, samples_per_token, output_dir, oversample_factor, desired_positive, desired_negative = args
     global _worker_dataset, _worker_return_class_map, _worker_quality_scores_map
     try:
         class_id = _worker_return_class_map.get(mint_addr)
@@ -244,15 +207,6 @@ def _process_single_token_context(args):
         q_score = _worker_quality_scores_map.get(mint_addr)
         if q_score is None:
             return {'status': 'skipped', 'reason': 'no quality score', 'mint': mint_addr}
-        saved_files = []
-        for ctx_idx, ctx in enumerate(contexts):
-            ctx["quality_score"] = q_score
-            ctx["class_id"] = class_id
-            filename = f"sample_{mint_addr[:16]}_{ctx_idx}.pt"
-            output_path = Path(output_dir) / filename
-            torch.save(ctx, output_path)
-            saved_files.append(filename)
         return {
             'status': 'success',
             'mint': mint_addr,
@@ -260,7 +214,7 @@ def _process_single_token_context(args):
             'q_score': q_score,
             'n_contexts': len(contexts),
             'n_events': len(contexts[0].get('event_sequence', [])) if contexts else 0,
-            'files': saved_files,
             'polarity_counts': polarity_counts,
         }
     except Exception as e:
@@ -284,7 +238,6 @@ def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--output_dir", type=str, default="data/cache")
-    parser.add_argument("--max_samples", type=int, default=None)
     parser.add_argument("--start_date", type=str, default=None)
     parser.add_argument("--min_trade_usd", type=float, default=0.0)
@@ -292,8 +245,8 @@ def main():
     parser.add_argument("--context_length", type=int, default=8192)
     parser.add_argument("--min_trades", type=int, default=10)
     parser.add_argument("--samples_per_token", type=int, default=1)
     parser.add_argument("--context_oversample_factor", type=int, default=4)
-    parser.add_argument("--cache_balance_mode", type=str, default="hybrid", choices=["class", "uniform", "hybrid"])
     parser.add_argument("--positive_balance_min_class", type=int, default=2)
     parser.add_argument("--positive_context_ratio", type=float, default=0.5)
     parser.add_argument("--horizons_seconds", type=int, nargs="+", default=[30, 60, 120, 240, 420])
@@ -308,6 +261,10 @@ def main():
     if args.num_workers == 0:
         args.num_workers = max(1, mp.cpu_count() - 4)
     output_dir = Path(args.output_dir)
     output_dir.mkdir(parents=True, exist_ok=True)
@@ -334,7 +291,7 @@ def main():
         quality_scores_map = get_token_quality_scores(clickhouse_client)
         print(f"INFO: Loaded {len(quality_scores_map)} quality scores.")
-        dataset = OracleDataset(data_fetcher=data_fetcher, max_samples=args.max_samples, start_date=start_date_dt, horizons_seconds=args.horizons_seconds, quantiles=args.quantiles, min_trade_usd=args.min_trade_usd, max_seq_len=args.context_length, p99_clamps=p99_clamps)
         if len(dataset) == 0:
             print("WARNING: No samples. Exiting.")
@@ -370,93 +327,73 @@ def main():
         print(f"INFO: Workers: {args.num_workers}")
         db_config = {'clickhouse_host': args.clickhouse_host, 'clickhouse_port': args.clickhouse_port, 'neo4j_uri': args.neo4j_uri, 'neo4j_user': args.neo4j_user, 'neo4j_password': args.neo4j_password}
-        dataset_config = {'max_samples': args.max_samples, 'start_date': start_date_dt, 'horizons_seconds': args.horizons_seconds, 'quantiles': args.quantiles, 'min_trade_usd': args.min_trade_usd, 'max_seq_len': args.context_length, 'sampled_mints': filtered_mints, 'p99_clamps': p99_clamps}
-        # Build tasks with class-aware multi-sampling for balanced cache
         import random
-        from collections import Counter, defaultdict
-        # Count eligible tokens per class
         eligible_class_counts = Counter()
-        mints_by_class = defaultdict(list)
         for i, m in enumerate(filtered_mints):
             cid = return_class_map.get(m['mint_address'])
             if cid is not None:
                 eligible_class_counts[cid] += 1
-                mints_by_class[cid].append((i, m))
         print(f"INFO: Eligible tokens per class: {dict(sorted(eligible_class_counts.items()))}")
-        num_classes = len(eligible_class_counts)
-        if args.max_samples:
-            target_total = args.max_samples
-        else:
-            target_total = 15000  # Default target: 15k balanced files
-        target_per_class = target_total // max(num_classes, 1)
-        token_plans, class_targets, class_polarity_targets = _allocate_class_targets(
-            mints_by_class=mints_by_class,
-            target_total=target_total,
-            positive_balance_min_class=args.positive_balance_min_class,
-            positive_ratio=args.positive_context_ratio,
-        )
-        print(f"INFO: Target total: {target_total}, Target per class: {target_per_class}")
         print(f"INFO: Exact class targets: {dict(sorted(class_targets.items()))}")
         print(f"INFO: Class polarity targets: {dict(sorted(class_polarity_targets.items()))}")
-        # Build balanced task list
-        tasks = []
-        if args.cache_balance_mode == "uniform":
-            target_tokens = len(filtered_mints)
-            if args.max_samples:
-                target_tokens = min(len(filtered_mints), max(1, math.ceil(args.max_samples / max(args.samples_per_token, 1))))
-            mint_pool = list(enumerate(filtered_mints))
-            random.shuffle(mint_pool)
-            for i, m in mint_pool[:target_tokens]:
-                tasks.append((i, m['mint_address'], args.samples_per_token, str(output_dir), args.context_oversample_factor, 0, args.samples_per_token))
-        else:
-            for (token_idx, mint_addr), plan in token_plans.items():
-                tasks.append((
-                    token_idx,
-                    mint_addr,
-                    plan["samples_to_keep"],
-                    str(output_dir),
-                    args.context_oversample_factor,
-                    plan["desired_positive"],
-                    plan["desired_negative"],
-                ))
-        random.shuffle(tasks)  # Shuffle tasks for even load distribution across workers
-        expected_files = sum(task[2] for task in tasks)
-        print(f"INFO: Total tasks: {len(tasks)} (expected ~{expected_files} output files, target ~{target_total})")
         success_count, skipped_count, error_count = 0, 0, 0
-        class_distribution = {}
-        polarity_distribution = {}
-        # --- Resume support: skip tokens that already have cached files ---
         existing_files = set(f.name for f in output_dir.glob("sample_*.pt"))
         if existing_files:
-            pre_resume = len(tasks)
-            filtered_tasks = []
             already_cached = 0
-            for task in tasks:
-                mint_addr = task[1]  # task = (idx, mint_addr, ...)
-                # Check if any file exists for this mint (context mode: sample_MINT_0.pt, raw mode: sample_MINT.pt)
-                mint_prefix = f"sample_{mint_addr[:16]}"
-                has_cached = any(ef.startswith(mint_prefix) for ef in existing_files)
-                if has_cached:
-                    already_cached += 1
-                    # Count existing files toward class distribution
-                    cid = return_class_map.get(mint_addr)
-                    if cid is not None:
-                        class_distribution[cid] = class_distribution.get(cid, 0) + 1
-                    success_count += 1
-                else:
-                    filtered_tasks.append(task)
-            tasks = filtered_tasks
-            print(f"INFO: Resume: {already_cached} tokens already cached, {len(tasks)} remaining (was {pre_resume})")
         print(f"INFO: Starting to cache {len(tasks)} tokens...")
         process_fn = _process_single_token_context
@@ -484,64 +421,113 @@ def main():
         error_log_path = Path(args.output_dir) / "cache_errors.log"
         error_samples = []  # First 20 unique error messages
-        if args.num_workers == 1:
-            print("INFO: Single-threaded mode...")
-            _init_worker(db_config, dataset_config, return_class_map, quality_scores_map)
-            start_time = _time.perf_counter()
-            recent_times = []
-            for task_num, task in enumerate(tqdm(tasks, desc="Caching", unit="tok")):
-                t0 = _time.perf_counter()
-                result = process_fn(task)
-                elapsed = _time.perf_counter() - t0
-                recent_times.append(elapsed)
-                if len(recent_times) > 50:
-                    recent_times.pop(0)
-                if result['status'] == 'success':
                     success_count += 1
-                    class_distribution[result['class_id']] = class_distribution.get(result['class_id'], 0) + 1
-                    for polarity, count in result.get('polarity_counts', {}).items():
-                        polarity_distribution[polarity] = polarity_distribution.get(polarity, 0) + count
-                elif result['status'] == 'skipped':
-                    skipped_count += 1
                 else:
-                    error_count += 1
-                    err_msg = result.get('error', 'unknown')
-                    tqdm.write(f"ERROR: {result['mint'][:16]} - {err_msg}")
-                    if len(error_samples) < 20:
-                        error_samples.append({'mint': result.get('mint'), 'error': err_msg, 'traceback': result.get('traceback', '')})
                 _log_progress(task_num, len(tasks), start_time, recent_times, success_count, skipped_count, error_count)
-        else:
-            print(f"INFO: Running with {args.num_workers} workers...")
-            start_time = _time.perf_counter()
-            recent_times = []
-            with ProcessPoolExecutor(max_workers=args.num_workers, initializer=_init_worker, initargs=(db_config, dataset_config, return_class_map, quality_scores_map)) as executor:
-                futures = {executor.submit(process_fn, task): task for task in tasks}
-                for task_num, future in enumerate(tqdm(as_completed(futures), total=len(futures), desc="Caching", unit="tok")):
-                    t0 = _time.perf_counter()
-                    try:
-                        result = future.result(timeout=300)
-                        elapsed = _time.perf_counter() - t0
-                        recent_times.append(elapsed)
-                        if len(recent_times) > 50:
-                            recent_times.pop(0)
-                        if result['status'] == 'success':
-                            success_count += 1
-                            class_distribution[result['class_id']] = class_distribution.get(result['class_id'], 0) + 1
-                            for polarity, count in result.get('polarity_counts', {}).items():
-                                polarity_distribution[polarity] = polarity_distribution.get(polarity, 0) + count
-                        elif result['status'] == 'skipped':
-                            skipped_count += 1
-                        else:
-                            error_count += 1
-                            err_msg = result.get('error', 'unknown')
-                            if len(error_samples) < 20:
-                                error_samples.append({'mint': result.get('mint'), 'error': err_msg, 'traceback': result.get('traceback', '')})
-                            if error_count <= 5:
-                                tqdm.write(f"ERROR: {result.get('mint', '?')[:16]} - {err_msg}")
-                    except Exception as e:
-                        error_count += 1
-                        tqdm.write(f"WORKER ERROR: {e}")
-                    _log_progress(task_num, len(tasks), start_time, recent_times, success_count, skipped_count, error_count)
         # Write error log
         if error_samples:
@@ -553,28 +539,24 @@ def main():
             print(f"INFO: First {len(error_samples)} error tracebacks saved to {error_log_path}")
         print("INFO: Building metadata...")
-        file_class_map = {}
-        for f in sorted(output_dir.glob("sample_*.pt")):
-            try:
-                file_class_map[f.name] = torch.load(f, map_location="cpu", weights_only=False).get("class_id", 0)
-            except:
-                pass
         with open(output_dir / "class_metadata.json", 'w') as f:
             json.dump({
                 'file_class_map': file_class_map,
                 'class_distribution': {str(k): v for k, v in class_distribution.items()},
                 'num_workers': args.num_workers,
                 'horizons_seconds': args.horizons_seconds,
                 'quantiles': args.quantiles,
                 'target_total': target_total,
-                'target_per_class': target_per_class,
-                'cache_balance_mode': args.cache_balance_mode,
                 'context_polarity_distribution': polarity_distribution,
                 'class_targets': {str(k): v for k, v in class_targets.items()},
                 'class_polarity_targets': {str(k): v for k, v in class_polarity_targets.items()},
                 'positive_balance_min_class': args.positive_balance_min_class,
                 'positive_context_ratio': args.positive_context_ratio,
             }, f, indent=2)
         print(f"\n--- Done ---\nSuccess: {success_count}, Skipped: {skipped_count}, Errors: {error_count}\nFiles: {len(file_class_map)}\nLocation: {output_dir.resolve()}")

 import datetime
 import torch
 import json
 from pathlib import Path
 from tqdm import tqdm
 from dotenv import load_dotenv
 import huggingface_hub
 import logging
 import multiprocessing as mp
+from collections import Counter, defaultdict
 logging.getLogger("httpx").setLevel(logging.WARNING)
 logging.getLogger("transformers").setLevel(logging.ERROR)
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from scripts.analyze_distribution import get_return_class_map, compute_p99_clamps
+from scripts.compute_quality_score import get_token_quality_scores
+from data.data_loader import summarize_context_window
+from data.quant_ohlc_feature_schema import FEATURE_VERSION
 from clickhouse_driver import Client as ClickHouseClient
 from neo4j import GraphDatabase
     return "positive" if max(valid_returns) > 0.0 else "negative"
+def _class_polarity_targets(class_id, target_contexts_per_class, positive_balance_min_class, positive_ratio):
+    if class_id >= positive_balance_min_class:
+        positive_target = int(round(target_contexts_per_class * positive_ratio))
+        positive_target = min(max(positive_target, 0), target_contexts_per_class)
+    else:
+        positive_target = 0
+    return {
+        "positive": positive_target,
+        "negative": max(0, target_contexts_per_class - positive_target),
+    }
+def _remaining_polarity_targets(class_id, accepted_counts, target_contexts_per_class, positive_balance_min_class, positive_ratio):
+    targets = _class_polarity_targets(
+        class_id=class_id,
+        target_contexts_per_class=target_contexts_per_class,
+        positive_balance_min_class=positive_balance_min_class,
+        positive_ratio=positive_ratio,
+    )
+    class_counts = accepted_counts[class_id]
+    return {
+        "positive": max(0, targets["positive"] - class_counts["positive"]),
+        "negative": max(0, targets["negative"] - class_counts["negative"]),
+    }
 def _select_contexts_by_polarity(contexts, max_keep, desired_positive=None, desired_negative=None):
     if len(contexts) <= max_keep:
         polarity_counts = {}
     return selected[:max_keep], polarity_counts
 def _init_worker(db_config, dataset_config, return_class_map, quality_scores_map):
     global _worker_dataset, _worker_return_class_map, _worker_quality_scores_map
     from data.data_loader import OracleDataset
     _worker_dataset = OracleDataset(
         data_fetcher=data_fetcher,
         start_date=dataset_config['start_date'],
         horizons_seconds=dataset_config['horizons_seconds'],
         quantiles=dataset_config['quantiles'],
 def _process_single_token_context(args):
+    idx, mint_addr, samples_per_token, oversample_factor, desired_positive, desired_negative = args
     global _worker_dataset, _worker_return_class_map, _worker_quality_scores_map
     try:
         class_id = _worker_return_class_map.get(mint_addr)
         q_score = _worker_quality_scores_map.get(mint_addr)
         if q_score is None:
             return {'status': 'skipped', 'reason': 'no quality score', 'mint': mint_addr}
         return {
             'status': 'success',
             'mint': mint_addr,
             'q_score': q_score,
             'n_contexts': len(contexts),
             'n_events': len(contexts[0].get('event_sequence', [])) if contexts else 0,
+            'contexts': contexts,
             'polarity_counts': polarity_counts,
         }
     except Exception as e:
     parser = argparse.ArgumentParser()
     parser.add_argument("--output_dir", type=str, default="data/cache")
     parser.add_argument("--start_date", type=str, default=None)
     parser.add_argument("--min_trade_usd", type=float, default=0.0)
     parser.add_argument("--context_length", type=int, default=8192)
     parser.add_argument("--min_trades", type=int, default=10)
     parser.add_argument("--samples_per_token", type=int, default=1)
+    parser.add_argument("--target_contexts_per_class", type=int, default=2500)
     parser.add_argument("--context_oversample_factor", type=int, default=4)
     parser.add_argument("--positive_balance_min_class", type=int, default=2)
     parser.add_argument("--positive_context_ratio", type=float, default=0.5)
     parser.add_argument("--horizons_seconds", type=int, nargs="+", default=[30, 60, 120, 240, 420])
     if args.num_workers == 0:
         args.num_workers = max(1, mp.cpu_count() - 4)
+    if args.num_workers != 1:
+        raise RuntimeError("Quota-based caching requires --num_workers 1 so class counters remain exact.")
+    if args.target_contexts_per_class <= 0:
+        raise RuntimeError("--target_contexts_per_class must be positive.")
     output_dir = Path(args.output_dir)
     output_dir.mkdir(parents=True, exist_ok=True)
         quality_scores_map = get_token_quality_scores(clickhouse_client)
         print(f"INFO: Loaded {len(quality_scores_map)} quality scores.")
+        dataset = OracleDataset(data_fetcher=data_fetcher, start_date=start_date_dt, horizons_seconds=args.horizons_seconds, quantiles=args.quantiles, min_trade_usd=args.min_trade_usd, max_seq_len=args.context_length, p99_clamps=p99_clamps)
         if len(dataset) == 0:
             print("WARNING: No samples. Exiting.")
         print(f"INFO: Workers: {args.num_workers}")
         db_config = {'clickhouse_host': args.clickhouse_host, 'clickhouse_port': args.clickhouse_port, 'neo4j_uri': args.neo4j_uri, 'neo4j_user': args.neo4j_user, 'neo4j_password': args.neo4j_password}
+        dataset_config = {'start_date': start_date_dt, 'horizons_seconds': args.horizons_seconds, 'quantiles': args.quantiles, 'min_trade_usd': args.min_trade_usd, 'max_seq_len': args.context_length, 'sampled_mints': filtered_mints, 'p99_clamps': p99_clamps}
         import random
         eligible_class_counts = Counter()
         for i, m in enumerate(filtered_mints):
             cid = return_class_map.get(m['mint_address'])
             if cid is not None:
                 eligible_class_counts[cid] += 1
         print(f"INFO: Eligible tokens per class: {dict(sorted(eligible_class_counts.items()))}")
+        class_targets = {
+            int(class_id): int(args.target_contexts_per_class)
+            for class_id in sorted(eligible_class_counts.keys())
+        }
+        class_polarity_targets = {
+            class_id: _class_polarity_targets(
+                class_id=class_id,
+                target_contexts_per_class=args.target_contexts_per_class,
+                positive_balance_min_class=args.positive_balance_min_class,
+                positive_ratio=args.positive_context_ratio,
+            )
+            for class_id in class_targets
+        }
+        target_total = args.target_contexts_per_class * len(class_targets)
+        print(f"INFO: Target total: {target_total}, Target per class: {args.target_contexts_per_class}")
         print(f"INFO: Exact class targets: {dict(sorted(class_targets.items()))}")
         print(f"INFO: Class polarity targets: {dict(sorted(class_polarity_targets.items()))}")
+        tasks = list(enumerate(filtered_mints))
+        random.shuffle(tasks)
+        print(f"INFO: Total candidate tokens: {len(tasks)}")
         success_count, skipped_count, error_count = 0, 0, 0
+        class_distribution = defaultdict(int)
+        polarity_distribution = defaultdict(int)
+        file_class_map = {}
+        file_context_bucket_map = {}
+        file_context_summary_map = {}
+        accepted_counts = defaultdict(lambda: {"total": 0, "positive": 0, "negative": 0})
+        # Resume support: count existing files toward quotas.
         existing_files = set(f.name for f in output_dir.glob("sample_*.pt"))
         if existing_files:
             already_cached = 0
+            for f in sorted(output_dir.glob("sample_*.pt")):
+                try:
+                    cached = torch.load(f, map_location="cpu", weights_only=False)
+                except Exception:
+                    continue
+                class_id = cached.get("class_id")
+                if class_id is None or int(class_id) not in class_targets:
+                    continue
+                class_id = int(class_id)
+                context_summary = summarize_context_window(cached.get("labels"), cached.get("labels_mask"))
+                polarity = _representative_context_polarity(cached)
+                file_class_map[f.name] = class_id
+                file_context_bucket_map[f.name] = context_summary["context_bucket"]
+                file_context_summary_map[f.name] = context_summary
+                class_distribution[class_id] += 1
+                polarity_distribution[polarity] += 1
+                accepted_counts[class_id]["total"] += 1
+                accepted_counts[class_id][polarity] += 1
+                already_cached += 1
+            print(f"INFO: Resume: counted {already_cached} cached contexts toward quotas.")
         print(f"INFO: Starting to cache {len(tasks)} tokens...")
         process_fn = _process_single_token_context
         error_log_path = Path(args.output_dir) / "cache_errors.log"
         error_samples = []  # First 20 unique error messages
+        print("INFO: Single-threaded mode...")
+        _init_worker(db_config, dataset_config, return_class_map, quality_scores_map)
+        start_time = _time.perf_counter()
+        recent_times = []
+        completed_classes = set()
+        for task_num, (idx, mint_record) in enumerate(tqdm(tasks, desc="Caching", unit="tok")):
+            mint_addr = mint_record["mint_address"]
+            class_id = return_class_map.get(mint_addr)
+            if class_id is None:
+                skipped_count += 1
+                _log_progress(task_num, len(tasks), start_time, recent_times, success_count, skipped_count, error_count)
+                continue
+            if accepted_counts[class_id]["total"] >= class_targets[class_id]:
+                if class_id not in completed_classes:
+                    completed_classes.add(class_id)
+                    tqdm.write(f"INFO: Class {class_id} quota filled. Skipping remaining tokens for this class.")
+                skipped_count += 1
+                _log_progress(task_num, len(tasks), start_time, recent_times, success_count, skipped_count, error_count)
+                continue
+            remaining_total = class_targets[class_id] - accepted_counts[class_id]["total"]
+            remaining_polarity = _remaining_polarity_targets(
+                class_id=class_id,
+                accepted_counts=accepted_counts,
+                target_contexts_per_class=args.target_contexts_per_class,
+                positive_balance_min_class=args.positive_balance_min_class,
+                positive_ratio=args.positive_context_ratio,
+            )
+            desired_positive = min(remaining_polarity["positive"], args.samples_per_token, remaining_total)
+            desired_negative = min(
+                remaining_polarity["negative"],
+                max(0, min(args.samples_per_token, remaining_total) - desired_positive),
+            )
+            samples_to_keep = min(args.samples_per_token, remaining_total)
+            task = (
+                idx,
+                mint_addr,
+                samples_to_keep,
+                args.context_oversample_factor,
+                desired_positive,
+                desired_negative,
+            )
+            t0 = _time.perf_counter()
+            result = process_fn(task)
+            elapsed = _time.perf_counter() - t0
+            recent_times.append(elapsed)
+            if len(recent_times) > 50:
+                recent_times.pop(0)
+            if result["status"] == "success":
+                saved_contexts = 0
+                for ctx in result.get("contexts", []):
+                    if accepted_counts[class_id]["total"] >= class_targets[class_id]:
+                        break
+                    polarity = _representative_context_polarity(ctx)
+                    remaining_polarity = _remaining_polarity_targets(
+                        class_id=class_id,
+                        accepted_counts=accepted_counts,
+                        target_contexts_per_class=args.target_contexts_per_class,
+                        positive_balance_min_class=args.positive_balance_min_class,
+                        positive_ratio=args.positive_context_ratio,
+                    )
+                    other_polarity = "negative" if polarity == "positive" else "positive"
+                    if remaining_polarity[polarity] <= 0 and remaining_polarity[other_polarity] > 0:
+                        continue
+                    ctx["quality_score"] = result["q_score"]
+                    ctx["class_id"] = class_id
+                    ctx["source_token"] = mint_addr
+                    context_summary = summarize_context_window(ctx.get("labels"), ctx.get("labels_mask"))
+                    ctx["context_bucket"] = context_summary["context_bucket"]
+                    ctx["context_score"] = context_summary["context_score"]
+                    file_idx = accepted_counts[class_id]["total"]
+                    filename = f"sample_{mint_addr[:16]}_{file_idx}.pt"
+                    output_path = output_dir / filename
+                    torch.save(ctx, output_path)
+                    file_class_map[filename] = class_id
+                    file_context_bucket_map[filename] = context_summary["context_bucket"]
+                    file_context_summary_map[filename] = context_summary
+                    class_distribution[class_id] += 1
+                    polarity_distribution[polarity] += 1
+                    accepted_counts[class_id]["total"] += 1
+                    accepted_counts[class_id][polarity] += 1
+                    saved_contexts += 1
+                if saved_contexts > 0:
                     success_count += 1
                 else:
+                    skipped_count += 1
+            elif result["status"] == "skipped":
+                skipped_count += 1
+            else:
+                error_count += 1
+                err_msg = result.get("error", "unknown")
+                tqdm.write(f"ERROR: {result['mint'][:16]} - {err_msg}")
+                if len(error_samples) < 20:
+                    error_samples.append({'mint': result.get('mint'), 'error': err_msg, 'traceback': result.get('traceback', '')})
+            if all(accepted_counts[cid]["total"] >= class_targets[cid] for cid in class_targets):
+                tqdm.write("INFO: All class quotas filled. Stopping early.")
                 _log_progress(task_num, len(tasks), start_time, recent_times, success_count, skipped_count, error_count)
+                break
+            _log_progress(task_num, len(tasks), start_time, recent_times, success_count, skipped_count, error_count)
         # Write error log
         if error_samples:
             print(f"INFO: First {len(error_samples)} error tracebacks saved to {error_log_path}")
         print("INFO: Building metadata...")
         with open(output_dir / "class_metadata.json", 'w') as f:
             json.dump({
                 'file_class_map': file_class_map,
+                'file_context_bucket_map': file_context_bucket_map,
+                'file_context_summary_map': file_context_summary_map,
                 'class_distribution': {str(k): v for k, v in class_distribution.items()},
                 'num_workers': args.num_workers,
                 'horizons_seconds': args.horizons_seconds,
                 'quantiles': args.quantiles,
                 'target_total': target_total,
+                'target_contexts_per_class': args.target_contexts_per_class,
                 'context_polarity_distribution': polarity_distribution,
                 'class_targets': {str(k): v for k, v in class_targets.items()},
                 'class_polarity_targets': {str(k): v for k, v in class_polarity_targets.items()},
+                'accepted_counts': {str(k): v for k, v in accepted_counts.items()},
                 'positive_balance_min_class': args.positive_balance_min_class,
                 'positive_context_ratio': args.positive_context_ratio,
+                'quant_feature_version': FEATURE_VERSION,
             }, f, indent=2)
         print(f"\n--- Done ---\nSuccess: {success_count}, Skipped: {skipped_count}, Errors: {error_count}\nFiles: {len(file_class_map)}\nLocation: {output_dir.resolve()}")

scripts/dump_cache_sample.py CHANGED Viewed

@@ -121,7 +121,7 @@ def main():
         "__metadata__": {
             "source_file": str(filepath.absolute()),
             "dumped_at": datetime.now().isoformat(),
-            "cache_mode": data.get("cache_mode", "unknown") if isinstance(data, dict) else "unknown"
         },
         "data": serializable_data
     }
@@ -143,7 +143,7 @@ def main():
     if isinstance(data, dict):
         print(f"\n=== Summary ===")
         print(f"Top-level keys: {list(data.keys())}")
-        print(f"Cache mode: {data.get('cache_mode', 'not specified')}")
         if 'event_sequence' in data:
             print(f"Event count: {len(data['event_sequence'])}")
         if 'trades' in data:
@@ -152,6 +152,10 @@ def main():
             print(f"Source token: {data['source_token']}")
         if 'class_id' in data:
             print(f"Class ID: {data['class_id']}")
         if 'quality_score' in data:
             print(f"Quality score: {data['quality_score']}")

         "__metadata__": {
             "source_file": str(filepath.absolute()),
             "dumped_at": datetime.now().isoformat(),
+            "cache_format": "context" if isinstance(data, dict) and "event_sequence" in data else "legacy"
         },
         "data": serializable_data
     }
     if isinstance(data, dict):
         print(f"\n=== Summary ===")
         print(f"Top-level keys: {list(data.keys())}")
+        print(f"Cache format: {'context' if 'event_sequence' in data else 'legacy'}")
         if 'event_sequence' in data:
             print(f"Event count: {len(data['event_sequence'])}")
         if 'trades' in data:
             print(f"Source token: {data['source_token']}")
         if 'class_id' in data:
             print(f"Class ID: {data['class_id']}")
+        if 'context_bucket' in data:
+            print(f"Context bucket: {data['context_bucket']}")
+        if 'context_score' in data:
+            print(f"Context score: {data['context_score']}")
         if 'quality_score' in data:
             print(f"Quality score: {data['quality_score']}")

scripts/evaluate_sample.py CHANGED Viewed

@@ -23,8 +23,10 @@ from models.token_encoder import TokenEncoder
 from models.wallet_encoder import WalletEncoder
 from models.graph_updater import GraphUpdater
 from models.ohlc_embedder import OHLCEmbedder
 from models.model import Oracle
 import models.vocabulary as vocab
 from train import create_balanced_split
 from dotenv import load_dotenv
 from clickhouse_driver import Client as ClickHouseClient
@@ -43,6 +45,12 @@ ABLATION_SWEEP_MODES = [
     "trade",
     "onchain",
     "wallet_graph",
 ]
 OHLC_PROBE_MODES = [
@@ -77,7 +85,7 @@ def parse_args():
         "--ablation",
         type=str,
         default="none",
-        choices=["none", "wallet", "graph", "wallet_graph", "social", "token", "holder", "ohlc", "ohlc_wallet", "trade", "onchain", "all", "sweep", "ohlc_probe"],
         help="Run inference with selected signal families removed, or use 'sweep' to rank multiple families.",
     )
     return parser.parse_args()
@@ -164,6 +172,23 @@ def apply_ablation(batch, mode, device):
             ablated["ohlc_price_tensors"] = torch.zeros_like(ablated["ohlc_price_tensors"])
         if "ohlc_interval_ids" in ablated:
             ablated["ohlc_interval_ids"] = torch.zeros_like(ablated["ohlc_interval_ids"])
     if mode in {"trade", "all"}:
         for key in (
@@ -627,6 +652,11 @@ def main():
     wallet_encoder = WalletEncoder(encoder=multi_modal_encoder, dtype=init_dtype)
     graph_updater = GraphUpdater(time_encoder=time_encoder, dtype=init_dtype)
     ohlc_embedder = OHLCEmbedder(num_intervals=vocab.NUM_OHLC_INTERVALS, dtype=init_dtype)
     collator = MemecoinCollator(
         event_type_to_id=vocab.EVENT_TO_ID,
@@ -641,6 +671,7 @@ def main():
         wallet_encoder=wallet_encoder,
         graph_updater=graph_updater,
         ohlc_embedder=ohlc_embedder,
         time_encoder=time_encoder,
         num_event_types=vocab.NUM_EVENT_TYPES,
         multi_modal_dim=multi_modal_encoder.embedding_dim,

 from models.wallet_encoder import WalletEncoder
 from models.graph_updater import GraphUpdater
 from models.ohlc_embedder import OHLCEmbedder
+from models.quant_ohlc_embedder import QuantOHLCEmbedder
 from models.model import Oracle
 import models.vocabulary as vocab
+from data.quant_ohlc_feature_schema import FEATURE_GROUPS, NUM_QUANT_OHLC_FEATURES, TOKENS_PER_SEGMENT, group_feature_indices
 from train import create_balanced_split
 from dotenv import load_dotenv
 from clickhouse_driver import Client as ClickHouseClient
     "trade",
     "onchain",
     "wallet_graph",
+    "quant_ohlc",
+    "quant_levels",
+    "quant_trendline",
+    "quant_breaks",
+    "quant_rolling",
+    "quant_patterns",
 ]
 OHLC_PROBE_MODES = [
         "--ablation",
         type=str,
         default="none",
+        choices=["none", "wallet", "graph", "wallet_graph", "social", "token", "holder", "ohlc", "ohlc_wallet", "trade", "onchain", "all", "sweep", "ohlc_probe", "quant_ohlc", "quant_levels", "quant_trendline", "quant_breaks", "quant_rolling", "quant_patterns"],
         help="Run inference with selected signal families removed, or use 'sweep' to rank multiple families.",
     )
     return parser.parse_args()
             ablated["ohlc_price_tensors"] = torch.zeros_like(ablated["ohlc_price_tensors"])
         if "ohlc_interval_ids" in ablated:
             ablated["ohlc_interval_ids"] = torch.zeros_like(ablated["ohlc_interval_ids"])
+        if "quant_ohlc_feature_tensors" in ablated:
+            ablated["quant_ohlc_feature_tensors"] = torch.zeros_like(ablated["quant_ohlc_feature_tensors"])
+        if "quant_ohlc_feature_mask" in ablated:
+            ablated["quant_ohlc_feature_mask"] = torch.zeros_like(ablated["quant_ohlc_feature_mask"])
+    quant_group_map = {
+        "quant_ohlc": list(FEATURE_GROUPS.keys()),
+        "quant_levels": ["levels_breaks"],
+        "quant_trendline": ["trendlines"],
+        "quant_breaks": ["relative_structure", "levels_breaks"],
+        "quant_rolling": ["rolling_quant"],
+        "quant_patterns": ["patterns"],
+    }
+    if mode in quant_group_map and "quant_ohlc_feature_tensors" in ablated:
+        idxs = group_feature_indices(quant_group_map[mode])
+        if idxs:
+            ablated["quant_ohlc_feature_tensors"][:, :, idxs] = 0
     if mode in {"trade", "all"}:
         for key in (
     wallet_encoder = WalletEncoder(encoder=multi_modal_encoder, dtype=init_dtype)
     graph_updater = GraphUpdater(time_encoder=time_encoder, dtype=init_dtype)
     ohlc_embedder = OHLCEmbedder(num_intervals=vocab.NUM_OHLC_INTERVALS, dtype=init_dtype)
+    quant_ohlc_embedder = QuantOHLCEmbedder(
+        num_features=NUM_QUANT_OHLC_FEATURES,
+        sequence_length=TOKENS_PER_SEGMENT,
+        dtype=init_dtype,
+    )
     collator = MemecoinCollator(
         event_type_to_id=vocab.EVENT_TO_ID,
         wallet_encoder=wallet_encoder,
         graph_updater=graph_updater,
         ohlc_embedder=ohlc_embedder,
+        quant_ohlc_embedder=quant_ohlc_embedder,
         time_encoder=time_encoder,
         num_event_types=vocab.NUM_EVENT_TYPES,
         multi_modal_dim=multi_modal_encoder.embedding_dim,

scripts/rebuild_metadata.py CHANGED Viewed

@@ -6,6 +6,7 @@ from pathlib import Path
 from tqdm import tqdm
 from collections import defaultdict
 from data.data_loader import summarize_context_window
 def rebuild_metadata(cache_dir="data/cache"):
     cache_path = Path(cache_dir)
@@ -52,6 +53,7 @@ def rebuild_metadata(cache_dir="data/cache"):
         'num_workers': 1,
         'horizons_seconds': [300, 900, 1800, 3600, 7200], # From user's pre_cache.sh
         'quantiles': [0.1, 0.5, 0.9],
     }
     out_file = cache_path / "class_metadata.json"

 from tqdm import tqdm
 from collections import defaultdict
 from data.data_loader import summarize_context_window
+from data.quant_ohlc_feature_schema import FEATURE_VERSION
 def rebuild_metadata(cache_dir="data/cache"):
     cache_path = Path(cache_dir)
         'num_workers': 1,
         'horizons_seconds': [300, 900, 1800, 3600, 7200], # From user's pre_cache.sh
         'quantiles': [0.1, 0.5, 0.9],
+        'quant_feature_version': FEATURE_VERSION,
     }
     out_file = cache_path / "class_metadata.json"

signals/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Deterministic chart signal extraction package.

signals/patterns.py ADDED Viewed

	@@ -0,0 +1,127 @@

+from typing import Dict, Tuple
+import numpy as np
+from scipy.signal import find_peaks
+from data.quant_ohlc_feature_schema import PATTERN_NAMES
+def _empty_pattern_output() -> Dict[str, float]:
+    out = {f"pattern_{name}_confidence": 0.0 for name in PATTERN_NAMES}
+    out["pattern_available"] = 1.0
+    return out
+def _confidence_from_error(error: float, tolerance: float) -> float:
+    if tolerance <= 1e-8:
+        return 0.0
+    return float(max(0.0, min(1.0, 1.0 - (error / tolerance))))
+def _recent_prominent_peaks(series: np.ndarray, distance: int, prominence: float) -> Tuple[np.ndarray, Dict[str, np.ndarray]]:
+    peaks, props = find_peaks(series, distance=distance, prominence=prominence)
+    if peaks.size == 0:
+        return peaks, props
+    order = np.argsort(props["prominences"])
+    keep = order[-5:]
+    keep_sorted = np.sort(keep)
+    peaks = peaks[keep_sorted]
+    props = {key: value[keep_sorted] for key, value in props.items()}
+    return peaks, props
+def _double_top_confidence(highs: np.ndarray, current_price: float, tolerance: float) -> float:
+    peaks, props = _recent_prominent_peaks(highs, distance=3, prominence=tolerance * 0.5)
+    if peaks.size < 2:
+        return 0.0
+    top1_idx, top2_idx = peaks[-2], peaks[-1]
+    top1 = float(highs[top1_idx])
+    top2 = float(highs[top2_idx])
+    neckline = float(np.min(highs[top1_idx:top2_idx + 1])) if top2_idx > top1_idx else min(top1, top2)
+    if current_price > max(top1, top2):
+        return 0.0
+    symmetry = _confidence_from_error(abs(top1 - top2), tolerance)
+    separation = min(1.0, float(top2_idx - top1_idx) / 8.0)
+    breakdown = 1.0 if current_price <= neckline else 0.6
+    prominence = min(1.0, float(np.mean(props["prominences"][-2:])) / max(tolerance, 1e-8))
+    return float(max(0.0, min(1.0, symmetry * separation * breakdown * prominence)))
+def _double_bottom_confidence(lows: np.ndarray, current_price: float, tolerance: float) -> float:
+    troughs, props = _recent_prominent_peaks(-lows, distance=3, prominence=tolerance * 0.5)
+    if troughs.size < 2:
+        return 0.0
+    low1_idx, low2_idx = troughs[-2], troughs[-1]
+    low1 = float(lows[low1_idx])
+    low2 = float(lows[low2_idx])
+    ceiling = float(np.max(lows[low1_idx:low2_idx + 1])) if low2_idx > low1_idx else max(low1, low2)
+    if current_price < min(low1, low2):
+        return 0.0
+    symmetry = _confidence_from_error(abs(low1 - low2), tolerance)
+    separation = min(1.0, float(low2_idx - low1_idx) / 8.0)
+    breakout = 1.0 if current_price >= ceiling else 0.6
+    prominence = min(1.0, float(np.mean(props["prominences"][-2:])) / max(tolerance, 1e-8))
+    return float(max(0.0, min(1.0, symmetry * separation * breakout * prominence)))
+def _triangle_confidences(highs: np.ndarray, lows: np.ndarray, tolerance: float) -> Dict[str, float]:
+    out = {
+        "ascending_triangle": 0.0,
+        "descending_triangle": 0.0,
+    }
+    peak_idx, _ = _recent_prominent_peaks(highs, distance=3, prominence=tolerance * 0.5)
+    trough_idx, _ = _recent_prominent_peaks(-lows, distance=3, prominence=tolerance * 0.5)
+    if peak_idx.size < 2 or trough_idx.size < 2:
+        return out
+    peak_vals = highs[peak_idx[-3:]]
+    trough_vals = lows[trough_idx[-3:]]
+    peak_slope = np.polyfit(np.arange(len(peak_vals), dtype=np.float64), peak_vals.astype(np.float64), deg=1)[0]
+    trough_slope = np.polyfit(np.arange(len(trough_vals), dtype=np.float64), trough_vals.astype(np.float64), deg=1)[0]
+    peak_flatness = _confidence_from_error(float(np.max(peak_vals) - np.min(peak_vals)), tolerance)
+    trough_flatness = _confidence_from_error(float(np.max(trough_vals) - np.min(trough_vals)), tolerance)
+    out["ascending_triangle"] = float(max(0.0, min(1.0, peak_flatness * max(0.0, trough_slope) / max(tolerance, 1e-8))))
+    out["descending_triangle"] = float(max(0.0, min(1.0, trough_flatness * max(0.0, -peak_slope) / max(tolerance, 1e-8))))
+    return out
+def _head_shoulders_confidence(highs: np.ndarray, lows: np.ndarray, tolerance: float, inverse: bool = False) -> float:
+    series = -lows if inverse else highs
+    pivots, props = _recent_prominent_peaks(series, distance=3, prominence=tolerance * 0.5)
+    if pivots.size < 3:
+        return 0.0
+    idxs = pivots[-3:]
+    values = series[idxs]
+    left, head, right = [float(v) for v in values]
+    shoulders_match = _confidence_from_error(abs(left - right), tolerance)
+    if inverse:
+        head_margin = max(0.0, min(left, right) - head)
+    else:
+        head_margin = max(0.0, head - max(left, right))
+    head_score = min(1.0, head_margin / max(tolerance, 1e-8))
+    spacing = min(1.0, float(min(idxs[1] - idxs[0], idxs[2] - idxs[1])) / 5.0)
+    prominence = min(1.0, float(np.mean(props["prominences"][-3:])) / max(tolerance, 1e-8))
+    return float(max(0.0, min(1.0, shoulders_match * head_score * spacing * prominence)))
+def compute_pattern_features(closes, highs, lows, end_idx: int) -> Dict[str, float]:
+    out = _empty_pattern_output()
+    closes_np = np.asarray(closes[: end_idx + 1], dtype=np.float64)
+    highs_np = np.asarray(highs[: end_idx + 1], dtype=np.float64)
+    lows_np = np.asarray(lows[: end_idx + 1], dtype=np.float64)
+    if closes_np.size < 10:
+        return out
+    current_price = float(closes_np[-1])
+    tolerance = max(float(np.std(closes_np[-20:])) if closes_np.size >= 20 else float(np.std(closes_np)), current_price * 0.003, 1e-5)
+    out["pattern_double_top_confidence"] = _double_top_confidence(highs_np, current_price, tolerance)
+    out["pattern_double_bottom_confidence"] = _double_bottom_confidence(lows_np, current_price, tolerance)
+    triangle = _triangle_confidences(highs_np, lows_np, tolerance)
+    out["pattern_ascending_triangle_confidence"] = triangle["ascending_triangle"]
+    out["pattern_descending_triangle_confidence"] = triangle["descending_triangle"]
+    out["pattern_head_shoulders_confidence"] = _head_shoulders_confidence(highs_np, lows_np, tolerance, inverse=False)
+    out["pattern_inverse_head_shoulders_confidence"] = _head_shoulders_confidence(highs_np, lows_np, tolerance, inverse=True)
+    return out

signals/rolling_quant.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from typing import Dict, List
+import numpy as np
+import pandas as pd
+from ta.trend import ema_indicator, sma_indicator
+def _finite_or_zero(value: float) -> float:
+    try:
+        value = float(value)
+    except Exception:
+        return 0.0
+    if not np.isfinite(value):
+        return 0.0
+    return value
+def compute_rolling_quant_features(closes: List[float], end_idx: int) -> Dict[str, float]:
+    closes_np = np.asarray(closes[: end_idx + 1], dtype=np.float64)
+    if closes_np.size == 0:
+        return {
+            "ema_fast": 0.0,
+            "ema_medium": 0.0,
+            "sma_fast": 0.0,
+            "sma_medium": 0.0,
+            "price_minus_ema_fast": 0.0,
+            "price_minus_ema_medium": 0.0,
+            "ema_spread": 0.0,
+            "price_zscore": 0.0,
+            "mean_reversion_score": 0.0,
+            "rolling_vol_zscore": 0.0,
+        }
+    close_series = pd.Series(closes_np)
+    current = float(closes_np[-1])
+    current_scale = max(abs(current), 1e-8)
+    ema_fast = _finite_or_zero(ema_indicator(close_series, window=8, fillna=True).iloc[-1])
+    ema_medium = _finite_or_zero(ema_indicator(close_series, window=21, fillna=True).iloc[-1])
+    sma_fast = _finite_or_zero(sma_indicator(close_series, window=8, fillna=True).iloc[-1])
+    sma_medium = _finite_or_zero(sma_indicator(close_series, window=21, fillna=True).iloc[-1])
+    mean_all = _finite_or_zero(close_series.mean())
+    std_all = _finite_or_zero(close_series.std(ddof=0))
+    price_zscore = 0.0 if std_all <= 1e-8 else (current - mean_all) / std_all
+    log_returns = np.diff(np.log(np.clip(closes_np, 1e-8, None)))
+    if log_returns.size == 0:
+        rolling_vol = 0.0
+        mean_vol = 0.0
+        std_vol = 0.0
+    else:
+        abs_log_returns = np.abs(log_returns)
+        rolling_vol = _finite_or_zero(np.std(log_returns[-20:]))
+        mean_vol = _finite_or_zero(np.mean(abs_log_returns))
+        std_vol = _finite_or_zero(np.std(abs_log_returns))
+    rolling_vol_zscore = 0.0 if std_vol <= 1e-8 else (rolling_vol - mean_vol) / std_vol
+    denom = max(abs(sma_medium), 1e-8)
+    return {
+        "ema_fast": ema_fast / current_scale,
+        "ema_medium": ema_medium / current_scale,
+        "sma_fast": sma_fast / current_scale,
+        "sma_medium": sma_medium / current_scale,
+        "price_minus_ema_fast": (current - ema_fast) / current_scale,
+        "price_minus_ema_medium": (current - ema_medium) / current_scale,
+        "ema_spread": (ema_fast - ema_medium) / current_scale,
+        "price_zscore": _finite_or_zero(price_zscore),
+        "mean_reversion_score": (mean_all - current) / denom,
+        "rolling_vol_zscore": _finite_or_zero(rolling_vol_zscore),
+    }

signals/support_resistance.py ADDED Viewed

	@@ -0,0 +1,136 @@

+from typing import Dict, List, Optional
+import numpy as np
+from scipy.signal import argrelextrema
+def _compute_pivots(prices: np.ndarray, order: int = 2) -> Dict[str, List[int]]:
+    if prices.size < (2 * order + 1):
+        return {"highs": [], "lows": []}
+    highs = argrelextrema(prices, np.greater_equal, order=order, mode="clip")[0].tolist()
+    lows = argrelextrema(prices, np.less_equal, order=order, mode="clip")[0].tolist()
+    highs = [idx for idx in highs if order <= idx < len(prices) - order]
+    lows = [idx for idx in lows if order <= idx < len(prices) - order]
+    return {"highs": highs, "lows": lows}
+def _cluster_levels(prices: np.ndarray, pivot_indices: List[int], tolerance: float) -> List[Dict[str, float]]:
+    levels: List[Dict[str, float]] = []
+    for pivot_idx in pivot_indices:
+        price = float(prices[pivot_idx])
+        matched = None
+        for level in levels:
+            if abs(price - level["price"]) <= tolerance:
+                matched = level
+                break
+        if matched is None:
+            levels.append({
+                "price": price,
+                "touches": 1.0,
+                "last_idx": float(pivot_idx),
+                "first_idx": float(pivot_idx),
+            })
+            continue
+        touches = matched["touches"] + 1.0
+        matched["price"] = ((matched["price"] * matched["touches"]) + price) / touches
+        matched["touches"] = touches
+        matched["last_idx"] = float(pivot_idx)
+    return levels
+def _nearest_level(levels: List[Dict[str, float]], current_price: float, below: bool) -> Optional[Dict[str, float]]:
+    candidates = [level for level in levels if (level["price"] <= current_price if below else level["price"] >= current_price)]
+    if not candidates:
+        return None
+    return min(candidates, key=lambda level: abs(level["price"] - current_price))
+def compute_support_resistance_features(
+    closes: List[float],
+    highs: List[float],
+    lows: List[float],
+    end_idx: int,
+    window_start: int,
+    window_end: int,
+    timestamps: List[int],
+) -> Dict[str, float]:
+    closes_np = np.asarray(closes[: end_idx + 1], dtype=np.float64)
+    highs_np = np.asarray(highs[: end_idx + 1], dtype=np.float64)
+    lows_np = np.asarray(lows[: end_idx + 1], dtype=np.float64)
+    if closes_np.size == 0:
+        return {key: 0.0 for key in [
+            "nearest_support_dist", "nearest_resistance_dist", "support_touch_count",
+            "resistance_touch_count", "support_age_sec", "resistance_age_sec",
+            "support_strength", "resistance_strength", "inside_support_zone",
+            "inside_resistance_zone", "support_swept", "resistance_swept",
+            "support_reclaim", "resistance_reject", "sr_available",
+        ]}
+    current_price = float(closes_np[-1])
+    local_range = max(float(np.max(highs_np) - np.min(lows_np)), current_price * 1e-3, 1e-8)
+    tolerance = max(local_range * 0.08, current_price * 0.0025)
+    pivots_high = _compute_pivots(highs_np, order=2)["highs"]
+    pivots_low = _compute_pivots(lows_np, order=2)["lows"]
+    support_levels = _cluster_levels(lows_np, pivots_low, tolerance)
+    resistance_levels = _cluster_levels(highs_np, pivots_high, tolerance)
+    support = _nearest_level(support_levels, current_price, below=True)
+    resistance = _nearest_level(resistance_levels, current_price, below=False)
+    current_ts = float(timestamps[min(end_idx, len(timestamps) - 1)]) if timestamps else float(end_idx)
+    def _level_stats(level: Optional[Dict[str, float]], is_support: bool) -> Dict[str, float]:
+        if level is None:
+            return {
+                "dist": 0.0,
+                "touch_count": 0.0,
+                "age_sec": 0.0,
+                "strength": 0.0,
+                "inside_zone": 0.0,
+                "swept": 0.0,
+                "confirm": 0.0,
+            }
+        level_price = float(level["price"])
+        zone_half_width = max(tolerance, abs(level_price) * 0.002)
+        window_prices_high = highs[window_start:window_end]
+        window_prices_low = lows[window_start:window_end]
+        swept = 0.0
+        confirm = 0.0
+        if is_support:
+            min_low = min(window_prices_low) if window_prices_low else current_price
+            swept = 1.0 if min_low < (level_price - zone_half_width) else 0.0
+            confirm = 1.0 if swept > 0 and current_price >= level_price else 0.0
+        else:
+            max_high = max(window_prices_high) if window_prices_high else current_price
+            swept = 1.0 if max_high > (level_price + zone_half_width) else 0.0
+            confirm = 1.0 if swept > 0 and current_price <= level_price else 0.0
+        return {
+            "dist": (current_price - level_price) / max(abs(current_price), 1e-8) if is_support else (level_price - current_price) / max(abs(current_price), 1e-8),
+            "touch_count": float(level["touches"]),
+            "age_sec": max(0.0, current_ts - float(timestamps[int(level["last_idx"])]) if timestamps else current_ts - level["last_idx"]),
+            "strength": float(level["touches"]) / max(1.0, float(end_idx + 1)),
+            "inside_zone": 1.0 if abs(current_price - level_price) <= zone_half_width else 0.0,
+            "swept": swept,
+            "confirm": confirm,
+        }
+    support_stats = _level_stats(support, True)
+    resistance_stats = _level_stats(resistance, False)
+    return {
+        "nearest_support_dist": support_stats["dist"],
+        "nearest_resistance_dist": resistance_stats["dist"],
+        "support_touch_count": support_stats["touch_count"],
+        "resistance_touch_count": resistance_stats["touch_count"],
+        "support_age_sec": support_stats["age_sec"],
+        "resistance_age_sec": resistance_stats["age_sec"],
+        "support_strength": support_stats["strength"],
+        "resistance_strength": resistance_stats["strength"],
+        "inside_support_zone": support_stats["inside_zone"],
+        "inside_resistance_zone": resistance_stats["inside_zone"],
+        "support_swept": support_stats["swept"],
+        "resistance_swept": resistance_stats["swept"],
+        "support_reclaim": support_stats["confirm"],
+        "resistance_reject": resistance_stats["confirm"],
+        "sr_available": 1.0 if support_levels or resistance_levels else 0.0,
+    }

signals/trendlines.py ADDED Viewed

	@@ -0,0 +1,118 @@

+from typing import Dict, Iterable, Optional, Tuple
+import numpy as np
+import trendln
+def _empty_trendline_features() -> Dict[str, float]:
+    return {
+        "lower_trendline_slope": 0.0,
+        "upper_trendline_slope": 0.0,
+        "dist_to_lower_line": 0.0,
+        "dist_to_upper_line": 0.0,
+        "trend_channel_width": 0.0,
+        "trend_convergence": 0.0,
+        "trend_breakout_upper": 0.0,
+        "trend_breakdown_lower": 0.0,
+        "trend_reentry": 0.0,
+        "trendline_available": 0.0,
+    }
+def _extract_best_line(line_data: object) -> Optional[Tuple[float, float]]:
+    if not isinstance(line_data, Iterable):
+        return None
+    line_list = list(line_data)
+    if not line_list:
+        return None
+    best = line_list[0]
+    if not isinstance(best, tuple) or len(best) < 2 or not isinstance(best[1], tuple) or len(best[1]) < 2:
+        return None
+    slope = float(best[1][0])
+    intercept = float(best[1][1])
+    if not np.isfinite(slope) or not np.isfinite(intercept):
+        return None
+    return slope, intercept
+def _extract_overall_line(summary_data: object) -> Optional[Tuple[float, float]]:
+    if isinstance(summary_data, (tuple, list)) and len(summary_data) >= 2:
+        slope = float(summary_data[0])
+        intercept = float(summary_data[1])
+        if np.isfinite(slope) and np.isfinite(intercept):
+            return slope, intercept
+    return None
+def _fit_with_trendln(values: np.ndarray) -> Tuple[Optional[Tuple[float, float]], Optional[Tuple[float, float]]]:
+    window = max(10, min(125, int(values.size)))
+    out = trendln.calc_support_resistance(
+        values,
+        extmethod=trendln.METHOD_NAIVE,
+        method=trendln.METHOD_NSQUREDLOGN,
+        window=window,
+        errpct=0.01,
+    )
+    support_part, resistance_part = out
+    support_line = _extract_best_line(support_part[2]) or _extract_overall_line(support_part[1])
+    resistance_line = _extract_best_line(resistance_part[2]) or _extract_overall_line(resistance_part[1])
+    return support_line, resistance_line
+def compute_trendline_features(closes, highs, lows, end_idx: int) -> Dict[str, float]:
+    closes_np = np.asarray(closes[: end_idx + 1], dtype=np.float64)
+    highs_np = np.asarray(highs[: end_idx + 1], dtype=np.float64)
+    lows_np = np.asarray(lows[: end_idx + 1], dtype=np.float64)
+    if closes_np.size < 5:
+        return _empty_trendline_features()
+    current_price = float(closes_np[-1])
+    idx = float(closes_np.size - 1)
+    prev_idx = max(0.0, idx - 1.0)
+    prev_close = float(closes_np[-2]) if closes_np.size > 1 else current_price
+    price_scale = max(abs(current_price), 1e-8)
+    try:
+        support_line, resistance_line = _fit_with_trendln(closes_np)
+    except Exception:
+        support_line, resistance_line = None, None
+    if support_line is None:
+        try:
+            support_line, _ = _fit_with_trendln(lows_np)
+        except Exception:
+            support_line = None
+    if resistance_line is None:
+        try:
+            _, resistance_line = _fit_with_trendln(highs_np)
+        except Exception:
+            resistance_line = None
+    if support_line is None and resistance_line is None:
+        return _empty_trendline_features()
+    lower_pred = support_line[0] * idx + support_line[1] if support_line is not None else current_price
+    upper_pred = resistance_line[0] * idx + resistance_line[1] if resistance_line is not None else current_price
+    lower_prev = support_line[0] * prev_idx + support_line[1] if support_line is not None else lower_pred
+    upper_prev = resistance_line[0] * prev_idx + resistance_line[1] if resistance_line is not None else upper_pred
+    width = max(abs(upper_pred - lower_pred), 1e-8)
+    breakout_upper = 1.0 if current_price > upper_pred and prev_close <= upper_prev else 0.0
+    breakdown_lower = 1.0 if current_price < lower_pred and prev_close >= lower_prev else 0.0
+    reentry = 1.0 if (
+        (prev_close > upper_prev and current_price <= upper_pred) or
+        (prev_close < lower_prev and current_price >= lower_pred)
+    ) else 0.0
+    return {
+        "lower_trendline_slope": 0.0 if support_line is None else support_line[0] / price_scale,
+        "upper_trendline_slope": 0.0 if resistance_line is None else resistance_line[0] / price_scale,
+        "dist_to_lower_line": (current_price - lower_pred) / price_scale,
+        "dist_to_upper_line": (upper_pred - current_price) / price_scale,
+        "trend_channel_width": width / price_scale,
+        "trend_convergence": 0.0 if support_line is None or resistance_line is None else (resistance_line[0] - support_line[0]) / price_scale,
+        "trend_breakout_upper": breakout_upper,
+        "trend_breakdown_lower": breakdown_lower,
+        "trend_reentry": reentry,
+        "trendline_available": 1.0,
+    }

train.py CHANGED Viewed

@@ -59,8 +59,10 @@ from models.token_encoder import TokenEncoder
 from models.wallet_encoder import WalletEncoder
 from models.graph_updater import GraphUpdater
 from models.ohlc_embedder import OHLCEmbedder
 from models.model import Oracle
 import models.vocabulary as vocab
 # Setup Logger
 logger = get_logger(__name__)
@@ -671,6 +673,11 @@ def main() -> None:
         num_intervals=vocab.NUM_OHLC_INTERVALS,
         dtype=init_dtype
     )
     collator = MemecoinCollator(
         event_type_to_id=vocab.EVENT_TO_ID,
@@ -809,6 +816,7 @@ def main() -> None:
         wallet_encoder=wallet_encoder,
         graph_updater=graph_updater,
         ohlc_embedder=ohlc_embedder,
         time_encoder=time_encoder,
         num_event_types=vocab.NUM_EVENT_TYPES,
         multi_modal_dim=multi_modal_encoder.embedding_dim,

 from models.wallet_encoder import WalletEncoder
 from models.graph_updater import GraphUpdater
 from models.ohlc_embedder import OHLCEmbedder
+from models.quant_ohlc_embedder import QuantOHLCEmbedder
 from models.model import Oracle
 import models.vocabulary as vocab
+from data.quant_ohlc_feature_schema import NUM_QUANT_OHLC_FEATURES, TOKENS_PER_SEGMENT
 # Setup Logger
 logger = get_logger(__name__)
         num_intervals=vocab.NUM_OHLC_INTERVALS,
         dtype=init_dtype
     )
+    quant_ohlc_embedder = QuantOHLCEmbedder(
+        num_features=NUM_QUANT_OHLC_FEATURES,
+        sequence_length=TOKENS_PER_SEGMENT,
+        dtype=init_dtype,
+    )
     collator = MemecoinCollator(
         event_type_to_id=vocab.EVENT_TO_ID,
         wallet_encoder=wallet_encoder,
         graph_updater=graph_updater,
         ohlc_embedder=ohlc_embedder,
+        quant_ohlc_embedder=quant_ohlc_embedder,
         time_encoder=time_encoder,
         num_event_types=vocab.NUM_EVENT_TYPES,
         multi_modal_dim=multi_modal_encoder.embedding_dim,