zirobtc commited on Feb 25

Commit

3780496

1 Parent(s): bb2313b

Upload folder using huggingface_hub

Browse files

Files changed (24) hide show

.gitattributes +1 -0
audit_cache.py +1 -1
cache_dataset.py +4 -21
data/all_files.txt +0 -0
data/backup_20260225_073238.log +3 -0
data/batch_list_aa +0 -0
data/batch_list_ab +0 -0
data/batch_list_ac +0 -0
data/batch_list_ad +0 -0
data/batch_list_ae +0 -0
data/data_loader.py +2 -16
data/ohlc_stats.npz +1 -1
ingest.sh +1 -1
pre_cache.sh +3 -4
sample_121MxrQDsaY35gC4_0.json +0 -0
sample_14CfRkQ9CFP4o9nV_3.json +0 -0
sample_2tYvBaQqXYy7Y5Qk_3.json +0 -0
scripts/analyze_distribution.py +2 -4
scripts/cache_dataset.py +4 -21
scripts/cache_parallel.py +2 -3
train.py +1 -2
train.sh +0 -1
train.yaml +0 -1
validate.py +0 -2

.gitattributes CHANGED Viewed

@@ -36,3 +36,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 log.log filter=lfs diff=lfs merge=lfs -text
 store/74c/74c70007-cccd-4669-bfd4-e25f8348ad8c/all_1_35_2/primary.cidx filter=lfs diff=lfs merge=lfs -text
 data/quality_scores.jsonl filter=lfs diff=lfs merge=lfs -text

 log.log filter=lfs diff=lfs merge=lfs -text
 store/74c/74c70007-cccd-4669-bfd4-e25f8348ad8c/all_1_35_2/primary.cidx filter=lfs diff=lfs merge=lfs -text
 data/quality_scores.jsonl filter=lfs diff=lfs merge=lfs -text
+data/backup_20260225_073238.log filter=lfs diff=lfs merge=lfs -text

audit_cache.py CHANGED Viewed

@@ -7,7 +7,7 @@ from collections import defaultdict
 import glob
 from tqdm import tqdm
-def audit_cache(cache_dir, num_samples=1000):
     files = glob.glob(os.path.join(cache_dir, "sample_*.pt"))
     if not files:
         print(f"No .pt files found in {cache_dir}")

 import glob
 from tqdm import tqdm
+def audit_cache(cache_dir, num_samples=10000):
     files = glob.glob(os.path.join(cache_dir, "sample_*.pt"))
     if not files:
         print(f"No .pt files found in {cache_dir}")

cache_dataset.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import os
 import sys
 import argparse
-import numpy as np
 import datetime
 import torch
 import json
@@ -45,7 +45,6 @@ def _init_worker(db_config, dataset_config, return_class_map, quality_scores_map
         data_fetcher=data_fetcher,
         max_samples=dataset_config['max_samples'],
         start_date=dataset_config['start_date'],
-        ohlc_stats_path=dataset_config['ohlc_stats_path'],
         horizons_seconds=dataset_config['horizons_seconds'],
         quantiles=dataset_config['quantiles'],
         min_trade_usd=dataset_config['min_trade_usd'],
@@ -110,21 +109,6 @@ def _process_single_token_raw(args):
         return {'status': 'error', 'mint': mint_addr, 'error': str(e), 'traceback': traceback.format_exc()}
-def compute_save_ohlc_stats(client, output_path):
-    print(f"INFO: Computing OHLC stats...")
-    query = """SELECT AVG(t.price_usd), stddevPop(t.price_usd), AVG(t.price), stddevPop(t.price), AVG(t.total_usd), stddevPop(t.total_usd) FROM trades AS t WHERE t.price_usd > 0 AND t.total_usd > 0"""
-    try:
-        result = client.execute(query)
-        if result and result[0]:
-            row = result[0]
-            stats = {"mean_price_usd": float(row[0] or 0), "std_price_usd": float(row[1] or 1), "mean_price_native": float(row[2] or 0), "std_price_native": float(row[3] or 1), "mean_trade_value_usd": float(row[4] or 0), "std_trade_value_usd": float(row[5] or 1)}
-        else:
-            stats = {"mean_price_usd": 0.0, "std_price_usd": 1.0, "mean_price_native": 0.0, "std_price_native": 1.0, "mean_trade_value_usd": 0.0, "std_trade_value_usd": 1.0}
-        Path(output_path).parent.mkdir(parents=True, exist_ok=True)
-        np.savez(output_path, **stats)
-        print(f"INFO: Saved OHLC stats to {output_path}")
-    except Exception as e:
-        print(f"ERROR: Failed to compute OHLC stats: {e}")
 def main():
@@ -140,7 +124,7 @@ def main():
     parser.add_argument("--output_dir", type=str, default="data/cache")
     parser.add_argument("--max_samples", type=int, default=None)
     parser.add_argument("--start_date", type=str, default=None)
-    parser.add_argument("--ohlc_stats_path", type=str, default="data/ohlc_stats.npz")
     parser.add_argument("--min_trade_usd", type=float, default=0.0)
     parser.add_argument("--cache_mode", type=str, default="raw", choices=["raw", "context"])
     parser.add_argument("--context_length", type=int, default=8192)
@@ -166,7 +150,6 @@ def main():
     neo4j_driver = GraphDatabase.driver(args.neo4j_uri, auth=(args.neo4j_user, args.neo4j_password))
     try:
-        compute_save_ohlc_stats(clickhouse_client, args.ohlc_stats_path)
         from data.data_loader import OracleDataset
         from data.data_fetcher import DataFetcher
@@ -180,7 +163,7 @@ def main():
         quality_scores_map = get_token_quality_scores(clickhouse_client)
         print(f"INFO: Loaded {len(quality_scores_map)} quality scores.")
-        dataset = OracleDataset(data_fetcher=data_fetcher, max_samples=args.max_samples, start_date=start_date_dt, ohlc_stats_path=args.ohlc_stats_path, horizons_seconds=[60, 180, 300, 600, 1800, 3600, 7200], quantiles=[0.5], min_trade_usd=args.min_trade_usd, max_seq_len=args.context_length)
         if len(dataset) == 0:
             print("WARNING: No samples. Exiting.")
@@ -198,7 +181,7 @@ def main():
         print(f"INFO: Cache mode: {args.cache_mode}, Workers: {args.num_workers}")
         db_config = {'clickhouse_host': args.clickhouse_host, 'clickhouse_port': args.clickhouse_port, 'neo4j_uri': args.neo4j_uri, 'neo4j_user': args.neo4j_user, 'neo4j_password': args.neo4j_password}
-        dataset_config = {'max_samples': args.max_samples, 'start_date': start_date_dt, 'ohlc_stats_path': args.ohlc_stats_path, 'horizons_seconds': [60, 180, 300, 600, 1800, 3600, 7200], 'quantiles': [0.5], 'min_trade_usd': args.min_trade_usd, 'max_seq_len': args.context_length, 'sampled_mints': filtered_mints}
         # Build tasks from filtered_mints directly
         tasks = []

 import os
 import sys
 import argparse
 import datetime
 import torch
 import json
         data_fetcher=data_fetcher,
         max_samples=dataset_config['max_samples'],
         start_date=dataset_config['start_date'],
         horizons_seconds=dataset_config['horizons_seconds'],
         quantiles=dataset_config['quantiles'],
         min_trade_usd=dataset_config['min_trade_usd'],
         return {'status': 'error', 'mint': mint_addr, 'error': str(e), 'traceback': traceback.format_exc()}
 def main():
     parser.add_argument("--output_dir", type=str, default="data/cache")
     parser.add_argument("--max_samples", type=int, default=None)
     parser.add_argument("--start_date", type=str, default=None)
     parser.add_argument("--min_trade_usd", type=float, default=0.0)
     parser.add_argument("--cache_mode", type=str, default="raw", choices=["raw", "context"])
     parser.add_argument("--context_length", type=int, default=8192)
     neo4j_driver = GraphDatabase.driver(args.neo4j_uri, auth=(args.neo4j_user, args.neo4j_password))
     try:
         from data.data_loader import OracleDataset
         from data.data_fetcher import DataFetcher
         quality_scores_map = get_token_quality_scores(clickhouse_client)
         print(f"INFO: Loaded {len(quality_scores_map)} quality scores.")
+        dataset = OracleDataset(data_fetcher=data_fetcher, max_samples=args.max_samples, start_date=start_date_dt, horizons_seconds=[60, 180, 300, 600, 1800, 3600, 7200], quantiles=[0.5], min_trade_usd=args.min_trade_usd, max_seq_len=args.context_length)
         if len(dataset) == 0:
             print("WARNING: No samples. Exiting.")
         print(f"INFO: Cache mode: {args.cache_mode}, Workers: {args.num_workers}")
         db_config = {'clickhouse_host': args.clickhouse_host, 'clickhouse_port': args.clickhouse_port, 'neo4j_uri': args.neo4j_uri, 'neo4j_user': args.neo4j_user, 'neo4j_password': args.neo4j_password}
+        dataset_config = {'max_samples': args.max_samples, 'start_date': start_date_dt, 'horizons_seconds': [60, 180, 300, 600, 1800, 3600, 7200], 'quantiles': [0.5], 'min_trade_usd': args.min_trade_usd, 'max_seq_len': args.context_length, 'sampled_mints': filtered_mints}
         # Build tasks from filtered_mints directly
         tasks = []

data/all_files.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

data/backup_20260225_073238.log ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e0454111eda3beeb85fd701110438e8dbced5f4440037a6964bcd0d5c527607d
+size 13931583

data/batch_list_aa ADDED Viewed

The diff for this file is too large to render. See raw diff

data/batch_list_ab ADDED Viewed

The diff for this file is too large to render. See raw diff

data/batch_list_ac ADDED Viewed

The diff for this file is too large to render. See raw diff

data/batch_list_ad ADDED Viewed

The diff for this file is too large to render. See raw diff

data/batch_list_ae ADDED Viewed

The diff for this file is too large to render. See raw diff

data/data_loader.py CHANGED Viewed

@@ -122,7 +122,7 @@ class OracleDataset(Dataset):
                  horizons_seconds: List[int] = [],
                  quantiles: List[float] = [],
                  max_samples: Optional[int] = None,
-                 ohlc_stats_path: Union[str, Path] = "./data/ohlc_stats.npz",
                  token_allowlist: Optional[List[str]] = None,
                  t_cutoff_seconds: int = 60,
                  cache_dir: Optional[Union[str, Path]] = None,
@@ -136,7 +136,6 @@ class OracleDataset(Dataset):
         # --- P99 data-driven clamp values (replace hardcoded min/max) ---
         self.p99_clamps = {
             'slippage': 1.0,
-            'priority_fee': 0.1,
             'total_usd': 100000.0,
             'history_bought_cost_sol': 30.0,
             'realized_profit_sol': 150.0,
@@ -316,20 +315,7 @@ class OracleDataset(Dataset):
         else:
             self.max_cache_horizon_seconds = 3600
-        # --- NEW: Load global OHLC normalization stats ---
-        self.ohlc_price_mean = 0.0
-        self.ohlc_price_std = 1.0
-        if ohlc_stats_path:
-            stats_path = Path(ohlc_stats_path)
-            if stats_path.exists():
-                stats = np.load(stats_path)
-                self.ohlc_price_mean = float(stats.get('mean_price_usd', 0.0))
-                self.ohlc_price_std = float(stats.get('std_price_usd', 1.0)) or 1.0
-            else:
-                 print(f"WARNING: OHLC stats file not found at {stats_path}. Using default normalization (mean=0, std=1).")
-        else:
-             print("INFO: No OHLC stats path provided. Using default normalization.")
         self.min_trade_usd = min_trade_usd
         self._uri_fail_counts: Dict[str, int] = {}

                  horizons_seconds: List[int] = [],
                  quantiles: List[float] = [],
                  max_samples: Optional[int] = None,
                  token_allowlist: Optional[List[str]] = None,
                  t_cutoff_seconds: int = 60,
                  cache_dir: Optional[Union[str, Path]] = None,
         # --- P99 data-driven clamp values (replace hardcoded min/max) ---
         self.p99_clamps = {
             'slippage': 1.0,
             'total_usd': 100000.0,
             'history_bought_cost_sol': 30.0,
             'realized_profit_sol': 150.0,
         else:
             self.max_cache_horizon_seconds = 3600
         self.min_trade_usd = min_trade_usd
         self._uri_fail_counts: Dict[str, int] = {}

data/ohlc_stats.npz CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1d757990a0158118444be61f3d944dfb125237928809b4568ac209ab260f032e
 size 1660

 version https://git-lfs.github.com/spec/v1
+oid sha256:667badf0d42d97e84ec60d58a1a4594f3141199325ab02b652adcf474d0a34f7
 size 1660

ingest.sh CHANGED Viewed

@@ -20,7 +20,7 @@ error() { echo -e "${RED}[ERROR]${NC} $1"; exit 1; }
 #===============================================================================
 header "Step 5-6/7: Processing Epochs (Download → Ingest → Delete)"
-EPOCHS=(844)
 log "Processing epochs one at a time to minimize disk usage..."

 #===============================================================================
 header "Step 5-6/7: Processing Epochs (Download → Ingest → Delete)"
+EPOCHS=(844 845 846 847 848 849 850 851)
 log "Processing epochs one at a time to minimize disk usage..."

pre_cache.sh CHANGED Viewed

@@ -5,7 +5,7 @@ CONTEXT_LENGTH=4096
 MIN_TRADES=10
 SAMPLES_PER_TOKEN=1
 NUM_WORKERS=1
-OHLC_STATS_PATH="/workspace/apollo/data/ohlc_stats.npz"
 OUTPUT_DIR="data/cache"
 # Label horizons in seconds, relative to each sampled T_cutoff.
@@ -24,7 +24,7 @@ echo "Num Workers: $NUM_WORKERS"
 echo "Horizons (sec): ${HORIZONS_SECONDS[*]}"
 echo "Quantiles: ${QUANTILES[*]}"
 echo "Output Directory: $OUTPUT_DIR"
-echo "OHLC Stats Path: $OHLC_STATS_PATH"
 echo "========================================"
 echo "Starting dataset caching..."
@@ -32,7 +32,6 @@ echo "Starting dataset caching..."
 mkdir -p "$OUTPUT_DIR"
 python3 scripts/cache_dataset.py \
-    --ohlc_stats_path "$OHLC_STATS_PATH" \
     --output_dir "$OUTPUT_DIR" \
     --context_length "$CONTEXT_LENGTH" \
     --min_trades "$MIN_TRADES" \
@@ -40,7 +39,7 @@ python3 scripts/cache_dataset.py \
     --num_workers "$NUM_WORKERS" \
     --horizons_seconds "${HORIZONS_SECONDS[@]}" \
     --quantiles "${QUANTILES[@]}" \
-    --max_samples 150000 \
     "$@"
 echo "Done!"

 MIN_TRADES=10
 SAMPLES_PER_TOKEN=1
 NUM_WORKERS=1
 OUTPUT_DIR="data/cache"
 # Label horizons in seconds, relative to each sampled T_cutoff.
 echo "Horizons (sec): ${HORIZONS_SECONDS[*]}"
 echo "Quantiles: ${QUANTILES[*]}"
 echo "Output Directory: $OUTPUT_DIR"
 echo "========================================"
 echo "Starting dataset caching..."
 mkdir -p "$OUTPUT_DIR"
 python3 scripts/cache_dataset.py \
     --output_dir "$OUTPUT_DIR" \
     --context_length "$CONTEXT_LENGTH" \
     --min_trades "$MIN_TRADES" \
     --num_workers "$NUM_WORKERS" \
     --horizons_seconds "${HORIZONS_SECONDS[@]}" \
     --quantiles "${QUANTILES[@]}" \
+    --max_samples 300000 \
     "$@"
 echo "Done!"

sample_121MxrQDsaY35gC4_0.json ADDED Viewed

The diff for this file is too large to render. See raw diff

sample_14CfRkQ9CFP4o9nV_3.json ADDED Viewed

The diff for this file is too large to render. See raw diff

sample_2tYvBaQqXYy7Y5Qk_3.json ADDED Viewed

The diff for this file is too large to render. See raw diff

scripts/analyze_distribution.py CHANGED Viewed

@@ -37,7 +37,6 @@ def compute_p99_clamps(client):
     trade_query = """
     SELECT
         quantile(0.99)(abs(slippage)) AS p99_slippage,
-        quantile(0.99)(priority_fee) AS p99_priority_fee,
         quantile(0.99)(total_usd) AS p99_total_usd
     FROM trades
     WHERE success = 1
@@ -56,7 +55,7 @@ def compute_p99_clamps(client):
     clamps = {
         # Defaults as fallback if queries return nothing
         'slippage': 1.0,
-        'priority_fee': 0.1,
         'total_usd': 100000.0,
         'history_bought_cost_sol': 30.0,
         'realized_profit_sol': 150.0,
@@ -65,8 +64,7 @@ def compute_p99_clamps(client):
     if trade_row and trade_row[0]:
         r = trade_row[0]
         clamps['slippage'] = max(float(r[0]), 0.01)
-        clamps['priority_fee'] = max(float(r[1]), 1e-9)
-        clamps['total_usd'] = max(float(r[2]), 1.0)
     if holdings_row and holdings_row[0]:
         r = holdings_row[0]

     trade_query = """
     SELECT
         quantile(0.99)(abs(slippage)) AS p99_slippage,
         quantile(0.99)(total_usd) AS p99_total_usd
     FROM trades
     WHERE success = 1
     clamps = {
         # Defaults as fallback if queries return nothing
         'slippage': 1.0,
         'total_usd': 100000.0,
         'history_bought_cost_sol': 30.0,
         'realized_profit_sol': 150.0,
     if trade_row and trade_row[0]:
         r = trade_row[0]
         clamps['slippage'] = max(float(r[0]), 0.01)
+        clamps['total_usd'] = max(float(r[1]), 1.0)
     if holdings_row and holdings_row[0]:
         r = holdings_row[0]

scripts/cache_dataset.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import os
 import sys
 import argparse
-import numpy as np
 import datetime
 import torch
 import json
@@ -61,7 +61,6 @@ def _init_worker(db_config, dataset_config, return_class_map, quality_scores_map
         data_fetcher=data_fetcher,
         max_samples=dataset_config['max_samples'],
         start_date=dataset_config['start_date'],
-        ohlc_stats_path=dataset_config['ohlc_stats_path'],
         horizons_seconds=dataset_config['horizons_seconds'],
         quantiles=dataset_config['quantiles'],
         min_trade_usd=dataset_config['min_trade_usd'],
@@ -112,21 +111,6 @@ def _process_single_token_context(args):
-def compute_save_ohlc_stats(client, output_path):
-    print(f"INFO: Computing OHLC stats...")
-    query = """SELECT AVG(t.price_usd), stddevPop(t.price_usd), AVG(t.price), stddevPop(t.price), AVG(t.total_usd), stddevPop(t.total_usd) FROM trades AS t WHERE t.price_usd > 0 AND t.total_usd > 0"""
-    try:
-        result = client.execute(query)
-        if result and result[0]:
-            row = result[0]
-            stats = {"mean_price_usd": float(row[0] or 0), "std_price_usd": float(row[1] or 1), "mean_price_native": float(row[2] or 0), "std_price_native": float(row[3] or 1), "mean_trade_value_usd": float(row[4] or 0), "std_trade_value_usd": float(row[5] or 1)}
-        else:
-            stats = {"mean_price_usd": 0.0, "std_price_usd": 1.0, "mean_price_native": 0.0, "std_price_native": 1.0, "mean_trade_value_usd": 0.0, "std_trade_value_usd": 1.0}
-        Path(output_path).parent.mkdir(parents=True, exist_ok=True)
-        np.savez(output_path, **stats)
-        print(f"INFO: Saved OHLC stats to {output_path}")
-    except Exception as e:
-        print(f"ERROR: Failed to compute OHLC stats: {e}")
 def main():
@@ -142,7 +126,7 @@ def main():
     parser.add_argument("--output_dir", type=str, default="data/cache")
     parser.add_argument("--max_samples", type=int, default=None)
     parser.add_argument("--start_date", type=str, default=None)
-    parser.add_argument("--ohlc_stats_path", type=str, default="data/ohlc_stats.npz")
     parser.add_argument("--min_trade_usd", type=float, default=0.0)
     parser.add_argument("--context_length", type=int, default=8192)
@@ -170,7 +154,6 @@ def main():
     neo4j_driver = GraphDatabase.driver(args.neo4j_uri, auth=(args.neo4j_user, args.neo4j_password))
     try:
-        compute_save_ohlc_stats(clickhouse_client, args.ohlc_stats_path)
         from data.data_loader import OracleDataset
         from data.data_fetcher import DataFetcher
@@ -187,7 +170,7 @@ def main():
         quality_scores_map = get_token_quality_scores(clickhouse_client)
         print(f"INFO: Loaded {len(quality_scores_map)} quality scores.")
-        dataset = OracleDataset(data_fetcher=data_fetcher, max_samples=args.max_samples, start_date=start_date_dt, ohlc_stats_path=args.ohlc_stats_path, horizons_seconds=args.horizons_seconds, quantiles=args.quantiles, min_trade_usd=args.min_trade_usd, max_seq_len=args.context_length, p99_clamps=p99_clamps)
         if len(dataset) == 0:
             print("WARNING: No samples. Exiting.")
@@ -223,7 +206,7 @@ def main():
         print(f"INFO: Workers: {args.num_workers}")
         db_config = {'clickhouse_host': args.clickhouse_host, 'clickhouse_port': args.clickhouse_port, 'neo4j_uri': args.neo4j_uri, 'neo4j_user': args.neo4j_user, 'neo4j_password': args.neo4j_password}
-        dataset_config = {'max_samples': args.max_samples, 'start_date': start_date_dt, 'ohlc_stats_path': args.ohlc_stats_path, 'horizons_seconds': args.horizons_seconds, 'quantiles': args.quantiles, 'min_trade_usd': args.min_trade_usd, 'max_seq_len': args.context_length, 'sampled_mints': filtered_mints, 'p99_clamps': p99_clamps}
         # Build tasks with class-aware multi-sampling for balanced cache
         import random

 import os
 import sys
 import argparse
 import datetime
 import torch
 import json
         data_fetcher=data_fetcher,
         max_samples=dataset_config['max_samples'],
         start_date=dataset_config['start_date'],
         horizons_seconds=dataset_config['horizons_seconds'],
         quantiles=dataset_config['quantiles'],
         min_trade_usd=dataset_config['min_trade_usd'],
 def main():
     parser.add_argument("--output_dir", type=str, default="data/cache")
     parser.add_argument("--max_samples", type=int, default=None)
     parser.add_argument("--start_date", type=str, default=None)
     parser.add_argument("--min_trade_usd", type=float, default=0.0)
     parser.add_argument("--context_length", type=int, default=8192)
     neo4j_driver = GraphDatabase.driver(args.neo4j_uri, auth=(args.neo4j_user, args.neo4j_password))
     try:
         from data.data_loader import OracleDataset
         from data.data_fetcher import DataFetcher
         quality_scores_map = get_token_quality_scores(clickhouse_client)
         print(f"INFO: Loaded {len(quality_scores_map)} quality scores.")
+        dataset = OracleDataset(data_fetcher=data_fetcher, max_samples=args.max_samples, start_date=start_date_dt, horizons_seconds=args.horizons_seconds, quantiles=args.quantiles, min_trade_usd=args.min_trade_usd, max_seq_len=args.context_length, p99_clamps=p99_clamps)
         if len(dataset) == 0:
             print("WARNING: No samples. Exiting.")
         print(f"INFO: Workers: {args.num_workers}")
         db_config = {'clickhouse_host': args.clickhouse_host, 'clickhouse_port': args.clickhouse_port, 'neo4j_uri': args.neo4j_uri, 'neo4j_user': args.neo4j_user, 'neo4j_password': args.neo4j_password}
+        dataset_config = {'max_samples': args.max_samples, 'start_date': start_date_dt, 'horizons_seconds': args.horizons_seconds, 'quantiles': args.quantiles, 'min_trade_usd': args.min_trade_usd, 'max_seq_len': args.context_length, 'sampled_mints': filtered_mints, 'p99_clamps': p99_clamps}
         # Build tasks with class-aware multi-sampling for balanced cache
         import random

scripts/cache_parallel.py CHANGED Viewed

@@ -29,7 +29,6 @@ def cache_chunk(args):
     ds = OracleDataset(
         data_fetcher=fetcher,
-        ohlc_stats_path=db_args['ohlc_stats_path'],
         horizons_seconds=[30, 60, 120, 240, 420],
         quantiles=[0.1, 0.5, 0.9],
     )
@@ -91,7 +90,7 @@ def main():
     fetcher = DataFetcher(clickhouse_client=ch, neo4j_driver=neo)
     return_map, _ = get_return_class_map(ch)
-    ds = OracleDataset(data_fetcher=fetcher, ohlc_stats_path="data/ohlc_stats.npz",
                        horizons_seconds=[60, 180, 300, 600, 1800, 3600, 7200], quantiles=[0.5])
     ds.sampled_mints = [m for m in ds.sampled_mints if m['mint_address'] in return_map]
     total = len(ds)
@@ -108,7 +107,7 @@ def main():
         'neo4j_uri': os.getenv("NEO4J_URI", "bolt://localhost:7687"),
         'neo4j_user': os.getenv("NEO4J_USER", "neo4j"),
         'neo4j_password': os.getenv("NEO4J_PASSWORD", "neo4j123"),
-        'ohlc_stats_path': "data/ohlc_stats.npz",
     }
     tasks = [(i, i*chunk_size, (i+1)*chunk_size, args.output_dir, db_args) for i in range(args.workers)]

     ds = OracleDataset(
         data_fetcher=fetcher,
         horizons_seconds=[30, 60, 120, 240, 420],
         quantiles=[0.1, 0.5, 0.9],
     )
     fetcher = DataFetcher(clickhouse_client=ch, neo4j_driver=neo)
     return_map, _ = get_return_class_map(ch)
+    ds = OracleDataset(data_fetcher=fetcher,
                        horizons_seconds=[60, 180, 300, 600, 1800, 3600, 7200], quantiles=[0.5])
     ds.sampled_mints = [m for m in ds.sampled_mints if m['mint_address'] in return_map]
     total = len(ds)
         'neo4j_uri': os.getenv("NEO4J_URI", "bolt://localhost:7687"),
         'neo4j_user': os.getenv("NEO4J_USER", "neo4j"),
         'neo4j_password': os.getenv("NEO4J_PASSWORD", "neo4j123"),
     }
     tasks = [(i, i*chunk_size, (i+1)*chunk_size, args.output_dir, db_args) for i in range(args.workers)]

train.py CHANGED Viewed

@@ -328,7 +328,7 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument("--horizons_seconds", type=int, nargs="+", default=[30, 60, 120, 240, 420])
     parser.add_argument("--quantiles", type=float, nargs="+", default=[0.1, 0.5, 0.9])
     parser.add_argument("--max_samples", type=int, default=None)
-    parser.add_argument("--ohlc_stats_path", type=str, default="./data/ohlc_stats.npz")
     parser.add_argument("--t_cutoff_seconds", type=int, default=60)
     parser.add_argument("--shuffle", dest="shuffle", action="store_true", default=True)
     parser.add_argument("--no-shuffle", dest="shuffle", action="store_false")
@@ -473,7 +473,6 @@ def main() -> None:
         horizons_seconds=horizons,
         quantiles=quantiles,
         max_samples=args.max_samples,
-        ohlc_stats_path=args.ohlc_stats_path,
         t_cutoff_seconds=int(args.t_cutoff_seconds) if hasattr(args, 't_cutoff_seconds') else 60,
         cache_dir="/workspace/apollo/data/cache"
     )

     parser.add_argument("--horizons_seconds", type=int, nargs="+", default=[30, 60, 120, 240, 420])
     parser.add_argument("--quantiles", type=float, nargs="+", default=[0.1, 0.5, 0.9])
     parser.add_argument("--max_samples", type=int, default=None)
     parser.add_argument("--t_cutoff_seconds", type=int, default=60)
     parser.add_argument("--shuffle", dest="shuffle", action="store_true", default=True)
     parser.add_argument("--no-shuffle", dest="shuffle", action="store_false")
         horizons_seconds=horizons,
         quantiles=quantiles,
         max_samples=args.max_samples,
         t_cutoff_seconds=int(args.t_cutoff_seconds) if hasattr(args, 't_cutoff_seconds') else 60,
         cache_dir="/workspace/apollo/data/cache"
     )

train.sh CHANGED Viewed

@@ -16,7 +16,6 @@ accelerate launch train.py \
   --max_seq_len 4096 \
   --horizons_seconds 300 900 1800 3600 7200 \
   --quantiles 0.1 0.5 0.9 \
-  --ohlc_stats_path ./data/ohlc_stats.npz \
   --num_workers 0 \
   --val_samples_per_class 2 \
   --val_every 100 \

   --max_seq_len 4096 \
   --horizons_seconds 300 900 1800 3600 7200 \
   --quantiles 0.1 0.5 0.9 \
   --num_workers 0 \
   --val_samples_per_class 2 \
   --val_every 100 \

train.yaml CHANGED Viewed

@@ -14,7 +14,6 @@ data:
   quantiles: [0.1, 0.5, 0.9]
   max_seq_len: 4096
   ohlc_seq_len: 300
-  ohlc_stats_path: ./data/ohlc_stats.npz
   t_cutoff_seconds: 60
   shuffle: true
   num_workers: 4

   quantiles: [0.1, 0.5, 0.9]
   max_seq_len: 4096
   ohlc_seq_len: 300
   t_cutoff_seconds: 60
   shuffle: true
   num_workers: 4

validate.py CHANGED Viewed

@@ -100,7 +100,6 @@ def main() -> None:
     ohlc_seq_len = data_cfg.get("ohlc_seq_len", 60)
     default_t_cutoff = int(data_cfg.get("t_cutoff_seconds", 60))
     t_cutoff_seconds = int(args.t_cutoff_seconds) if args.t_cutoff_seconds is not None else default_t_cutoff
-    ohlc_stats_path = data_cfg.get("ohlc_stats_path", "./data/ohlc_stats.npz")
     multi_modal_encoder = MultiModalEncoder(dtype=dtype)
     time_encoder = ContextualTimeEncoder(dtype=dtype)
@@ -140,7 +139,6 @@ def main() -> None:
         horizons_seconds=horizons,
         quantiles=quantiles,
         max_samples=max_samples,
-        ohlc_stats_path=ohlc_stats_path,
         token_allowlist=[args.token_address] if args.token_address else None,
         t_cutoff_seconds=t_cutoff_seconds
     )

     ohlc_seq_len = data_cfg.get("ohlc_seq_len", 60)
     default_t_cutoff = int(data_cfg.get("t_cutoff_seconds", 60))
     t_cutoff_seconds = int(args.t_cutoff_seconds) if args.t_cutoff_seconds is not None else default_t_cutoff
     multi_modal_encoder = MultiModalEncoder(dtype=dtype)
     time_encoder = ContextualTimeEncoder(dtype=dtype)
         horizons_seconds=horizons,
         quantiles=quantiles,
         max_samples=max_samples,
         token_allowlist=[args.token_address] if args.token_address else None,
         t_cutoff_seconds=t_cutoff_seconds
     )