zirobtc
/

oracle

Model card Files Files and versions

xet

Community

zirobtc commited on Mar 3

Commit

b441d51

verified ·

1 Parent(s): 54191e5

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

scripts/evaluate_sample.py +304 -0

scripts/evaluate_sample.py ADDED Viewed

	@@ -0,0 +1,304 @@

+import os
+import sys
+import argparse
+import random
+import torch
+from pathlib import Path
+# Add project root to sys.path so we can import data and models
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+# Provide standard defaults
+from accelerate import Accelerator
+from torch.utils.data import DataLoader, Subset
+from data.data_loader import OracleDataset
+from data.data_collator import MemecoinCollator
+from models.multi_modal_processor import MultiModalEncoder
+from models.helper_encoders import ContextualTimeEncoder
+from models.token_encoder import TokenEncoder
+from models.wallet_encoder import WalletEncoder
+from models.graph_updater import GraphUpdater
+from models.ohlc_embedder import OHLCEmbedder
+from models.model import Oracle
+import models.vocabulary as vocab
+from train import create_balanced_split
+def unlog_transform(tensor):
+    """Invert the log1p transform applied during training."""
+    # During training: labels = torch.sign(labels) * torch.log1p(torch.abs(labels))
+    return torch.sign(tensor) * (torch.exp(torch.abs(tensor)) - 1)
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--checkpoint", type=str, default="checkpoints/checkpoint-90000", help="Path to checkpoint dir")
+    parser.add_argument("--cache_dir", type=str, default="/workspace/apollo/data/cache", help="Path to dataset cache")
+    parser.add_argument("--sample_idx", type=int, default=None, help="Specific sample index to evaluate")
+    parser.add_argument("--mixed_precision", type=str, default="bf16")
+    parser.add_argument("--horizons_seconds", type=int, nargs="+", default=[300, 900, 1800, 3600, 7200])
+    parser.add_argument("--quantiles", type=float, nargs="+", default=[0.1, 0.5, 0.9])
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--min_horizon", type=int, default=900, help="Ensure the sampled coin has ground truth for at least this horizon (in seconds)")
+    return parser.parse_args()
+def get_latest_checkpoint(checkpoint_dir):
+    ckpt_dir = Path(checkpoint_dir)
+    if ckpt_dir.exists():
+        dirs = [d for d in ckpt_dir.iterdir() if d.is_dir()]
+        if dirs:
+            dirs.sort(key=lambda x: x.stat().st_mtime)
+            latest_checkpoint = dirs[-1]
+            return str(latest_checkpoint)
+    return None
+def main():
+    args = parse_args()
+    accelerator = Accelerator(mixed_precision=args.mixed_precision)
+    device = accelerator.device
+    init_dtype = torch.float32
+    if accelerator.mixed_precision == 'bf16':
+        init_dtype = torch.bfloat16
+    elif accelerator.mixed_precision == 'fp16':
+        init_dtype = torch.float16
+    print(f"Loading cached dataset from {args.cache_dir}...")
+    dataset = OracleDataset(
+        data_fetcher=None,
+        fetcher_config=None,
+        horizons_seconds=args.horizons_seconds,
+        quantiles=args.quantiles,
+        max_samples=None,
+        t_cutoff_seconds=60,
+        cache_dir=args.cache_dir
+    )
+    if len(dataset) == 0:
+        raise ValueError("Dataset is empty!")
+    # Optionally pick validation sample like in training
+    print("Creating balanced train/val split to pick a validation sample...")
+    _, val_indices, _ = create_balanced_split(dataset, n_val_per_class=10, seed=args.seed)
+    # Re-seed with system time so we don't pick the same sample every time
+    import time
+    random.seed(time.time())
+    # --- Filter by minimum horizon if requested ---
+    if args.min_horizon is not None and args.min_horizon in args.horizons_seconds:
+        print(f"Filtering dataset to find samples with ground truth >= {args.min_horizon}s...")
+        h_idx = args.horizons_seconds.index(args.min_horizon)
+        num_quantiles = len(args.quantiles)
+        valid_indices = []
+        # We search through a shuffled subset to avoid checking the whole dataset
+        search_pool = val_indices.copy()
+        random.shuffle(search_pool)
+        if not search_pool:
+            search_pool = list(range(len(dataset)))
+            random.shuffle(search_pool)
+        for idx in search_pool:
+            sample = dataset[idx]
+            if sample is None:
+                continue
+            mask = sample.get('labels_mask')
+            if mask is not None:
+                # Based on raw file inspection, mask is shape [H], so we index by h_idx directly
+                if h_idx < len(mask) and mask[h_idx] > 0.0:
+                    valid_indices.append(idx)
+                    # Once we find a handful of valid ones, we can stop searching
+                    if len(valid_indices) >= 10:
+                        break
+        if valid_indices:
+            print(f"Found {len(valid_indices)} candidate samples with >= {args.min_horizon}s horizon.")
+            val_indices = valid_indices
+        else:
+            print(f"WARNING: No samples found with ground truth for horizon {args.min_horizon}s. Reverting to random pick.")
+    if args.sample_idx is not None:
+        if args.sample_idx >= len(dataset):
+             raise ValueError(f"Sample index {args.sample_idx} out of range [0, {len(dataset)-1}]")
+        sample_idx = args.sample_idx
+    else:
+        # Pick a random sample from validation set
+        if len(val_indices) > 0:
+             sample_idx = random.choice(val_indices)
+        else:
+             print("No validation indices found. Picking random sample from entire set.")
+             sample_idx = random.randint(0, len(dataset) - 1)
+    print(f"\nEvaluating on sample index: {sample_idx}")
+    # Initialize encoders and model
+    print("Initializing encoders...")
+    multi_modal_encoder = MultiModalEncoder(dtype=init_dtype, device=device)
+    time_encoder = ContextualTimeEncoder(dtype=init_dtype)
+    token_encoder = TokenEncoder(multi_dim=multi_modal_encoder.embedding_dim, dtype=init_dtype)
+    wallet_encoder = WalletEncoder(encoder=multi_modal_encoder, dtype=init_dtype)
+    graph_updater = GraphUpdater(time_encoder=time_encoder, dtype=init_dtype)
+    ohlc_embedder = OHLCEmbedder(num_intervals=vocab.NUM_OHLC_INTERVALS, dtype=init_dtype)
+    collator = MemecoinCollator(
+        event_type_to_id=vocab.EVENT_TO_ID,
+        device=device,
+        dtype=init_dtype,
+        max_seq_len=4096
+    )
+    print("Initializing model...")
+    model = Oracle(
+        token_encoder=token_encoder,
+        wallet_encoder=wallet_encoder,
+        graph_updater=graph_updater,
+        ohlc_embedder=ohlc_embedder,
+        time_encoder=time_encoder,
+        num_event_types=vocab.NUM_EVENT_TYPES,
+        multi_modal_dim=multi_modal_encoder.embedding_dim,
+        event_pad_id=vocab.EVENT_TO_ID["__PAD__"],
+        event_type_to_id=vocab.EVENT_TO_ID,
+        model_config_name="llama3-12l-768d-gqa4-8k-random",
+        quantiles=args.quantiles,
+        horizons_seconds=args.horizons_seconds,
+        dtype=init_dtype
+    )
+    if hasattr(model.model, 'embed_tokens'):
+        del model.model.embed_tokens
+    # Load checkpoint
+    ckpt_path = args.checkpoint
+    if ckpt_path.endswith("latest"):
+       base_dir = Path(ckpt_path).parent
+       found = get_latest_checkpoint(base_dir)
+       if found:
+           ckpt_path = found
+    if not os.path.exists(ckpt_path):
+        print(f"Warning: Checkpoint {ckpt_path} not found. Running with random weights!")
+        model = accelerator.prepare(model)
+    else:
+        print(f"Loading checkpoint from {ckpt_path}...")
+        # Since we use accelerate, the state dict is usually split or in pytorch_model.bin/model.safetensors
+        # Using accelerate to load:
+        # We need to wrap it if we want to use `accelerator.load_state`
+        model = accelerator.prepare(model)
+        try:
+            accelerator.load_state(ckpt_path)
+            print("Successfully loaded accelerator state.")
+        except Exception as e:
+            print(f"Could not load using accelerate.load_state: {e}")
+            print("Trying to load model weights directly...")
+            model_file = os.path.join(ckpt_path, "pytorch_model.bin")
+            if not os.path.exists(model_file):
+                model_file = os.path.join(ckpt_path, "model.safetensors")
+            if os.path.exists(model_file):
+                if model_file.endswith(".safetensors"):
+                    from safetensors.torch import load_file
+                    state_dict = load_file(model_file)
+                else:
+                    state_dict = torch.load(model_file, map_location="cpu")
+                # Unwrap model to load state
+                uw_model = accelerator.unwrap_model(model)
+                uw_model.load_state_dict(state_dict, strict=False)
+                print("Successfully loaded weights directly.")
+            else:
+                 print(f"Error: model weights not found in {ckpt_path}")
+    model.eval()
+    # Get sample
+    raw_sample = dataset[sample_idx]
+    if raw_sample is None:
+        print("Sample is None!")
+        return
+    batch = collator([raw_sample])
+    # Move batch to device
+    for k, v in batch.items():
+        if isinstance(v, torch.Tensor):
+            batch[k] = v.to(device)
+        elif isinstance(v, list) and len(v) > 0 and isinstance(v[0], torch.Tensor):
+            batch[k] = [t.to(device) for t in v]
+    # Add missing keys needed by model safety checks
+    if 'textual_event_indices' not in batch:
+        B, L = batch['event_type_ids'].shape
+        batch['textual_event_indices'] = torch.zeros((B, L), dtype=torch.long, device=device)
+    if 'textual_event_data' not in batch:
+        batch['textual_event_data'] = []
+    print("\n--- Running Inference ---")
+    with torch.no_grad():
+        outputs = model(batch)
+    preds = outputs["quantile_logits"][0].cpu() # shape [Horizons * Quantiles]
+    quality_preds = outputs["quality_logits"][0].cpu() if "quality_logits" in outputs else None
+    # Raw labels from dataset (these are NOT log-transformed yet)
+    gt_labels = batch["labels"][0].cpu()
+    gt_mask = batch["labels_mask"][0].cpu().bool()
+    # Quality target if available
+    gt_quality = batch["quality_score"][0].item() if "quality_score" in batch else None
+    # Un-log the predictions since model was trained on log-transformed returns
+    # But wait, did the user train with log transformed returns?
+    # Yes, train.py does: labels = torch.sign(labels) * torch.log1p(torch.abs(labels))
+    real_preds = unlog_transform(preds)
+    print("\n================== Results ==================")
+    print(f"Token Address: {batch.get('token_addresses', ['Unknown'])[0]}")
+    if gt_quality is not None:
+         print(f"Quality Score: GT = {gt_quality:.4f} | Pred = {quality_preds.item() if quality_preds is not None else 'N/A'}")
+    print("\nReturns per Horizon:")
+    num_quantiles = len(args.quantiles)
+    # The models outputs all defined horizons, but the dataset labels might be truncated
+    # if it was generated with fewer horizons.
+    num_gt_horizons = len(gt_mask) # Shape is [H]
+    for h_idx, horizon in enumerate(args.horizons_seconds):
+        horizon_min = horizon // 60
+        print(f"\n--- Horizon: {horizon}s ({horizon_min}m) ---")
+        if h_idx >= num_gt_horizons:
+            print("  [No Ground Truth Available for this Horizon - Not in Dataset]")
+            valid = False
+        else:
+            # Mask format is [H]
+            valid = gt_mask[h_idx].item()
+        if not valid:
+             print("  [No Ground Truth Available for this Horizon - Masked]")
+             # We still print predictions even if GT is masked/missing
+             print("  Predictions:")
+             for q_idx, q in enumerate(args.quantiles):
+                  flat_idx = h_idx * num_quantiles + q_idx
+                  pred_ret = real_preds[flat_idx].item()
+                  log_pred = preds[flat_idx].item()
+                  print(f"    - p{int(q*100):02d}: {pred_ret * 100:>8.2f}%  (raw log-val: {log_pred:7.4f})")
+             continue
+        # Ground truth (raw)
+        gt_ret = gt_labels[h_idx].item()
+        print(f"  Ground Truth: {gt_ret * 100:.2f}%")
+        # Predictions
+        print("  Predictions:")
+        for q_idx, q in enumerate(args.quantiles):
+             flat_idx = h_idx * num_quantiles + q_idx
+             pred_ret = real_preds[flat_idx].item()
+             log_pred = preds[flat_idx].item()
+             print(f"    - p{int(q*100):02d}: {pred_ret * 100:>8.2f}%  (raw log-val: {log_pred:7.4f})")
+    print("=============================================\n")
+if __name__ == "__main__":
+    main()