Model Card for Model ID

ARTICLE: https://medium.com/@frankmorales_91352/the-llm-jepa-advantage-fine-tuning-mistral-7b-for-cost-efficient-high-abstract-cryptocurrency-3ee06c928143

CODE: https://github.com/frank-morales2020/MLxDL/blob/main/FINAL_LLM_JEPA_MISTRAL_FT_BTC.ipynb

Training Data

# Validate all samples in dataset
print("\n--- Validating Dataset Keys ---")
required_keys = [
    "input_ids", "labels", "attention_mask",
    "input_ids_user", "labels_user", "attention_mask_user",
    "input_ids_assistant", "labels_assistant", "attention_mask_assistant"
]
for split in ["train", "test"]:
    print(f"\nChecking {split} split...")
    missing_samples = []
    for idx, sample in tqdm(enumerate(tokenized_dataset[split]), total=len(tokenized_dataset[split]), desc=f"Validating {split} split"):
        missing_keys = [key for key in required_keys if key not in sample]
        if missing_keys:
            missing_samples.append((idx, missing_keys))
    if missing_samples:
        print(f"Found {len(missing_samples)} problematic samples in {split} split:")
        for idx, missing_keys in missing_samples[:5]:
            print(f"Sample {idx} missing keys: {missing_keys}")
        raise ValueError(f"Dataset validation failed in {split} split.")
    print(f"{split} split: All {len(tokenized_dataset[split])} samples validated.")

COLUMNS_TO_KEEP = required_keys
try:
    tokenized_dataset["train"] = tokenized_dataset["train"].select_columns(COLUMNS_TO_KEEP)
    tokenized_dataset["test"] = tokenized_dataset["test"].select_columns(COLUMNS_TO_KEEP)
    print(f"Dataset columns filtered to: {COLUMNS_TO_KEEP}")
except Exception as e:
    print(f"Error during column selection: {e}")
    print(f"Available columns in train: {tokenized_dataset['train'].column_names}")
    print(f"Available columns in test: {tokenized_dataset['test'].column_names}")
    raise

Training Procedure

Preprocessing [optional]

[More Information Needed]

Training Hyperparameters


# PEFT (LoRA) Config
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
)


# Training Arguments - OPTIMIZED FOR JEPA MONITORING
training_arguments = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    optim="paged_adamw_8bit",
    save_steps=0,  # Disable saving during demo
    logging_steps=50,  # See JEPA metrics every 10 steps
    max_steps=500,  # Just enough to see JEPA loss decreasing
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    disable_tqdm=False,
    report_to="none",
    # ↓↓↓ CRITICAL CHANGES FOR DEMO ↓↓↓
    evaluation_strategy="no",  # Disable evaluation during training
    eval_steps=None,  # No evaluation steps
    metric_for_best_model=None,  # Not needed for demo
    # ↑↑↑ CRITICAL CHANGES FOR DEMO ↑↑↑
    dataloader_drop_last=True,
    dataloader_num_workers=0,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    resume_from_checkpoint=True
)

[More Information Needed]

Evaluation


from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
import torch
from peft import PeftModel
import peft
import os

# --- FILE PATHS (Replicated from original code) ---
MODEL_NAME = "mistralai/Mistral-7B-v0.1"
HUB_MODEL_ID = "frankmorales2020/Mistral-7B-BTC-JEPA-LLM-Expert"

# --- 1. SETUP ---
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_TYPE = "FINE_TUNED"

# --- 2. MODEL AND TOKENIZER LOADING (Single block for efficiency) ---
print("\n--- Model and Tokenizer Setup ---")
print(f"--- Loading Model onto {DEVICE} ---")
print(f'BASE MODEL: {MODEL_NAME}\nFINE TUNE MODEL: {HUB_MODEL_ID}')

try:
    # 4-bit Quantization Config
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token

    # Add special tokens and resize embeddings (essential for fine-tuned LoRA)
    SPECIAL_PREDICTOR_TOKENS = ["<pred>", "<targ>", "<jepa>"]
    tokenizer.add_special_tokens({"additional_special_tokens": SPECIAL_PREDICTOR_TOKENS})
    
    # Load base model
    base_model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
    )
    base_model.resize_token_embeddings(len(tokenizer))
    
    # Load fine-tuned adapter weights
    model = PeftModel.from_pretrained(base_model, HUB_MODEL_ID).eval()
    print("🎉 SUCCESS: Loaded fine-tuned JEPA model!")

except Exception as e:
    print(f"❌ Model loading failed: {e}")
    # Fallback to base model logic is removed for this test since the specific fine-tuned
    # model must be used, so we raise the error.
    raise

# --- 3. INFERENCE FUNCTION (Encapsulating fixed parameters) ---

def run_inference_test(btc_data_input, tokenizer, model, device):
    """Runs a single inference test with the fixed strict extraction logic."""
    
    # 1. Create Strict Prompt
    user_prompt_content = f"Current BTC data: {btc_data_input}. Give ONLY the 12-hour direction (UP, DOWN, or FLAT). The output MUST be a single word: UP, DOWN, or FLAT."
    input_text = f"<s>[INST] {user_prompt_content} [/INST]"
    
    # 2. Tokenize and Set Fixed Generation Params
    inputs = tokenizer(input_text, return_tensors="pt").to(device)
    
    # Fixed generation parameters proven to work with robust extraction
    generation_params = {
        "max_new_tokens": 15,     
        "do_sample": False,       
        "temperature": 0.1,       
        "top_p": 1.0,
        "repetition_penalty": 1.0,
    }

    # 3. Generate Output
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            **generation_params,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # 4. Robust Extraction Logic (Final Corrected Logic)
    prediction_output = "❌ PREDICTION NOT FOUND"
    target_words = ["UP", "DOWN", "FLAT"]

    if "[/INST]" in response_text:
        prediction_output_raw = response_text.split("[/INST]")[-1].strip()
        
        # Iterate through the expected target words
        for target in target_words:
            if target in prediction_output_raw.upper():
                prediction_output = target
                break
    else:
        prediction_output_raw = response_text.strip()
        for target in target_words:
            if target in prediction_output_raw.upper():
                prediction_output = target
                break

    return input_text, prediction_output_raw, prediction_output

# --- 4. MULTI-INPUT EXECUTION ---

# Define the 3 test cases to check for all directions
TEST_CASES = [
    {
        "name": "Test 1: Downward Reversal (Original Input)",
        "input": "[O:30000, H:30500, C:30200]",
        "expected_logic": "DOWN"
    },
]


for i, test in enumerate(TEST_CASES):
    print(f"\n[{test['name']} - Expecting {test['expected_logic']}]")
    
    # Run the test
    input_text, raw_output, cleaned_prediction = run_inference_test(
        test['input'], tokenizer, model, DEVICE
    )
    
    # Print results
    print("=" * 70)
    print(f"📤 INPUT: {test['input']}")
    print(f"🤖 RAW OUTPUT: {raw_output}")
    print(f"✅ CLEANED PREDICTION: **{cleaned_prediction}**")
    print("=" * 70)



[Test 1: Downward Reversal (Original Input) - Expecting DOWN]
======================================================================
📤 INPUT: [O:30000, H:30500, C:30200]
🤖 RAW OUTPUT: The 12-hour prediction is **DOWN**. The final prediction is
✅ CLEANED PREDICTION: **DOWN**
======================================================================


import os
import torch
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel

os.environ["CUDA_HOME"] = "/usr/local/cuda"

# --- 2. CONFIGURATION ---
MODEL_NAME = "mistralai/Mistral-7B-v0.1"
HUB_MODEL_ID = "frankmorales2020/Mistral-7B-BTC-JEPA-LLM-Expert"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SPECIAL_TOKENS = ["<|predictor_1|>", "<|predictor_2|>", "<|predictor_3|>"]

# --- 3. THE FIXED LOADING SEQUENCE ---
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({"additional_special_tokens": SPECIAL_TOKENS})

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

base_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, quantization_config=bnb_config, device_map="auto")

base_model.resize_token_embeddings(len(tokenizer))

model = PeftModel.from_pretrained(base_model, HUB_MODEL_ID).eval()
print("🎉 SUCCESS: Model and JEPA Expert loaded!")

# --- 4. CALIBRATED INFERENCE ENGINE (DEMO-FOCUSED) ---
def run_expert_inference(ohlc_data, rsi=50.0, sma=60000.0):
    target_words = ["UP", "DOWN", "FLAT"]
    target_ids = [tokenizer.encode(word, add_special_tokens=False)[-1] for word in target_words]
    
    instruction = (
        f"Analyze BTC Market Data: {ohlc_data}. "
        f"Technicals: RSI(14) is {rsi:.2f}, SMA(20) is {sma:.2f}. "
        f"Direction (UP/DOWN/FLAT):"
    )
    prompt = f"<s>[INST] {instruction} [/INST]"
    
    with torch.no_grad():
        null_prompt = f"<s>[INST] Analyze BTC Market Data: [O:60000, H:60100, C:60050]. Technicals: RSI(14) is 50.00, SMA(20) is 60000.00. Direction (UP/DOWN/FLAT): [/INST]"
        n_inputs = tokenizer(null_prompt, return_tensors="pt").to(DEVICE)
        null_logits = model(**n_inputs).logits[0, -1, target_ids]
        
        r_inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
        real_logits = model(**r_inputs).logits[0, -1, target_ids]
        
        # Temperature 1.0 for natural balance in demo cases
        calibrated_logits = (real_logits - null_logits) / 1.0
        probs = F.softmax(calibrated_logits, dim=-1)

    confidences = {word: round(prob.item(), 3) for word, prob in zip(target_words, probs)}
    prediction = max(confidences, key=confidences.get)
    return prediction, confidences


print("🎉 SUCCESS: Model and JEPA Expert loaded!")
#--- 5. FINAL DEMO CASES TO SHOWCASE UP, DOWN, and FLAT ---
demo_cases = [
    # Clear bearish crash (strong DOWN signal)
    {"label": "BEARISH CRASH (DOWN Demo)", "data": "[O:100000, H:101000, C:70000]", "rsi": 20.0, "sma": 95000.0},
    
    # Pure neutral flat (FLAT Demo)
    {"label": "NEUTRAL FLAT (FLAT Demo)", "data": "[O:87000, H:87100, C:87050]", "rsi": 50.0, "sma": 87000.0},
    
    # Current real market Dec 25, 2025 (~$87,700, neutral RSI ~43, mild weakness below SMA ~$88,500)
    {"label": "CURRENT REAL (FLAT/DOWN-ish)", "data": "[O:87700, H:87800, C:87700]", "rsi": 43.0, "sma": 88500.0},
    
    # Mild bullish (to trigger UP – the adapter sometimes favors UP in very neutral cases)
    {"label": "MILD BULLISH (UP Demo)", "data": "[O:87000, H:88000, C:87500]", "rsi": 55.0, "sma": 86800.0},
    
    # Strong bullish rally (UP Demo fallback if needed)
    {"label": "STRONG BULLISH (UP Demo)", "data": "[O:80000, H:95000, C:94000]", "rsi": 75.0, "sma": 82000.0},
]

print("\n" + "="*95)
print(f"{'Demo Scenario':<30} | {'Prediction':<10} | {'Confidence (UP / DOWN / FLAT)'}")
print("-" * 95)
for test in demo_cases:
    pred, scores = run_expert_inference(test['data'], rsi=test['rsi'], sma=test['sma'])
    score_str = f"U:{scores['UP']:.3f} D:{scores['DOWN']:.3f} F:{scores['FLAT']:.3f}"
    print(f"{test['label']:<30} | {pred:<10} | {score_str}")
print("="*95)

🎉 SUCCESS: Model and JEPA Expert loaded!

===============================================================================================
Demo Scenario                  | Prediction | Confidence (UP / DOWN / FLAT)
-----------------------------------------------------------------------------------------------
BEARISH CRASH (DOWN Demo)      | FLAT       | U:0.328 D:0.332 F:0.340
NEUTRAL FLAT (FLAT Demo)       | UP         | U:0.337 D:0.335 F:0.329
CURRENT REAL (FLAT/DOWN-ish)   | DOWN       | U:0.332 D:0.338 F:0.330
MILD BULLISH (UP Demo)         | DOWN       | U:0.331 D:0.337 F:0.331
STRONG BULLISH (UP Demo)       | DOWN       | U:0.331 D:0.336 F:0.332
===============================================================================================

Results


✅ JEPA Monitoring Enabled - will show metrics every 50/500 steps

Step	Training Loss
50	0.876400
100	0.235400
150	0.224500
200	0.229200
250	0.231300
300	0.221500
350	0.225300
400	0.229300
450	0.220400
500	0.224200

🎯 JEPA Metrics [Step 50]:
   JEPA Loss: 0.0001
   LM Loss: 0.2534
   Cosine Sim: 0.9999
   Total Loss: 0.2535

🎯 JEPA Metrics [Step 100]:
   JEPA Loss: 0.0000
   LM Loss: 0.2319
   Cosine Sim: 1.0000
   Total Loss: 0.2320

🎯 JEPA Metrics [Step 150]:
   JEPA Loss: 0.0000
   LM Loss: 0.2438
   Cosine Sim: 1.0000
   Total Loss: 0.2438

🎯 JEPA Metrics [Step 200]:
   JEPA Loss: 0.0000
   LM Loss: 0.2384
   Cosine Sim: 1.0000
   Total Loss: 0.2384

🎯 JEPA Metrics [Step 250]:
   JEPA Loss: 0.0000
   LM Loss: 0.2216
   Cosine Sim: 1.0000
   Total Loss: 0.2217


🎯 JEPA Metrics [Step 300]:
   JEPA Loss: 0.0000
   LM Loss: 0.1579
   Cosine Sim: 1.0000
   Total Loss: 0.1580

🎯 JEPA Metrics [Step 350]:
   JEPA Loss: 0.0000
   LM Loss: 0.2287
   Cosine Sim: 1.0000
   Total Loss: 0.2287

🎯 JEPA Metrics [Step 400]:
   JEPA Loss: 0.0000
   LM Loss: 0.2151
   Cosine Sim: 1.0000
   Total Loss: 0.2152

🎯 JEPA Metrics [Step 450]:
   JEPA Loss: 0.0000
   LM Loss: 0.1994
   Cosine Sim: 1.0000
   Total Loss: 0.1994

🎯 JEPA Metrics [Step 500]:
   JEPA Loss: 0.0000
   LM Loss: 0.2188
   Cosine Sim: 1.0000
   Total Loss: 0.2189

Compute Infrastructure


Wed Oct  8 05:48:17 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA L4                      Off |   00000000:00:03.0 Off |                    0 |
| N/A   37C    P8             11W /   72W |       0MiB /  23034MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                                                         
+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|  No running processes found                                                             |
+-----------------------------------------------------------------------------------------+

Downloads last month: -; Downloads are not tracked for this model. How to track

Inference Providers NEW

This model isn't deployed by any Inference Provider. 🙋 Ask for provider support