nemotron-opus-distillation / launch_nemotron_opus_distillation.py
anbench's picture
Upload folder using huggingface_hub
165ee16 verified
# unsloth-finetune/launch_nemotron_opus_distillation.py
import os
import subprocess
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template, train_on_responses_only
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset
# ==========================================
# Phase 1: Hugging Face Storage Integration
# ==========================================
# We use HF CLI to create a dedicated bucket for our training artifacts.
# This prevents Git LFS bottlenecks and uses Xet deduplication for fast checkpoints.
HF_BUCKET_NAME = "nemotron-opus-distill-runs"
print(f"Ensuring HF Storage Bucket '{HF_BUCKET_NAME}' exists...")
try:
subprocess.run(["hf", "buckets", "create", HF_BUCKET_NAME], check=False, capture_output=True)
print("HF Storage Bucket ready!")
except FileNotFoundError:
print("WARNING: 'hf' CLI not found. Make sure to install it: pip install -U huggingface_hub[cli]")
print("Falling back to local storage only for now.")
# ==========================================
# Phase 2: Unsloth Model Loading
# ==========================================
max_seq_length = 4096
dtype = None
load_in_4bit = True # 4-bit allows this 30B model to easily fit on a 24GB or 40GB GPU
print("\nLoading NVIDIA Nemotron-3-Nano-30B-A3B via Unsloth...")
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "unsloth/Nemotron-3-Nano-30B-A3B",
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
# Apply Hybrid LatentMoE/Mamba LoRA Adapters
print("Applying Hybrid LoRA Adapters...")
model = FastLanguageModel.get_peft_model(
model,
r = 16,
# Target standard layers + Mamba projections for deep reasoning logic capture
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
"in_proj", "out_proj"],
lora_alpha = 32,
lora_dropout = 0,
bias = "none",
use_gradient_checkpointing = "unsloth",
random_state = 3407,
)
# ==========================================
# Phase 3: Claude 4.6 Opus Reasoning Distillation
# ==========================================
tokenizer = get_chat_template(
tokenizer,
chat_template = "chatml",
)
print("\nStreaming Opus-4.6-Reasoning Dataset from HF Hub...")
# Note: HF's dataset library natively streams from their CDN
dataset = load_dataset("nohurry/Opus-4.6-Reasoning-3000x-filtered", split = "train")
# We format the dataset based on the exact columns in nohurry/Opus-4.6-Reasoning-3000x-filtered
# The columns are: problem, thinking, solution
def format_reasoning_prompts(examples):
problems = examples["problem"]
thinkings = examples["thinking"]
solutions = examples["solution"]
texts = []
for problem, thinking, solution in zip(problems, thinkings, solutions):
# Force the model to generate <think> blocks before answering
convo = [
{"role": "user", "content": problem},
{"role": "assistant", "content": f"<think>\n{thinking}\n</think>\n{solution}"}
]
text = tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False)
texts.append(text)
return { "text" : texts }
dataset = dataset.map(format_reasoning_prompts, batched = True)
# ==========================================
# Phase 4: Training & Xet Deduplication Checkpointing
# ==========================================
print("\nSetting up Trainer...")
local_output_dir = "nemotron_outputs"
trainer = SFTTrainer(
model = model,
tokenizer = tokenizer,
train_dataset = dataset,
dataset_text_field = "text",
max_seq_length = max_seq_length,
dataset_num_proc = 2,
packing = False,
args = TrainingArguments(
per_device_train_batch_size = 2,
gradient_accumulation_steps = 8,
warmup_steps = 10,
max_steps = 500, # Increased steps for a true distillation run
learning_rate = 1e-4,
fp16 = not torch.cuda.is_bf16_supported(),
bf16 = torch.cuda.is_bf16_supported(),
logging_steps = 5,
save_steps = 50, # Save checkpoints every 50 steps
optim = "adamw_8bit",
weight_decay = 0.05,
lr_scheduler_type = "cosine",
seed = 3407,
output_dir = local_output_dir,
),
)
trainer = train_on_responses_only(
trainer,
instruction_part = "<|im_start|>user\n",
response_part = "<|im_start|>assistant\n",
)
# Start a background process to sync checkpoints to HF Storage Bucket using Xet Deduplication
print(f"Starting background HF Bucket sync: local '{local_output_dir}' -> bucket '{HF_BUCKET_NAME}'")
try:
# Use hf sync to continuously push changes. Because of Xet, it only uploads the tiny diffs!
sync_process = subprocess.Popen(["hf", "sync", local_output_dir, f"hf://buckets/{HF_BUCKET_NAME}"],
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
except FileNotFoundError:
sync_process = None
print("Starting Reasoning Distillation Fine-tuning!")
trainer_stats = trainer.train()
if sync_process:
sync_process.terminate()
# ==========================================
# Phase 5: GGUF Export to Bucket
# ==========================================
print("\nTraining Complete! Exporting to GGUF and pushing directly to Storage Bucket...")
# We use Unsloth's native GGUF exporter, but target our high-speed HF Bucket instead of a standard repo
try:
model.push_to_hub_gguf(
f"hf://buckets/{HF_BUCKET_NAME}/Nemotron-3-Super-Opus-Reasoning-GGUF",
tokenizer,
quantization_method="q4_k_m"
)
print("GGUF successfully uploaded to HF Storage Bucket!")
except Exception as e:
print(f"Failed to push GGUF (check HF Token). Error: {e}")
print("All tasks completed.")