Llama-3.1-Nemotron-Nano-8B-v1-bnb-4bit
Community Article Published
March 24, 2025
tobit4
Use System Ubuntu 22.04
install Software
pip transformers bitsandbytes accelerate
to bnb 4bit
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import bitsandbytes as bnb
# Define the model name and path
model_name = "nvidia/Llama-3.1-Nemotron-Nano-8B-v1"
# Configure quantization parameters
quantization_config = BitsAndBytesConfig(
load_in_4bit=True, # Load the model weights in 4-bit precision
bnb_4bit_compute_dtype=torch.bfloat16, # Use bfloat16 for computation
bnb_4bit_quant_type="nf4", # Use "nf4" quantization type
bnb_4bit_use_double_quant=True, # Enable double quantization
llm_int8_skip_modules=[ # Specify modules to skip during quantization
"lm_head",
"multi_modal_projector",
"merger",
"modality_projection",
"model.layers.1.mlp"
],
)
# Load the pre-trained model with the specified quantization configuration
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=quantization_config,
device_map="auto" # Automatically allocate devices
)
# Load the tokenizer associated with the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Save the quantized model and tokenizer to a specified directory
model.save_pretrained("Llama-3.1-Nemotron-Nano-8B-v1-bnb-4bit")
tokenizer.save_pretrained("Llama-3.1-Nemotron-Nano-8B-v1-bnb-4bit")
Chat Test
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
# Configure quantization parameters
quantization_config = BitsAndBytesConfig(
load_in_4bit=True, # Load the model weights in 4-bit precision
bnb_4bit_compute_dtype=torch.bfloat16, # Use bfloat16 for computation
bnb_4bit_quant_type="nf4", # Use "nf4" quantization type
bnb_4bit_use_double_quant=True, # Enable double quantization
)
# Define the model name and path for the quantized model
model_name = "./Llama-3.1-Nemotron-Nano-8B-v1-bnb-4bit"
# Load the quantized model with the specified configuration
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=quantization_config,
device_map="auto" # Automatically allocate devices
)
# Load the tokenizer associated with the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Determine the device where the model is located
device = model.device
# Prepare input text and move it to the same device as the model
input_text = "Once upon a time"
inputs = tokenizer(input_text, return_tensors="pt").to(device)
# Perform inference
with torch.no_grad():
outputs = model.generate(**inputs, max_length=50)
# Decode the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)