| import torch |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
| from safetensors.torch import load_file |
| from accelerate import init_empty_weights, load_checkpoint_and_dispatch |
|
|
| |
| MODEL_NAME = "mistral-8x7B" |
| SAFETENSORS_PATH = "path_to_your_model.safetensors" |
|
|
| |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
|
|
| |
| with init_empty_weights(): |
| model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) |
|
|
| |
| model_weights = load_file(SAFETENSORS_PATH) |
|
|
| |
| |
| model = load_checkpoint_and_dispatch( |
| model, |
| SAFETENSORS_PATH, |
| device_map="auto", # Automatically handles GPU/CPU offloading |
| no_split_module_classes=["MistralLayer"], # Specify layers not to split |
| dtype=torch.float16, # Use mixed precision for memory efficiency |
| ) |
|
|
| |
| device = "cuda" if torch.cuda.is_available() else "cpu" |
| model.to(device) |
|
|
| |
| input_text = "Hello, how are you?" |
| inputs = tokenizer(input_text, return_tensors="pt").to(device) |
|
|
| |
| with torch.no_grad(): |
| outputs = model.generate( |
| inputs["input_ids"], |
| max_length=50, |
| num_return_sequences=1, |
| temperature=0.7, |
| top_k=50, |
| top_p=0.95, |
| ) |
|
|
| |
| generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) |
| print("Generated Text:", generated_text) |