| import torch |
| from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer |
| from datasets import load_dataset |
| import os |
|
|
| |
| MODEL_NAME = "mistralai/Mixtral-8x7B-Instruct-v0.1" |
| OUTPUT_DIR = "./mixtral_finetuned" |
|
|
| |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
| tokenizer.pad_token_id = tokenizer.eos_token_id |
|
|
| |
| model = AutoModelForCausalLM.from_pretrained( |
| MODEL_NAME, |
| torch_dtype=torch.bfloat16, |
| device_map="auto", |
| low_cpu_mem_usage=True |
| ) |
|
|
| |
| try: |
| dataset = load_dataset("text", data_files={"train": "train.txt", "validation": "val.txt"}) |
| except FileNotFoundError: |
| print("Error: train.txt or val.txt not found. Please provide valid files.") |
| exit(1) |
|
|
| |
| def tokenize_function(examples): |
| tokenized = tokenizer( |
| examples["text"], |
| padding="max_length", |
| truncation=True, |
| max_length=512, |
| return_tensors="pt" |
| ) |
| tokenized["labels"] = tokenized["input_ids"].clone() |
| return tokenized |
|
|
| tokenized_datasets = dataset.map( |
| tokenize_function, |
| batched=True, |
| remove_columns=["text"] |
| ) |
|
|
| |
| train_dataset = tokenized_datasets["train"] |
| eval_dataset = tokenized_datasets["validation"] if "validation" in tokenized_datasets else None |
| if not train_dataset or (eval_dataset and len(eval_dataset) == 0): |
| print("Error: Empty training or validation dataset.") |
| exit(1) |
|
|
| |
| training_args = TrainingArguments( |
| output_dir=OUTPUT_DIR, |
| evaluation_strategy="epoch" if eval_dataset else "no", |
| per_device_train_batch_size=1, |
| per_device_eval_batch_size=1, |
| num_train_epochs=3, |
| learning_rate=2e-5, |
| weight_decay=0.01, |
| gradient_accumulation_steps=8, |
| bf16=True, |
| fp16=False, |
| save_strategy="epoch", |
| save_total_limit=2, |
| logging_dir="./logs", |
| logging_steps=10, |
| load_best_model_at_end=bool(eval_dataset), |
| metric_for_best_model="loss", |
| report_to="none" |
| ) |
|
|
| |
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=train_dataset, |
| eval_dataset=eval_dataset, |
| ) |
|
|
| |
| try: |
| trainer.train() |
| except RuntimeError as e: |
| print(f"Training failed: {e} (Likely OOM—reduce batch size or max_length)") |
| exit(1) |
|
|
| |
| trainer.save_model(OUTPUT_DIR) |
| tokenizer.save_pretrained(OUTPUT_DIR) |
|
|
| |
| del model |
| torch.cuda.empty_cache() |
| print(f"Model and tokenizer saved to {OUTPUT_DIR}") |