| import torch |
| import argparse |
| import os |
| from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer |
| from datasets import load_dataset |
|
|
| def parse_args(): |
| parser = argparse.ArgumentParser(description="Fine-tune Charm 15 AI Model") |
| parser.add_argument("--model_name", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1", |
| help="Base model name or local path (default: Mixtral-8x7B)") |
| parser.add_argument("--dataset", type=str, required=True, |
| help="Path to training dataset (JSON or text file)") |
| parser.add_argument("--eval_dataset", type=str, default=None, |
| help="Path to optional validation dataset") |
| parser.add_argument("--epochs", type=int, default=3, |
| help="Number of training epochs") |
| parser.add_argument("--batch_size", type=int, default=1, |
| help="Per-device training batch size (lowered for GPU compatibility)") |
| parser.add_argument("--lr", type=float, default=5e-5, |
| help="Learning rate") |
| parser.add_argument("--output_dir", type=str, default="./finetuned_charm15", |
| help="Model save directory") |
| parser.add_argument("--max_length", type=int, default=512, |
| help="Max token length for training") |
| return parser.parse_args() |
|
|
| def tokenize_function(examples, tokenizer, max_length): |
| """Tokenize dataset and prepare labels for causal LM.""" |
| tokenized = tokenizer( |
| examples["text"], |
| padding="max_length", |
| truncation=True, |
| max_length=max_length, |
| return_tensors="pt" |
| ) |
| tokenized["labels"] = tokenized["input_ids"].clone() |
| return tokenized |
|
|
| def main(): |
| args = parse_args() |
| device = "cuda" if torch.cuda.is_available() else "cpu" |
| print(f"Using device: {device}") |
|
|
| |
| os.makedirs(args.output_dir, exist_ok=True) |
| os.makedirs("./logs", exist_ok=True) |
|
|
| |
| print(f"Loading tokenizer from {args.model_name}...") |
| try: |
| tokenizer = AutoTokenizer.from_pretrained(args.model_name) |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
| tokenizer.pad_token_id = tokenizer.eos_token_id |
| except Exception as e: |
| print(f"Error loading tokenizer: {e}") |
| exit(1) |
|
|
| |
| print(f"Loading model {args.model_name}...") |
| try: |
| model = AutoModelForCausalLM.from_pretrained( |
| args.model_name, |
| torch_dtype=torch.bfloat16, |
| device_map="auto", |
| low_cpu_mem_usage=True |
| ).to(device) |
| except Exception as e: |
| print(f"Error loading model: {e}") |
| exit(1) |
|
|
| |
| print(f"Loading dataset from {args.dataset}...") |
| try: |
| if args.dataset.endswith(".json"): |
| dataset = load_dataset("json", data_files={"train": args.dataset}) |
| else: |
| dataset = load_dataset("text", data_files={"train": args.dataset}) |
| |
| eval_dataset = None |
| if args.eval_dataset: |
| if args.eval_dataset.endswith(".json"): |
| eval_dataset = load_dataset("json", data_files={"train": args.eval_dataset})["train"] |
| else: |
| eval_dataset = load_dataset("text", data_files={"train": args.eval_dataset})["train"] |
| except Exception as e: |
| print(f"Error loading dataset: {e}") |
| exit(1) |
|
|
| |
| print("Tokenizing dataset...") |
| train_dataset = dataset["train"].map( |
| lambda x: tokenize_function(x, tokenizer, args.max_length), |
| batched=True, |
| remove_columns=["text"] |
| ) |
| eval_dataset = eval_dataset.map( |
| lambda x: tokenize_function(x, tokenizer, args.max_length), |
| batched=True, |
| remove_columns=["text"] |
| ) if args.eval_dataset else None |
|
|
| |
| training_args = TrainingArguments( |
| output_dir=args.output_dir, |
| per_device_train_batch_size=args.batch_size, |
| per_device_eval_batch_size=args.batch_size, |
| num_train_epochs=args.epochs, |
| learning_rate=args.lr, |
| gradient_accumulation_steps=8, |
| bf16=True, |
| fp16=False, |
| save_total_limit=2, |
| save_steps=500, |
| logging_dir="./logs", |
| logging_steps=100, |
| report_to="none", |
| evaluation_strategy="epoch" if eval_dataset else "no", |
| save_strategy="epoch", |
| load_best_model_at_end=bool(eval_dataset), |
| metric_for_best_model="loss" |
| ) |
|
|
| |
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=train_dataset, |
| eval_dataset=eval_dataset, |
| tokenizer=tokenizer |
| ) |
|
|
| |
| print("Starting fine-tuning...") |
| try: |
| trainer.train() |
| except RuntimeError as e: |
| print(f"Training failed: {e} (Try reducing batch_size or max_length)") |
| exit(1) |
|
|
| |
| print(f"Saving fine-tuned model to {args.output_dir}") |
| trainer.save_model(args.output_dir) |
| tokenizer.save_pretrained(args.output_dir) |
|
|
| |
| del model |
| torch.cuda.empty_cache() |
| print("Training complete. Memory cleared.") |
|
|
| if __name__ == "__main__": |
| main() |