| import torch
|
| from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForSeq2Seq
|
| from datasets import load_dataset
|
|
|
|
|
| MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
|
|
|
|
|
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| model = AutoModelForCausalLM.from_pretrained(
|
| MODEL_NAME,
|
| torch_dtype=torch.float16,
|
| device_map="auto"
|
| )
|
|
|
|
|
| dataset = load_dataset("json", data_files="processed_dataset.json")
|
|
|
|
|
| def tokenize_function(examples):
|
| return tokenizer(examples["prompt"], examples["response"], padding="max_length", truncation=True)
|
|
|
|
|
| dataset = dataset.map(tokenize_function, batched=True)
|
| dataset = dataset.remove_columns(["prompt", "response"])
|
|
|
|
|
| data_collator = DataCollatorForSeq2Seq(
|
| tokenizer=tokenizer,
|
| model=model,
|
| padding=True,
|
| return_tensors="pt"
|
| )
|
|
|
|
|
| training_args = TrainingArguments(
|
| output_dir="./results",
|
| num_train_epochs=3,
|
| per_device_train_batch_size=4,
|
| per_device_eval_batch_size=4,
|
| save_steps=10_000,
|
| save_total_limit=2,
|
| logging_dir="./logs",
|
| logging_steps=200,
|
| remove_unused_columns=False,
|
| fp16=True,
|
| )
|
|
|
|
|
| trainer = Trainer(
|
| model=model,
|
| args=training_args,
|
| train_dataset=dataset["train"],
|
| data_collator=data_collator,
|
| )
|
|
|
|
|
| trainer.train()
|
|
|