|
|
|
|
| from datasets import load_dataset |
| from transformers import T5Tokenizer, T5ForConditionalGeneration |
| import torch, csv |
| file_dict = { |
| "train" : "name_dataset.csv", |
| "test" : "name_dataset.csv" |
| } |
|
|
| dataset = load_dataset( |
| 'csv', |
| data_files=file_dict, |
| delimiter=',', |
| column_names=['text', 'label'], |
| skiprows=1 |
| ) |
|
|
| print(f"Train dataset size: {len(dataset['train'])}") |
| print(f"Test dataset size: {len(dataset['test'])}") |
|
|
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
| from datasets import concatenate_datasets |
|
|
| model_id = "t5-small" |
| tokenizer = AutoTokenizer.from_pretrained(model_id) |
|
|
| def tokenize_function(example): |
| model_inputs = tokenizer(example["text"], truncation=True) |
| targets = tokenizer(example["label"], truncation=True) |
| model_inputs['labels'] = targets['input_ids'] |
| return model_inputs |
|
|
| tokenized_datasets = dataset.map(tokenize_function, batched=True) |
| tokenized_datasets = tokenized_datasets.remove_columns("text") |
| tokenized_datasets = tokenized_datasets.remove_columns("label") |
|
|
| from transformers import DataCollatorForSeq2Seq |
| model =T5ForConditionalGeneration.from_pretrained(model_id) |
|
|
| from peft import LoraConfig, get_peft_model,TaskType |
|
|
|
|
| lora_config = LoraConfig( |
| r=16, |
| lora_alpha=32, |
| target_modules=["q", "v"], |
| lora_dropout=0.05, |
| bias="none", |
| task_type=TaskType.SEQ_2_SEQ_LM |
| ) |
| model = get_peft_model(model, lora_config) |
| model.print_trainable_parameters() |
|
|
| label_pad_token_id = -100 |
| data_collator = DataCollatorForSeq2Seq( |
| tokenizer, |
| model=model, |
| label_pad_token_id=label_pad_token_id, |
| pad_to_multiple_of=8 |
| ) |
|
|
| from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments |
|
|
| output_dir = "lora-t5" |
| training_args = Seq2SeqTrainingArguments( |
| output_dir=output_dir, |
| auto_find_batch_size=True, |
| learning_rate=1e-3, |
| num_train_epochs=100, |
| logging_dir=f"{output_dir}/logs", |
| logging_strategy="steps", |
| logging_steps=500, |
| save_strategy="no", |
| |
| ) |
|
|
| trainer = Seq2SeqTrainer( |
| model=model, |
| args=training_args, |
| data_collator=data_collator, |
| train_dataset=tokenized_datasets["train"], |
| ) |
| model.config.use_cache = False |
| trainer.train() |
| peft_model_id = "name-peft" |
| trainer.model.save_pretrained(peft_model_id) |
| tokenizer.save_pretrained(peft_model_id) |
| from transformers import T5ForConditionalGeneration, AutoTokenizer |
| from peft import PeftModel |
| base_model = T5ForConditionalGeneration.from_pretrained(model_id) |
| peft_model = PeftModel.from_pretrained(base_model, "name-peft") |
| peft_model = peft_model.merge_and_unload() |
| peft_model.save_pretrained("name-extraction") |
| tokenizer = AutoTokenizer.from_pretrained(model_id) |
| tokenizer.save_pretrained("name-extraction") |
|
|