| import pandas as pd |
| import numpy as np |
| import os |
|
|
| def read_txt(file_path): |
| text = "" |
| try: |
| with open(file_path, "r") as file: |
| text = file.read() |
| except: |
| text = "" |
| return text |
|
|
| with open("train.txt", "w") as f: |
| f.write('') |
| |
| data = "" |
| for filename in os.listdir("./"): |
| file_path = os.path.join("./", filename) |
| if file_path.endswith(".txt") and (file_path != "train.txt"): |
| data += read_txt(file_path) |
| data = ' '.join(data.split('\n')) |
| |
| with open("train.txt", "a") as f: |
| f.write(data) |
| |
| from transformers import TextDataset, DataCollatorForLanguageModeling |
| from transformers import GPT2Tokenizer, GPT2LMHeadModel |
| from transformers import Trainer, TrainingArguments |
|
|
| def load_dataset(file_path, tokenizer, block_size = 128): |
| dataset = TextDataset( |
| tokenizer = tokenizer, |
| file_path = file_path, |
| block_size = block_size, |
| ) |
| return dataset |
|
|
| def load_data_collator(tokenizer, mlm = False): |
| data_collator = DataCollatorForLanguageModeling( |
| tokenizer=tokenizer, |
| mlm=mlm, |
| ) |
| return data_collator |
|
|
| def train(train_file_path,model_name, |
| output_dir, |
| overwrite_output_dir, |
| per_device_train_batch_size, |
| num_train_epochs, |
| save_steps): |
| |
| tokenizer = GPT2Tokenizer.from_pretrained(model_name) |
| train_dataset = load_dataset(train_file_path, tokenizer) |
| data_collator = load_data_collator(tokenizer) |
| tokenizer.save_pretrained(output_dir) |
| model = GPT2LMHeadModel.from_pretrained(model_name) |
| model.save_pretrained(output_dir) |
| |
| training_args = TrainingArguments( |
| output_dir=output_dir, |
| overwrite_output_dir=overwrite_output_dir, |
| per_device_train_batch_size=per_device_train_batch_size, |
| num_train_epochs=num_train_epochs, |
| ) |
| |
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| data_collator=data_collator, |
| train_dataset=train_dataset, |
| ) |
| |
| trainer.train() |
| trainer.save_model() |
|
|
| train_file_path = "train.txt" |
| model_name = 'gpt2' |
| output_dir = 'model' |
| overwrite_output_dir = False |
| per_device_train_batch_size = 8 |
| num_train_epochs = 50.0 |
| save_steps = 50000 |
|
|
| train( |
| train_file_path=train_file_path, |
| model_name=model_name, |
| output_dir=output_dir, |
| overwrite_output_dir=overwrite_output_dir, |
| per_device_train_batch_size=per_device_train_batch_size, |
| num_train_epochs=num_train_epochs, |
| save_steps=save_steps |
| ) |
|
|
| from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer |
|
|
| def load_model(model_path): |
| model = GPT2LMHeadModel.from_pretrained(model_path) |
| return model |
|
|
|
|
| def load_tokenizer(tokenizer_path): |
| tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path) |
| return tokenizer |
|
|
| def generate_text(model_path, sequence, max_length): |
| |
| model = load_model(model_path) |
| tokenizer = load_tokenizer(model_path) |
| ids = tokenizer.encode(f'{sequence}', return_tensors='pt') |
| final_outputs = model.generate( |
| ids, |
| do_sample=True, |
| max_length=max_length, |
| pad_token_id=model.config.eos_token_id, |
| top_k=50, |
| top_p=0.95, |
| ) |
| print(tokenizer.decode(final_outputs[0], skip_special_tokens=True)) |
|
|
| model_path = "/model/" |
| sequence = "Hello!" |
| max_len = 50 |
| generate_text(model_path, sequence, max_len) |