| import math |
|
|
| import torch |
| from tokenizers import Tokenizer |
| from transformers import PreTrainedTokenizerFast, get_cosine_schedule_with_warmup |
|
|
| from training import PreTrainer |
| from tynerox.modeling import TyneRoxModel, TyneRoxConfig |
| from dataset.pre_train import create_train_dataloader |
|
|
| if __name__ == "__main__": |
|
|
| |
| tokenizer = Tokenizer.from_file("tokenizer/tokens-bpe-36k.json") |
| tokenizer = PreTrainedTokenizerFast( |
| tokenizer_object=tokenizer, |
| unk_token="[UNK]", |
| pad_token="<|endoftext|>", |
| eos_token="<|endoftext|>", |
| ) |
|
|
| tokenizer.save_pretrained(f"../") |
|
|
| |
| config = TyneRoxConfig( |
| vocab_size=tokenizer.vocab_size, |
| pad_token_id=tokenizer.pad_token_id, |
| ) |
|
|
| model = TyneRoxModel(config) |
| model.to("cuda") |
|
|
| |
| folder_path = "bobboyms/subset-Itau-Unibanco-aroeira-1B-tokens" |
| dataloader = create_train_dataloader( |
| folder_path, |
| tokenizer, |
| batch_size=5, |
| max_length=1024, |
| drop_last=True, |
| num_workers=10 |
| ) |
|
|
| |
| model = torch.compile(model) |
| optimizer = torch.optim.AdamW( |
| model.parameters(), |
| lr=0.000461, |
| weight_decay=0.1 |
| ) |
|
|
| |
| epochs = 1 |
| batch_size = 40 |
| size_dataset = 2_883_231 |
| warmup_ratio = 0.05 |
|
|
| num_training_steps = len(dataloader) * epochs |
| num_warmup_steps = math.floor(num_training_steps * warmup_ratio) |
|
|
| |
| scheduler = get_cosine_schedule_with_warmup( |
| optimizer, |
| num_warmup_steps=num_warmup_steps, |
| num_training_steps=num_training_steps, |
| ) |
|
|
| sample_prompts = [ |
| "Olá, como vai você? ", |
| "Quando a manhã chegou, Iracema ainda estava ali, debruçada, como uma borboleta que ", |
| "Não, respondeu; na verdade, estou com medo ", |
| "O resultado representa uma desaceleração ", |
| "No vídeo, é possível ver ", |
| "Essa receita de torta de frango ", |
| "Durante o primeiro mandato ", |
| "Os donos de cães " |
| ] |
|
|
| logger_config = { |
| "tracking_uri": "http://127.0.0.1:5000", |
| "experiment": "Pre training LLM", |
| "model_name": "Pre training LLM (Long Context)" |
| } |
|
|
| trainer = PreTrainer( |
| model=model, |
| optimizer=optimizer, |
| scheduler=scheduler, |
| tokenizer=tokenizer, |
| train_loader=dataloader, |
| test_loader=None, |
| logger_config=logger_config, |
| use_amp=True |
| ) |
|
|
| trainer.train(num_epochs=epochs,sample_prompts=sample_prompts) |
|
|
| |
| model.save_pretrained(f"../") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|