adaptive-retro-gpt-1b / training_config.json
kyLELEng's picture
Train Adaptive-RETRO-GPT-1B
5a8b07f verified
{
"model_repo_id": "kyLELEng/adaptive-retro-gpt-1b",
"corpus_repo_id": "kyLELEng/adaptive-retro-gpt-1b-corpus",
"datastore_repo_id": "kyLELEng/adaptive-retro-gpt-1b-datastore",
"private_repo": true,
"push_to_hub": true,
"push_datasets": true,
"output_dir": "/tmp/adaptive-retro-gpt-1b",
"smoke_test": false,
"dataset_id": "HuggingFaceFW/fineweb-edu",
"dataset_config": "sample-10BT",
"train_split": "train",
"validation_split": "train",
"text_column": "text",
"streaming": true,
"min_text_chars": 200,
"datastore_dataset_id": "wikimedia/wikipedia",
"datastore_dataset_config": "20231101.en",
"datastore_split": "train",
"datastore_text_column": "text",
"corpus_train_docs": 80000,
"corpus_validation_docs": 4000,
"tokenizer_train_docs": 200000,
"vocab_size": 50000,
"max_train_docs": 120000,
"max_eval_docs": 2048,
"max_index_docs": 120000,
"max_index_chunks": 120000,
"chunk_tokens": 96,
"min_chunk_tokens": 24,
"hash_dim": 1024,
"top_k": 2,
"retrieval_seq_len": 512,
"seq_len": 2048,
"d_model": 2048,
"n_layers": 20,
"n_heads": 16,
"dropout": 0.0,
"retrieval_layers": "5,11,17",
"retrieval_budget_lambda": 0.001,
"no_retrieval_prob": 0.1,
"random_retrieval_prob": 0.1,
"max_steps": 20000,
"per_device_batch_size": 2,
"gradient_accumulation_steps": 2,
"learning_rate": 0.00018,
"min_lr_ratio": 0.1,
"warmup_steps": 1000,
"weight_decay": 0.1,
"grad_clip": 1.0,
"precision": "bf16",
"gradient_checkpointing": false,
"log_every": 10,
"eval_every": 2000,
"save_every": 5000,
"max_eval_batches": 32,
"seed": 17
}