{ "model_repo_id": "kyLELEng/adaptive-retro-gpt-1b", "corpus_repo_id": "kyLELEng/adaptive-retro-gpt-1b-corpus", "datastore_repo_id": "kyLELEng/adaptive-retro-gpt-1b-datastore", "private_repo": true, "push_to_hub": true, "push_datasets": true, "output_dir": "/tmp/adaptive-retro-gpt-1b", "smoke_test": false, "dataset_id": "HuggingFaceFW/fineweb-edu", "dataset_config": "sample-10BT", "train_split": "train", "validation_split": "train", "text_column": "text", "streaming": true, "min_text_chars": 200, "datastore_dataset_id": "wikimedia/wikipedia", "datastore_dataset_config": "20231101.en", "datastore_split": "train", "datastore_text_column": "text", "corpus_train_docs": 80000, "corpus_validation_docs": 4000, "tokenizer_train_docs": 200000, "vocab_size": 50000, "max_train_docs": 120000, "max_eval_docs": 2048, "max_index_docs": 120000, "max_index_chunks": 120000, "chunk_tokens": 96, "min_chunk_tokens": 24, "hash_dim": 1024, "top_k": 2, "retrieval_seq_len": 512, "seq_len": 2048, "d_model": 2048, "n_layers": 20, "n_heads": 16, "dropout": 0.0, "retrieval_layers": "5,11,17", "retrieval_budget_lambda": 0.001, "no_retrieval_prob": 0.1, "random_retrieval_prob": 0.1, "max_steps": 20000, "per_device_batch_size": 2, "gradient_accumulation_steps": 2, "learning_rate": 0.00018, "min_lr_ratio": 0.1, "warmup_steps": 1000, "weight_decay": 0.1, "grad_clip": 1.0, "precision": "bf16", "gradient_checkpointing": false, "log_every": 10, "eval_every": 2000, "save_every": 5000, "max_eval_batches": 32, "seed": 17 }