harshad317's picture
Uploading the base model
2ca81ec verified
{
"step": 45444,
"val_bpb": 0.6811688407173832,
"model_config": {
"sequence_len": 8192,
"vocab_size": 131072,
"n_layer": 16,
"n_head": 8,
"n_kv_head": 8,
"n_embd": 1024,
"moe_num_experts": 8,
"moe_top_k": 2,
"moe_layer_interval": 3,
"moe_group_size": 4,
"moe_expert_intermediate_size": 1792,
"moe_adjugate_intermediate_size": 0,
"moe_adjugate_scale": 0.05,
"moe_router_aux_loss_coef": 0.015,
"moe_router_bias_lr": 0.001,
"moe_activation_checkpoint": true,
"moe_capacity_factor": 0.75,
"rotary_scaling_type": "yarn",
"rotary_scale_factor": 4.0,
"residual_scale": -1.0,
"attn_dropout": 0.01,
"label_smoothing": 0.0,
"z_loss_weight": 0.0,
"use_flash_attention": true,
"domain_router_dim": 32,
"num_domain_tags": 128,
"domain_router_features": {
"dataset": {
"capacity": 128,
"mode": "one_hot"
},
"quality": {
"capacity": 32,
"mode": "one_hot"
},
"specialty": {
"capacity": 64,
"mode": "one_hot"
},
"modality": {
"capacity": 32,
"mode": "one_hot"
},
"language": {
"capacity": 32,
"mode": "one_hot"
},
"origin": {
"capacity": 8,
"mode": "one_hot"
}
}
},
"user_config": {
"run": "continous_pretraining",
"device_type": "",
"depth": 16,
"max_seq_len": 8192,
"moe_num_experts": 8,
"moe_top_k": 2,
"moe_layer_interval": 3,
"moe_group_size": 4,
"moe_expert_intermediate_size": 1792,
"moe_adjugate_intermediate_size": 0,
"moe_adjugate_scale": 0.05,
"moe_router_aux_loss_coef": 0.015,
"moe_router_bias_lr": 0.001,
"moe_activation_checkpoint": true,
"moe_capacity_factor": 0.75,
"moe_router_bias_reset_interval": 0,
"domain_router_dim": 32,
"domain_tag_capacity": 128,
"quality_tag_capacity": 32,
"specialty_tag_capacity": 64,
"modality_tag_capacity": 32,
"language_tag_capacity": 32,
"origin_tag_capacity": 8,
"domain_router_feature_modes": "dataset:one_hot,quality:one_hot,specialty:one_hot,modality:one_hot,language:one_hot,origin:one_hot",
"rotary_scaling_type": "yarn",
"rotary_scale_factor": 4.0,
"residual_scale": -1.0,
"attn_dropout": 0.01,
"label_smoothing": 0.0,
"z_loss_weight": 0.0,
"use_flash_attention": true,
"use_distributed_muon": false,
"num_iterations": -1,
"target_flops": -1.0,
"target_param_data_ratio": 20,
"device_batch_size": 1,
"total_batch_size": 524288,
"max_grad_accum_steps": 0,
"embedding_lr": 0.2,
"unembedding_lr": 0.004,
"weight_decay": 0.0,
"freeze_embedding_optimizers": false,
"matrix_lr": 0.02,
"grad_clip": 1.0,
"warmup_ratio": 0.0,
"warmdown_ratio": 0.2,
"final_lr_frac": 0.0,
"eval_every": 500,
"eval_tokens": "[redacted]",
"core_metric_every": 1000,
"core_metric_max_per_task": 2500,
"sample_every": 1000,
"micro_eval_every": 250,
"micro_eval_tokens": "[redacted]",
"skip_initial_eval": false,
"fast_dev_run": false,
"fast_dev_num_iterations": 200,
"fast_dev_max_grad_accum_steps": 2,
"fast_dev_eval_tokens_multiplier": "[redacted]",
"model_tag": "d16_cont",
"hf_repo_id": "harshad317/base_Medical_continuous",
"hf_repo_type": "model",
"hf_path_in_repo": "",
"hf_commit_message": "Uploading the base model",
"hf_private": false,
"hf_token": "[redacted]",
"hf_max_shard_size": "2gb",
"resume_from_checkpoint": "d16",
"resume_from_step": 22722,
"resume_load_optimizer": true,
"base_dataset_num_shards": 1250,
"train_dataset_mix": "",
"train_dataset_mix_path": "configs/train_dataset_mix.json",
"loader_dedup_window": 300000,
"loader_enable_length_bucketing": true,
"loader_bucket_bins": "512,1024,2048,4096,8192",
"loader_prefetch_batches": 2,
"loader_tokenizer_threads": "[redacted]",
"loader_tokenizer_batch_size": "[redacted]",
"train_single_pass": true,
"enable_length_curriculum": false,
"sequence_length_schedule": "0.25:512,0.50:1024,0.75:2048,1.0:4096",
"length_schedule_round_to": 32,
"enable_dataset_reweighting": true,
"dataset_reweight_interval": 2000,
"dataset_reweight_strength": 0.75,
"dataset_reweight_smoothing": 0.2,
"dataset_reweight_min_multiplier": 0.25,
"dataset_reweight_warmup_steps": 4000,
"memory_profile_interval": 500,
"dataset_mix_source": "/home/jupyter/nanochat/configs/train_dataset_mix.json"
},
"device_batch_size": 1,
"max_seq_len": 8192
}