{ "step": 45444, "val_bpb": 0.6811688407173832, "model_config": { "sequence_len": 8192, "vocab_size": 131072, "n_layer": 16, "n_head": 8, "n_kv_head": 8, "n_embd": 1024, "moe_num_experts": 8, "moe_top_k": 2, "moe_layer_interval": 3, "moe_group_size": 4, "moe_expert_intermediate_size": 1792, "moe_adjugate_intermediate_size": 0, "moe_adjugate_scale": 0.05, "moe_router_aux_loss_coef": 0.015, "moe_router_bias_lr": 0.001, "moe_activation_checkpoint": true, "moe_capacity_factor": 0.75, "rotary_scaling_type": "yarn", "rotary_scale_factor": 4.0, "residual_scale": -1.0, "attn_dropout": 0.01, "label_smoothing": 0.0, "z_loss_weight": 0.0, "use_flash_attention": true, "domain_router_dim": 32, "num_domain_tags": 128, "domain_router_features": { "dataset": { "capacity": 128, "mode": "one_hot" }, "quality": { "capacity": 32, "mode": "one_hot" }, "specialty": { "capacity": 64, "mode": "one_hot" }, "modality": { "capacity": 32, "mode": "one_hot" }, "language": { "capacity": 32, "mode": "one_hot" }, "origin": { "capacity": 8, "mode": "one_hot" } } }, "user_config": { "run": "continous_pretraining", "device_type": "", "depth": 16, "max_seq_len": 8192, "moe_num_experts": 8, "moe_top_k": 2, "moe_layer_interval": 3, "moe_group_size": 4, "moe_expert_intermediate_size": 1792, "moe_adjugate_intermediate_size": 0, "moe_adjugate_scale": 0.05, "moe_router_aux_loss_coef": 0.015, "moe_router_bias_lr": 0.001, "moe_activation_checkpoint": true, "moe_capacity_factor": 0.75, "moe_router_bias_reset_interval": 0, "domain_router_dim": 32, "domain_tag_capacity": 128, "quality_tag_capacity": 32, "specialty_tag_capacity": 64, "modality_tag_capacity": 32, "language_tag_capacity": 32, "origin_tag_capacity": 8, "domain_router_feature_modes": "dataset:one_hot,quality:one_hot,specialty:one_hot,modality:one_hot,language:one_hot,origin:one_hot", "rotary_scaling_type": "yarn", "rotary_scale_factor": 4.0, "residual_scale": -1.0, "attn_dropout": 0.01, "label_smoothing": 0.0, "z_loss_weight": 0.0, "use_flash_attention": true, "use_distributed_muon": false, "num_iterations": -1, "target_flops": -1.0, "target_param_data_ratio": 20, "device_batch_size": 1, "total_batch_size": 524288, "max_grad_accum_steps": 0, "embedding_lr": 0.2, "unembedding_lr": 0.004, "weight_decay": 0.0, "freeze_embedding_optimizers": false, "matrix_lr": 0.02, "grad_clip": 1.0, "warmup_ratio": 0.0, "warmdown_ratio": 0.2, "final_lr_frac": 0.0, "eval_every": 500, "eval_tokens": "[redacted]", "core_metric_every": 1000, "core_metric_max_per_task": 2500, "sample_every": 1000, "micro_eval_every": 250, "micro_eval_tokens": "[redacted]", "skip_initial_eval": false, "fast_dev_run": false, "fast_dev_num_iterations": 200, "fast_dev_max_grad_accum_steps": 2, "fast_dev_eval_tokens_multiplier": "[redacted]", "model_tag": "d16_cont", "hf_repo_id": "harshad317/base_Medical_continuous", "hf_repo_type": "model", "hf_path_in_repo": "", "hf_commit_message": "Uploading the base model", "hf_private": false, "hf_token": "[redacted]", "hf_max_shard_size": "2gb", "resume_from_checkpoint": "d16", "resume_from_step": 22722, "resume_load_optimizer": true, "base_dataset_num_shards": 1250, "train_dataset_mix": "", "train_dataset_mix_path": "configs/train_dataset_mix.json", "loader_dedup_window": 300000, "loader_enable_length_bucketing": true, "loader_bucket_bins": "512,1024,2048,4096,8192", "loader_prefetch_batches": 2, "loader_tokenizer_threads": "[redacted]", "loader_tokenizer_batch_size": "[redacted]", "train_single_pass": true, "enable_length_curriculum": false, "sequence_length_schedule": "0.25:512,0.50:1024,0.75:2048,1.0:4096", "length_schedule_round_to": 32, "enable_dataset_reweighting": true, "dataset_reweight_interval": 2000, "dataset_reweight_strength": 0.75, "dataset_reweight_smoothing": 0.2, "dataset_reweight_min_multiplier": 0.25, "dataset_reweight_warmup_steps": 4000, "memory_profile_interval": 500, "dataset_mix_source": "/home/jupyter/nanochat/configs/train_dataset_mix.json" }, "device_batch_size": 1, "max_seq_len": 8192 }