| { |
| "step": 45444, |
| "val_bpb": 0.6811688407173832, |
| "model_config": { |
| "sequence_len": 8192, |
| "vocab_size": 131072, |
| "n_layer": 16, |
| "n_head": 8, |
| "n_kv_head": 8, |
| "n_embd": 1024, |
| "moe_num_experts": 8, |
| "moe_top_k": 2, |
| "moe_layer_interval": 3, |
| "moe_group_size": 4, |
| "moe_expert_intermediate_size": 1792, |
| "moe_adjugate_intermediate_size": 0, |
| "moe_adjugate_scale": 0.05, |
| "moe_router_aux_loss_coef": 0.015, |
| "moe_router_bias_lr": 0.001, |
| "moe_activation_checkpoint": true, |
| "moe_capacity_factor": 0.75, |
| "rotary_scaling_type": "yarn", |
| "rotary_scale_factor": 4.0, |
| "residual_scale": -1.0, |
| "attn_dropout": 0.01, |
| "label_smoothing": 0.0, |
| "z_loss_weight": 0.0, |
| "use_flash_attention": true, |
| "domain_router_dim": 32, |
| "num_domain_tags": 128, |
| "domain_router_features": { |
| "dataset": { |
| "capacity": 128, |
| "mode": "one_hot" |
| }, |
| "quality": { |
| "capacity": 32, |
| "mode": "one_hot" |
| }, |
| "specialty": { |
| "capacity": 64, |
| "mode": "one_hot" |
| }, |
| "modality": { |
| "capacity": 32, |
| "mode": "one_hot" |
| }, |
| "language": { |
| "capacity": 32, |
| "mode": "one_hot" |
| }, |
| "origin": { |
| "capacity": 8, |
| "mode": "one_hot" |
| } |
| } |
| }, |
| "user_config": { |
| "run": "continous_pretraining", |
| "device_type": "", |
| "depth": 16, |
| "max_seq_len": 8192, |
| "moe_num_experts": 8, |
| "moe_top_k": 2, |
| "moe_layer_interval": 3, |
| "moe_group_size": 4, |
| "moe_expert_intermediate_size": 1792, |
| "moe_adjugate_intermediate_size": 0, |
| "moe_adjugate_scale": 0.05, |
| "moe_router_aux_loss_coef": 0.015, |
| "moe_router_bias_lr": 0.001, |
| "moe_activation_checkpoint": true, |
| "moe_capacity_factor": 0.75, |
| "moe_router_bias_reset_interval": 0, |
| "domain_router_dim": 32, |
| "domain_tag_capacity": 128, |
| "quality_tag_capacity": 32, |
| "specialty_tag_capacity": 64, |
| "modality_tag_capacity": 32, |
| "language_tag_capacity": 32, |
| "origin_tag_capacity": 8, |
| "domain_router_feature_modes": "dataset:one_hot,quality:one_hot,specialty:one_hot,modality:one_hot,language:one_hot,origin:one_hot", |
| "rotary_scaling_type": "yarn", |
| "rotary_scale_factor": 4.0, |
| "residual_scale": -1.0, |
| "attn_dropout": 0.01, |
| "label_smoothing": 0.0, |
| "z_loss_weight": 0.0, |
| "use_flash_attention": true, |
| "use_distributed_muon": false, |
| "num_iterations": -1, |
| "target_flops": -1.0, |
| "target_param_data_ratio": 20, |
| "device_batch_size": 1, |
| "total_batch_size": 524288, |
| "max_grad_accum_steps": 0, |
| "embedding_lr": 0.2, |
| "unembedding_lr": 0.004, |
| "weight_decay": 0.0, |
| "freeze_embedding_optimizers": false, |
| "matrix_lr": 0.02, |
| "grad_clip": 1.0, |
| "warmup_ratio": 0.0, |
| "warmdown_ratio": 0.2, |
| "final_lr_frac": 0.0, |
| "eval_every": 500, |
| "eval_tokens": "[redacted]", |
| "core_metric_every": 1000, |
| "core_metric_max_per_task": 2500, |
| "sample_every": 1000, |
| "micro_eval_every": 250, |
| "micro_eval_tokens": "[redacted]", |
| "skip_initial_eval": false, |
| "fast_dev_run": false, |
| "fast_dev_num_iterations": 200, |
| "fast_dev_max_grad_accum_steps": 2, |
| "fast_dev_eval_tokens_multiplier": "[redacted]", |
| "model_tag": "d16_cont", |
| "hf_repo_id": "harshad317/base_Medical_continuous", |
| "hf_repo_type": "model", |
| "hf_path_in_repo": "", |
| "hf_commit_message": "Uploading the base model", |
| "hf_private": false, |
| "hf_token": "[redacted]", |
| "hf_max_shard_size": "2gb", |
| "resume_from_checkpoint": "d16", |
| "resume_from_step": 22722, |
| "resume_load_optimizer": true, |
| "base_dataset_num_shards": 1250, |
| "train_dataset_mix": "", |
| "train_dataset_mix_path": "configs/train_dataset_mix.json", |
| "loader_dedup_window": 300000, |
| "loader_enable_length_bucketing": true, |
| "loader_bucket_bins": "512,1024,2048,4096,8192", |
| "loader_prefetch_batches": 2, |
| "loader_tokenizer_threads": "[redacted]", |
| "loader_tokenizer_batch_size": "[redacted]", |
| "train_single_pass": true, |
| "enable_length_curriculum": false, |
| "sequence_length_schedule": "0.25:512,0.50:1024,0.75:2048,1.0:4096", |
| "length_schedule_round_to": 32, |
| "enable_dataset_reweighting": true, |
| "dataset_reweight_interval": 2000, |
| "dataset_reweight_strength": 0.75, |
| "dataset_reweight_smoothing": 0.2, |
| "dataset_reweight_min_multiplier": 0.25, |
| "dataset_reweight_warmup_steps": 4000, |
| "memory_profile_interval": 500, |
| "dataset_mix_source": "/home/jupyter/nanochat/configs/train_dataset_mix.json" |
| }, |
| "device_batch_size": 1, |
| "max_seq_len": 8192 |
| } |