Uploading the base model

2ca81ec verified 5 months ago

4.66 kB

	{
	"step": 45444,
	"val_bpb": 0.6811688407173832,
	"model_config": {
	"sequence_len": 8192,
	"vocab_size": 131072,
	"n_layer": 16,
	"n_head": 8,
	"n_kv_head": 8,
	"n_embd": 1024,
	"moe_num_experts": 8,
	"moe_top_k": 2,
	"moe_layer_interval": 3,
	"moe_group_size": 4,
	"moe_expert_intermediate_size": 1792,
	"moe_adjugate_intermediate_size": 0,
	"moe_adjugate_scale": 0.05,
	"moe_router_aux_loss_coef": 0.015,
	"moe_router_bias_lr": 0.001,
	"moe_activation_checkpoint": true,
	"moe_capacity_factor": 0.75,
	"rotary_scaling_type": "yarn",
	"rotary_scale_factor": 4.0,
	"residual_scale": -1.0,
	"attn_dropout": 0.01,
	"label_smoothing": 0.0,
	"z_loss_weight": 0.0,
	"use_flash_attention": true,
	"domain_router_dim": 32,
	"num_domain_tags": 128,
	"domain_router_features": {
	"dataset": {
	"capacity": 128,
	"mode": "one_hot"
	},
	"quality": {
	"capacity": 32,
	"mode": "one_hot"
	},
	"specialty": {
	"capacity": 64,
	"mode": "one_hot"
	},
	"modality": {
	"capacity": 32,
	"mode": "one_hot"
	},
	"language": {
	"capacity": 32,
	"mode": "one_hot"
	},
	"origin": {
	"capacity": 8,
	"mode": "one_hot"
	}
	}
	},
	"user_config": {
	"run": "continous_pretraining",
	"device_type": "",
	"depth": 16,
	"max_seq_len": 8192,
	"moe_num_experts": 8,
	"moe_top_k": 2,
	"moe_layer_interval": 3,
	"moe_group_size": 4,
	"moe_expert_intermediate_size": 1792,
	"moe_adjugate_intermediate_size": 0,
	"moe_adjugate_scale": 0.05,
	"moe_router_aux_loss_coef": 0.015,
	"moe_router_bias_lr": 0.001,
	"moe_activation_checkpoint": true,
	"moe_capacity_factor": 0.75,
	"moe_router_bias_reset_interval": 0,
	"domain_router_dim": 32,
	"domain_tag_capacity": 128,
	"quality_tag_capacity": 32,
	"specialty_tag_capacity": 64,
	"modality_tag_capacity": 32,
	"language_tag_capacity": 32,
	"origin_tag_capacity": 8,
	"domain_router_feature_modes": "dataset:one_hot,quality:one_hot,specialty:one_hot,modality:one_hot,language:one_hot,origin:one_hot",
	"rotary_scaling_type": "yarn",
	"rotary_scale_factor": 4.0,
	"residual_scale": -1.0,
	"attn_dropout": 0.01,
	"label_smoothing": 0.0,
	"z_loss_weight": 0.0,
	"use_flash_attention": true,
	"use_distributed_muon": false,
	"num_iterations": -1,
	"target_flops": -1.0,
	"target_param_data_ratio": 20,
	"device_batch_size": 1,
	"total_batch_size": 524288,
	"max_grad_accum_steps": 0,
	"embedding_lr": 0.2,
	"unembedding_lr": 0.004,
	"weight_decay": 0.0,
	"freeze_embedding_optimizers": false,
	"matrix_lr": 0.02,
	"grad_clip": 1.0,
	"warmup_ratio": 0.0,
	"warmdown_ratio": 0.2,
	"final_lr_frac": 0.0,
	"eval_every": 500,
	"eval_tokens": "[redacted]",
	"core_metric_every": 1000,
	"core_metric_max_per_task": 2500,
	"sample_every": 1000,
	"micro_eval_every": 250,
	"micro_eval_tokens": "[redacted]",
	"skip_initial_eval": false,
	"fast_dev_run": false,
	"fast_dev_num_iterations": 200,
	"fast_dev_max_grad_accum_steps": 2,
	"fast_dev_eval_tokens_multiplier": "[redacted]",
	"model_tag": "d16_cont",
	"hf_repo_id": "harshad317/base_Medical_continuous",
	"hf_repo_type": "model",
	"hf_path_in_repo": "",
	"hf_commit_message": "Uploading the base model",
	"hf_private": false,
	"hf_token": "[redacted]",
	"hf_max_shard_size": "2gb",
	"resume_from_checkpoint": "d16",
	"resume_from_step": 22722,
	"resume_load_optimizer": true,
	"base_dataset_num_shards": 1250,
	"train_dataset_mix": "",
	"train_dataset_mix_path": "configs/train_dataset_mix.json",
	"loader_dedup_window": 300000,
	"loader_enable_length_bucketing": true,
	"loader_bucket_bins": "512,1024,2048,4096,8192",
	"loader_prefetch_batches": 2,
	"loader_tokenizer_threads": "[redacted]",
	"loader_tokenizer_batch_size": "[redacted]",
	"train_single_pass": true,
	"enable_length_curriculum": false,
	"sequence_length_schedule": "0.25:512,0.50:1024,0.75:2048,1.0:4096",
	"length_schedule_round_to": 32,
	"enable_dataset_reweighting": true,
	"dataset_reweight_interval": 2000,
	"dataset_reweight_strength": 0.75,
	"dataset_reweight_smoothing": 0.2,
	"dataset_reweight_min_multiplier": 0.25,
	"dataset_reweight_warmup_steps": 4000,
	"memory_profile_interval": 500,
	"dataset_mix_source": "/home/jupyter/nanochat/configs/train_dataset_mix.json"
	},
	"device_batch_size": 1,
	"max_seq_len": 8192
	}